diff --git a/fs/dax.c b/fs/dax.c index 6bf81f931de3..68eef98cd9c4 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; @@ -1247,7 +1247,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - struct iomap iomap = { 0 }; + struct iomap iomap = { .type = IOMAP_HOLE }; + struct iomap srcmap = { .type = IOMAP_HOLE }; unsigned flags = IOMAP_FAULT; int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -1292,7 +1293,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); if (iomap_errp) *iomap_errp = error; if (error) { @@ -1471,7 +1472,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; - struct iomap iomap = { 0 }; + struct iomap iomap = { .type = IOMAP_HOLE }; + struct iomap srcmap = { .type = IOMAP_HOLE }; pgoff_t max_pgoff; void *entry; loff_t pos; @@ -1546,7 +1548,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * to look up our filesystem block. */ pos = (loff_t)xas.xa_index << PAGE_SHIFT; - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, + &srcmap); if (error) goto unlock_entry; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7004ce581a32..467c13ff6b40 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, #ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap) + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c3ea2c818542..61987c106511 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1584,7 +1584,6 @@ enum { EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ - EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ @@ -2565,8 +2564,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, @@ -3391,6 +3388,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 59f741bedf2f..0e8708b77da6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1765,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; - /* - * The check for IO to unwritten extent is somewhat racy as we - * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after - * dropping i_data_sem. But reserved blocks should save us in that - * case. - */ + if (ext4_ext_is_unwritten(ex1) && - (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || - atomic_read(&EXT4_I(inode)->i_unwritten) || - (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) + ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8d2bbcc2d813..6a7293a5cda2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -29,10 +29,58 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "truncate.h" + +static bool ext4_dio_supported(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) + return false; + if (fsverity_active(inode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + return true; +} + +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock_shared(inode); + /* + * Fallback to buffered I/O if the operation being performed on + * the inode is not supported by direct I/O. The IOCB_DIRECT + * flag needs to be cleared here in order to ensure that the + * direct I/O path within generic_file_read_iter() is not + * taken. + */ + iocb->ki_flags &= ~IOCB_DIRECT; + return generic_file_read_iter(iocb, to); + } + + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, + is_sync_kiocb(iocb)); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX - if (IS_DAX(file_inode(iocb->ki_filp))) + if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_read_iter(iocb, to); + return generic_file_read_iter(iocb, to); } @@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_unwritten_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); -} - /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they @@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -180,32 +226,301 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + + ret = file_modified(iocb->ki_filp); + if (ret) + return ret; + return iov_iter_count(from); } +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + current->backing_dev_info = NULL; + +out: + inode_unlock(inode); + if (likely(ret > 0)) { + iocb->ki_pos += ret; + ret = generic_write_sync(iocb, ret); + } + + return ret; +} + +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, + ssize_t written, size_t count) +{ + handle_t *handle; + bool truncate = false; + u8 blkbits = inode->i_blkbits; + ext4_lblk_t written_blk, end_blk; + + /* + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But, the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. + */ + WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); + if (offset + count <= EXT4_I(inode)->i_disksize) { + /* + * We need to ensure that the inode is removed from the orphan + * list if it has been added prematurely, due to writeback of + * delalloc blocks. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } + + return written; + } + + if (written < 0) + goto truncate; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + written = PTR_ERR(handle); + goto truncate; + } + + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + written_blk = ALIGN(offset + written, 1 << blkbits); + end_blk = ALIGN(offset + count, 1 << blkbits); + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + + /* + * Remove the inode from the orphan list if it has been extended and + * everything went OK. + */ + if (!truncate && inode->i_nlink) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + + if (truncate) { +truncate: + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return written; +} + +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t offset = iocb->ki_pos; + struct inode *inode = file_inode(iocb->ki_filp); + + if (error) + return error; + + if (size && flags & IOMAP_DIO_UNWRITTEN) + return ext4_convert_unwritten_extents(NULL, inode, + offset, size); + + return 0; +} + +static const struct iomap_dio_ops ext4_dio_write_ops = { + .end_io = ext4_dio_write_end_io, +}; + +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + struct inode *inode = file_inode(iocb->ki_filp); + bool extend = false, overwrite = false, unaligned_aio = false; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock(inode); + /* + * Fallback to buffered I/O if the inode does not support + * direct I/O. + */ + return ext4_buffered_write_iter(iocb, from); + } + + ret = ext4_write_checks(iocb, from); + if (ret <= 0) { + inode_unlock(inode); + return ret; + } + + /* + * Unaligned asynchronous direct I/O must be serialized among each + * other as the zeroing of partial blocks of two competing unaligned + * asynchronous direct I/O writes can result in data corruption. + */ + offset = iocb->ki_pos; + count = iov_iter_count(from); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { + unaligned_aio = true; + inode_dio_wait(inode); + } + + /* + * Determine whether the I/O will overwrite allocated and initialized + * blocks. If so, check to see whether it is possible to take the + * dioread_nolock path. + */ + if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && + ext4_should_dioread_nolock(inode)) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } + + ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_aio || extend); + + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); + +out: + if (overwrite) + inode_unlock_shared(inode); + else + inode_unlock(inode); + + if (ret >= 0 && iov_iter_count(from)) { + ssize_t err; + loff_t endbyte; + + offset = iocb->ki_pos; + err = ext4_buffered_write_iter(iocb, from); + if (err < 0) + return err; + + /* + * We need to ensure that the pages within the page cache for + * the range covered by this I/O are written to disk and + * invalidated. This is in attempt to preserve the expected + * direct I/O semantics in the case we fallback to buffered I/O + * to complete off the I/O request. + */ + ret += err; + endbyte = offset + err - 1; + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, endbyte); + if (!err) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } + + return ret; +} + #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + bool extend = false; + struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; inode_lock(inode); } + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; - ret = file_remove_privs(iocb->ki_filp); - if (ret) - goto out; - ret = file_update_time(iocb->ki_filp); - if (ret) - goto out; + + offset = iocb->ki_pos; + count = iov_iter_count(from); + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); if (ret > 0) @@ -218,10 +533,6 @@ static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - int o_direct = iocb->ki_flags & IOCB_DIRECT; - int unaligned_aio = 0; - int overwrite = 0; - ssize_t ret; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -230,59 +541,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_write_iter(iocb, from); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - inode_lock(inode); - } - - ret = ext4_write_checks(iocb, from); - if (ret <= 0) - goto out; - - /* - * Unaligned direct AIO must be serialized among each other as zeroing - * of partial blocks of two competing unaligned AIOs can result in data - * corruption. - */ - if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && - ext4_unaligned_aio(inode, from, iocb->ki_pos)) { - unaligned_aio = 1; - ext4_unwritten_wait(inode); - } - - iocb->private = &overwrite; - /* Check whether we do a DIO overwrite or not */ - if (o_direct && !unaligned_aio) { - if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { - if (ext4_should_dioread_nolock(inode)) - overwrite = 1; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; - } - } - - ret = __generic_file_write_iter(iocb, from); - /* - * Unaligned direct AIO must be the only IO in flight. Otherwise - * overlapping aligned IO after unaligned might result in data - * corruption. - */ - if (ret == -EIOCBQUEUED && unaligned_aio) - ext4_unwritten_wait(inode); - inode_unlock(inode); - - if (ret > 0) - ret = generic_write_sync(iocb, ret); - - return ret; - -out: - inode_unlock(inode); - return ret; + return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX @@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); - offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_hole(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); - offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_data(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5508baa11bb6..e10206e7f4bb 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode) return ret; } +static int ext4_fsync_nojournal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + int ret, err; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY_ALL)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode_metadata(inode, 1); + if (!ret) + ret = err; + + if (!ret) + ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) + *needs_barrier = true; + + return ret; +} + +static int ext4_fsync_journal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + *needs_barrier = true; + + return jbd2_complete_transaction(journal, commit_tid); +} + /* * akpm: A new design for ext4_sync_file(). * @@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode) * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. */ - int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret = 0, err; - tid_t commit_tid; bool needs_barrier = false; + struct inode *inode = file->f_mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(sbi))) return -EIO; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); - if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ret = -EROFS; goto out; } - if (!journal) { - ret = __generic_file_fsync(file, start, end, datasync); - if (!ret) - ret = ext4_sync_parent(inode); - if (test_opt(inode->i_sb, BARRIER)) - goto issue_flush; - goto out; - } - ret = file_write_and_wait_range(file, start, end); if (ret) return ret; + /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { + if (!sbi->s_journal) + ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); + else if (ext4_should_journal_data(inode)) ret = ext4_force_commit(inode->i_sb); - goto out; - } + else + ret = ext4_fsync_journal(inode, datasync, &needs_barrier); - commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = true; - ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - issue_flush: err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) ret = err; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 61052c67952e..381813205f99 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -809,136 +809,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 -/* - * Get blocks function for the cases that need to start a transaction - - * generally difference cases of direct IO and DAX IO. It also handles retries - * in case of ENOSPC. - */ -static int ext4_get_block_trans(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) -{ - int dio_credits; - handle_t *handle; - int retries = 0; - int ret; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) - bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; - dio_credits = ext4_chunk_trans_blocks(inode, - bh_result->b_size >> inode->i_blkbits); -retry: - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = _ext4_get_block(inode, iblock, bh_result, flags); - ext4_journal_stop(handle); - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; -} - -/* Get block function for DIO reads and writes to inodes without extents */ -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - if (!create) - return _ext4_get_block(inode, iblock, bh, 0); - return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); -} - -/* - * Get block function for AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete. - */ -static int ext4_dio_get_block_unwritten_async(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * When doing DIO using unwritten extents, we need io_end to convert - * unwritten extents to written on IO completion. We allocate io_end - * once we spot unwritten extent and store it in b_private. Generic - * DIO code keeps b_private set and furthermore passes the value to - * our completion callback in 'private' argument. - */ - if (!ret && buffer_unwritten(bh_result)) { - if (!bh_result->b_private) { - ext4_io_end_t *io_end; - - io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!io_end) - return -ENOMEM; - bh_result->b_private = io_end; - ext4_set_io_unwritten_flag(inode, io_end); - } - set_buffer_defer_completion(bh_result); - } - - return ret; -} - -/* - * Get block function for non-AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete by ext4_direct_IO_write(). - */ -static int ext4_dio_get_block_unwritten_sync(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * Mark inode as having pending DIO writes to unwritten extents. - * ext4_direct_IO_write() checks this flag and converts extents to - * written. - */ - if (!ret && buffer_unwritten(bh_result)) - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - - return ret; -} - -static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); - - return ret; -} - - /* * `handle' can be NULL if create is zero */ @@ -3431,148 +3301,142 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) return inode->i_state & I_DIRTY_DATASYNC; } -static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap) +static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, + struct ext4_map_blocks *map, loff_t offset, + loff_t length) +{ + u8 blkbits = inode->i_blkbits; + + /* + * Writes that span EOF might trigger an I/O size update on completion, + * so consider them to be dirty for the purpose of O_DSYNC, even if + * there is no other metadata changes being made or are pending. + */ + iomap->flags = 0; + if (ext4_inode_datasync_dirty(inode) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + if (map->m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + iomap->offset = (u64) map->m_lblk << blkbits; + iomap->length = (u64) map->m_len << blkbits; + + /* + * Flags passed to ext4_map_blocks() for direct I/O writes can result + * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits + * set. In order for any allocated unwritten extents to be converted + * into written extents correctly within the ->end_io() handler, we + * need to ensure that the iomap->type is set appropriately. Hence, the + * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has + * been set first. + */ + if (map->m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + iomap->addr = (u64) map->m_pblk << blkbits; + } else if (map->m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = (u64) map->m_pblk << blkbits; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } +} + +static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, + unsigned int flags) +{ + handle_t *handle; + u8 blkbits = inode->i_blkbits; + int ret, dio_credits, m_flags = 0, retries = 0; + + /* + * Trim the mapping request to the maximum value that we can map at + * once for direct I/O. + */ + if (map->m_len > DIO_MAX_BLOCKS) + map->m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + +retry: + /* + * Either we allocate blocks and then don't get an unwritten extent, so + * in that case we have reserved enough credits. Or, the blocks are + * already allocated and unwritten. In that case, the extent conversion + * fits into the credits as well. + */ + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + /* + * DAX and direct I/O are the only two operations that are currently + * supported with IOMAP_WRITE. + */ + WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); + if (IS_DAX(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + /* + * We use i_size instead of i_disksize here because delalloc writeback + * can complete at any point during the I/O and subsequently push the + * i_disksize out to i_size. This could be beyond where direct I/O is + * happening and thus expose allocated blocks to direct I/O reads. + */ + else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE; + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; + + ret = ext4_map_blocks(handle, inode, map, m_flags); + + /* + * We cannot fill holes in indirect tree based inodes as that could + * expose stale data in the case of a crash. Use the magic error code + * to fallback to buffered I/O. + */ + if (!m_flags && !ret) + ret = -ENOTBLK; + + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + return ret; +} + + +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int blkbits = inode->i_blkbits; - unsigned long first_block, last_block; - struct ext4_map_blocks map; - bool delalloc = false; int ret; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; - first_block = offset >> blkbits; - last_block = min_t(loff_t, (offset + length - 1) >> blkbits, - EXT4_MAX_LOGICAL_BLOCK); - if (flags & IOMAP_REPORT) { - if (ext4_has_inline_data(inode)) { - ret = ext4_inline_data_iomap(inode, iomap); - if (ret != -EAGAIN) { - if (ret == 0 && offset >= iomap->length) - ret = -ENOENT; - return ret; - } - } - } else { - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) - return -ERANGE; - } + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; - map.m_lblk = first_block; - map.m_len = last_block - first_block + 1; + /* + * Calculate the first and last logical blocks respectively. + */ + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; - if (flags & IOMAP_REPORT) { + if (flags & IOMAP_WRITE) + ret = ext4_iomap_alloc(inode, &map, flags); + else ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - if (ret == 0) { - ext4_lblk_t end = map.m_lblk + map.m_len - 1; - struct extent_status es; + if (ret < 0) + return ret; - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map.m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) { - /* entire range is a hole */ - } else if (es.es_lblk > map.m_lblk) { - /* range starts with a hole */ - map.m_len = es.es_lblk - map.m_lblk; - } else { - ext4_lblk_t offs = 0; - - if (es.es_lblk < map.m_lblk) - offs = map.m_lblk - es.es_lblk; - map.m_lblk = es.es_lblk + offs; - map.m_len = es.es_len - offs; - delalloc = true; - } - } - } else if (flags & IOMAP_WRITE) { - int dio_credits; - handle_t *handle; - int retries = 0; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); -retry: - /* - * Either we allocate blocks and then we don't get unwritten - * extent so we have reserved enough credits, or the blocks - * are already allocated and unwritten and in that case - * extent conversion fits in the credits as well. - */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) { - ext4_journal_stop(handle); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; - } - - /* - * If we added blocks beyond i_size, we need to make sure they - * will get truncated if we crash before updating i_size in - * ext4_iomap_end(). For faults we don't need to do that (and - * even cannot because for orphan list operations inode_lock is - * required) - if we happen to instantiate block beyond i_size, - * it is because we race with truncate which has already added - * the inode to the orphan list. - */ - if (!(flags & IOMAP_FAULT) && first_block + map.m_len > - (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { - int err; - - err = ext4_orphan_add(handle, inode); - if (err < 0) { - ext4_journal_stop(handle); - return err; - } - } - ext4_journal_stop(handle); - } else { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - } - - iomap->flags = 0; - if (ext4_inode_datasync_dirty(inode)) - iomap->flags |= IOMAP_F_DIRTY; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = sbi->s_daxdev; - iomap->offset = (u64)first_block << blkbits; - iomap->length = (u64)map.m_len << blkbits; - - if (ret == 0) { - iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; - } else { - if (map.m_flags & EXT4_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { - iomap->type = IOMAP_UNWRITTEN; - } else { - WARN_ON_ONCE(1); - return -EIO; - } - iomap->addr = (u64)map.m_pblk << blkbits; - } - - if (map.m_flags & EXT4_MAP_NEW) - iomap->flags |= IOMAP_F_NEW; + ext4_set_iomap(inode, iomap, &map, offset, length); return 0; } @@ -3580,53 +3444,17 @@ retry: static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { - int ret = 0; - handle_t *handle; - int blkbits = inode->i_blkbits; - bool truncate = false; - - if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) - return 0; - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto orphan_del; - } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); /* - * We may need to truncate allocated but not written blocks beyond EOF. + * Check to see whether an error occurred while writing out the data to + * the allocated blocks. If so, return the magic error code so that we + * fallback to buffered I/O and attempt to complete the remainder of + * the I/O. Any blocks that may have been allocated in preparation for + * the direct I/O will be reused during buffered I/O. */ - if (iomap->offset + iomap->length > - ALIGN(inode->i_size, 1 << blkbits)) { - ext4_lblk_t written_blk, end_blk; + if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + return -ENOTBLK; - written_blk = (offset + written) >> blkbits; - end_blk = (offset + length) >> blkbits; - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - } - /* - * Remove inode from orphan list if we were extending a inode and - * everything went fine. - */ - if (!truncate && inode->i_nlink && - !list_empty(&EXT4_I(inode)->i_orphan)) - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - if (truncate) { - ext4_truncate_failed_write(inode); -orphan_del: - /* - * If truncate failed early the inode might still be on the - * orphan list; we need to make sure the inode is removed from - * the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - return ret; + return 0; } const struct iomap_ops ext4_iomap_ops = { @@ -3634,271 +3462,73 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; -static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) +static bool ext4_iomap_is_delalloc(struct inode *inode, + struct ext4_map_blocks *map) { - ext4_io_end_t *io_end = private; - struct ext4_io_end_vec *io_end_vec; + struct extent_status es; + ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - /* if not async direct IO just return */ - if (!io_end) - return 0; + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map->m_lblk, end, &es); - ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - io_end, io_end->inode->i_ino, iocb, offset, size); + if (!es.es_len || es.es_lblk > end) + return false; + + if (es.es_lblk > map->m_lblk) { + map->m_len = es.es_lblk - map->m_lblk; + return false; + } + + offset = map->m_lblk - es.es_lblk; + map->m_len = es.es_len - offset; + + return true; +} + +static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + bool delalloc = false; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; + + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } /* - * Error during AIO DIO. We cannot convert unwritten extents as the - * data was not written. Just clear the unwritten flag and drop io_end. + * Calculate the first and last logical block respectively. */ - if (size <= 0) { - ext4_clear_io_unwritten_flag(io_end); - size = 0; - } - io_end_vec = ext4_alloc_io_end_vec(io_end); - io_end_vec->offset = offset; - io_end_vec->size = size; - ext4_put_io_end(io_end); + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) + delalloc = ext4_iomap_is_delalloc(inode, &map); + + ext4_set_iomap(inode, iomap, &map, offset, length); + if (delalloc && iomap->type == IOMAP_HOLE) + iomap->type = IOMAP_DELALLOC; return 0; } -/* - * Handling of direct IO writes. - * - * For ext4 extent files, ext4 will do direct-io write even to holes, - * preallocated extents, and those write extend the file, no need to - * fall back to buffered IO. - * - * For holes, we fallocate those blocks, mark them as unwritten - * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as unwritten. - * - * The unwritten extents will be converted to written when DIO is completed. - * For async direct IO, since the IO may still pending when return, we - * set up an end_io call back function, which will do the conversion - * when async direct IO completed. - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - */ -static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - ssize_t ret; - loff_t offset = iocb->ki_pos; - size_t count = iov_iter_count(iter); - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - loff_t final_size = offset + count; - int orphan = 0; - handle_t *handle; - - if (final_size > inode->i_size || final_size > ei->i_disksize) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ext4_update_i_disksize(inode, inode->i_size); - ext4_journal_stop(handle); - } - - BUG_ON(iocb->private == NULL); - - /* - * Make all waiters for direct IO properly wait also for extent - * conversion. This also disallows race between truncate() and - * overwrite DIO as i_dio_count needs to be incremented under i_mutex. - */ - inode_dio_begin(inode); - - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); - - if (overwrite) - inode_unlock(inode); - - /* - * For extent mapped files we could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as unwritten to prevent - * parallel buffered read to expose the stale data before DIO complete - * the data IO. - * - * As to previously fallocated extents, ext4 get_block will just simply - * mark the buffer mapped but still keep the extents unwritten. - * - * For non AIO case, we will convert those unwritten extents to written - * after return back from blockdev_direct_IO. That way we save us from - * allocating io_end structure and also the overhead of offloading - * the extent convertion to a workqueue. - * - * For async DIO, the conversion needs to be deferred when the - * IO is completed. The ext4 end_io callback function will be - * called to take care of the conversion work. Here for async - * case, we allocate an io_end structure to hook to the iocb. - */ - iocb->private = NULL; - if (overwrite) - get_block_func = ext4_dio_get_block_overwrite; - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, i_blocksize(inode)) >= inode->i_size) { - get_block_func = ext4_dio_get_block; - dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else if (is_sync_kiocb(iocb)) { - get_block_func = ext4_dio_get_block_unwritten_sync; - dio_flags = DIO_LOCKING; - } else { - get_block_func = ext4_dio_get_block_unwritten_async; - dio_flags = DIO_LOCKING; - } - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - get_block_func, ext4_end_io_dio, NULL, - dio_flags); - - if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(NULL, inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } - - inode_dio_end(inode); - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) - inode_lock(inode); - - if (ret < 0 && final_size > inode->i_size) - ext4_truncate_failed_write(inode); - - /* Handle extending of i_size after direct IO write */ - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* - * We wrote the data but cannot extend - * i_size. Bail out. In async io case, we do - * not return error here because we have - * already submmitted the corresponding - * bio. Returning error here makes the caller - * think that this IO is done and failed - * resulting in race with bio's completion - * handler. - */ - if (!ret) - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size || end > ei->i_disksize) { - ext4_update_i_disksize(inode, end); - if (end > inode->i_size) - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); - ssize_t ret; - - /* - * Shared inode_lock is enough for us - it protects against concurrent - * writes & truncates and since we take care of writing back page cache, - * we are protected against page writeback as well. - */ - inode_lock_shared(inode); - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, NULL, NULL, 0); -out_unlock: - inode_unlock_shared(inode); - return ret; -} - -static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - ssize_t ret; - -#ifdef CONFIG_FS_ENCRYPTION - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) - return 0; -#endif - if (fsverity_active(inode)) - return 0; - - /* - * If we are doing data journalling we don't support O_DIRECT - */ - if (ext4_should_journal_data(inode)) - return 0; - - /* Let buffer I/O handle the inline data case. */ - if (ext4_has_inline_data(inode)) - return 0; - - trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == READ) - ret = ext4_direct_IO_read(iocb, iter); - else - ret = ext4_direct_IO_write(iocb, iter); - trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - return ret; -} +const struct iomap_ops ext4_iomap_report_ops = { + .iomap_begin = ext4_iomap_begin_report, +}; /* * Pages can be marked dirty completely asynchronously from ext4's journalling @@ -3937,7 +3567,7 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -3954,7 +3584,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -3970,7 +3600,7 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index f63df54a08c6..516103248272 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1149,7 +1149,8 @@ static inline bool gfs2_iomap_need_write_lock(unsigned flags) } static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap) + unsigned flags, struct iomap *iomap, + struct iomap *srcmap) { struct gfs2_inode *ip = GFS2_I(inode); struct metapath mp = { .mp_aheight = 1, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 997b326247e2..f0caee2b7c00 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -732,7 +732,8 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to) if (ret) goto out_uninit; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL); + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, + is_sync_kiocb(iocb)); gfs2_glock_dq(&gh); out_uninit: @@ -767,7 +768,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (offset + len > i_size_read(&ip->i_inode)) goto out; - ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL); + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, + is_sync_kiocb(iocb)); out: gfs2_glock_dq(&gh); diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 93cd11938bf5..eef2722d93a1 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -3,13 +3,15 @@ # Copyright (c) 2019 Oracle. # All Rights Reserved. # + +ccflags-y += -I $(srctree)/$(src) # needed for trace events + obj-$(CONFIG_FS_IOMAP) += iomap.o -iomap-y += \ - apply.o \ - buffered-io.o \ - direct-io.o \ - fiemap.o \ - seek.o - +iomap-y += trace.o \ + apply.o \ + buffered-io.o \ + direct-io.o \ + fiemap.o \ + seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c index 54c02aecf3cd..484dd8eda861 100644 --- a/fs/iomap/apply.c +++ b/fs/iomap/apply.c @@ -23,8 +23,10 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, iomap_actor_t actor) { - struct iomap iomap = { 0 }; + struct iomap iomap = { .type = IOMAP_HOLE }; + struct iomap srcmap = { .type = IOMAP_HOLE }; loff_t written = 0, ret; + u64 end; /* * Need to map a range from start position for length bytes. This can @@ -38,7 +40,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, * expose transient stale data. If the reserve fails, we can safely * back out at this point as there is nothing to undo. */ - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); + ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); if (ret) return ret; if (WARN_ON(iomap.offset > pos)) @@ -50,15 +52,26 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, * Cut down the length to the one actually provided by the filesystem, * as it might not be able to give us the whole size that we requested. */ - if (iomap.offset + iomap.length < pos + length) - length = iomap.offset + iomap.length - pos; + end = iomap.offset + iomap.length; + if (srcmap.type != IOMAP_HOLE) + end = min(end, srcmap.offset + srcmap.length); + if (pos + length > end) + length = end - pos; /* - * Now that we have guaranteed that the space allocation will succeed. + * Now that we have guaranteed that the space allocation will succeed, * we can do the copy-in page by page without having to worry about * failures exposing transient data. + * + * To support COW operations, we read in data for partially blocks from + * the srcmap if the file system filled it in. In that case we the + * length needs to be limited to the earlier of the ends of the iomaps. + * If the file system did not provide a srcmap we pass in the normal + * iomap into the actors so that they don't need to have special + * handling for the two cases. */ - written = actor(inode, pos, length, data, &iomap); + written = actor(inode, pos, length, data, &iomap, + srcmap.type != IOMAP_HOLE ? &srcmap : &iomap); /* * Now the data has been copied, commit the range we've copied. This diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e25901ae3ff4..c62e807956b6 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (C) 2016-2019 Christoph Hellwig. */ #include #include @@ -12,13 +12,34 @@ #include #include #include +#include #include #include #include #include +#include "trace.h" #include "../internal.h" +/* + * Structure allocated for each page when block size < PAGE_SIZE to track + * sub-page uptodate status and I/O completions. + */ +struct iomap_page { + atomic_t read_count; + atomic_t write_count; + DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); +}; + +static inline struct iomap_page *to_iomap_page(struct page *page) +{ + if (page_has_private(page)) + return (struct iomap_page *)page_private(page); + return NULL; +} + +static struct bio_set iomap_ioend_bioset; + static struct iomap_page * iomap_page_create(struct inode *inode, struct page *page) { @@ -203,9 +224,17 @@ iomap_read_inline_data(struct inode *inode, struct page *page, SetPageUptodate(page); } +static inline bool iomap_block_needs_zeroing(struct inode *inode, + struct iomap *iomap, loff_t pos) +{ + return iomap->type != IOMAP_MAPPED || + (iomap->flags & IOMAP_F_NEW) || + pos >= i_size_read(inode); +} + static loff_t iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; struct page *page = ctx->cur_page; @@ -226,7 +255,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (plen == 0) goto done; - if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { + if (iomap_block_needs_zeroing(inode, iomap, pos)) { zero_user(page, poff, plen); iomap_set_range_uptodate(page, poff, plen); goto done; @@ -293,6 +322,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) unsigned poff; loff_t ret; + trace_iomap_readpage(page->mapping->host, 1); + for (poff = 0; poff < PAGE_SIZE; poff += ret) { ret = iomap_apply(inode, page_offset(page) + poff, PAGE_SIZE - poff, 0, ops, &ctx, @@ -351,7 +382,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, static loff_t iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; loff_t done, ret; @@ -371,7 +402,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, - ctx, iomap); + ctx, iomap, srcmap); } return done; @@ -389,6 +420,8 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages, loff_t last = page_offset(list_entry(pages->next, struct page, lru)); loff_t length = last - pos + PAGE_SIZE, ret = 0; + trace_iomap_readpages(mapping->host, nr_pages); + while (length > 0) { ret = iomap_apply(mapping->host, pos, length, 0, ops, &ctx, iomap_readpages_actor); @@ -455,6 +488,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); int iomap_releasepage(struct page *page, gfp_t gfp_mask) { + trace_iomap_releasepage(page->mapping->host, page, 0, 0); + /* * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to @@ -470,6 +505,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage); void iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) { + trace_iomap_invalidatepage(page->mapping->host, page, offset, len); + /* * If we are invalidating the entire page, clear the dirty state from it * and release it to avoid unnecessary buildup of the LRU. @@ -511,6 +548,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, EXPORT_SYMBOL_GPL(iomap_migrate_page); #endif /* CONFIG_MIGRATION */ +enum { + IOMAP_WRITE_F_UNSHARE = (1 << 0), +}; + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -525,19 +566,12 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) } static int -iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, - unsigned poff, unsigned plen, unsigned from, unsigned to, - struct iomap *iomap) +iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, + unsigned plen, struct iomap *iomap) { struct bio_vec bvec; struct bio bio; - if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { - zero_user_segments(page, poff, from, to, poff + plen); - iomap_set_range_uptodate(page, poff, plen); - return 0; - } - bio_init(&bio, &bvec, 1); bio.bi_opf = REQ_OP_READ; bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); @@ -547,15 +581,15 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, } static int -__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, - struct page *page, struct iomap *iomap) +__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, + struct page *page, struct iomap *srcmap) { struct iomap_page *iop = iomap_page_create(inode, page); loff_t block_size = i_blocksize(inode); loff_t block_start = pos & ~(block_size - 1); loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); unsigned from = offset_in_page(pos), to = from + len, poff, plen; - int status = 0; + int status; if (PageUptodate(page)) return 0; @@ -566,29 +600,39 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, if (plen == 0) break; - if ((from > poff && from < poff + plen) || - (to > poff && to < poff + plen)) { - status = iomap_read_page_sync(inode, block_start, page, - poff, plen, from, to, iomap); - if (status) - break; + if (!(flags & IOMAP_WRITE_F_UNSHARE) && + (from <= poff || from >= poff + plen) && + (to <= poff || to >= poff + plen)) + continue; + + if (iomap_block_needs_zeroing(inode, srcmap, block_start)) { + if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) + return -EIO; + zero_user_segments(page, poff, from, to, poff + plen); + iomap_set_range_uptodate(page, poff, plen); + continue; } + status = iomap_read_page_sync(block_start, page, poff, plen, + srcmap); + if (status) + return status; } while ((block_start += plen) < block_end); - return status; + return 0; } static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap) + struct page **pagep, struct iomap *iomap, struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; - pgoff_t index = pos >> PAGE_SHIFT; struct page *page; int status = 0; BUG_ON(pos + len > iomap->offset + iomap->length); + if (srcmap != iomap) + BUG_ON(pos + len > srcmap->offset + srcmap->length); if (fatal_signal_pending(current)) return -EINTR; @@ -599,18 +643,20 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, return status; } - page = grab_cache_page_write_begin(inode->i_mapping, index, flags); + page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT, + AOP_FLAG_NOFS); if (!page) { status = -ENOMEM; goto out_no_page; } - if (iomap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, iomap); + if (srcmap->type == IOMAP_INLINE) + iomap_read_inline_data(inode, page, srcmap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) - status = __block_write_begin_int(page, pos, len, NULL, iomap); + status = __block_write_begin_int(page, pos, len, NULL, srcmap); else - status = __iomap_write_begin(inode, pos, len, page, iomap); + status = __iomap_write_begin(inode, pos, len, flags, page, + srcmap); if (unlikely(status)) goto out_unlock; @@ -656,7 +702,7 @@ EXPORT_SYMBOL_GPL(iomap_set_page_dirty); static int __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page, struct iomap *iomap) + unsigned copied, struct page *page) { flush_dcache_page(page); @@ -696,20 +742,20 @@ iomap_write_end_inline(struct inode *inode, struct page *page, } static int -iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page, struct iomap *iomap) +iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, + struct page *page, struct iomap *iomap, struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; loff_t old_size = inode->i_size; int ret; - if (iomap->type == IOMAP_INLINE) { + if (srcmap->type == IOMAP_INLINE) { ret = iomap_write_end_inline(inode, page, iomap, pos, copied); - } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { + } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, page, NULL); } else { - ret = __iomap_write_end(inode, pos, len, copied, page, iomap); + ret = __iomap_write_end(inode, pos, len, copied, page); } /* @@ -736,12 +782,11 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len, static loff_t iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct iov_iter *i = data; long status = 0; ssize_t written = 0; - unsigned int flags = AOP_FLAG_NOFS; do { struct page *page; @@ -771,8 +816,8 @@ again: break; } - status = iomap_write_begin(inode, pos, bytes, flags, &page, - iomap); + status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, + srcmap); if (unlikely(status)) break; @@ -783,8 +828,8 @@ again: flush_dcache_page(page); - status = iomap_write_end(inode, pos, bytes, copied, page, - iomap); + status = iomap_write_end(inode, pos, bytes, copied, page, iomap, + srcmap); if (unlikely(status < 0)) break; copied = status; @@ -835,50 +880,32 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); -static struct page * -__iomap_read_page(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - struct page *page; - - page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - return page; -} - static loff_t -iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) +iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap, struct iomap *srcmap) { long status = 0; ssize_t written = 0; + /* don't bother with blocks that are not shared to start with */ + if (!(iomap->flags & IOMAP_F_SHARED)) + return length; + /* don't bother with holes or unwritten extents */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + do { - struct page *page, *rpage; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ - - offset = offset_in_page(pos); - bytes = min_t(loff_t, PAGE_SIZE - offset, length); - - rpage = __iomap_read_page(inode, pos); - if (IS_ERR(rpage)) - return PTR_ERR(rpage); + unsigned long offset = offset_in_page(pos); + unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); + struct page *page; status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_NOFS, &page, iomap); - put_page(rpage); + IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); if (unlikely(status)) return status; - WARN_ON_ONCE(!PageUptodate(page)); - - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); + status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, + srcmap); if (unlikely(status <= 0)) { if (WARN_ON_ONCE(status == 0)) return -EIO; @@ -898,14 +925,14 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } int -iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, +iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { loff_t ret; while (len) { ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, - iomap_dirty_actor); + iomap_unshare_actor); if (ret <= 0) return ret; pos += ret; @@ -914,23 +941,22 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, return 0; } -EXPORT_SYMBOL_GPL(iomap_file_dirty); +EXPORT_SYMBOL_GPL(iomap_file_unshare); static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap) + unsigned bytes, struct iomap *iomap, struct iomap *srcmap) { struct page *page; int status; - status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, - iomap); + status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); if (status) return status; zero_user(page, offset, bytes); mark_page_accessed(page); - return iomap_write_end(inode, pos, bytes, bytes, page, iomap); + return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, @@ -942,14 +968,14 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { bool *did_zero = data; loff_t written = 0; int status; /* already zeroed? we're done. */ - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return count; do { @@ -961,7 +987,8 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, if (IS_DAX(inode)) status = iomap_dax_zero(pos, offset, bytes, iomap); else - status = iomap_zero(inode, pos, offset, bytes, iomap); + status = iomap_zero(inode, pos, offset, bytes, iomap, + srcmap); if (status < 0) return status; @@ -1011,7 +1038,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page); static loff_t iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct page *page = data; int ret; @@ -1071,3 +1098,551 @@ out_unlock: return block_page_mkwrite_return(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); + +static void +iomap_finish_page_writeback(struct inode *inode, struct page *page, + int error) +{ + struct iomap_page *iop = to_iomap_page(page); + + if (error) { + SetPageError(page); + mapping_set_error(inode->i_mapping, -EIO); + } + + WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0); + + if (!iop || atomic_dec_and_test(&iop->write_count)) + end_page_writeback(page); +} + +/* + * We're now finished for good with this ioend structure. Update the page + * state, release holds on bios, and finally free up memory. Do not use the + * ioend after this. + */ +static void +iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + struct inode *inode = ioend->io_inode; + struct bio *bio = &ioend->io_inline_bio; + struct bio *last = ioend->io_bio, *next; + u64 start = bio->bi_iter.bi_sector; + bool quiet = bio_flagged(bio, BIO_QUIET); + + for (bio = &ioend->io_inline_bio; bio; bio = next) { + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + /* + * For the last bio, bi_private points to the ioend, so we + * need to explicitly end the iteration here. + */ + if (bio == last) + next = NULL; + else + next = bio->bi_private; + + /* walk each page on bio, ending page IO on them */ + bio_for_each_segment_all(bv, bio, iter_all) + iomap_finish_page_writeback(inode, bv->bv_page, error); + bio_put(bio); + } + + if (unlikely(error && !quiet)) { + printk_ratelimited(KERN_ERR +"%s: writeback error on inode %lu, offset %lld, sector %llu", + inode->i_sb->s_id, inode->i_ino, ioend->io_offset, + start); + } +} + +void +iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + + list_replace_init(&ioend->io_list, &tmp); + iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool +iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) +{ + if (ioend->io_bio->bi_status != next->io_bio->bi_status) + return false; + if ((ioend->io_flags & IOMAP_F_SHARED) ^ + (next->io_flags & IOMAP_F_SHARED)) + return false; + if ((ioend->io_type == IOMAP_UNWRITTEN) ^ + (next->io_type == IOMAP_UNWRITTEN)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + return true; +} + +void +iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, + void (*merge_private)(struct iomap_ioend *ioend, + struct iomap_ioend *next)) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + if (next->io_private && merge_private) + merge_private(ioend, next); + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int +iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void +iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + +static void iomap_writepage_end_bio(struct bio *bio) +{ + struct iomap_ioend *ioend = bio->bi_private; + + iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); +} + +/* + * Submit the final bio for an ioend. + * + * If @error is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the bio instead of + * submitting it. This typically only happens on a filesystem shutdown. + */ +static int +iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, + int error) +{ + ioend->io_bio->bi_private = ioend; + ioend->io_bio->bi_end_io = iomap_writepage_end_bio; + + if (wpc->ops->prepare_ioend) + error = wpc->ops->prepare_ioend(ioend, error); + if (error) { + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + ioend->io_bio->bi_status = errno_to_blk_status(error); + bio_endio(ioend->io_bio); + return error; + } + + submit_bio(ioend->io_bio); + return 0; +} + +static struct iomap_ioend * +iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, + loff_t offset, sector_t sector, struct writeback_control *wbc) +{ + struct iomap_ioend *ioend; + struct bio *bio; + + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset); + bio_set_dev(bio, wpc->iomap.bdev); + bio->bi_iter.bi_sector = sector; + bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + bio->bi_write_hint = inode->i_write_hint; + wbc_init_bio(wbc, bio); + + ioend = container_of(bio, struct iomap_ioend, io_inline_bio); + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_type = wpc->iomap.type; + ioend->io_flags = wpc->iomap.flags; + ioend->io_inode = inode; + ioend->io_size = 0; + ioend->io_offset = offset; + ioend->io_private = NULL; + ioend->io_bio = bio; + return ioend; +} + +/* + * Allocate a new bio, and chain the old bio to the new one. + * + * Note that we have to do perform the chaining in this unintuitive order + * so that the bi_private linkage is set up in the right direction for the + * traversal in iomap_finish_ioend(). + */ +static struct bio * +iomap_chain_bio(struct bio *prev) +{ + struct bio *new; + + new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + bio_copy_dev(new, prev);/* also copies over blkcg information */ + new->bi_iter.bi_sector = bio_end_sector(prev); + new->bi_opf = prev->bi_opf; + new->bi_write_hint = prev->bi_write_hint; + + bio_chain(prev, new); + bio_get(prev); /* for iomap_finish_ioend */ + submit_bio(prev); + return new; +} + +static bool +iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, + sector_t sector) +{ + if ((wpc->iomap.flags & IOMAP_F_SHARED) != + (wpc->ioend->io_flags & IOMAP_F_SHARED)) + return false; + if (wpc->iomap.type != wpc->ioend->io_type) + return false; + if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) + return false; + if (sector != bio_end_sector(wpc->ioend->io_bio)) + return false; + return true; +} + +/* + * Test to see if we have an existing ioend structure that we could append to + * first, otherwise finish off the current ioend and start another. + */ +static void +iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, + struct iomap_page *iop, struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct list_head *iolist) +{ + sector_t sector = iomap_sector(&wpc->iomap, offset); + unsigned len = i_blocksize(inode); + unsigned poff = offset & (PAGE_SIZE - 1); + bool merged, same_page = false; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { + if (wpc->ioend) + list_add(&wpc->ioend->io_list, iolist); + wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); + } + + merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, + &same_page); + if (iop && !same_page) + atomic_inc(&iop->write_count); + + if (!merged) { + if (bio_full(wpc->ioend->io_bio, len)) { + wpc->ioend->io_bio = + iomap_chain_bio(wpc->ioend->io_bio); + } + bio_add_page(wpc->ioend->io_bio, page, len, poff); + } + + wpc->ioend->io_size += len; + wbc_account_cgroup_owner(wbc, page, len); +} + +/* + * We implement an immediate ioend submission policy here to avoid needing to + * chain multiple ioends and hence nest mempool allocations which can violate + * forward progress guarantees we need to provide. The current ioend we are + * adding blocks to is cached on the writepage context, and if the new block + * does not append to the cached ioend it will create a new ioend and cache that + * instead. + * + * If a new ioend is created and cached, the old ioend is returned and queued + * locally for submission once the entire page is processed or an error has been + * detected. While ioends are submitted immediately after they are completed, + * batching optimisations are provided by higher level block plugging. + * + * At the end of a writeback pass, there will be a cached ioend remaining on the + * writepage context that the caller will need to submit. + */ +static int +iomap_writepage_map(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct inode *inode, + struct page *page, u64 end_offset) +{ + struct iomap_page *iop = to_iomap_page(page); + struct iomap_ioend *ioend, *next; + unsigned len = i_blocksize(inode); + u64 file_offset; /* file offset of page */ + int error = 0, count = 0, i; + LIST_HEAD(submit_list); + + WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0); + + /* + * Walk through the page to find areas to write back. If we run off the + * end of the current map or find the current map invalid, grab a new + * one. + */ + for (i = 0, file_offset = page_offset(page); + i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; + i++, file_offset += len) { + if (iop && !test_bit(i, iop->uptodate)) + continue; + + error = wpc->ops->map_blocks(wpc, inode, file_offset); + if (error) + break; + if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) + continue; + if (wpc->iomap.type == IOMAP_HOLE) + continue; + iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, + &submit_list); + count++; + } + + WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); + WARN_ON_ONCE(!PageLocked(page)); + WARN_ON_ONCE(PageWriteback(page)); + + /* + * We cannot cancel the ioend directly here on error. We may have + * already set other pages under writeback and hence we have to run I/O + * completion to mark the error state of the pages under writeback + * appropriately. + */ + if (unlikely(error)) { + if (!count) { + /* + * If the current page hasn't been added to ioend, it + * won't be affected by I/O completions and we must + * discard and unlock it right here. + */ + if (wpc->ops->discard_page) + wpc->ops->discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + goto done; + } + + /* + * If the page was not fully cleaned, we need to ensure that the + * higher layers come back to it correctly. That means we need + * to keep the page dirty, and for WB_SYNC_ALL writeback we need + * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed + * so another attempt to write this page in this writeback sweep + * will be made. + */ + set_page_writeback_keepwrite(page); + } else { + clear_page_dirty_for_io(page); + set_page_writeback(page); + } + + unlock_page(page); + + /* + * Preserve the original error if there was one, otherwise catch + * submission errors here and propagate into subsequent ioend + * submissions. + */ + list_for_each_entry_safe(ioend, next, &submit_list, io_list) { + int error2; + + list_del_init(&ioend->io_list); + error2 = iomap_submit_ioend(wpc, ioend, error); + if (error2 && !error) + error = error2; + } + + /* + * We can end up here with no error and nothing to write only if we race + * with a partial page truncate on a sub-page block sized filesystem. + */ + if (!count) + end_page_writeback(page); +done: + mapping_set_error(page->mapping, error); + return error; +} + +/* + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + */ +static int +iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) +{ + struct iomap_writepage_ctx *wpc = data; + struct inode *inode = page->mapping->host; + pgoff_t end_index; + u64 end_offset; + loff_t offset; + + trace_iomap_writepage(inode, page, 0, 0); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should never happen except in the case of a VM regression so + * warn about it. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC)) + goto redirty; + + /* + * Given that we do not allow direct reclaim to call us, we should + * never be called in a recursive filesystem reclaim context. + */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) + goto redirty; + + /* + * Is this page beyond the end of the file? + * + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + offset = i_size_read(inode); + end_index = offset >> PAGE_SHIFT; + if (page->index < end_index) + end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ + unsigned offset_into_page = offset & (PAGE_SIZE - 1); + + /* + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * iomap_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. + */ + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) + goto redirty; + + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining + * memory is zeroed when mapped, and writes to that region are + * not written out to the file." + */ + zero_user_segment(page, offset_into_page, PAGE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; + } + + return iomap_writepage_map(wpc, wbc, inode, page, end_offset); + +redirty: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +int +iomap_writepage(struct page *page, struct writeback_control *wbc, + struct iomap_writepage_ctx *wpc, + const struct iomap_writeback_ops *ops) +{ + int ret; + + wpc->ops = ops; + ret = iomap_do_writepage(page, wbc, wpc); + if (!wpc->ioend) + return ret; + return iomap_submit_ioend(wpc, wpc->ioend, ret); +} +EXPORT_SYMBOL_GPL(iomap_writepage); + +int +iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, + struct iomap_writepage_ctx *wpc, + const struct iomap_writeback_ops *ops) +{ + int ret; + + wpc->ops = ops; + ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); + if (!wpc->ioend) + return ret; + return iomap_submit_ioend(wpc, wpc->ioend, ret); +} +EXPORT_SYMBOL_GPL(iomap_writepages); + +static int __init iomap_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_inline_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 1fc28c2da279..2f88d64c2a4d 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -358,7 +358,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, static loff_t iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_dio *dio = data; @@ -392,7 +392,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, const struct iomap_dio_ops *dops) + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, + bool wait_for_completion) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -400,7 +401,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos, start = pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; - bool wait_for_completion = is_sync_kiocb(iocb); struct blk_plug plug; struct iomap_dio *dio; @@ -409,6 +409,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!count) return 0; + if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion)) + return -EIO; + dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) return -ENOMEM; @@ -430,7 +433,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; - if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) + if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { flags |= IOMAP_WRITE; diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index f26fdd36e383..690ef2d7c6c8 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -44,7 +44,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, static loff_t iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct fiemap_ctx *ctx = data; loff_t ret = length; @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(iomap_fiemap); static loff_t iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { sector_t *bno = data, addr; diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index c04bad4b2b43..89f61d93c0bc 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -119,7 +119,7 @@ out: static loff_t iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { switch (iomap->type) { case IOMAP_UNWRITTEN: @@ -165,7 +165,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole); static loff_t iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { switch (iomap->type) { case IOMAP_HOLE: diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 152a230f668d..a648dbf6991e 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -76,7 +76,8 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) * distinction between written and unwritten extents. */ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, - loff_t count, void *data, struct iomap *iomap) + loff_t count, void *data, struct iomap *iomap, + struct iomap *srcmap) { struct iomap_swapfile_info *isi = data; int error; diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c new file mode 100644 index 000000000000..da217246b1a9 --- /dev/null +++ b/fs/iomap/trace.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Christoph Hellwig + */ +#include + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h new file mode 100644 index 000000000000..4ca1aa2f3f6e --- /dev/null +++ b/fs/iomap/trace.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2009-2019 Christoph Hellwig + * + * NOTE: none of these tracepoints shall be consider a stable kernel ABI + * as they can change at any time. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM iomap + +#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _IOMAP_TRACE_H + +#include + +struct inode; + +DECLARE_EVENT_CLASS(iomap_readpage_class, + TP_PROTO(struct inode *inode, int nr_pages), + TP_ARGS(inode, nr_pages), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(int, nr_pages) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nr_pages = nr_pages; + ), + TP_printk("dev %d:%d ino 0x%llx nr_pages %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->nr_pages) +) + +#define DEFINE_READPAGE_EVENT(name) \ +DEFINE_EVENT(iomap_readpage_class, name, \ + TP_PROTO(struct inode *inode, int nr_pages), \ + TP_ARGS(inode, nr_pages)) +DEFINE_READPAGE_EVENT(iomap_readpage); +DEFINE_READPAGE_EVENT(iomap_readpages); + +DECLARE_EVENT_CLASS(iomap_page_class, + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, + unsigned int len), + TP_ARGS(inode, page, off, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(pgoff_t, pgoff) + __field(loff_t, size) + __field(unsigned long, offset) + __field(unsigned int, length) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgoff = page_offset(page); + __entry->size = i_size_read(inode); + __entry->offset = off; + __entry->length = len; + ), + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + "length %x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->size, + __entry->offset, + __entry->length) +) + +#define DEFINE_PAGE_EVENT(name) \ +DEFINE_EVENT(iomap_page_class, name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ + unsigned int len), \ + TP_ARGS(inode, page, off, len)) +DEFINE_PAGE_EVENT(iomap_writepage); +DEFINE_PAGE_EVENT(iomap_releasepage); +DEFINE_PAGE_EVENT(iomap_invalidatepage); + +#endif /* _IOMAP_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 02469d59c787..ef75e223cb70 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,6 +34,7 @@ #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_icache.h" +#include "xfs_iomap.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -4456,16 +4457,21 @@ int xfs_bmapi_convert_delalloc( struct xfs_inode *ip, int whichfork, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap, + xfs_off_t offset, + struct iomap *iomap, unsigned int *seq) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; + u16 flags = 0; struct xfs_trans *tp; int error; + if (whichfork == XFS_COW_FORK) + flags |= IOMAP_F_SHARED; + /* * Space for the extent and indirect blocks was reserved when the * delalloc extent was created so there's no need to do so here. @@ -4495,7 +4501,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - *imap = bma.got; + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4528,7 +4534,7 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - *imap = bma.got; + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e2798c6f3a5f..14d25e0b7d9c 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -228,8 +228,7 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, - unsigned int *seq); + xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f16d5f196c6b..5936507c6f50 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -18,17 +18,18 @@ #include "xfs_bmap_util.h" #include "xfs_reflink.h" -/* - * structure owned by writepages passed to individual writepage calls - */ struct xfs_writepage_ctx { - struct xfs_bmbt_irec imap; - int fork; + struct iomap_writepage_ctx ctx; unsigned int data_seq; unsigned int cow_seq; - struct xfs_ioend *ioend; }; +static inline struct xfs_writepage_ctx * +XFS_WPC(struct iomap_writepage_ctx *ctx) +{ + return container_of(ctx, struct xfs_writepage_ctx, ctx); +} + struct block_device * xfs_find_bdev_for_inode( struct inode *inode) @@ -55,71 +56,10 @@ xfs_find_daxdev_for_inode( return mp->m_ddev_targp->bt_daxdev; } -static void -xfs_finish_page_writeback( - struct inode *inode, - struct bio_vec *bvec, - int error) -{ - struct iomap_page *iop = to_iomap_page(bvec->bv_page); - - if (error) { - SetPageError(bvec->bv_page); - mapping_set_error(inode->i_mapping, -EIO); - } - - ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); - ASSERT(!iop || atomic_read(&iop->write_count) > 0); - - if (!iop || atomic_dec_and_test(&iop->write_count)) - end_page_writeback(bvec->bv_page); -} - -/* - * We're now finished for good with this ioend structure. Update the page - * state, release holds on bios, and finally free up memory. Do not use the - * ioend after this. - */ -STATIC void -xfs_destroy_ioend( - struct xfs_ioend *ioend, - int error) -{ - struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - bool quiet = bio_flagged(bio, BIO_QUIET); - - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, iter_all) - xfs_finish_page_writeback(inode, bvec, error); - bio_put(bio); - } - - if (unlikely(error && !quiet)) { - xfs_err_ratelimited(XFS_I(inode)->i_mount, - "writeback error on sector %llu", start); - } -} - /* * Fast and loose check if this write could update the on-disk inode size. */ -static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend) { return ioend->io_offset + ioend->io_size > XFS_I(ioend->io_inode)->i_d.di_size; @@ -127,7 +67,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) STATIC int xfs_setfilesize_trans_alloc( - struct xfs_ioend *ioend) + struct iomap_ioend *ioend) { struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; struct xfs_trans *tp; @@ -137,7 +77,7 @@ xfs_setfilesize_trans_alloc( if (error) return error; - ioend->io_append_trans = tp; + ioend->io_private = tp; /* * We may pass freeze protection with a transaction. So tell lockdep @@ -200,11 +140,11 @@ xfs_setfilesize( STATIC int xfs_setfilesize_ioend( - struct xfs_ioend *ioend, + struct iomap_ioend *ioend, int error) { struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_trans *tp = ioend->io_append_trans; + struct xfs_trans *tp = ioend->io_private; /* * The transaction may have been allocated in the I/O submission thread, @@ -228,9 +168,8 @@ xfs_setfilesize_ioend( */ STATIC void xfs_end_ioend( - struct xfs_ioend *ioend) + struct iomap_ioend *ioend) { - struct list_head ioend_list; struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; @@ -257,7 +196,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - if (ioend->io_fork == XFS_COW_FORK) + if (ioend->io_flags & IOMAP_F_SHARED) xfs_reflink_cancel_cow_range(ip, offset, size, true); goto done; } @@ -265,49 +204,20 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_fork == XFS_COW_FORK) + if (ioend->io_flags & IOMAP_F_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_state == XFS_EXT_UNWRITTEN) + else if (ioend->io_type == IOMAP_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); else - ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); + ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private); done: - if (ioend->io_append_trans) + if (ioend->io_private) error = xfs_setfilesize_ioend(ioend, error); - list_replace_init(&ioend->io_list, &ioend_list); - xfs_destroy_ioend(ioend, error); - - while (!list_empty(&ioend_list)) { - ioend = list_first_entry(&ioend_list, struct xfs_ioend, - io_list); - list_del_init(&ioend->io_list); - xfs_destroy_ioend(ioend, error); - } - + iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } -/* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -xfs_ioend_can_merge( - struct xfs_ioend *ioend, - struct xfs_ioend *next) -{ - if (ioend->io_bio->bi_status != next->io_bio->bi_status) - return false; - if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) - return false; - if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ - (next->io_state == XFS_EXT_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - return true; -} - /* * If the to be merged ioend has a preallocated transaction for file * size updates we need to ensure the ioend it is merged into also @@ -315,104 +225,65 @@ xfs_ioend_can_merge( * as it is guaranteed to be clean. */ static void -xfs_ioend_merge_append_transactions( - struct xfs_ioend *ioend, - struct xfs_ioend *next) +xfs_ioend_merge_private( + struct iomap_ioend *ioend, + struct iomap_ioend *next) { - if (!ioend->io_append_trans) { - ioend->io_append_trans = next->io_append_trans; - next->io_append_trans = NULL; + if (!ioend->io_private) { + ioend->io_private = next->io_private; + next->io_private = NULL; } else { xfs_setfilesize_ioend(next, -ECANCELED); } } -/* Try to merge adjacent completions. */ -STATIC void -xfs_ioend_try_merge( - struct xfs_ioend *ioend, - struct list_head *more_ioends) -{ - struct xfs_ioend *next_ioend; - - while (!list_empty(more_ioends)) { - next_ioend = list_first_entry(more_ioends, struct xfs_ioend, - io_list); - if (!xfs_ioend_can_merge(ioend, next_ioend)) - break; - list_move_tail(&next_ioend->io_list, &ioend->io_list); - ioend->io_size += next_ioend->io_size; - if (next_ioend->io_append_trans) - xfs_ioend_merge_append_transactions(ioend, next_ioend); - } -} - -/* list_sort compare function for ioends */ -static int -xfs_ioend_compare( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_ioend *ia; - struct xfs_ioend *ib; - - ia = container_of(a, struct xfs_ioend, io_list); - ib = container_of(b, struct xfs_ioend, io_list); - if (ia->io_offset < ib->io_offset) - return -1; - else if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - /* Finish all pending io completions. */ void xfs_end_io( struct work_struct *work) { - struct xfs_inode *ip; - struct xfs_ioend *ioend; - struct list_head completion_list; + struct xfs_inode *ip = + container_of(work, struct xfs_inode, i_ioend_work); + struct iomap_ioend *ioend; + struct list_head tmp; unsigned long flags; - ip = container_of(work, struct xfs_inode, i_ioend_work); - spin_lock_irqsave(&ip->i_ioend_lock, flags); - list_replace_init(&ip->i_ioend_list, &completion_list); + list_replace_init(&ip->i_ioend_list, &tmp); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); - list_sort(NULL, &completion_list, xfs_ioend_compare); - - while (!list_empty(&completion_list)) { - ioend = list_first_entry(&completion_list, struct xfs_ioend, - io_list); + iomap_sort_ioends(&tmp); + while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, + io_list))) { list_del_init(&ioend->io_list); - xfs_ioend_try_merge(ioend, &completion_list); + iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private); xfs_end_ioend(ioend); } } +static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend) +{ + return ioend->io_private || + ioend->io_type == IOMAP_UNWRITTEN || + (ioend->io_flags & IOMAP_F_SHARED); +} + STATIC void xfs_end_bio( struct bio *bio) { - struct xfs_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = bio->bi_private; struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_mount *mp = ip->i_mount; unsigned long flags; - if (ioend->io_fork == XFS_COW_FORK || - ioend->io_state == XFS_EXT_UNWRITTEN || - ioend->io_append_trans != NULL) { - spin_lock_irqsave(&ip->i_ioend_lock, flags); - if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, - &ip->i_ioend_work)); - list_add_tail(&ioend->io_list, &ip->i_ioend_list); - spin_unlock_irqrestore(&ip->i_ioend_lock, flags); - } else - xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); + ASSERT(xfs_ioend_needs_workqueue(ioend)); + + spin_lock_irqsave(&ip->i_ioend_lock, flags); + if (list_empty(&ip->i_ioend_list)) + WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + &ip->i_ioend_work)); + list_add_tail(&ioend->io_list, &ip->i_ioend_list); + spin_unlock_irqrestore(&ip->i_ioend_lock, flags); } /* @@ -421,19 +292,19 @@ xfs_end_bio( */ static bool xfs_imap_valid( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb) + loff_t offset) { - if (offset_fsb < wpc->imap.br_startoff || - offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + if (offset < wpc->iomap.offset || + offset >= wpc->iomap.offset + wpc->iomap.length) return false; /* * If this is a COW mapping, it is sufficient to check that the mapping * covers the offset. Be careful to check this first because the caller * can revalidate a COW mapping without updating the data seqno. */ - if (wpc->fork == XFS_COW_FORK) + if (wpc->iomap.flags & IOMAP_F_SHARED) return true; /* @@ -443,17 +314,17 @@ xfs_imap_valid( * checked (and found nothing at this offset) could have added * overlapping blocks. */ - if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) return false; if (xfs_inode_has_cow_data(ip) && - wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) return false; return true; } /* * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->imap. + * extent that maps offset_fsb in wpc->iomap. * * The current page is held locked so nothing could have removed the block * backing offset_fsb, although it could have moved from the COW to the data @@ -461,32 +332,38 @@ xfs_imap_valid( */ static int xfs_convert_blocks( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb) + int whichfork, + loff_t offset) { int error; + unsigned *seq; + + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; /* - * Attempt to allocate whatever delalloc extent currently backs - * offset_fsb and put the result into wpc->imap. Allocate in a loop - * because it may take several attempts to allocate real blocks for a - * contiguous delalloc extent if free space is sufficiently fragmented. + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into wpc->iomap. Allocate in a loop because it + * may take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. */ do { - error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, - &wpc->imap, wpc->fork == XFS_COW_FORK ? - &wpc->cow_seq : &wpc->data_seq); + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) return error; - } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + } while (wpc->iomap.offset + wpc->iomap.length <= offset); return 0; } -STATIC int +static int xfs_map_blocks( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset) { @@ -496,6 +373,7 @@ xfs_map_blocks( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; + int whichfork = XFS_DATA_FORK; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; int retries = 0; @@ -519,7 +397,7 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - if (xfs_imap_valid(wpc, ip, offset_fsb)) + if (xfs_imap_valid(wpc, ip, offset)) return 0; /* @@ -541,10 +419,10 @@ retry: xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) cow_fsb = imap.br_startoff; if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { - wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); + XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - wpc->fork = XFS_COW_FORK; + whichfork = XFS_COW_FORK; goto allocate_blocks; } @@ -552,7 +430,7 @@ retry: * No COW extent overlap. Revalidate now that we may have updated * ->cow_seq. If the data mapping is still valid, we're done. */ - if (xfs_imap_valid(wpc, ip, offset_fsb)) { + if (xfs_imap_valid(wpc, ip, offset)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -564,11 +442,9 @@ retry: */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ - wpc->data_seq = READ_ONCE(ip->i_df.if_seq); + XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - wpc->fork = XFS_DATA_FORK; - /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { imap.br_blockcount = imap.br_startoff - offset_fsb; @@ -592,11 +468,11 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - wpc->imap = imap; - trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, offset_fsb); + error = xfs_convert_blocks(wpc, ip, whichfork, offset); if (error) { /* * If we failed to find the extent in the COW fork we might have @@ -605,7 +481,7 @@ allocate_blocks: * the former case, but prevent additional retries to avoid * looping forever for the latter case. */ - if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++) goto retry; ASSERT(error != -EAGAIN); return error; @@ -616,34 +492,22 @@ allocate_blocks: * original delalloc one. Trim the return extent to the next COW * boundary again to force a re-lookup. */ - if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && - cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) - wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) { + loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb); - ASSERT(wpc->imap.br_startoff <= offset_fsb); - ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); + if (cow_offset < wpc->iomap.offset + wpc->iomap.length) + wpc->iomap.length = cow_offset - wpc->iomap.offset; + } + + ASSERT(wpc->iomap.offset <= offset); + ASSERT(wpc->iomap.offset + wpc->iomap.length > offset); + trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap); return 0; } -/* - * Submit the bio for an ioend. We are passed an ioend with a bio attached to - * it, and we submit that bio. The ioend may be used for multiple bio - * submissions, so we only want to allocate an append transaction for the ioend - * once. In the case of multiple bio submission, each bio will take an IO - * reference to the ioend to ensure that the ioend completion is only done once - * all bios have been submitted and the ioend is really done. - * - * If @status is non-zero, it means that we have a situation where some part of - * the submission process has failed after we have marked paged for writeback - * and unlocked them. In this situation, we need to fail the bio and ioend - * rather than submit it to IO. This typically only happens on a filesystem - * shutdown. - */ -STATIC int -xfs_submit_ioend( - struct writeback_control *wbc, - struct xfs_ioend *ioend, +static int +xfs_prepare_ioend( + struct iomap_ioend *ioend, int status) { unsigned int nofs_flag; @@ -656,157 +520,24 @@ xfs_submit_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && ioend->io_fork == XFS_COW_FORK) { + if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - (ioend->io_fork == XFS_COW_FORK || - ioend->io_state != XFS_EXT_UNWRITTEN) && + ((ioend->io_flags & IOMAP_F_SHARED) || + ioend->io_type != IOMAP_UNWRITTEN) && xfs_ioend_is_append(ioend) && - !ioend->io_append_trans) + !ioend->io_private) status = xfs_setfilesize_trans_alloc(ioend); memalloc_nofs_restore(nofs_flag); - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = xfs_end_bio; - - /* - * If we are failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - if (status) { - ioend->io_bio->bi_status = errno_to_blk_status(status); - bio_endio(ioend->io_bio); - return status; - } - - submit_bio(ioend->io_bio); - return 0; -} - -static struct xfs_ioend * -xfs_alloc_ioend( - struct inode *inode, - int fork, - xfs_exntst_t state, - xfs_off_t offset, - struct block_device *bdev, - sector_t sector, - struct writeback_control *wbc) -{ - struct xfs_ioend *ioend; - struct bio *bio; - - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = sector; - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - bio->bi_write_hint = inode->i_write_hint; - wbc_init_bio(wbc, bio); - - ioend = container_of(bio, struct xfs_ioend, io_inline_bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_fork = fork; - ioend->io_state = state; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = offset; - ioend->io_append_trans = NULL; - ioend->io_bio = bio; - return ioend; -} - -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to do perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in xfs_destroy_ioend(). - */ -static struct bio * -xfs_chain_bio( - struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); - bio_copy_dev(new, prev);/* also copies over blkcg information */ - new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_opf = prev->bi_opf; - new->bi_write_hint = prev->bi_write_hint; - - bio_chain(prev, new); - bio_get(prev); /* for xfs_destroy_ioend */ - submit_bio(prev); - return new; -} - -/* - * Test to see if we have an existing ioend structure that we could append to - * first, otherwise finish off the current ioend and start another. - */ -STATIC void -xfs_add_to_ioend( - struct inode *inode, - xfs_off_t offset, - struct page *page, - struct iomap_page *iop, - struct xfs_writepage_ctx *wpc, - struct writeback_control *wbc, - struct list_head *iolist) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - unsigned len = i_blocksize(inode); - unsigned poff = offset & (PAGE_SIZE - 1); - bool merged, same_page = false; - sector_t sector; - - sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + - ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - - if (!wpc->ioend || - wpc->fork != wpc->ioend->io_fork || - wpc->imap.br_state != wpc->ioend->io_state || - sector != bio_end_sector(wpc->ioend->io_bio) || - offset != wpc->ioend->io_offset + wpc->ioend->io_size) { - if (wpc->ioend) - list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, - wpc->imap.br_state, offset, bdev, sector, wbc); - } - - merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, - &same_page); - - if (iop && !same_page) - atomic_inc(&iop->write_count); - - if (!merged) { - if (bio_full(wpc->ioend->io_bio, len)) - wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio); - bio_add_page(wpc->ioend->io_bio, page, len, poff); - } - - wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, page, len); -} - -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned int offset, - unsigned int length) -{ - trace_xfs_invalidatepage(page->mapping->host, page, offset, length); - iomap_invalidatepage(page, offset, length); + if (xfs_ioend_needs_workqueue(ioend)) + ioend->io_bio->bi_end_io = xfs_end_bio; + return status; } /* @@ -820,8 +551,8 @@ xfs_vm_invalidatepage( * transaction as there is no space left for block reservation (typically why we * see a ENOSPC in writeback). */ -STATIC void -xfs_aops_discard_page( +static void +xfs_discard_page( struct page *page) { struct inode *inode = page->mapping->host; @@ -843,246 +574,14 @@ xfs_aops_discard_page( if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - xfs_vm_invalidatepage(page, 0, PAGE_SIZE); + iomap_invalidatepage(page, 0, PAGE_SIZE); } -/* - * We implement an immediate ioend submission policy here to avoid needing to - * chain multiple ioends and hence nest mempool allocations which can violate - * forward progress guarantees we need to provide. The current ioend we are - * adding blocks to is cached on the writepage context, and if the new block - * does not append to the cached ioend it will create a new ioend and cache that - * instead. - * - * If a new ioend is created and cached, the old ioend is returned and queued - * locally for submission once the entire page is processed or an error has been - * detected. While ioends are submitted immediately after they are completed, - * batching optimisations are provided by higher level block plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int -xfs_writepage_map( - struct xfs_writepage_ctx *wpc, - struct writeback_control *wbc, - struct inode *inode, - struct page *page, - uint64_t end_offset) -{ - LIST_HEAD(submit_list); - struct iomap_page *iop = to_iomap_page(page); - unsigned len = i_blocksize(inode); - struct xfs_ioend *ioend, *next; - uint64_t file_offset; /* file offset of page */ - int error = 0, count = 0, i; - - ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); - ASSERT(!iop || atomic_read(&iop->write_count) == 0); - - /* - * Walk through the page to find areas to write back. If we run off the - * end of the current map or find the current map invalid, grab a new - * one. - */ - for (i = 0, file_offset = page_offset(page); - i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; - i++, file_offset += len) { - if (iop && !test_bit(i, iop->uptodate)) - continue; - - error = xfs_map_blocks(wpc, inode, file_offset); - if (error) - break; - if (wpc->imap.br_startblock == HOLESTARTBLOCK) - continue; - xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, - &submit_list); - count++; - } - - ASSERT(wpc->ioend || list_empty(&submit_list)); - ASSERT(PageLocked(page)); - ASSERT(!PageWriteback(page)); - - /* - * On error, we have to fail the ioend here because we may have set - * pages under writeback, we have to make sure we run IO completion to - * mark the error state of the IO appropriately, so we can't cancel the - * ioend directly here. That means we have to mark this page as under - * writeback if we included any blocks from it in the ioend chain so - * that completion treats it correctly. - * - * If we didn't include the page in the ioend, the on error we can - * simply discard and unlock it as there are no other users of the page - * now. The caller will still need to trigger submission of outstanding - * ioends on the writepage context so they are treated correctly on - * error. - */ - if (unlikely(error)) { - if (!count) { - xfs_aops_discard_page(page); - ClearPageUptodate(page); - unlock_page(page); - goto done; - } - - /* - * If the page was not fully cleaned, we need to ensure that the - * higher layers come back to it correctly. That means we need - * to keep the page dirty, and for WB_SYNC_ALL writeback we need - * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed - * so another attempt to write this page in this writeback sweep - * will be made. - */ - set_page_writeback_keepwrite(page); - } else { - clear_page_dirty_for_io(page); - set_page_writeback(page); - } - - unlock_page(page); - - /* - * Preserve the original error if there was one, otherwise catch - * submission errors here and propagate into subsequent ioend - * submissions. - */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = xfs_submit_ioend(wbc, ioend, error); - if (error2 && !error) - error = error2; - } - - /* - * We can end up here with no error and nothing to write only if we race - * with a partial page truncate on a sub-page block sized filesystem. - */ - if (!count) - end_page_writeback(page); -done: - mapping_set_error(page->mapping, error); - return error; -} - -/* - * Write out a dirty page. - * - * For delalloc space on the page we need to allocate space and flush it. - * For unwritten space on the page we need to start the conversion to - * regular allocated space. - */ -STATIC int -xfs_do_writepage( - struct page *page, - struct writeback_control *wbc, - void *data) -{ - struct xfs_writepage_ctx *wpc = data; - struct inode *inode = page->mapping->host; - loff_t offset; - uint64_t end_offset; - pgoff_t end_index; - - trace_xfs_writepage(inode, page, 0, 0); - - /* - * Refuse to write the page out if we are called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should never happen except in the case of a VM regression so - * warn about it. - */ - if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC)) - goto redirty; - - /* - * Given that we do not allow direct reclaim to call us, we should - * never be called while in a filesystem transaction. - */ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) - goto redirty; - - /* - * Is this page beyond the end of the file? - * - * The page index is less than the end_index, adjust the end_offset - * to the highest offset that this page should represent. - * ----------------------------------------------------- - * | file mapping | | - * ----------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | | - * ^--------------------------------^----------|-------- - * | desired writeback range | see else | - * ---------------------------------^------------------| - */ - offset = i_size_read(inode); - end_index = offset >> PAGE_SHIFT; - if (page->index < end_index) - end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT; - else { - /* - * Check whether the page to write out is beyond or straddles - * i_size or not. - * ------------------------------------------------------- - * | file mapping | | - * ------------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | - * ^--------------------------------^-----------|--------- - * | | Straddles | - * ---------------------------------^-----------|--------| - */ - unsigned offset_into_page = offset & (PAGE_SIZE - 1); - - /* - * Skip the page if it is fully outside i_size, e.g. due to a - * truncate operation that is in progress. We must redirty the - * page so that reclaim stops reclaiming it. Otherwise - * xfs_vm_releasepage() is called on it and gets confused. - * - * Note that the end_index is unsigned long, it would overflow - * if the given offset is greater than 16TB on 32-bit system - * and if we do check the page is fully outside i_size or not - * via "if (page->index >= end_index + 1)" as "end_index + 1" - * will be evaluated to 0. Hence this page will be redirtied - * and be written out repeatedly which would result in an - * infinite loop, the user program that perform this operation - * will hang. Instead, we can verify this situation by checking - * if the page to write is totally beyond the i_size or if it's - * offset is just equal to the EOF. - */ - if (page->index > end_index || - (page->index == end_index && offset_into_page == 0)) - goto redirty; - - /* - * The page straddles i_size. It must be zeroed out on each - * and every writepage invocation because it may be mmapped. - * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining - * memory is zeroed when mapped, and writes to that region are - * not written out to the file." - */ - zero_user_segment(page, offset_into_page, PAGE_SIZE); - - /* Adjust the end_offset to the end of file */ - end_offset = offset; - } - - return xfs_writepage_map(wpc, wbc, inode, page, end_offset); - -redirty: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} +static const struct iomap_writeback_ops xfs_writeback_ops = { + .map_blocks = xfs_map_blocks, + .prepare_ioend = xfs_prepare_ioend, + .discard_page = xfs_discard_page, +}; STATIC int xfs_vm_writepage( @@ -1090,12 +589,8 @@ xfs_vm_writepage( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { }; - int ret; - ret = xfs_do_writepage(page, wbc, &wpc); - if (wpc.ioend) - ret = xfs_submit_ioend(wbc, wpc.ioend, ret); - return ret; + return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int @@ -1104,13 +599,9 @@ xfs_vm_writepages( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { }; - int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); - if (wpc.ioend) - ret = xfs_submit_ioend(wbc, wpc.ioend, ret); - return ret; + return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int @@ -1123,15 +614,6 @@ xfs_dax_writepages( xfs_find_bdev_for_inode(mapping->host), wbc); } -STATIC int -xfs_vm_releasepage( - struct page *page, - gfp_t gfp_mask) -{ - trace_xfs_releasepage(page->mapping->host, page, 0, 0); - return iomap_releasepage(page, gfp_mask); -} - STATIC sector_t xfs_vm_bmap( struct address_space *mapping, @@ -1160,7 +642,6 @@ xfs_vm_readpage( struct file *unused, struct page *page) { - trace_xfs_vm_readpage(page->mapping->host, 1); return iomap_readpage(page, &xfs_iomap_ops); } @@ -1171,7 +652,6 @@ xfs_vm_readpages( struct list_head *pages, unsigned nr_pages) { - trace_xfs_vm_readpages(mapping->host, nr_pages); return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops); } @@ -1191,8 +671,8 @@ const struct address_space_operations xfs_address_space_operations = { .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, - .releasepage = xfs_vm_releasepage, - .invalidatepage = xfs_vm_invalidatepage, + .releasepage = iomap_releasepage, + .invalidatepage = iomap_invalidatepage, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, .migratepage = iomap_migrate_page, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 45a1ea240cbb..687b11f34fa2 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -6,23 +6,6 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct bio_set xfs_ioend_bioset; - -/* - * Structure for buffered I/O completions. - */ -struct xfs_ioend { - struct list_head io_list; /* next ioend in chain */ - int io_fork; /* inode fork written back */ - xfs_exntst_t io_state; /* extent state */ - struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ - xfs_off_t io_offset; /* offset in the file */ - struct xfs_trans *io_append_trans;/* xact. for size update */ - struct bio *io_bio; /* bio being built */ - struct bio io_inline_bio; /* MUST BE LAST! */ -}; - extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1ffb179f35d2..c0620135a279 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -188,7 +188,7 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); xfs_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL, is_sync_kiocb(iocb)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -547,15 +547,12 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops); - /* - * If unaligned, this is the only IO in-flight. If it has not yet - * completed, wait on it before we release the iolock to prevent - * subsequent overlapping IO. + * If unaligned, this is the only IO in-flight. Wait on it before we + * release the iolock to prevent subsequent overlapping IO. */ - if (ret == -EIOCBQUEUED && unaligned_io) - inode_dio_wait(inode); + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_io); out: xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f780e223b118..95719e161286 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -54,7 +54,7 @@ xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - bool shared) + u16 flags) { struct xfs_mount *mp = ip->i_mount; @@ -79,12 +79,11 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + iomap->flags = flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; - if (shared) - iomap->flags |= IOMAP_F_SHARED; return 0; } @@ -540,6 +539,7 @@ xfs_file_iomap_begin_delay( struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; bool eof = false, cow_eof = false, shared = false; + u16 iomap_flags = 0; int whichfork = XFS_DATA_FORK; int error = 0; @@ -707,22 +707,28 @@ retry: * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ - iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, whichfork, - whichfork == XFS_DATA_FORK ? &imap : &cmap); + if (whichfork == XFS_DATA_FORK) { + iomap_flags |= IOMAP_F_NEW; + trace_xfs_iomap_alloc(ip, offset, count, whichfork, &imap); + } else { + trace_xfs_iomap_alloc(ip, offset, count, whichfork, &cmap); + } done: if (whichfork == XFS_COW_FORK) { if (imap.br_startoff > offset_fsb) { xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, + IOMAP_F_SHARED); goto out_unlock; } /* ensure we only report blocks we have a reservation for */ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); shared = true; } - error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); + if (shared) + iomap_flags |= IOMAP_F_SHARED; + error = xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -922,7 +928,8 @@ xfs_file_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -930,6 +937,7 @@ xfs_file_iomap_begin( xfs_fileoff_t offset_fsb, end_fsb; int nimaps = 1, error = 0; bool shared = false; + u16 iomap_flags = 0; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) @@ -1045,11 +1053,20 @@ xfs_file_iomap_begin( if (error) return error; - iomap->flags |= IOMAP_F_NEW; + iomap_flags |= IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); + /* + * Writes that span EOF might trigger an IO size update on completion, + * so consider them to be dirty for the purposes of O_DSYNC even if + * there is no other metadata changes pending or have been made here. + */ + if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode)) + iomap_flags |= IOMAP_F_DIRTY; + if (shared) + iomap_flags |= IOMAP_F_SHARED; + return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); out_found: ASSERT(nimaps); @@ -1145,7 +1162,8 @@ xfs_seek_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1193,7 +1211,7 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1215,7 +1233,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1231,7 +1249,8 @@ xfs_xattr_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1261,7 +1280,7 @@ out_unlock: if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, false); + return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 5c2f6aa6d78f..71d0ae460c44 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -16,7 +16,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, bool shared); + struct xfs_bmbt_irec *, u16); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index a339bd5fa260..9c96493be9e0 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -178,7 +178,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0f08153b4994..a9634110c783 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1442,7 +1442,7 @@ xfs_reflink_dirty_extents( flen = XFS_FSB_TO_B(mp, rlen); if (fpos + flen > isize) flen = isize - fpos; - error = iomap_file_dirty(VFS_I(ip), fpos, flen, + error = iomap_file_unshare(VFS_I(ip), fpos, flen, &xfs_iomap_ops); xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8d1df9f8be07..0a8cf6b87a21 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -40,7 +40,6 @@ #include static const struct super_operations xfs_super_operations; -struct bio_set xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -1853,15 +1852,10 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS)) - goto out; - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), "xfs_log_ticket"); if (!xfs_log_ticket_zone) - goto out_free_ioend_bioset; + goto out; xfs_bmap_free_item_zone = kmem_zone_init( sizeof(struct xfs_extent_free_item), @@ -1996,8 +1990,6 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_bmap_free_item_zone); out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); - out_free_ioend_bioset: - bioset_exit(&xfs_ioend_bioset); out: return -ENOMEM; } @@ -2028,7 +2020,6 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - bioset_exit(&xfs_ioend_bioset); } STATIC int __init diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index eaae275ed430..cbb23d7a3554 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1158,71 +1158,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); -DECLARE_EVENT_CLASS(xfs_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, - unsigned int len), - TP_ARGS(inode, page, off, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(pgoff_t, pgoff) - __field(loff_t, size) - __field(unsigned long, offset) - __field(unsigned int, length) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = XFS_I(inode)->i_ino; - __entry->pgoff = page_offset(page); - __entry->size = i_size_read(inode); - __entry->offset = off; - __entry->length = len; - ), - TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " - "length %x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->pgoff, - __entry->size, - __entry->offset, - __entry->length) -) - -#define DEFINE_PAGE_EVENT(name) \ -DEFINE_EVENT(xfs_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ - unsigned int len), \ - TP_ARGS(inode, page, off, len)) -DEFINE_PAGE_EVENT(xfs_writepage); -DEFINE_PAGE_EVENT(xfs_releasepage); -DEFINE_PAGE_EVENT(xfs_invalidatepage); - -DECLARE_EVENT_CLASS(xfs_readpage_class, - TP_PROTO(struct inode *inode, int nr_pages), - TP_ARGS(inode, nr_pages), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(int, nr_pages) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->nr_pages = nr_pages; - ), - TP_printk("dev %d:%d ino 0x%llx nr_pages %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->nr_pages) -) - -#define DEFINE_READPAGE_EVENT(name) \ -DEFINE_EVENT(xfs_readpage_class, name, \ - TP_PROTO(struct inode *inode, int nr_pages), \ - TP_ARGS(inode, nr_pages)) -DEFINE_READPAGE_EVENT(xfs_vm_readpage); -DEFINE_READPAGE_EVENT(xfs_vm_readpages); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 7aa5d6117936..8b09463dae0d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -12,6 +13,7 @@ struct address_space; struct fiemap_extent_info; struct inode; +struct iomap_writepage_ctx; struct iov_iter; struct kiocb; struct page; @@ -21,28 +23,45 @@ struct vm_fault; /* * Types of block ranges for iomap mappings: */ -#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */ -#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */ -#define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */ -#define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */ -#define IOMAP_INLINE 0x05 /* data inline in the inode */ +#define IOMAP_HOLE 0 /* no blocks allocated, need allocation */ +#define IOMAP_DELALLOC 1 /* delayed allocation blocks */ +#define IOMAP_MAPPED 2 /* blocks allocated at @addr */ +#define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */ +#define IOMAP_INLINE 4 /* data inline in the inode */ /* - * Flags for all iomap mappings: + * Flags reported by the file system from iomap_begin: + * + * IOMAP_F_NEW indicates that the blocks have been newly allocated and need + * zeroing for areas that no data is copied to. * * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access * written data and requires fdatasync to commit them to persistent storage. + * This needs to take into account metadata changes that *may* be made at IO + * completion, such as file size updates from direct IO. + * + * IOMAP_F_SHARED indicates that the blocks are shared, and will need to be + * unshared as part a write. + * + * IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block + * mappings. + * + * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of + * buffer heads for this mapping. */ -#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ -#define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ -#define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */ -#define IOMAP_F_SIZE_CHANGED 0x08 /* file size has changed */ +#define IOMAP_F_NEW 0x01 +#define IOMAP_F_DIRTY 0x02 +#define IOMAP_F_SHARED 0x04 +#define IOMAP_F_MERGED 0x08 +#define IOMAP_F_BUFFER_HEAD 0x10 /* - * Flags that only need to be reported for IOMAP_REPORT requests: + * Flags set by the core iomap code during operations: + * + * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size + * has changed as the result of this write operation. */ -#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */ -#define IOMAP_F_SHARED 0x20 /* block shared with another file */ +#define IOMAP_F_SIZE_CHANGED 0x100 /* * Flags from 0x1000 up are for file system specific usage: @@ -110,7 +129,8 @@ struct iomap_ops { * The actual length is returned in iomap->length. */ int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap); + unsigned flags, struct iomap *iomap, + struct iomap *srcmap); /* * Commit and/or unreserve space previous allocated using iomap_begin. @@ -126,29 +146,12 @@ struct iomap_ops { * Main iomap iterator function. */ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); + void *data, struct iomap *iomap, struct iomap *srcmap); loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, iomap_actor_t actor); -/* - * Structure allocate for each page when block size < PAGE_SIZE to track - * sub-page uptodate status and I/O completions. - */ -struct iomap_page { - atomic_t read_count; - atomic_t write_count; - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); -}; - -static inline struct iomap_page *to_iomap_page(struct page *page) -{ - if (page_has_private(page)) - return (struct iomap_page *)page_private(page); - return NULL; -} - ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); @@ -166,7 +169,7 @@ int iomap_migrate_page(struct address_space *mapping, struct page *newpage, #else #define iomap_migrate_page NULL #endif -int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, +int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); @@ -183,6 +186,63 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset, sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); +/* + * Structure for writeback I/O completions. + */ +struct iomap_ioend { + struct list_head io_list; /* next ioend in chain */ + u16 io_type; + u16 io_flags; /* IOMAP_F_* */ + struct inode *io_inode; /* file being written to */ + size_t io_size; /* size of the extent */ + loff_t io_offset; /* offset in the file */ + void *io_private; /* file system private data */ + struct bio *io_bio; /* bio being built */ + struct bio io_inline_bio; /* MUST BE LAST! */ +}; + +struct iomap_writeback_ops { + /* + * Required, maps the blocks so that writeback can be performed on + * the range starting at offset. + */ + int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, + loff_t offset); + + /* + * Optional, allows the file systems to perform actions just before + * submitting the bio and/or override the bio end_io handler for complex + * operations like copy on write extent manipulation or unwritten extent + * conversions. + */ + int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + + /* + * Optional, allows the file system to discard state on a page where + * we failed to submit any I/O. + */ + void (*discard_page)(struct page *page); +}; + +struct iomap_writepage_ctx { + struct iomap iomap; + struct iomap_ioend *ioend; + const struct iomap_writeback_ops *ops; +}; + +void iomap_finish_ioends(struct iomap_ioend *ioend, int error); +void iomap_ioend_try_merge(struct iomap_ioend *ioend, + struct list_head *more_ioends, + void (*merge_private)(struct iomap_ioend *ioend, + struct iomap_ioend *next)); +void iomap_sort_ioends(struct list_head *ioend_list); +int iomap_writepage(struct page *page, struct writeback_control *wbc, + struct iomap_writepage_ctx *wpc, + const struct iomap_writeback_ops *ops); +int iomap_writepages(struct address_space *mapping, + struct writeback_control *wbc, struct iomap_writepage_ctx *wpc, + const struct iomap_writeback_ops *ops); + /* * Flags for direct I/O ->end_io: */ @@ -195,7 +255,8 @@ struct iomap_dio_ops { }; ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, const struct iomap_dio_ops *dops); + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, + bool wait_for_completion); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); #ifdef CONFIG_SWAP