Merge branch 'mb/dio' into master
This commit is contained in:
commit
8d0d47ea16
13
fs/dax.c
13
fs/dax.c
@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
|
||||
|
||||
static loff_t
|
||||
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
struct iomap *iomap)
|
||||
struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct block_device *bdev = iomap->bdev;
|
||||
struct dax_device *dax_dev = iomap->dax_dev;
|
||||
@ -1247,7 +1247,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
||||
struct inode *inode = mapping->host;
|
||||
unsigned long vaddr = vmf->address;
|
||||
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
|
||||
struct iomap iomap = { 0 };
|
||||
struct iomap iomap = { .type = IOMAP_HOLE };
|
||||
struct iomap srcmap = { .type = IOMAP_HOLE };
|
||||
unsigned flags = IOMAP_FAULT;
|
||||
int error, major = 0;
|
||||
bool write = vmf->flags & FAULT_FLAG_WRITE;
|
||||
@ -1292,7 +1293,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
||||
* the file system block size to be equal the page size, which means
|
||||
* that we never have to deal with more than a single extent here.
|
||||
*/
|
||||
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
|
||||
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
|
||||
if (iomap_errp)
|
||||
*iomap_errp = error;
|
||||
if (error) {
|
||||
@ -1471,7 +1472,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
||||
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
|
||||
struct inode *inode = mapping->host;
|
||||
vm_fault_t result = VM_FAULT_FALLBACK;
|
||||
struct iomap iomap = { 0 };
|
||||
struct iomap iomap = { .type = IOMAP_HOLE };
|
||||
struct iomap srcmap = { .type = IOMAP_HOLE };
|
||||
pgoff_t max_pgoff;
|
||||
void *entry;
|
||||
loff_t pos;
|
||||
@ -1546,7 +1548,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
||||
* to look up our filesystem block.
|
||||
*/
|
||||
pos = (loff_t)xas.xa_index << PAGE_SHIFT;
|
||||
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
|
||||
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
|
||||
&srcmap);
|
||||
if (error)
|
||||
goto unlock_entry;
|
||||
|
||||
|
@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
|
||||
unsigned flags, struct iomap *iomap)
|
||||
unsigned flags, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
unsigned int blkbits = inode->i_blkbits;
|
||||
unsigned long first_block = offset >> blkbits;
|
||||
|
@ -1584,7 +1584,6 @@ enum {
|
||||
EXT4_STATE_NO_EXPAND, /* No space for expansion */
|
||||
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
|
||||
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
|
||||
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
|
||||
EXT4_STATE_NEWENTRY, /* File just added to dir */
|
||||
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
|
||||
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
|
||||
@ -2565,8 +2564,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh, int create);
|
||||
int ext4_walk_page_buffers(handle_t *handle,
|
||||
@ -3391,6 +3388,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
|
||||
}
|
||||
|
||||
extern const struct iomap_ops ext4_iomap_ops;
|
||||
extern const struct iomap_ops ext4_iomap_report_ops;
|
||||
|
||||
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
|
||||
{
|
||||
|
@ -1765,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
|
||||
*/
|
||||
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
|
||||
return 0;
|
||||
/*
|
||||
* The check for IO to unwritten extent is somewhat racy as we
|
||||
* increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
|
||||
* dropping i_data_sem. But reserved blocks should save us in that
|
||||
* case.
|
||||
*/
|
||||
|
||||
if (ext4_ext_is_unwritten(ex1) &&
|
||||
(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
|
||||
atomic_read(&EXT4_I(inode)->i_unwritten) ||
|
||||
(ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
|
||||
ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
|
||||
return 0;
|
||||
#ifdef AGGRESSIVE_TEST
|
||||
if (ext1_ee_len >= 4)
|
||||
|
418
fs/ext4/file.c
418
fs/ext4/file.c
@ -29,10 +29,58 @@
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include "ext4.h"
|
||||
#include "ext4_jbd2.h"
|
||||
#include "xattr.h"
|
||||
#include "acl.h"
|
||||
#include "truncate.h"
|
||||
|
||||
static bool ext4_dio_supported(struct inode *inode)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
|
||||
return false;
|
||||
if (fsverity_active(inode))
|
||||
return false;
|
||||
if (ext4_should_journal_data(inode))
|
||||
return false;
|
||||
if (ext4_has_inline_data(inode))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
{
|
||||
ssize_t ret;
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT) {
|
||||
if (!inode_trylock_shared(inode))
|
||||
return -EAGAIN;
|
||||
} else {
|
||||
inode_lock_shared(inode);
|
||||
}
|
||||
|
||||
if (!ext4_dio_supported(inode)) {
|
||||
inode_unlock_shared(inode);
|
||||
/*
|
||||
* Fallback to buffered I/O if the operation being performed on
|
||||
* the inode is not supported by direct I/O. The IOCB_DIRECT
|
||||
* flag needs to be cleared here in order to ensure that the
|
||||
* direct I/O path within generic_file_read_iter() is not
|
||||
* taken.
|
||||
*/
|
||||
iocb->ki_flags &= ~IOCB_DIRECT;
|
||||
return generic_file_read_iter(iocb, to);
|
||||
}
|
||||
|
||||
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
|
||||
is_sync_kiocb(iocb));
|
||||
inode_unlock_shared(inode);
|
||||
|
||||
file_accessed(iocb->ki_filp);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
|
||||
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
{
|
||||
if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
|
||||
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
||||
return -EIO;
|
||||
|
||||
if (!iov_iter_count(to))
|
||||
return 0; /* skip atime */
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
if (IS_DAX(file_inode(iocb->ki_filp)))
|
||||
if (IS_DAX(inode))
|
||||
return ext4_dax_read_iter(iocb, to);
|
||||
#endif
|
||||
if (iocb->ki_flags & IOCB_DIRECT)
|
||||
return ext4_dio_read_iter(iocb, to);
|
||||
|
||||
return generic_file_read_iter(iocb, to);
|
||||
}
|
||||
|
||||
@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ext4_unwritten_wait(struct inode *inode)
|
||||
{
|
||||
wait_queue_head_t *wq = ext4_ioend_wq(inode);
|
||||
|
||||
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
|
||||
}
|
||||
|
||||
/*
|
||||
* This tests whether the IO in question is block-aligned or not.
|
||||
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
|
||||
@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
ssize_t ret;
|
||||
|
||||
if (unlikely(IS_IMMUTABLE(inode)))
|
||||
return -EPERM;
|
||||
|
||||
ret = generic_write_checks(iocb, from);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
if (unlikely(IS_IMMUTABLE(inode)))
|
||||
return -EPERM;
|
||||
|
||||
/*
|
||||
* If we have encountered a bitmap-format file, the size limit
|
||||
* is smaller than s_maxbytes, which is for extent-mapped files.
|
||||
@ -180,32 +226,301 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
|
||||
return -EFBIG;
|
||||
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
|
||||
}
|
||||
|
||||
ret = file_modified(iocb->ki_filp);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return iov_iter_count(from);
|
||||
}
|
||||
|
||||
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
|
||||
struct iov_iter *from)
|
||||
{
|
||||
ssize_t ret;
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
inode_lock(inode);
|
||||
ret = ext4_write_checks(iocb, from);
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
|
||||
current->backing_dev_info = inode_to_bdi(inode);
|
||||
ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
|
||||
current->backing_dev_info = NULL;
|
||||
|
||||
out:
|
||||
inode_unlock(inode);
|
||||
if (likely(ret > 0)) {
|
||||
iocb->ki_pos += ret;
|
||||
ret = generic_write_sync(iocb, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
|
||||
ssize_t written, size_t count)
|
||||
{
|
||||
handle_t *handle;
|
||||
bool truncate = false;
|
||||
u8 blkbits = inode->i_blkbits;
|
||||
ext4_lblk_t written_blk, end_blk;
|
||||
|
||||
/*
|
||||
* Note that EXT4_I(inode)->i_disksize can get extended up to
|
||||
* inode->i_size while the I/O was running due to writeback of delalloc
|
||||
* blocks. But, the code in ext4_iomap_alloc() is careful to use
|
||||
* zeroed/unwritten extents if this is possible; thus we won't leave
|
||||
* uninitialized blocks in a file even if we didn't succeed in writing
|
||||
* as much as we intended.
|
||||
*/
|
||||
WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
|
||||
if (offset + count <= EXT4_I(inode)->i_disksize) {
|
||||
/*
|
||||
* We need to ensure that the inode is removed from the orphan
|
||||
* list if it has been added prematurely, due to writeback of
|
||||
* delalloc blocks.
|
||||
*/
|
||||
if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
|
||||
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
||||
|
||||
if (IS_ERR(handle)) {
|
||||
ext4_orphan_del(NULL, inode);
|
||||
return PTR_ERR(handle);
|
||||
}
|
||||
|
||||
ext4_orphan_del(handle, inode);
|
||||
ext4_journal_stop(handle);
|
||||
}
|
||||
|
||||
return written;
|
||||
}
|
||||
|
||||
if (written < 0)
|
||||
goto truncate;
|
||||
|
||||
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
||||
if (IS_ERR(handle)) {
|
||||
written = PTR_ERR(handle);
|
||||
goto truncate;
|
||||
}
|
||||
|
||||
if (ext4_update_inode_size(inode, offset + written))
|
||||
ext4_mark_inode_dirty(handle, inode);
|
||||
|
||||
/*
|
||||
* We may need to truncate allocated but not written blocks beyond EOF.
|
||||
*/
|
||||
written_blk = ALIGN(offset + written, 1 << blkbits);
|
||||
end_blk = ALIGN(offset + count, 1 << blkbits);
|
||||
if (written_blk < end_blk && ext4_can_truncate(inode))
|
||||
truncate = true;
|
||||
|
||||
/*
|
||||
* Remove the inode from the orphan list if it has been extended and
|
||||
* everything went OK.
|
||||
*/
|
||||
if (!truncate && inode->i_nlink)
|
||||
ext4_orphan_del(handle, inode);
|
||||
ext4_journal_stop(handle);
|
||||
|
||||
if (truncate) {
|
||||
truncate:
|
||||
ext4_truncate_failed_write(inode);
|
||||
/*
|
||||
* If the truncate operation failed early, then the inode may
|
||||
* still be on the orphan list. In that case, we need to try
|
||||
* remove the inode from the in-memory linked list.
|
||||
*/
|
||||
if (inode->i_nlink)
|
||||
ext4_orphan_del(NULL, inode);
|
||||
}
|
||||
|
||||
return written;
|
||||
}
|
||||
|
||||
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
|
||||
int error, unsigned int flags)
|
||||
{
|
||||
loff_t offset = iocb->ki_pos;
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
if (size && flags & IOMAP_DIO_UNWRITTEN)
|
||||
return ext4_convert_unwritten_extents(NULL, inode,
|
||||
offset, size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct iomap_dio_ops ext4_dio_write_ops = {
|
||||
.end_io = ext4_dio_write_end_io,
|
||||
};
|
||||
|
||||
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
ssize_t ret;
|
||||
size_t count;
|
||||
loff_t offset;
|
||||
handle_t *handle;
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
bool extend = false, overwrite = false, unaligned_aio = false;
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT) {
|
||||
if (!inode_trylock(inode))
|
||||
return -EAGAIN;
|
||||
} else {
|
||||
inode_lock(inode);
|
||||
}
|
||||
|
||||
if (!ext4_dio_supported(inode)) {
|
||||
inode_unlock(inode);
|
||||
/*
|
||||
* Fallback to buffered I/O if the inode does not support
|
||||
* direct I/O.
|
||||
*/
|
||||
return ext4_buffered_write_iter(iocb, from);
|
||||
}
|
||||
|
||||
ret = ext4_write_checks(iocb, from);
|
||||
if (ret <= 0) {
|
||||
inode_unlock(inode);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Unaligned asynchronous direct I/O must be serialized among each
|
||||
* other as the zeroing of partial blocks of two competing unaligned
|
||||
* asynchronous direct I/O writes can result in data corruption.
|
||||
*/
|
||||
offset = iocb->ki_pos;
|
||||
count = iov_iter_count(from);
|
||||
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
|
||||
!is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
|
||||
unaligned_aio = true;
|
||||
inode_dio_wait(inode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine whether the I/O will overwrite allocated and initialized
|
||||
* blocks. If so, check to see whether it is possible to take the
|
||||
* dioread_nolock path.
|
||||
*/
|
||||
if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
|
||||
ext4_should_dioread_nolock(inode)) {
|
||||
overwrite = true;
|
||||
downgrade_write(&inode->i_rwsem);
|
||||
}
|
||||
|
||||
if (offset + count > EXT4_I(inode)->i_disksize) {
|
||||
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = ext4_orphan_add(handle, inode);
|
||||
if (ret) {
|
||||
ext4_journal_stop(handle);
|
||||
goto out;
|
||||
}
|
||||
|
||||
extend = true;
|
||||
ext4_journal_stop(handle);
|
||||
}
|
||||
|
||||
ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
|
||||
is_sync_kiocb(iocb) || unaligned_aio || extend);
|
||||
|
||||
if (extend)
|
||||
ret = ext4_handle_inode_extension(inode, offset, ret, count);
|
||||
|
||||
out:
|
||||
if (overwrite)
|
||||
inode_unlock_shared(inode);
|
||||
else
|
||||
inode_unlock(inode);
|
||||
|
||||
if (ret >= 0 && iov_iter_count(from)) {
|
||||
ssize_t err;
|
||||
loff_t endbyte;
|
||||
|
||||
offset = iocb->ki_pos;
|
||||
err = ext4_buffered_write_iter(iocb, from);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
/*
|
||||
* We need to ensure that the pages within the page cache for
|
||||
* the range covered by this I/O are written to disk and
|
||||
* invalidated. This is in attempt to preserve the expected
|
||||
* direct I/O semantics in the case we fallback to buffered I/O
|
||||
* to complete off the I/O request.
|
||||
*/
|
||||
ret += err;
|
||||
endbyte = offset + err - 1;
|
||||
err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
|
||||
offset, endbyte);
|
||||
if (!err)
|
||||
invalidate_mapping_pages(iocb->ki_filp->f_mapping,
|
||||
offset >> PAGE_SHIFT,
|
||||
endbyte >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
static ssize_t
|
||||
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
ssize_t ret;
|
||||
size_t count;
|
||||
loff_t offset;
|
||||
handle_t *handle;
|
||||
bool extend = false;
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
|
||||
if (!inode_trylock(inode)) {
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
return -EAGAIN;
|
||||
inode_lock(inode);
|
||||
}
|
||||
|
||||
ret = ext4_write_checks(iocb, from);
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
ret = file_remove_privs(iocb->ki_filp);
|
||||
if (ret)
|
||||
goto out;
|
||||
ret = file_update_time(iocb->ki_filp);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
offset = iocb->ki_pos;
|
||||
count = iov_iter_count(from);
|
||||
|
||||
if (offset + count > EXT4_I(inode)->i_disksize) {
|
||||
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = ext4_orphan_add(handle, inode);
|
||||
if (ret) {
|
||||
ext4_journal_stop(handle);
|
||||
goto out;
|
||||
}
|
||||
|
||||
extend = true;
|
||||
ext4_journal_stop(handle);
|
||||
}
|
||||
|
||||
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
|
||||
|
||||
if (extend)
|
||||
ret = ext4_handle_inode_extension(inode, offset, ret, count);
|
||||
out:
|
||||
inode_unlock(inode);
|
||||
if (ret > 0)
|
||||
@ -218,10 +533,6 @@ static ssize_t
|
||||
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
int o_direct = iocb->ki_flags & IOCB_DIRECT;
|
||||
int unaligned_aio = 0;
|
||||
int overwrite = 0;
|
||||
ssize_t ret;
|
||||
|
||||
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
||||
return -EIO;
|
||||
@ -230,59 +541,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
if (IS_DAX(inode))
|
||||
return ext4_dax_write_iter(iocb, from);
|
||||
#endif
|
||||
if (iocb->ki_flags & IOCB_DIRECT)
|
||||
return ext4_dio_write_iter(iocb, from);
|
||||
|
||||
if (!inode_trylock(inode)) {
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
return -EAGAIN;
|
||||
inode_lock(inode);
|
||||
}
|
||||
|
||||
ret = ext4_write_checks(iocb, from);
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Unaligned direct AIO must be serialized among each other as zeroing
|
||||
* of partial blocks of two competing unaligned AIOs can result in data
|
||||
* corruption.
|
||||
*/
|
||||
if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
|
||||
!is_sync_kiocb(iocb) &&
|
||||
ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
|
||||
unaligned_aio = 1;
|
||||
ext4_unwritten_wait(inode);
|
||||
}
|
||||
|
||||
iocb->private = &overwrite;
|
||||
/* Check whether we do a DIO overwrite or not */
|
||||
if (o_direct && !unaligned_aio) {
|
||||
if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
|
||||
if (ext4_should_dioread_nolock(inode))
|
||||
overwrite = 1;
|
||||
} else if (iocb->ki_flags & IOCB_NOWAIT) {
|
||||
ret = -EAGAIN;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = __generic_file_write_iter(iocb, from);
|
||||
/*
|
||||
* Unaligned direct AIO must be the only IO in flight. Otherwise
|
||||
* overlapping aligned IO after unaligned might result in data
|
||||
* corruption.
|
||||
*/
|
||||
if (ret == -EIOCBQUEUED && unaligned_aio)
|
||||
ext4_unwritten_wait(inode);
|
||||
inode_unlock(inode);
|
||||
|
||||
if (ret > 0)
|
||||
ret = generic_write_sync(iocb, ret);
|
||||
|
||||
return ret;
|
||||
|
||||
out:
|
||||
inode_unlock(inode);
|
||||
return ret;
|
||||
return ext4_buffered_write_iter(iocb, from);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
|
||||
maxbytes, i_size_read(inode));
|
||||
case SEEK_HOLE:
|
||||
inode_lock_shared(inode);
|
||||
offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
|
||||
offset = iomap_seek_hole(inode, offset,
|
||||
&ext4_iomap_report_ops);
|
||||
inode_unlock_shared(inode);
|
||||
break;
|
||||
case SEEK_DATA:
|
||||
inode_lock_shared(inode);
|
||||
offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
|
||||
offset = iomap_seek_data(inode, offset,
|
||||
&ext4_iomap_report_ops);
|
||||
inode_unlock_shared(inode);
|
||||
break;
|
||||
}
|
||||
|
@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
|
||||
bool *needs_barrier)
|
||||
{
|
||||
int ret, err;
|
||||
|
||||
ret = sync_mapping_buffers(inode->i_mapping);
|
||||
if (!(inode->i_state & I_DIRTY_ALL))
|
||||
return ret;
|
||||
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
|
||||
return ret;
|
||||
|
||||
err = sync_inode_metadata(inode, 1);
|
||||
if (!ret)
|
||||
ret = err;
|
||||
|
||||
if (!ret)
|
||||
ret = ext4_sync_parent(inode);
|
||||
if (test_opt(inode->i_sb, BARRIER))
|
||||
*needs_barrier = true;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ext4_fsync_journal(struct inode *inode, bool datasync,
|
||||
bool *needs_barrier)
|
||||
{
|
||||
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
|
||||
tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
|
||||
|
||||
if (journal->j_flags & JBD2_BARRIER &&
|
||||
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
|
||||
*needs_barrier = true;
|
||||
|
||||
return jbd2_complete_transaction(journal, commit_tid);
|
||||
}
|
||||
|
||||
/*
|
||||
* akpm: A new design for ext4_sync_file().
|
||||
*
|
||||
@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode)
|
||||
* What we do is just kick off a commit and wait on it. This will snapshot the
|
||||
* inode to disk.
|
||||
*/
|
||||
|
||||
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
{
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
|
||||
int ret = 0, err;
|
||||
tid_t commit_tid;
|
||||
bool needs_barrier = false;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||
|
||||
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
||||
if (unlikely(ext4_forced_shutdown(sbi)))
|
||||
return -EIO;
|
||||
|
||||
J_ASSERT(ext4_journal_current_handle() == NULL);
|
||||
@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
if (sb_rdonly(inode->i_sb)) {
|
||||
/* Make sure that we read updated s_mount_flags value */
|
||||
smp_rmb();
|
||||
if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
|
||||
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
|
||||
ret = -EROFS;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!journal) {
|
||||
ret = __generic_file_fsync(file, start, end, datasync);
|
||||
if (!ret)
|
||||
ret = ext4_sync_parent(inode);
|
||||
if (test_opt(inode->i_sb, BARRIER))
|
||||
goto issue_flush;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = file_write_and_wait_range(file, start, end);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* data=writeback,ordered:
|
||||
* The caller's filemap_fdatawrite()/wait will sync the data.
|
||||
@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
* (they were dirtied by commit). But that's OK - the blocks are
|
||||
* safe in-journal, which is all fsync() needs to ensure.
|
||||
*/
|
||||
if (ext4_should_journal_data(inode)) {
|
||||
if (!sbi->s_journal)
|
||||
ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
|
||||
else if (ext4_should_journal_data(inode))
|
||||
ret = ext4_force_commit(inode->i_sb);
|
||||
goto out;
|
||||
}
|
||||
else
|
||||
ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
|
||||
|
||||
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
|
||||
if (journal->j_flags & JBD2_BARRIER &&
|
||||
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
|
||||
needs_barrier = true;
|
||||
ret = jbd2_complete_transaction(journal, commit_tid);
|
||||
if (needs_barrier) {
|
||||
issue_flush:
|
||||
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
|
||||
if (!ret)
|
||||
ret = err;
|
||||
|
756
fs/ext4/inode.c
756
fs/ext4/inode.c
@ -809,136 +809,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
|
||||
/* Maximum number of blocks we map for direct IO at once. */
|
||||
#define DIO_MAX_BLOCKS 4096
|
||||
|
||||
/*
|
||||
* Get blocks function for the cases that need to start a transaction -
|
||||
* generally difference cases of direct IO and DAX IO. It also handles retries
|
||||
* in case of ENOSPC.
|
||||
*/
|
||||
static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int flags)
|
||||
{
|
||||
int dio_credits;
|
||||
handle_t *handle;
|
||||
int retries = 0;
|
||||
int ret;
|
||||
|
||||
/* Trim mapping request to maximum we can map at once for DIO */
|
||||
if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
|
||||
bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
|
||||
dio_credits = ext4_chunk_trans_blocks(inode,
|
||||
bh_result->b_size >> inode->i_blkbits);
|
||||
retry:
|
||||
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
|
||||
if (IS_ERR(handle))
|
||||
return PTR_ERR(handle);
|
||||
|
||||
ret = _ext4_get_block(inode, iblock, bh_result, flags);
|
||||
ext4_journal_stop(handle);
|
||||
|
||||
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
|
||||
goto retry;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Get block function for DIO reads and writes to inodes without extents */
|
||||
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh, int create)
|
||||
{
|
||||
/* We don't expect handle for direct IO */
|
||||
WARN_ON_ONCE(ext4_journal_current_handle());
|
||||
|
||||
if (!create)
|
||||
return _ext4_get_block(inode, iblock, bh, 0);
|
||||
return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get block function for AIO DIO writes when we create unwritten extent if
|
||||
* blocks are not allocated yet. The extent will be converted to written
|
||||
* after IO is complete.
|
||||
*/
|
||||
static int ext4_dio_get_block_unwritten_async(struct inode *inode,
|
||||
sector_t iblock, struct buffer_head *bh_result, int create)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* We don't expect handle for direct IO */
|
||||
WARN_ON_ONCE(ext4_journal_current_handle());
|
||||
|
||||
ret = ext4_get_block_trans(inode, iblock, bh_result,
|
||||
EXT4_GET_BLOCKS_IO_CREATE_EXT);
|
||||
|
||||
/*
|
||||
* When doing DIO using unwritten extents, we need io_end to convert
|
||||
* unwritten extents to written on IO completion. We allocate io_end
|
||||
* once we spot unwritten extent and store it in b_private. Generic
|
||||
* DIO code keeps b_private set and furthermore passes the value to
|
||||
* our completion callback in 'private' argument.
|
||||
*/
|
||||
if (!ret && buffer_unwritten(bh_result)) {
|
||||
if (!bh_result->b_private) {
|
||||
ext4_io_end_t *io_end;
|
||||
|
||||
io_end = ext4_init_io_end(inode, GFP_KERNEL);
|
||||
if (!io_end)
|
||||
return -ENOMEM;
|
||||
bh_result->b_private = io_end;
|
||||
ext4_set_io_unwritten_flag(inode, io_end);
|
||||
}
|
||||
set_buffer_defer_completion(bh_result);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get block function for non-AIO DIO writes when we create unwritten extent if
|
||||
* blocks are not allocated yet. The extent will be converted to written
|
||||
* after IO is complete by ext4_direct_IO_write().
|
||||
*/
|
||||
static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
|
||||
sector_t iblock, struct buffer_head *bh_result, int create)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* We don't expect handle for direct IO */
|
||||
WARN_ON_ONCE(ext4_journal_current_handle());
|
||||
|
||||
ret = ext4_get_block_trans(inode, iblock, bh_result,
|
||||
EXT4_GET_BLOCKS_IO_CREATE_EXT);
|
||||
|
||||
/*
|
||||
* Mark inode as having pending DIO writes to unwritten extents.
|
||||
* ext4_direct_IO_write() checks this flag and converts extents to
|
||||
* written.
|
||||
*/
|
||||
if (!ret && buffer_unwritten(bh_result))
|
||||
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
|
||||
inode->i_ino, create);
|
||||
/* We don't expect handle for direct IO */
|
||||
WARN_ON_ONCE(ext4_journal_current_handle());
|
||||
|
||||
ret = _ext4_get_block(inode, iblock, bh_result, 0);
|
||||
/*
|
||||
* Blocks should have been preallocated! ext4_file_write_iter() checks
|
||||
* that.
|
||||
*/
|
||||
WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* `handle' can be NULL if create is zero
|
||||
*/
|
||||
@ -3431,148 +3301,142 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
|
||||
return inode->i_state & I_DIRTY_DATASYNC;
|
||||
}
|
||||
|
||||
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
|
||||
unsigned flags, struct iomap *iomap)
|
||||
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
|
||||
struct ext4_map_blocks *map, loff_t offset,
|
||||
loff_t length)
|
||||
{
|
||||
u8 blkbits = inode->i_blkbits;
|
||||
|
||||
/*
|
||||
* Writes that span EOF might trigger an I/O size update on completion,
|
||||
* so consider them to be dirty for the purpose of O_DSYNC, even if
|
||||
* there is no other metadata changes being made or are pending.
|
||||
*/
|
||||
iomap->flags = 0;
|
||||
if (ext4_inode_datasync_dirty(inode) ||
|
||||
offset + length > i_size_read(inode))
|
||||
iomap->flags |= IOMAP_F_DIRTY;
|
||||
|
||||
if (map->m_flags & EXT4_MAP_NEW)
|
||||
iomap->flags |= IOMAP_F_NEW;
|
||||
|
||||
iomap->bdev = inode->i_sb->s_bdev;
|
||||
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
|
||||
iomap->offset = (u64) map->m_lblk << blkbits;
|
||||
iomap->length = (u64) map->m_len << blkbits;
|
||||
|
||||
/*
|
||||
* Flags passed to ext4_map_blocks() for direct I/O writes can result
|
||||
* in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
|
||||
* set. In order for any allocated unwritten extents to be converted
|
||||
* into written extents correctly within the ->end_io() handler, we
|
||||
* need to ensure that the iomap->type is set appropriately. Hence, the
|
||||
* reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
|
||||
* been set first.
|
||||
*/
|
||||
if (map->m_flags & EXT4_MAP_UNWRITTEN) {
|
||||
iomap->type = IOMAP_UNWRITTEN;
|
||||
iomap->addr = (u64) map->m_pblk << blkbits;
|
||||
} else if (map->m_flags & EXT4_MAP_MAPPED) {
|
||||
iomap->type = IOMAP_MAPPED;
|
||||
iomap->addr = (u64) map->m_pblk << blkbits;
|
||||
} else {
|
||||
iomap->type = IOMAP_HOLE;
|
||||
iomap->addr = IOMAP_NULL_ADDR;
|
||||
}
|
||||
}
|
||||
|
||||
static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
|
||||
unsigned int flags)
|
||||
{
|
||||
handle_t *handle;
|
||||
u8 blkbits = inode->i_blkbits;
|
||||
int ret, dio_credits, m_flags = 0, retries = 0;
|
||||
|
||||
/*
|
||||
* Trim the mapping request to the maximum value that we can map at
|
||||
* once for direct I/O.
|
||||
*/
|
||||
if (map->m_len > DIO_MAX_BLOCKS)
|
||||
map->m_len = DIO_MAX_BLOCKS;
|
||||
dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
|
||||
|
||||
retry:
|
||||
/*
|
||||
* Either we allocate blocks and then don't get an unwritten extent, so
|
||||
* in that case we have reserved enough credits. Or, the blocks are
|
||||
* already allocated and unwritten. In that case, the extent conversion
|
||||
* fits into the credits as well.
|
||||
*/
|
||||
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
|
||||
if (IS_ERR(handle))
|
||||
return PTR_ERR(handle);
|
||||
|
||||
/*
|
||||
* DAX and direct I/O are the only two operations that are currently
|
||||
* supported with IOMAP_WRITE.
|
||||
*/
|
||||
WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
|
||||
if (IS_DAX(inode))
|
||||
m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
|
||||
/*
|
||||
* We use i_size instead of i_disksize here because delalloc writeback
|
||||
* can complete at any point during the I/O and subsequently push the
|
||||
* i_disksize out to i_size. This could be beyond where direct I/O is
|
||||
* happening and thus expose allocated blocks to direct I/O reads.
|
||||
*/
|
||||
else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
|
||||
m_flags = EXT4_GET_BLOCKS_CREATE;
|
||||
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
|
||||
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
|
||||
|
||||
ret = ext4_map_blocks(handle, inode, map, m_flags);
|
||||
|
||||
/*
|
||||
* We cannot fill holes in indirect tree based inodes as that could
|
||||
* expose stale data in the case of a crash. Use the magic error code
|
||||
* to fallback to buffered I/O.
|
||||
*/
|
||||
if (!m_flags && !ret)
|
||||
ret = -ENOTBLK;
|
||||
|
||||
ext4_journal_stop(handle);
|
||||
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
|
||||
goto retry;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
|
||||
unsigned flags, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||
unsigned int blkbits = inode->i_blkbits;
|
||||
unsigned long first_block, last_block;
|
||||
struct ext4_map_blocks map;
|
||||
bool delalloc = false;
|
||||
int ret;
|
||||
struct ext4_map_blocks map;
|
||||
u8 blkbits = inode->i_blkbits;
|
||||
|
||||
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
|
||||
return -EINVAL;
|
||||
first_block = offset >> blkbits;
|
||||
last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
|
||||
EXT4_MAX_LOGICAL_BLOCK);
|
||||
|
||||
if (flags & IOMAP_REPORT) {
|
||||
if (ext4_has_inline_data(inode)) {
|
||||
ret = ext4_inline_data_iomap(inode, iomap);
|
||||
if (ret != -EAGAIN) {
|
||||
if (ret == 0 && offset >= iomap->length)
|
||||
ret = -ENOENT;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
|
||||
return -ERANGE;
|
||||
}
|
||||
if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
|
||||
return -ERANGE;
|
||||
|
||||
map.m_lblk = first_block;
|
||||
map.m_len = last_block - first_block + 1;
|
||||
/*
|
||||
* Calculate the first and last logical blocks respectively.
|
||||
*/
|
||||
map.m_lblk = offset >> blkbits;
|
||||
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
|
||||
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
|
||||
|
||||
if (flags & IOMAP_REPORT) {
|
||||
if (flags & IOMAP_WRITE)
|
||||
ret = ext4_iomap_alloc(inode, &map, flags);
|
||||
else
|
||||
ret = ext4_map_blocks(NULL, inode, &map, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (ret == 0) {
|
||||
ext4_lblk_t end = map.m_lblk + map.m_len - 1;
|
||||
struct extent_status es;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
|
||||
map.m_lblk, end, &es);
|
||||
|
||||
if (!es.es_len || es.es_lblk > end) {
|
||||
/* entire range is a hole */
|
||||
} else if (es.es_lblk > map.m_lblk) {
|
||||
/* range starts with a hole */
|
||||
map.m_len = es.es_lblk - map.m_lblk;
|
||||
} else {
|
||||
ext4_lblk_t offs = 0;
|
||||
|
||||
if (es.es_lblk < map.m_lblk)
|
||||
offs = map.m_lblk - es.es_lblk;
|
||||
map.m_lblk = es.es_lblk + offs;
|
||||
map.m_len = es.es_len - offs;
|
||||
delalloc = true;
|
||||
}
|
||||
}
|
||||
} else if (flags & IOMAP_WRITE) {
|
||||
int dio_credits;
|
||||
handle_t *handle;
|
||||
int retries = 0;
|
||||
|
||||
/* Trim mapping request to maximum we can map at once for DIO */
|
||||
if (map.m_len > DIO_MAX_BLOCKS)
|
||||
map.m_len = DIO_MAX_BLOCKS;
|
||||
dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
|
||||
retry:
|
||||
/*
|
||||
* Either we allocate blocks and then we don't get unwritten
|
||||
* extent so we have reserved enough credits, or the blocks
|
||||
* are already allocated and unwritten and in that case
|
||||
* extent conversion fits in the credits as well.
|
||||
*/
|
||||
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
|
||||
dio_credits);
|
||||
if (IS_ERR(handle))
|
||||
return PTR_ERR(handle);
|
||||
|
||||
ret = ext4_map_blocks(handle, inode, &map,
|
||||
EXT4_GET_BLOCKS_CREATE_ZERO);
|
||||
if (ret < 0) {
|
||||
ext4_journal_stop(handle);
|
||||
if (ret == -ENOSPC &&
|
||||
ext4_should_retry_alloc(inode->i_sb, &retries))
|
||||
goto retry;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we added blocks beyond i_size, we need to make sure they
|
||||
* will get truncated if we crash before updating i_size in
|
||||
* ext4_iomap_end(). For faults we don't need to do that (and
|
||||
* even cannot because for orphan list operations inode_lock is
|
||||
* required) - if we happen to instantiate block beyond i_size,
|
||||
* it is because we race with truncate which has already added
|
||||
* the inode to the orphan list.
|
||||
*/
|
||||
if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
|
||||
(i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
|
||||
int err;
|
||||
|
||||
err = ext4_orphan_add(handle, inode);
|
||||
if (err < 0) {
|
||||
ext4_journal_stop(handle);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
ext4_journal_stop(handle);
|
||||
} else {
|
||||
ret = ext4_map_blocks(NULL, inode, &map, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
|
||||
iomap->flags = 0;
|
||||
if (ext4_inode_datasync_dirty(inode))
|
||||
iomap->flags |= IOMAP_F_DIRTY;
|
||||
iomap->bdev = inode->i_sb->s_bdev;
|
||||
iomap->dax_dev = sbi->s_daxdev;
|
||||
iomap->offset = (u64)first_block << blkbits;
|
||||
iomap->length = (u64)map.m_len << blkbits;
|
||||
|
||||
if (ret == 0) {
|
||||
iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
|
||||
iomap->addr = IOMAP_NULL_ADDR;
|
||||
} else {
|
||||
if (map.m_flags & EXT4_MAP_MAPPED) {
|
||||
iomap->type = IOMAP_MAPPED;
|
||||
} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
|
||||
iomap->type = IOMAP_UNWRITTEN;
|
||||
} else {
|
||||
WARN_ON_ONCE(1);
|
||||
return -EIO;
|
||||
}
|
||||
iomap->addr = (u64)map.m_pblk << blkbits;
|
||||
}
|
||||
|
||||
if (map.m_flags & EXT4_MAP_NEW)
|
||||
iomap->flags |= IOMAP_F_NEW;
|
||||
ext4_set_iomap(inode, iomap, &map, offset, length);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -3580,53 +3444,17 @@ retry:
|
||||
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
|
||||
ssize_t written, unsigned flags, struct iomap *iomap)
|
||||
{
|
||||
int ret = 0;
|
||||
handle_t *handle;
|
||||
int blkbits = inode->i_blkbits;
|
||||
bool truncate = false;
|
||||
|
||||
if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
|
||||
return 0;
|
||||
|
||||
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
goto orphan_del;
|
||||
}
|
||||
if (ext4_update_inode_size(inode, offset + written))
|
||||
ext4_mark_inode_dirty(handle, inode);
|
||||
/*
|
||||
* We may need to truncate allocated but not written blocks beyond EOF.
|
||||
* Check to see whether an error occurred while writing out the data to
|
||||
* the allocated blocks. If so, return the magic error code so that we
|
||||
* fallback to buffered I/O and attempt to complete the remainder of
|
||||
* the I/O. Any blocks that may have been allocated in preparation for
|
||||
* the direct I/O will be reused during buffered I/O.
|
||||
*/
|
||||
if (iomap->offset + iomap->length >
|
||||
ALIGN(inode->i_size, 1 << blkbits)) {
|
||||
ext4_lblk_t written_blk, end_blk;
|
||||
if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
|
||||
return -ENOTBLK;
|
||||
|
||||
written_blk = (offset + written) >> blkbits;
|
||||
end_blk = (offset + length) >> blkbits;
|
||||
if (written_blk < end_blk && ext4_can_truncate(inode))
|
||||
truncate = true;
|
||||
}
|
||||
/*
|
||||
* Remove inode from orphan list if we were extending a inode and
|
||||
* everything went fine.
|
||||
*/
|
||||
if (!truncate && inode->i_nlink &&
|
||||
!list_empty(&EXT4_I(inode)->i_orphan))
|
||||
ext4_orphan_del(handle, inode);
|
||||
ext4_journal_stop(handle);
|
||||
if (truncate) {
|
||||
ext4_truncate_failed_write(inode);
|
||||
orphan_del:
|
||||
/*
|
||||
* If truncate failed early the inode might still be on the
|
||||
* orphan list; we need to make sure the inode is removed from
|
||||
* the orphan list in that case.
|
||||
*/
|
||||
if (inode->i_nlink)
|
||||
ext4_orphan_del(NULL, inode);
|
||||
}
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct iomap_ops ext4_iomap_ops = {
|
||||
@ -3634,271 +3462,73 @@ const struct iomap_ops ext4_iomap_ops = {
|
||||
.iomap_end = ext4_iomap_end,
|
||||
};
|
||||
|
||||
static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
|
||||
ssize_t size, void *private)
|
||||
static bool ext4_iomap_is_delalloc(struct inode *inode,
|
||||
struct ext4_map_blocks *map)
|
||||
{
|
||||
ext4_io_end_t *io_end = private;
|
||||
struct ext4_io_end_vec *io_end_vec;
|
||||
struct extent_status es;
|
||||
ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
|
||||
|
||||
/* if not async direct IO just return */
|
||||
if (!io_end)
|
||||
return 0;
|
||||
ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
|
||||
map->m_lblk, end, &es);
|
||||
|
||||
ext_debug("ext4_end_io_dio(): io_end 0x%p "
|
||||
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
|
||||
io_end, io_end->inode->i_ino, iocb, offset, size);
|
||||
if (!es.es_len || es.es_lblk > end)
|
||||
return false;
|
||||
|
||||
if (es.es_lblk > map->m_lblk) {
|
||||
map->m_len = es.es_lblk - map->m_lblk;
|
||||
return false;
|
||||
}
|
||||
|
||||
offset = map->m_lblk - es.es_lblk;
|
||||
map->m_len = es.es_len - offset;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
|
||||
loff_t length, unsigned int flags,
|
||||
struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
int ret;
|
||||
bool delalloc = false;
|
||||
struct ext4_map_blocks map;
|
||||
u8 blkbits = inode->i_blkbits;
|
||||
|
||||
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
|
||||
return -EINVAL;
|
||||
|
||||
if (ext4_has_inline_data(inode)) {
|
||||
ret = ext4_inline_data_iomap(inode, iomap);
|
||||
if (ret != -EAGAIN) {
|
||||
if (ret == 0 && offset >= iomap->length)
|
||||
ret = -ENOENT;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Error during AIO DIO. We cannot convert unwritten extents as the
|
||||
* data was not written. Just clear the unwritten flag and drop io_end.
|
||||
* Calculate the first and last logical block respectively.
|
||||
*/
|
||||
if (size <= 0) {
|
||||
ext4_clear_io_unwritten_flag(io_end);
|
||||
size = 0;
|
||||
}
|
||||
io_end_vec = ext4_alloc_io_end_vec(io_end);
|
||||
io_end_vec->offset = offset;
|
||||
io_end_vec->size = size;
|
||||
ext4_put_io_end(io_end);
|
||||
map.m_lblk = offset >> blkbits;
|
||||
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
|
||||
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
|
||||
|
||||
ret = ext4_map_blocks(NULL, inode, &map, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret == 0)
|
||||
delalloc = ext4_iomap_is_delalloc(inode, &map);
|
||||
|
||||
ext4_set_iomap(inode, iomap, &map, offset, length);
|
||||
if (delalloc && iomap->type == IOMAP_HOLE)
|
||||
iomap->type = IOMAP_DELALLOC;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handling of direct IO writes.
|
||||
*
|
||||
* For ext4 extent files, ext4 will do direct-io write even to holes,
|
||||
* preallocated extents, and those write extend the file, no need to
|
||||
* fall back to buffered IO.
|
||||
*
|
||||
* For holes, we fallocate those blocks, mark them as unwritten
|
||||
* If those blocks were preallocated, we mark sure they are split, but
|
||||
* still keep the range to write as unwritten.
|
||||
*
|
||||
* The unwritten extents will be converted to written when DIO is completed.
|
||||
* For async direct IO, since the IO may still pending when return, we
|
||||
* set up an end_io call back function, which will do the conversion
|
||||
* when async direct IO completed.
|
||||
*
|
||||
* If the O_DIRECT write will extend the file then add this inode to the
|
||||
* orphan list. So recovery will truncate it back to the original size
|
||||
* if the machine crashes during the write.
|
||||
*
|
||||
*/
|
||||
static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||
ssize_t ret;
|
||||
loff_t offset = iocb->ki_pos;
|
||||
size_t count = iov_iter_count(iter);
|
||||
int overwrite = 0;
|
||||
get_block_t *get_block_func = NULL;
|
||||
int dio_flags = 0;
|
||||
loff_t final_size = offset + count;
|
||||
int orphan = 0;
|
||||
handle_t *handle;
|
||||
|
||||
if (final_size > inode->i_size || final_size > ei->i_disksize) {
|
||||
/* Credits for sb + inode write */
|
||||
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
goto out;
|
||||
}
|
||||
ret = ext4_orphan_add(handle, inode);
|
||||
if (ret) {
|
||||
ext4_journal_stop(handle);
|
||||
goto out;
|
||||
}
|
||||
orphan = 1;
|
||||
ext4_update_i_disksize(inode, inode->i_size);
|
||||
ext4_journal_stop(handle);
|
||||
}
|
||||
|
||||
BUG_ON(iocb->private == NULL);
|
||||
|
||||
/*
|
||||
* Make all waiters for direct IO properly wait also for extent
|
||||
* conversion. This also disallows race between truncate() and
|
||||
* overwrite DIO as i_dio_count needs to be incremented under i_mutex.
|
||||
*/
|
||||
inode_dio_begin(inode);
|
||||
|
||||
/* If we do a overwrite dio, i_mutex locking can be released */
|
||||
overwrite = *((int *)iocb->private);
|
||||
|
||||
if (overwrite)
|
||||
inode_unlock(inode);
|
||||
|
||||
/*
|
||||
* For extent mapped files we could direct write to holes and fallocate.
|
||||
*
|
||||
* Allocated blocks to fill the hole are marked as unwritten to prevent
|
||||
* parallel buffered read to expose the stale data before DIO complete
|
||||
* the data IO.
|
||||
*
|
||||
* As to previously fallocated extents, ext4 get_block will just simply
|
||||
* mark the buffer mapped but still keep the extents unwritten.
|
||||
*
|
||||
* For non AIO case, we will convert those unwritten extents to written
|
||||
* after return back from blockdev_direct_IO. That way we save us from
|
||||
* allocating io_end structure and also the overhead of offloading
|
||||
* the extent convertion to a workqueue.
|
||||
*
|
||||
* For async DIO, the conversion needs to be deferred when the
|
||||
* IO is completed. The ext4 end_io callback function will be
|
||||
* called to take care of the conversion work. Here for async
|
||||
* case, we allocate an io_end structure to hook to the iocb.
|
||||
*/
|
||||
iocb->private = NULL;
|
||||
if (overwrite)
|
||||
get_block_func = ext4_dio_get_block_overwrite;
|
||||
else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
|
||||
round_down(offset, i_blocksize(inode)) >= inode->i_size) {
|
||||
get_block_func = ext4_dio_get_block;
|
||||
dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
|
||||
} else if (is_sync_kiocb(iocb)) {
|
||||
get_block_func = ext4_dio_get_block_unwritten_sync;
|
||||
dio_flags = DIO_LOCKING;
|
||||
} else {
|
||||
get_block_func = ext4_dio_get_block_unwritten_async;
|
||||
dio_flags = DIO_LOCKING;
|
||||
}
|
||||
ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
|
||||
get_block_func, ext4_end_io_dio, NULL,
|
||||
dio_flags);
|
||||
|
||||
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
|
||||
EXT4_STATE_DIO_UNWRITTEN)) {
|
||||
int err;
|
||||
/*
|
||||
* for non AIO case, since the IO is already
|
||||
* completed, we could do the conversion right here
|
||||
*/
|
||||
err = ext4_convert_unwritten_extents(NULL, inode,
|
||||
offset, ret);
|
||||
if (err < 0)
|
||||
ret = err;
|
||||
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
|
||||
}
|
||||
|
||||
inode_dio_end(inode);
|
||||
/* take i_mutex locking again if we do a ovewrite dio */
|
||||
if (overwrite)
|
||||
inode_lock(inode);
|
||||
|
||||
if (ret < 0 && final_size > inode->i_size)
|
||||
ext4_truncate_failed_write(inode);
|
||||
|
||||
/* Handle extending of i_size after direct IO write */
|
||||
if (orphan) {
|
||||
int err;
|
||||
|
||||
/* Credits for sb + inode write */
|
||||
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
||||
if (IS_ERR(handle)) {
|
||||
/*
|
||||
* We wrote the data but cannot extend
|
||||
* i_size. Bail out. In async io case, we do
|
||||
* not return error here because we have
|
||||
* already submmitted the corresponding
|
||||
* bio. Returning error here makes the caller
|
||||
* think that this IO is done and failed
|
||||
* resulting in race with bio's completion
|
||||
* handler.
|
||||
*/
|
||||
if (!ret)
|
||||
ret = PTR_ERR(handle);
|
||||
if (inode->i_nlink)
|
||||
ext4_orphan_del(NULL, inode);
|
||||
|
||||
goto out;
|
||||
}
|
||||
if (inode->i_nlink)
|
||||
ext4_orphan_del(handle, inode);
|
||||
if (ret > 0) {
|
||||
loff_t end = offset + ret;
|
||||
if (end > inode->i_size || end > ei->i_disksize) {
|
||||
ext4_update_i_disksize(inode, end);
|
||||
if (end > inode->i_size)
|
||||
i_size_write(inode, end);
|
||||
/*
|
||||
* We're going to return a positive `ret'
|
||||
* here due to non-zero-length I/O, so there's
|
||||
* no way of reporting error returns from
|
||||
* ext4_mark_inode_dirty() to userspace. So
|
||||
* ignore it.
|
||||
*/
|
||||
ext4_mark_inode_dirty(handle, inode);
|
||||
}
|
||||
}
|
||||
err = ext4_journal_stop(handle);
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
|
||||
{
|
||||
struct address_space *mapping = iocb->ki_filp->f_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
size_t count = iov_iter_count(iter);
|
||||
ssize_t ret;
|
||||
|
||||
/*
|
||||
* Shared inode_lock is enough for us - it protects against concurrent
|
||||
* writes & truncates and since we take care of writing back page cache,
|
||||
* we are protected against page writeback as well.
|
||||
*/
|
||||
inode_lock_shared(inode);
|
||||
ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
|
||||
iocb->ki_pos + count - 1);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
|
||||
iter, ext4_dio_get_block, NULL, NULL, 0);
|
||||
out_unlock:
|
||||
inode_unlock_shared(inode);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
size_t count = iov_iter_count(iter);
|
||||
loff_t offset = iocb->ki_pos;
|
||||
ssize_t ret;
|
||||
|
||||
#ifdef CONFIG_FS_ENCRYPTION
|
||||
if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
|
||||
return 0;
|
||||
#endif
|
||||
if (fsverity_active(inode))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* If we are doing data journalling we don't support O_DIRECT
|
||||
*/
|
||||
if (ext4_should_journal_data(inode))
|
||||
return 0;
|
||||
|
||||
/* Let buffer I/O handle the inline data case. */
|
||||
if (ext4_has_inline_data(inode))
|
||||
return 0;
|
||||
|
||||
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
|
||||
if (iov_iter_rw(iter) == READ)
|
||||
ret = ext4_direct_IO_read(iocb, iter);
|
||||
else
|
||||
ret = ext4_direct_IO_write(iocb, iter);
|
||||
trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
|
||||
return ret;
|
||||
}
|
||||
const struct iomap_ops ext4_iomap_report_ops = {
|
||||
.iomap_begin = ext4_iomap_begin_report,
|
||||
};
|
||||
|
||||
/*
|
||||
* Pages can be marked dirty completely asynchronously from ext4's journalling
|
||||
@ -3937,7 +3567,7 @@ static const struct address_space_operations ext4_aops = {
|
||||
.bmap = ext4_bmap,
|
||||
.invalidatepage = ext4_invalidatepage,
|
||||
.releasepage = ext4_releasepage,
|
||||
.direct_IO = ext4_direct_IO,
|
||||
.direct_IO = noop_direct_IO,
|
||||
.migratepage = buffer_migrate_page,
|
||||
.is_partially_uptodate = block_is_partially_uptodate,
|
||||
.error_remove_page = generic_error_remove_page,
|
||||
@ -3954,7 +3584,7 @@ static const struct address_space_operations ext4_journalled_aops = {
|
||||
.bmap = ext4_bmap,
|
||||
.invalidatepage = ext4_journalled_invalidatepage,
|
||||
.releasepage = ext4_releasepage,
|
||||
.direct_IO = ext4_direct_IO,
|
||||
.direct_IO = noop_direct_IO,
|
||||
.is_partially_uptodate = block_is_partially_uptodate,
|
||||
.error_remove_page = generic_error_remove_page,
|
||||
};
|
||||
@ -3970,7 +3600,7 @@ static const struct address_space_operations ext4_da_aops = {
|
||||
.bmap = ext4_bmap,
|
||||
.invalidatepage = ext4_invalidatepage,
|
||||
.releasepage = ext4_releasepage,
|
||||
.direct_IO = ext4_direct_IO,
|
||||
.direct_IO = noop_direct_IO,
|
||||
.migratepage = buffer_migrate_page,
|
||||
.is_partially_uptodate = block_is_partially_uptodate,
|
||||
.error_remove_page = generic_error_remove_page,
|
||||
|
@ -1149,7 +1149,8 @@ static inline bool gfs2_iomap_need_write_lock(unsigned flags)
|
||||
}
|
||||
|
||||
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
|
||||
unsigned flags, struct iomap *iomap)
|
||||
unsigned flags, struct iomap *iomap,
|
||||
struct iomap *srcmap)
|
||||
{
|
||||
struct gfs2_inode *ip = GFS2_I(inode);
|
||||
struct metapath mp = { .mp_aheight = 1, };
|
||||
|
@ -732,7 +732,8 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
|
||||
if (ret)
|
||||
goto out_uninit;
|
||||
|
||||
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL);
|
||||
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
|
||||
is_sync_kiocb(iocb));
|
||||
|
||||
gfs2_glock_dq(&gh);
|
||||
out_uninit:
|
||||
@ -767,7 +768,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
||||
if (offset + len > i_size_read(&ip->i_inode))
|
||||
goto out;
|
||||
|
||||
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL);
|
||||
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
|
||||
is_sync_kiocb(iocb));
|
||||
|
||||
out:
|
||||
gfs2_glock_dq(&gh);
|
||||
|
@ -3,13 +3,15 @@
|
||||
# Copyright (c) 2019 Oracle.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
|
||||
ccflags-y += -I $(srctree)/$(src) # needed for trace events
|
||||
|
||||
obj-$(CONFIG_FS_IOMAP) += iomap.o
|
||||
|
||||
iomap-y += \
|
||||
apply.o \
|
||||
buffered-io.o \
|
||||
direct-io.o \
|
||||
fiemap.o \
|
||||
seek.o
|
||||
|
||||
iomap-y += trace.o \
|
||||
apply.o \
|
||||
buffered-io.o \
|
||||
direct-io.o \
|
||||
fiemap.o \
|
||||
seek.o
|
||||
iomap-$(CONFIG_SWAP) += swapfile.o
|
||||
|
@ -23,8 +23,10 @@ loff_t
|
||||
iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
|
||||
const struct iomap_ops *ops, void *data, iomap_actor_t actor)
|
||||
{
|
||||
struct iomap iomap = { 0 };
|
||||
struct iomap iomap = { .type = IOMAP_HOLE };
|
||||
struct iomap srcmap = { .type = IOMAP_HOLE };
|
||||
loff_t written = 0, ret;
|
||||
u64 end;
|
||||
|
||||
/*
|
||||
* Need to map a range from start position for length bytes. This can
|
||||
@ -38,7 +40,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
|
||||
* expose transient stale data. If the reserve fails, we can safely
|
||||
* back out at this point as there is nothing to undo.
|
||||
*/
|
||||
ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
|
||||
ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (WARN_ON(iomap.offset > pos))
|
||||
@ -50,15 +52,26 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
|
||||
* Cut down the length to the one actually provided by the filesystem,
|
||||
* as it might not be able to give us the whole size that we requested.
|
||||
*/
|
||||
if (iomap.offset + iomap.length < pos + length)
|
||||
length = iomap.offset + iomap.length - pos;
|
||||
end = iomap.offset + iomap.length;
|
||||
if (srcmap.type != IOMAP_HOLE)
|
||||
end = min(end, srcmap.offset + srcmap.length);
|
||||
if (pos + length > end)
|
||||
length = end - pos;
|
||||
|
||||
/*
|
||||
* Now that we have guaranteed that the space allocation will succeed.
|
||||
* Now that we have guaranteed that the space allocation will succeed,
|
||||
* we can do the copy-in page by page without having to worry about
|
||||
* failures exposing transient data.
|
||||
*
|
||||
* To support COW operations, we read in data for partially blocks from
|
||||
* the srcmap if the file system filled it in. In that case we the
|
||||
* length needs to be limited to the earlier of the ends of the iomaps.
|
||||
* If the file system did not provide a srcmap we pass in the normal
|
||||
* iomap into the actors so that they don't need to have special
|
||||
* handling for the two cases.
|
||||
*/
|
||||
written = actor(inode, pos, length, data, &iomap);
|
||||
written = actor(inode, pos, length, data, &iomap,
|
||||
srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);
|
||||
|
||||
/*
|
||||
* Now the data has been copied, commit the range we've copied. This
|
||||
|
@ -1,7 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2010 Red Hat, Inc.
|
||||
* Copyright (c) 2016-2018 Christoph Hellwig.
|
||||
* Copyright (C) 2016-2019 Christoph Hellwig.
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/compiler.h>
|
||||
@ -12,13 +12,34 @@
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/list_sort.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/migrate.h>
|
||||
#include "trace.h"
|
||||
|
||||
#include "../internal.h"
|
||||
|
||||
/*
|
||||
* Structure allocated for each page when block size < PAGE_SIZE to track
|
||||
* sub-page uptodate status and I/O completions.
|
||||
*/
|
||||
struct iomap_page {
|
||||
atomic_t read_count;
|
||||
atomic_t write_count;
|
||||
DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
|
||||
};
|
||||
|
||||
static inline struct iomap_page *to_iomap_page(struct page *page)
|
||||
{
|
||||
if (page_has_private(page))
|
||||
return (struct iomap_page *)page_private(page);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct bio_set iomap_ioend_bioset;
|
||||
|
||||
static struct iomap_page *
|
||||
iomap_page_create(struct inode *inode, struct page *page)
|
||||
{
|
||||
@ -203,9 +224,17 @@ iomap_read_inline_data(struct inode *inode, struct page *page,
|
||||
SetPageUptodate(page);
|
||||
}
|
||||
|
||||
static inline bool iomap_block_needs_zeroing(struct inode *inode,
|
||||
struct iomap *iomap, loff_t pos)
|
||||
{
|
||||
return iomap->type != IOMAP_MAPPED ||
|
||||
(iomap->flags & IOMAP_F_NEW) ||
|
||||
pos >= i_size_read(inode);
|
||||
}
|
||||
|
||||
static loff_t
|
||||
iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
struct iomap *iomap)
|
||||
struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct iomap_readpage_ctx *ctx = data;
|
||||
struct page *page = ctx->cur_page;
|
||||
@ -226,7 +255,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
if (plen == 0)
|
||||
goto done;
|
||||
|
||||
if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
|
||||
if (iomap_block_needs_zeroing(inode, iomap, pos)) {
|
||||
zero_user(page, poff, plen);
|
||||
iomap_set_range_uptodate(page, poff, plen);
|
||||
goto done;
|
||||
@ -293,6 +322,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
|
||||
unsigned poff;
|
||||
loff_t ret;
|
||||
|
||||
trace_iomap_readpage(page->mapping->host, 1);
|
||||
|
||||
for (poff = 0; poff < PAGE_SIZE; poff += ret) {
|
||||
ret = iomap_apply(inode, page_offset(page) + poff,
|
||||
PAGE_SIZE - poff, 0, ops, &ctx,
|
||||
@ -351,7 +382,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
|
||||
|
||||
static loff_t
|
||||
iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
void *data, struct iomap *iomap)
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct iomap_readpage_ctx *ctx = data;
|
||||
loff_t done, ret;
|
||||
@ -371,7 +402,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
ctx->cur_page_in_bio = false;
|
||||
}
|
||||
ret = iomap_readpage_actor(inode, pos + done, length - done,
|
||||
ctx, iomap);
|
||||
ctx, iomap, srcmap);
|
||||
}
|
||||
|
||||
return done;
|
||||
@ -389,6 +420,8 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
|
||||
loff_t last = page_offset(list_entry(pages->next, struct page, lru));
|
||||
loff_t length = last - pos + PAGE_SIZE, ret = 0;
|
||||
|
||||
trace_iomap_readpages(mapping->host, nr_pages);
|
||||
|
||||
while (length > 0) {
|
||||
ret = iomap_apply(mapping->host, pos, length, 0, ops,
|
||||
&ctx, iomap_readpages_actor);
|
||||
@ -455,6 +488,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
|
||||
int
|
||||
iomap_releasepage(struct page *page, gfp_t gfp_mask)
|
||||
{
|
||||
trace_iomap_releasepage(page->mapping->host, page, 0, 0);
|
||||
|
||||
/*
|
||||
* mm accommodates an old ext3 case where clean pages might not have had
|
||||
* the dirty bit cleared. Thus, it can send actual dirty pages to
|
||||
@ -470,6 +505,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage);
|
||||
void
|
||||
iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
|
||||
{
|
||||
trace_iomap_invalidatepage(page->mapping->host, page, offset, len);
|
||||
|
||||
/*
|
||||
* If we are invalidating the entire page, clear the dirty state from it
|
||||
* and release it to avoid unnecessary buildup of the LRU.
|
||||
@ -511,6 +548,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
|
||||
EXPORT_SYMBOL_GPL(iomap_migrate_page);
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
||||
enum {
|
||||
IOMAP_WRITE_F_UNSHARE = (1 << 0),
|
||||
};
|
||||
|
||||
static void
|
||||
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
|
||||
{
|
||||
@ -525,19 +566,12 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
|
||||
}
|
||||
|
||||
static int
|
||||
iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
|
||||
unsigned poff, unsigned plen, unsigned from, unsigned to,
|
||||
struct iomap *iomap)
|
||||
iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
|
||||
unsigned plen, struct iomap *iomap)
|
||||
{
|
||||
struct bio_vec bvec;
|
||||
struct bio bio;
|
||||
|
||||
if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
|
||||
zero_user_segments(page, poff, from, to, poff + plen);
|
||||
iomap_set_range_uptodate(page, poff, plen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
bio_init(&bio, &bvec, 1);
|
||||
bio.bi_opf = REQ_OP_READ;
|
||||
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
|
||||
@ -547,15 +581,15 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
|
||||
}
|
||||
|
||||
static int
|
||||
__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
|
||||
struct page *page, struct iomap *iomap)
|
||||
__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
|
||||
struct page *page, struct iomap *srcmap)
|
||||
{
|
||||
struct iomap_page *iop = iomap_page_create(inode, page);
|
||||
loff_t block_size = i_blocksize(inode);
|
||||
loff_t block_start = pos & ~(block_size - 1);
|
||||
loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
|
||||
unsigned from = offset_in_page(pos), to = from + len, poff, plen;
|
||||
int status = 0;
|
||||
int status;
|
||||
|
||||
if (PageUptodate(page))
|
||||
return 0;
|
||||
@ -566,29 +600,39 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
|
||||
if (plen == 0)
|
||||
break;
|
||||
|
||||
if ((from > poff && from < poff + plen) ||
|
||||
(to > poff && to < poff + plen)) {
|
||||
status = iomap_read_page_sync(inode, block_start, page,
|
||||
poff, plen, from, to, iomap);
|
||||
if (status)
|
||||
break;
|
||||
if (!(flags & IOMAP_WRITE_F_UNSHARE) &&
|
||||
(from <= poff || from >= poff + plen) &&
|
||||
(to <= poff || to >= poff + plen))
|
||||
continue;
|
||||
|
||||
if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
|
||||
if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
|
||||
return -EIO;
|
||||
zero_user_segments(page, poff, from, to, poff + plen);
|
||||
iomap_set_range_uptodate(page, poff, plen);
|
||||
continue;
|
||||
}
|
||||
|
||||
status = iomap_read_page_sync(block_start, page, poff, plen,
|
||||
srcmap);
|
||||
if (status)
|
||||
return status;
|
||||
} while ((block_start += plen) < block_end);
|
||||
|
||||
return status;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
|
||||
struct page **pagep, struct iomap *iomap)
|
||||
struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
const struct iomap_page_ops *page_ops = iomap->page_ops;
|
||||
pgoff_t index = pos >> PAGE_SHIFT;
|
||||
struct page *page;
|
||||
int status = 0;
|
||||
|
||||
BUG_ON(pos + len > iomap->offset + iomap->length);
|
||||
if (srcmap != iomap)
|
||||
BUG_ON(pos + len > srcmap->offset + srcmap->length);
|
||||
|
||||
if (fatal_signal_pending(current))
|
||||
return -EINTR;
|
||||
@ -599,18 +643,20 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
|
||||
return status;
|
||||
}
|
||||
|
||||
page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
|
||||
page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
|
||||
AOP_FLAG_NOFS);
|
||||
if (!page) {
|
||||
status = -ENOMEM;
|
||||
goto out_no_page;
|
||||
}
|
||||
|
||||
if (iomap->type == IOMAP_INLINE)
|
||||
iomap_read_inline_data(inode, page, iomap);
|
||||
if (srcmap->type == IOMAP_INLINE)
|
||||
iomap_read_inline_data(inode, page, srcmap);
|
||||
else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
|
||||
status = __block_write_begin_int(page, pos, len, NULL, iomap);
|
||||
status = __block_write_begin_int(page, pos, len, NULL, srcmap);
|
||||
else
|
||||
status = __iomap_write_begin(inode, pos, len, page, iomap);
|
||||
status = __iomap_write_begin(inode, pos, len, flags, page,
|
||||
srcmap);
|
||||
|
||||
if (unlikely(status))
|
||||
goto out_unlock;
|
||||
@ -656,7 +702,7 @@ EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
|
||||
|
||||
static int
|
||||
__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
|
||||
unsigned copied, struct page *page, struct iomap *iomap)
|
||||
unsigned copied, struct page *page)
|
||||
{
|
||||
flush_dcache_page(page);
|
||||
|
||||
@ -696,20 +742,20 @@ iomap_write_end_inline(struct inode *inode, struct page *page,
|
||||
}
|
||||
|
||||
static int
|
||||
iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
|
||||
unsigned copied, struct page *page, struct iomap *iomap)
|
||||
iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied,
|
||||
struct page *page, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
const struct iomap_page_ops *page_ops = iomap->page_ops;
|
||||
loff_t old_size = inode->i_size;
|
||||
int ret;
|
||||
|
||||
if (iomap->type == IOMAP_INLINE) {
|
||||
if (srcmap->type == IOMAP_INLINE) {
|
||||
ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
|
||||
} else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
|
||||
} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
|
||||
ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
|
||||
page, NULL);
|
||||
} else {
|
||||
ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
|
||||
ret = __iomap_write_end(inode, pos, len, copied, page);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -736,12 +782,11 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
|
||||
|
||||
static loff_t
|
||||
iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
struct iomap *iomap)
|
||||
struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct iov_iter *i = data;
|
||||
long status = 0;
|
||||
ssize_t written = 0;
|
||||
unsigned int flags = AOP_FLAG_NOFS;
|
||||
|
||||
do {
|
||||
struct page *page;
|
||||
@ -771,8 +816,8 @@ again:
|
||||
break;
|
||||
}
|
||||
|
||||
status = iomap_write_begin(inode, pos, bytes, flags, &page,
|
||||
iomap);
|
||||
status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
|
||||
srcmap);
|
||||
if (unlikely(status))
|
||||
break;
|
||||
|
||||
@ -783,8 +828,8 @@ again:
|
||||
|
||||
flush_dcache_page(page);
|
||||
|
||||
status = iomap_write_end(inode, pos, bytes, copied, page,
|
||||
iomap);
|
||||
status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
|
||||
srcmap);
|
||||
if (unlikely(status < 0))
|
||||
break;
|
||||
copied = status;
|
||||
@ -835,50 +880,32 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
|
||||
|
||||
static struct page *
|
||||
__iomap_read_page(struct inode *inode, loff_t offset)
|
||||
{
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
struct page *page;
|
||||
|
||||
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
|
||||
if (IS_ERR(page))
|
||||
return page;
|
||||
if (!PageUptodate(page)) {
|
||||
put_page(page);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
static loff_t
|
||||
iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
struct iomap *iomap)
|
||||
iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
long status = 0;
|
||||
ssize_t written = 0;
|
||||
|
||||
/* don't bother with blocks that are not shared to start with */
|
||||
if (!(iomap->flags & IOMAP_F_SHARED))
|
||||
return length;
|
||||
/* don't bother with holes or unwritten extents */
|
||||
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
|
||||
return length;
|
||||
|
||||
do {
|
||||
struct page *page, *rpage;
|
||||
unsigned long offset; /* Offset into pagecache page */
|
||||
unsigned long bytes; /* Bytes to write to page */
|
||||
|
||||
offset = offset_in_page(pos);
|
||||
bytes = min_t(loff_t, PAGE_SIZE - offset, length);
|
||||
|
||||
rpage = __iomap_read_page(inode, pos);
|
||||
if (IS_ERR(rpage))
|
||||
return PTR_ERR(rpage);
|
||||
unsigned long offset = offset_in_page(pos);
|
||||
unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
|
||||
struct page *page;
|
||||
|
||||
status = iomap_write_begin(inode, pos, bytes,
|
||||
AOP_FLAG_NOFS, &page, iomap);
|
||||
put_page(rpage);
|
||||
IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap);
|
||||
if (unlikely(status))
|
||||
return status;
|
||||
|
||||
WARN_ON_ONCE(!PageUptodate(page));
|
||||
|
||||
status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
|
||||
status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
|
||||
srcmap);
|
||||
if (unlikely(status <= 0)) {
|
||||
if (WARN_ON_ONCE(status == 0))
|
||||
return -EIO;
|
||||
@ -898,14 +925,14 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
}
|
||||
|
||||
int
|
||||
iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
|
||||
iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
|
||||
const struct iomap_ops *ops)
|
||||
{
|
||||
loff_t ret;
|
||||
|
||||
while (len) {
|
||||
ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
|
||||
iomap_dirty_actor);
|
||||
iomap_unshare_actor);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
pos += ret;
|
||||
@ -914,23 +941,22 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_file_dirty);
|
||||
EXPORT_SYMBOL_GPL(iomap_file_unshare);
|
||||
|
||||
static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
|
||||
unsigned bytes, struct iomap *iomap)
|
||||
unsigned bytes, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct page *page;
|
||||
int status;
|
||||
|
||||
status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
|
||||
iomap);
|
||||
status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
|
||||
if (status)
|
||||
return status;
|
||||
|
||||
zero_user(page, offset, bytes);
|
||||
mark_page_accessed(page);
|
||||
|
||||
return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
|
||||
return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
|
||||
}
|
||||
|
||||
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
|
||||
@ -942,14 +968,14 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
|
||||
|
||||
static loff_t
|
||||
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
|
||||
void *data, struct iomap *iomap)
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
bool *did_zero = data;
|
||||
loff_t written = 0;
|
||||
int status;
|
||||
|
||||
/* already zeroed? we're done. */
|
||||
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
|
||||
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
|
||||
return count;
|
||||
|
||||
do {
|
||||
@ -961,7 +987,8 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
|
||||
if (IS_DAX(inode))
|
||||
status = iomap_dax_zero(pos, offset, bytes, iomap);
|
||||
else
|
||||
status = iomap_zero(inode, pos, offset, bytes, iomap);
|
||||
status = iomap_zero(inode, pos, offset, bytes, iomap,
|
||||
srcmap);
|
||||
if (status < 0)
|
||||
return status;
|
||||
|
||||
@ -1011,7 +1038,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page);
|
||||
|
||||
static loff_t
|
||||
iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
void *data, struct iomap *iomap)
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct page *page = data;
|
||||
int ret;
|
||||
@ -1071,3 +1098,551 @@ out_unlock:
|
||||
return block_page_mkwrite_return(ret);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
|
||||
|
||||
static void
|
||||
iomap_finish_page_writeback(struct inode *inode, struct page *page,
|
||||
int error)
|
||||
{
|
||||
struct iomap_page *iop = to_iomap_page(page);
|
||||
|
||||
if (error) {
|
||||
SetPageError(page);
|
||||
mapping_set_error(inode->i_mapping, -EIO);
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
|
||||
WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0);
|
||||
|
||||
if (!iop || atomic_dec_and_test(&iop->write_count))
|
||||
end_page_writeback(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* We're now finished for good with this ioend structure. Update the page
|
||||
* state, release holds on bios, and finally free up memory. Do not use the
|
||||
* ioend after this.
|
||||
*/
|
||||
static void
|
||||
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
|
||||
{
|
||||
struct inode *inode = ioend->io_inode;
|
||||
struct bio *bio = &ioend->io_inline_bio;
|
||||
struct bio *last = ioend->io_bio, *next;
|
||||
u64 start = bio->bi_iter.bi_sector;
|
||||
bool quiet = bio_flagged(bio, BIO_QUIET);
|
||||
|
||||
for (bio = &ioend->io_inline_bio; bio; bio = next) {
|
||||
struct bio_vec *bv;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
/*
|
||||
* For the last bio, bi_private points to the ioend, so we
|
||||
* need to explicitly end the iteration here.
|
||||
*/
|
||||
if (bio == last)
|
||||
next = NULL;
|
||||
else
|
||||
next = bio->bi_private;
|
||||
|
||||
/* walk each page on bio, ending page IO on them */
|
||||
bio_for_each_segment_all(bv, bio, iter_all)
|
||||
iomap_finish_page_writeback(inode, bv->bv_page, error);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
if (unlikely(error && !quiet)) {
|
||||
printk_ratelimited(KERN_ERR
|
||||
"%s: writeback error on inode %lu, offset %lld, sector %llu",
|
||||
inode->i_sb->s_id, inode->i_ino, ioend->io_offset,
|
||||
start);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
|
||||
{
|
||||
struct list_head tmp;
|
||||
|
||||
list_replace_init(&ioend->io_list, &tmp);
|
||||
iomap_finish_ioend(ioend, error);
|
||||
|
||||
while (!list_empty(&tmp)) {
|
||||
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
|
||||
list_del_init(&ioend->io_list);
|
||||
iomap_finish_ioend(ioend, error);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
|
||||
|
||||
/*
|
||||
* We can merge two adjacent ioends if they have the same set of work to do.
|
||||
*/
|
||||
static bool
|
||||
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
|
||||
{
|
||||
if (ioend->io_bio->bi_status != next->io_bio->bi_status)
|
||||
return false;
|
||||
if ((ioend->io_flags & IOMAP_F_SHARED) ^
|
||||
(next->io_flags & IOMAP_F_SHARED))
|
||||
return false;
|
||||
if ((ioend->io_type == IOMAP_UNWRITTEN) ^
|
||||
(next->io_type == IOMAP_UNWRITTEN))
|
||||
return false;
|
||||
if (ioend->io_offset + ioend->io_size != next->io_offset)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
|
||||
void (*merge_private)(struct iomap_ioend *ioend,
|
||||
struct iomap_ioend *next))
|
||||
{
|
||||
struct iomap_ioend *next;
|
||||
|
||||
INIT_LIST_HEAD(&ioend->io_list);
|
||||
|
||||
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
|
||||
io_list))) {
|
||||
if (!iomap_ioend_can_merge(ioend, next))
|
||||
break;
|
||||
list_move_tail(&next->io_list, &ioend->io_list);
|
||||
ioend->io_size += next->io_size;
|
||||
if (next->io_private && merge_private)
|
||||
merge_private(ioend, next);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
|
||||
|
||||
static int
|
||||
iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b)
|
||||
{
|
||||
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
|
||||
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
|
||||
|
||||
if (ia->io_offset < ib->io_offset)
|
||||
return -1;
|
||||
if (ia->io_offset > ib->io_offset)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
iomap_sort_ioends(struct list_head *ioend_list)
|
||||
{
|
||||
list_sort(NULL, ioend_list, iomap_ioend_compare);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
|
||||
|
||||
static void iomap_writepage_end_bio(struct bio *bio)
|
||||
{
|
||||
struct iomap_ioend *ioend = bio->bi_private;
|
||||
|
||||
iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
|
||||
}
|
||||
|
||||
/*
|
||||
* Submit the final bio for an ioend.
|
||||
*
|
||||
* If @error is non-zero, it means that we have a situation where some part of
|
||||
* the submission process has failed after we have marked paged for writeback
|
||||
* and unlocked them. In this situation, we need to fail the bio instead of
|
||||
* submitting it. This typically only happens on a filesystem shutdown.
|
||||
*/
|
||||
static int
|
||||
iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
|
||||
int error)
|
||||
{
|
||||
ioend->io_bio->bi_private = ioend;
|
||||
ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
|
||||
|
||||
if (wpc->ops->prepare_ioend)
|
||||
error = wpc->ops->prepare_ioend(ioend, error);
|
||||
if (error) {
|
||||
/*
|
||||
* If we are failing the IO now, just mark the ioend with an
|
||||
* error and finish it. This will run IO completion immediately
|
||||
* as there is only one reference to the ioend at this point in
|
||||
* time.
|
||||
*/
|
||||
ioend->io_bio->bi_status = errno_to_blk_status(error);
|
||||
bio_endio(ioend->io_bio);
|
||||
return error;
|
||||
}
|
||||
|
||||
submit_bio(ioend->io_bio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct iomap_ioend *
|
||||
iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
|
||||
loff_t offset, sector_t sector, struct writeback_control *wbc)
|
||||
{
|
||||
struct iomap_ioend *ioend;
|
||||
struct bio *bio;
|
||||
|
||||
bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset);
|
||||
bio_set_dev(bio, wpc->iomap.bdev);
|
||||
bio->bi_iter.bi_sector = sector;
|
||||
bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
|
||||
bio->bi_write_hint = inode->i_write_hint;
|
||||
wbc_init_bio(wbc, bio);
|
||||
|
||||
ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
|
||||
INIT_LIST_HEAD(&ioend->io_list);
|
||||
ioend->io_type = wpc->iomap.type;
|
||||
ioend->io_flags = wpc->iomap.flags;
|
||||
ioend->io_inode = inode;
|
||||
ioend->io_size = 0;
|
||||
ioend->io_offset = offset;
|
||||
ioend->io_private = NULL;
|
||||
ioend->io_bio = bio;
|
||||
return ioend;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a new bio, and chain the old bio to the new one.
|
||||
*
|
||||
* Note that we have to do perform the chaining in this unintuitive order
|
||||
* so that the bi_private linkage is set up in the right direction for the
|
||||
* traversal in iomap_finish_ioend().
|
||||
*/
|
||||
static struct bio *
|
||||
iomap_chain_bio(struct bio *prev)
|
||||
{
|
||||
struct bio *new;
|
||||
|
||||
new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
|
||||
bio_copy_dev(new, prev);/* also copies over blkcg information */
|
||||
new->bi_iter.bi_sector = bio_end_sector(prev);
|
||||
new->bi_opf = prev->bi_opf;
|
||||
new->bi_write_hint = prev->bi_write_hint;
|
||||
|
||||
bio_chain(prev, new);
|
||||
bio_get(prev); /* for iomap_finish_ioend */
|
||||
submit_bio(prev);
|
||||
return new;
|
||||
}
|
||||
|
||||
static bool
|
||||
iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
|
||||
sector_t sector)
|
||||
{
|
||||
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
|
||||
(wpc->ioend->io_flags & IOMAP_F_SHARED))
|
||||
return false;
|
||||
if (wpc->iomap.type != wpc->ioend->io_type)
|
||||
return false;
|
||||
if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
|
||||
return false;
|
||||
if (sector != bio_end_sector(wpc->ioend->io_bio))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test to see if we have an existing ioend structure that we could append to
|
||||
* first, otherwise finish off the current ioend and start another.
|
||||
*/
|
||||
static void
|
||||
iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
|
||||
struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct list_head *iolist)
|
||||
{
|
||||
sector_t sector = iomap_sector(&wpc->iomap, offset);
|
||||
unsigned len = i_blocksize(inode);
|
||||
unsigned poff = offset & (PAGE_SIZE - 1);
|
||||
bool merged, same_page = false;
|
||||
|
||||
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
|
||||
if (wpc->ioend)
|
||||
list_add(&wpc->ioend->io_list, iolist);
|
||||
wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
|
||||
}
|
||||
|
||||
merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
|
||||
&same_page);
|
||||
if (iop && !same_page)
|
||||
atomic_inc(&iop->write_count);
|
||||
|
||||
if (!merged) {
|
||||
if (bio_full(wpc->ioend->io_bio, len)) {
|
||||
wpc->ioend->io_bio =
|
||||
iomap_chain_bio(wpc->ioend->io_bio);
|
||||
}
|
||||
bio_add_page(wpc->ioend->io_bio, page, len, poff);
|
||||
}
|
||||
|
||||
wpc->ioend->io_size += len;
|
||||
wbc_account_cgroup_owner(wbc, page, len);
|
||||
}
|
||||
|
||||
/*
|
||||
* We implement an immediate ioend submission policy here to avoid needing to
|
||||
* chain multiple ioends and hence nest mempool allocations which can violate
|
||||
* forward progress guarantees we need to provide. The current ioend we are
|
||||
* adding blocks to is cached on the writepage context, and if the new block
|
||||
* does not append to the cached ioend it will create a new ioend and cache that
|
||||
* instead.
|
||||
*
|
||||
* If a new ioend is created and cached, the old ioend is returned and queued
|
||||
* locally for submission once the entire page is processed or an error has been
|
||||
* detected. While ioends are submitted immediately after they are completed,
|
||||
* batching optimisations are provided by higher level block plugging.
|
||||
*
|
||||
* At the end of a writeback pass, there will be a cached ioend remaining on the
|
||||
* writepage context that the caller will need to submit.
|
||||
*/
|
||||
static int
|
||||
iomap_writepage_map(struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct inode *inode,
|
||||
struct page *page, u64 end_offset)
|
||||
{
|
||||
struct iomap_page *iop = to_iomap_page(page);
|
||||
struct iomap_ioend *ioend, *next;
|
||||
unsigned len = i_blocksize(inode);
|
||||
u64 file_offset; /* file offset of page */
|
||||
int error = 0, count = 0, i;
|
||||
LIST_HEAD(submit_list);
|
||||
|
||||
WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
|
||||
WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0);
|
||||
|
||||
/*
|
||||
* Walk through the page to find areas to write back. If we run off the
|
||||
* end of the current map or find the current map invalid, grab a new
|
||||
* one.
|
||||
*/
|
||||
for (i = 0, file_offset = page_offset(page);
|
||||
i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
|
||||
i++, file_offset += len) {
|
||||
if (iop && !test_bit(i, iop->uptodate))
|
||||
continue;
|
||||
|
||||
error = wpc->ops->map_blocks(wpc, inode, file_offset);
|
||||
if (error)
|
||||
break;
|
||||
if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
|
||||
continue;
|
||||
if (wpc->iomap.type == IOMAP_HOLE)
|
||||
continue;
|
||||
iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
|
||||
&submit_list);
|
||||
count++;
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
|
||||
WARN_ON_ONCE(!PageLocked(page));
|
||||
WARN_ON_ONCE(PageWriteback(page));
|
||||
|
||||
/*
|
||||
* We cannot cancel the ioend directly here on error. We may have
|
||||
* already set other pages under writeback and hence we have to run I/O
|
||||
* completion to mark the error state of the pages under writeback
|
||||
* appropriately.
|
||||
*/
|
||||
if (unlikely(error)) {
|
||||
if (!count) {
|
||||
/*
|
||||
* If the current page hasn't been added to ioend, it
|
||||
* won't be affected by I/O completions and we must
|
||||
* discard and unlock it right here.
|
||||
*/
|
||||
if (wpc->ops->discard_page)
|
||||
wpc->ops->discard_page(page);
|
||||
ClearPageUptodate(page);
|
||||
unlock_page(page);
|
||||
goto done;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the page was not fully cleaned, we need to ensure that the
|
||||
* higher layers come back to it correctly. That means we need
|
||||
* to keep the page dirty, and for WB_SYNC_ALL writeback we need
|
||||
* to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
|
||||
* so another attempt to write this page in this writeback sweep
|
||||
* will be made.
|
||||
*/
|
||||
set_page_writeback_keepwrite(page);
|
||||
} else {
|
||||
clear_page_dirty_for_io(page);
|
||||
set_page_writeback(page);
|
||||
}
|
||||
|
||||
unlock_page(page);
|
||||
|
||||
/*
|
||||
* Preserve the original error if there was one, otherwise catch
|
||||
* submission errors here and propagate into subsequent ioend
|
||||
* submissions.
|
||||
*/
|
||||
list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
|
||||
int error2;
|
||||
|
||||
list_del_init(&ioend->io_list);
|
||||
error2 = iomap_submit_ioend(wpc, ioend, error);
|
||||
if (error2 && !error)
|
||||
error = error2;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can end up here with no error and nothing to write only if we race
|
||||
* with a partial page truncate on a sub-page block sized filesystem.
|
||||
*/
|
||||
if (!count)
|
||||
end_page_writeback(page);
|
||||
done:
|
||||
mapping_set_error(page->mapping, error);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out a dirty page.
|
||||
*
|
||||
* For delalloc space on the page we need to allocate space and flush it.
|
||||
* For unwritten space on the page we need to start the conversion to
|
||||
* regular allocated space.
|
||||
*/
|
||||
static int
|
||||
iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
|
||||
{
|
||||
struct iomap_writepage_ctx *wpc = data;
|
||||
struct inode *inode = page->mapping->host;
|
||||
pgoff_t end_index;
|
||||
u64 end_offset;
|
||||
loff_t offset;
|
||||
|
||||
trace_iomap_writepage(inode, page, 0, 0);
|
||||
|
||||
/*
|
||||
* Refuse to write the page out if we are called from reclaim context.
|
||||
*
|
||||
* This avoids stack overflows when called from deeply used stacks in
|
||||
* random callers for direct reclaim or memcg reclaim. We explicitly
|
||||
* allow reclaim from kswapd as the stack usage there is relatively low.
|
||||
*
|
||||
* This should never happen except in the case of a VM regression so
|
||||
* warn about it.
|
||||
*/
|
||||
if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
|
||||
PF_MEMALLOC))
|
||||
goto redirty;
|
||||
|
||||
/*
|
||||
* Given that we do not allow direct reclaim to call us, we should
|
||||
* never be called in a recursive filesystem reclaim context.
|
||||
*/
|
||||
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
|
||||
goto redirty;
|
||||
|
||||
/*
|
||||
* Is this page beyond the end of the file?
|
||||
*
|
||||
* The page index is less than the end_index, adjust the end_offset
|
||||
* to the highest offset that this page should represent.
|
||||
* -----------------------------------------------------
|
||||
* | file mapping | <EOF> |
|
||||
* -----------------------------------------------------
|
||||
* | Page ... | Page N-2 | Page N-1 | Page N | |
|
||||
* ^--------------------------------^----------|--------
|
||||
* | desired writeback range | see else |
|
||||
* ---------------------------------^------------------|
|
||||
*/
|
||||
offset = i_size_read(inode);
|
||||
end_index = offset >> PAGE_SHIFT;
|
||||
if (page->index < end_index)
|
||||
end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT;
|
||||
else {
|
||||
/*
|
||||
* Check whether the page to write out is beyond or straddles
|
||||
* i_size or not.
|
||||
* -------------------------------------------------------
|
||||
* | file mapping | <EOF> |
|
||||
* -------------------------------------------------------
|
||||
* | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
|
||||
* ^--------------------------------^-----------|---------
|
||||
* | | Straddles |
|
||||
* ---------------------------------^-----------|--------|
|
||||
*/
|
||||
unsigned offset_into_page = offset & (PAGE_SIZE - 1);
|
||||
|
||||
/*
|
||||
* Skip the page if it is fully outside i_size, e.g. due to a
|
||||
* truncate operation that is in progress. We must redirty the
|
||||
* page so that reclaim stops reclaiming it. Otherwise
|
||||
* iomap_vm_releasepage() is called on it and gets confused.
|
||||
*
|
||||
* Note that the end_index is unsigned long, it would overflow
|
||||
* if the given offset is greater than 16TB on 32-bit system
|
||||
* and if we do check the page is fully outside i_size or not
|
||||
* via "if (page->index >= end_index + 1)" as "end_index + 1"
|
||||
* will be evaluated to 0. Hence this page will be redirtied
|
||||
* and be written out repeatedly which would result in an
|
||||
* infinite loop, the user program that perform this operation
|
||||
* will hang. Instead, we can verify this situation by checking
|
||||
* if the page to write is totally beyond the i_size or if it's
|
||||
* offset is just equal to the EOF.
|
||||
*/
|
||||
if (page->index > end_index ||
|
||||
(page->index == end_index && offset_into_page == 0))
|
||||
goto redirty;
|
||||
|
||||
/*
|
||||
* The page straddles i_size. It must be zeroed out on each
|
||||
* and every writepage invocation because it may be mmapped.
|
||||
* "A file is mapped in multiples of the page size. For a file
|
||||
* that is not a multiple of the page size, the remaining
|
||||
* memory is zeroed when mapped, and writes to that region are
|
||||
* not written out to the file."
|
||||
*/
|
||||
zero_user_segment(page, offset_into_page, PAGE_SIZE);
|
||||
|
||||
/* Adjust the end_offset to the end of file */
|
||||
end_offset = offset;
|
||||
}
|
||||
|
||||
return iomap_writepage_map(wpc, wbc, inode, page, end_offset);
|
||||
|
||||
redirty:
|
||||
redirty_page_for_writepage(wbc, page);
|
||||
unlock_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
iomap_writepage(struct page *page, struct writeback_control *wbc,
|
||||
struct iomap_writepage_ctx *wpc,
|
||||
const struct iomap_writeback_ops *ops)
|
||||
{
|
||||
int ret;
|
||||
|
||||
wpc->ops = ops;
|
||||
ret = iomap_do_writepage(page, wbc, wpc);
|
||||
if (!wpc->ioend)
|
||||
return ret;
|
||||
return iomap_submit_ioend(wpc, wpc->ioend, ret);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_writepage);
|
||||
|
||||
int
|
||||
iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
|
||||
struct iomap_writepage_ctx *wpc,
|
||||
const struct iomap_writeback_ops *ops)
|
||||
{
|
||||
int ret;
|
||||
|
||||
wpc->ops = ops;
|
||||
ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
|
||||
if (!wpc->ioend)
|
||||
return ret;
|
||||
return iomap_submit_ioend(wpc, wpc->ioend, ret);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_writepages);
|
||||
|
||||
static int __init iomap_init(void)
|
||||
{
|
||||
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
|
||||
offsetof(struct iomap_ioend, io_inline_bio),
|
||||
BIOSET_NEED_BVECS);
|
||||
}
|
||||
fs_initcall(iomap_init);
|
||||
|
@ -358,7 +358,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
|
||||
static loff_t
|
||||
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
void *data, struct iomap *iomap)
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct iomap_dio *dio = data;
|
||||
|
||||
@ -392,7 +392,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
*/
|
||||
ssize_t
|
||||
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
|
||||
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
|
||||
bool wait_for_completion)
|
||||
{
|
||||
struct address_space *mapping = iocb->ki_filp->f_mapping;
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
@ -400,7 +401,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
loff_t pos = iocb->ki_pos, start = pos;
|
||||
loff_t end = iocb->ki_pos + count - 1, ret = 0;
|
||||
unsigned int flags = IOMAP_DIRECT;
|
||||
bool wait_for_completion = is_sync_kiocb(iocb);
|
||||
struct blk_plug plug;
|
||||
struct iomap_dio *dio;
|
||||
|
||||
@ -409,6 +409,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
|
||||
return -EIO;
|
||||
|
||||
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
|
||||
if (!dio)
|
||||
return -ENOMEM;
|
||||
@ -430,7 +433,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
if (pos >= dio->i_size)
|
||||
goto out_free_dio;
|
||||
|
||||
if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
|
||||
if (iter_is_iovec(iter))
|
||||
dio->flags |= IOMAP_DIO_DIRTY;
|
||||
} else {
|
||||
flags |= IOMAP_WRITE;
|
||||
|
@ -44,7 +44,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
|
||||
|
||||
static loff_t
|
||||
iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
|
||||
struct iomap *iomap)
|
||||
struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
struct fiemap_ctx *ctx = data;
|
||||
loff_t ret = length;
|
||||
@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(iomap_fiemap);
|
||||
|
||||
static loff_t
|
||||
iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
|
||||
void *data, struct iomap *iomap)
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
sector_t *bno = data, addr;
|
||||
|
||||
|
@ -119,7 +119,7 @@ out:
|
||||
|
||||
static loff_t
|
||||
iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
|
||||
void *data, struct iomap *iomap)
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
switch (iomap->type) {
|
||||
case IOMAP_UNWRITTEN:
|
||||
@ -165,7 +165,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole);
|
||||
|
||||
static loff_t
|
||||
iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
|
||||
void *data, struct iomap *iomap)
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap)
|
||||
{
|
||||
switch (iomap->type) {
|
||||
case IOMAP_HOLE:
|
||||
|
@ -76,7 +76,8 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
|
||||
* distinction between written and unwritten extents.
|
||||
*/
|
||||
static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
|
||||
loff_t count, void *data, struct iomap *iomap)
|
||||
loff_t count, void *data, struct iomap *iomap,
|
||||
struct iomap *srcmap)
|
||||
{
|
||||
struct iomap_swapfile_info *isi = data;
|
||||
int error;
|
||||
|
12
fs/iomap/trace.c
Normal file
12
fs/iomap/trace.c
Normal file
@ -0,0 +1,12 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2019 Christoph Hellwig
|
||||
*/
|
||||
#include <linux/iomap.h>
|
||||
|
||||
/*
|
||||
* We include this last to have the helpers above available for the trace
|
||||
* event implementations.
|
||||
*/
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include "trace.h"
|
88
fs/iomap/trace.h
Normal file
88
fs/iomap/trace.h
Normal file
@ -0,0 +1,88 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (c) 2009-2019 Christoph Hellwig
|
||||
*
|
||||
* NOTE: none of these tracepoints shall be consider a stable kernel ABI
|
||||
* as they can change at any time.
|
||||
*/
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM iomap
|
||||
|
||||
#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _IOMAP_TRACE_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
struct inode;
|
||||
|
||||
DECLARE_EVENT_CLASS(iomap_readpage_class,
|
||||
TP_PROTO(struct inode *inode, int nr_pages),
|
||||
TP_ARGS(inode, nr_pages),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(u64, ino)
|
||||
__field(int, nr_pages)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->nr_pages = nr_pages;
|
||||
),
|
||||
TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->ino,
|
||||
__entry->nr_pages)
|
||||
)
|
||||
|
||||
#define DEFINE_READPAGE_EVENT(name) \
|
||||
DEFINE_EVENT(iomap_readpage_class, name, \
|
||||
TP_PROTO(struct inode *inode, int nr_pages), \
|
||||
TP_ARGS(inode, nr_pages))
|
||||
DEFINE_READPAGE_EVENT(iomap_readpage);
|
||||
DEFINE_READPAGE_EVENT(iomap_readpages);
|
||||
|
||||
DECLARE_EVENT_CLASS(iomap_page_class,
|
||||
TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
|
||||
unsigned int len),
|
||||
TP_ARGS(inode, page, off, len),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(u64, ino)
|
||||
__field(pgoff_t, pgoff)
|
||||
__field(loff_t, size)
|
||||
__field(unsigned long, offset)
|
||||
__field(unsigned int, length)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->pgoff = page_offset(page);
|
||||
__entry->size = i_size_read(inode);
|
||||
__entry->offset = off;
|
||||
__entry->length = len;
|
||||
),
|
||||
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
|
||||
"length %x",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->ino,
|
||||
__entry->pgoff,
|
||||
__entry->size,
|
||||
__entry->offset,
|
||||
__entry->length)
|
||||
)
|
||||
|
||||
#define DEFINE_PAGE_EVENT(name) \
|
||||
DEFINE_EVENT(iomap_page_class, name, \
|
||||
TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
|
||||
unsigned int len), \
|
||||
TP_ARGS(inode, page, off, len))
|
||||
DEFINE_PAGE_EVENT(iomap_writepage);
|
||||
DEFINE_PAGE_EVENT(iomap_releasepage);
|
||||
DEFINE_PAGE_EVENT(iomap_invalidatepage);
|
||||
|
||||
#endif /* _IOMAP_TRACE_H */
|
||||
|
||||
#undef TRACE_INCLUDE_PATH
|
||||
#define TRACE_INCLUDE_PATH .
|
||||
#define TRACE_INCLUDE_FILE trace
|
||||
#include <trace/define_trace.h>
|
@ -34,6 +34,7 @@
|
||||
#include "xfs_ag_resv.h"
|
||||
#include "xfs_refcount.h"
|
||||
#include "xfs_icache.h"
|
||||
#include "xfs_iomap.h"
|
||||
|
||||
|
||||
kmem_zone_t *xfs_bmap_free_item_zone;
|
||||
@ -4456,16 +4457,21 @@ int
|
||||
xfs_bmapi_convert_delalloc(
|
||||
struct xfs_inode *ip,
|
||||
int whichfork,
|
||||
xfs_fileoff_t offset_fsb,
|
||||
struct xfs_bmbt_irec *imap,
|
||||
xfs_off_t offset,
|
||||
struct iomap *iomap,
|
||||
unsigned int *seq)
|
||||
{
|
||||
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
|
||||
struct xfs_bmalloca bma = { NULL };
|
||||
u16 flags = 0;
|
||||
struct xfs_trans *tp;
|
||||
int error;
|
||||
|
||||
if (whichfork == XFS_COW_FORK)
|
||||
flags |= IOMAP_F_SHARED;
|
||||
|
||||
/*
|
||||
* Space for the extent and indirect blocks was reserved when the
|
||||
* delalloc extent was created so there's no need to do so here.
|
||||
@ -4495,7 +4501,7 @@ xfs_bmapi_convert_delalloc(
|
||||
* the extent. Just return the real extent at this offset.
|
||||
*/
|
||||
if (!isnullstartblock(bma.got.br_startblock)) {
|
||||
*imap = bma.got;
|
||||
xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
|
||||
*seq = READ_ONCE(ifp->if_seq);
|
||||
goto out_trans_cancel;
|
||||
}
|
||||
@ -4528,7 +4534,7 @@ xfs_bmapi_convert_delalloc(
|
||||
XFS_STATS_INC(mp, xs_xstrat_quick);
|
||||
|
||||
ASSERT(!isnullstartblock(bma.got.br_startblock));
|
||||
*imap = bma.got;
|
||||
xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
|
||||
*seq = READ_ONCE(ifp->if_seq);
|
||||
|
||||
if (whichfork == XFS_COW_FORK)
|
||||
|
@ -228,8 +228,7 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
|
||||
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
|
||||
int eof);
|
||||
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
|
||||
xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
|
||||
unsigned int *seq);
|
||||
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
|
||||
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
|
||||
struct xfs_inode *ip, int whichfork,
|
||||
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,23 +6,6 @@
|
||||
#ifndef __XFS_AOPS_H__
|
||||
#define __XFS_AOPS_H__
|
||||
|
||||
extern struct bio_set xfs_ioend_bioset;
|
||||
|
||||
/*
|
||||
* Structure for buffered I/O completions.
|
||||
*/
|
||||
struct xfs_ioend {
|
||||
struct list_head io_list; /* next ioend in chain */
|
||||
int io_fork; /* inode fork written back */
|
||||
xfs_exntst_t io_state; /* extent state */
|
||||
struct inode *io_inode; /* file being written to */
|
||||
size_t io_size; /* size of the extent */
|
||||
xfs_off_t io_offset; /* offset in the file */
|
||||
struct xfs_trans *io_append_trans;/* xact. for size update */
|
||||
struct bio *io_bio; /* bio being built */
|
||||
struct bio io_inline_bio; /* MUST BE LAST! */
|
||||
};
|
||||
|
||||
extern const struct address_space_operations xfs_address_space_operations;
|
||||
extern const struct address_space_operations xfs_dax_aops;
|
||||
|
||||
|
@ -188,7 +188,7 @@ xfs_file_dio_aio_read(
|
||||
file_accessed(iocb->ki_filp);
|
||||
|
||||
xfs_ilock(ip, XFS_IOLOCK_SHARED);
|
||||
ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
|
||||
ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL, is_sync_kiocb(iocb));
|
||||
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
|
||||
|
||||
return ret;
|
||||
@ -547,15 +547,12 @@ xfs_file_dio_aio_write(
|
||||
}
|
||||
|
||||
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
|
||||
ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
|
||||
|
||||
/*
|
||||
* If unaligned, this is the only IO in-flight. If it has not yet
|
||||
* completed, wait on it before we release the iolock to prevent
|
||||
* subsequent overlapping IO.
|
||||
* If unaligned, this is the only IO in-flight. Wait on it before we
|
||||
* release the iolock to prevent subsequent overlapping IO.
|
||||
*/
|
||||
if (ret == -EIOCBQUEUED && unaligned_io)
|
||||
inode_dio_wait(inode);
|
||||
ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops,
|
||||
is_sync_kiocb(iocb) || unaligned_io);
|
||||
out:
|
||||
xfs_iunlock(ip, iolock);
|
||||
|
||||
|
@ -54,7 +54,7 @@ xfs_bmbt_to_iomap(
|
||||
struct xfs_inode *ip,
|
||||
struct iomap *iomap,
|
||||
struct xfs_bmbt_irec *imap,
|
||||
bool shared)
|
||||
u16 flags)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
|
||||
@ -79,12 +79,11 @@ xfs_bmbt_to_iomap(
|
||||
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
|
||||
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
|
||||
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
|
||||
iomap->flags = flags;
|
||||
|
||||
if (xfs_ipincount(ip) &&
|
||||
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
|
||||
iomap->flags |= IOMAP_F_DIRTY;
|
||||
if (shared)
|
||||
iomap->flags |= IOMAP_F_SHARED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -540,6 +539,7 @@ xfs_file_iomap_begin_delay(
|
||||
struct xfs_iext_cursor icur, ccur;
|
||||
xfs_fsblock_t prealloc_blocks = 0;
|
||||
bool eof = false, cow_eof = false, shared = false;
|
||||
u16 iomap_flags = 0;
|
||||
int whichfork = XFS_DATA_FORK;
|
||||
int error = 0;
|
||||
|
||||
@ -707,22 +707,28 @@ retry:
|
||||
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
|
||||
* them out if the write happens to fail.
|
||||
*/
|
||||
iomap->flags |= IOMAP_F_NEW;
|
||||
trace_xfs_iomap_alloc(ip, offset, count, whichfork,
|
||||
whichfork == XFS_DATA_FORK ? &imap : &cmap);
|
||||
if (whichfork == XFS_DATA_FORK) {
|
||||
iomap_flags |= IOMAP_F_NEW;
|
||||
trace_xfs_iomap_alloc(ip, offset, count, whichfork, &imap);
|
||||
} else {
|
||||
trace_xfs_iomap_alloc(ip, offset, count, whichfork, &cmap);
|
||||
}
|
||||
done:
|
||||
if (whichfork == XFS_COW_FORK) {
|
||||
if (imap.br_startoff > offset_fsb) {
|
||||
xfs_trim_extent(&cmap, offset_fsb,
|
||||
imap.br_startoff - offset_fsb);
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &cmap,
|
||||
IOMAP_F_SHARED);
|
||||
goto out_unlock;
|
||||
}
|
||||
/* ensure we only report blocks we have a reservation for */
|
||||
xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
|
||||
shared = true;
|
||||
}
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
|
||||
if (shared)
|
||||
iomap_flags |= IOMAP_F_SHARED;
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
|
||||
out_unlock:
|
||||
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
||||
return error;
|
||||
@ -922,7 +928,8 @@ xfs_file_iomap_begin(
|
||||
loff_t offset,
|
||||
loff_t length,
|
||||
unsigned flags,
|
||||
struct iomap *iomap)
|
||||
struct iomap *iomap,
|
||||
struct iomap *srcmap)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
@ -930,6 +937,7 @@ xfs_file_iomap_begin(
|
||||
xfs_fileoff_t offset_fsb, end_fsb;
|
||||
int nimaps = 1, error = 0;
|
||||
bool shared = false;
|
||||
u16 iomap_flags = 0;
|
||||
unsigned lockmode;
|
||||
|
||||
if (XFS_FORCED_SHUTDOWN(mp))
|
||||
@ -1045,11 +1053,20 @@ xfs_file_iomap_begin(
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
iomap->flags |= IOMAP_F_NEW;
|
||||
iomap_flags |= IOMAP_F_NEW;
|
||||
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
|
||||
|
||||
out_finish:
|
||||
return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
|
||||
/*
|
||||
* Writes that span EOF might trigger an IO size update on completion,
|
||||
* so consider them to be dirty for the purposes of O_DSYNC even if
|
||||
* there is no other metadata changes pending or have been made here.
|
||||
*/
|
||||
if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode))
|
||||
iomap_flags |= IOMAP_F_DIRTY;
|
||||
if (shared)
|
||||
iomap_flags |= IOMAP_F_SHARED;
|
||||
return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
|
||||
|
||||
out_found:
|
||||
ASSERT(nimaps);
|
||||
@ -1145,7 +1162,8 @@ xfs_seek_iomap_begin(
|
||||
loff_t offset,
|
||||
loff_t length,
|
||||
unsigned flags,
|
||||
struct iomap *iomap)
|
||||
struct iomap *iomap,
|
||||
struct iomap *srcmap)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
@ -1193,7 +1211,7 @@ xfs_seek_iomap_begin(
|
||||
if (data_fsb < cow_fsb + cmap.br_blockcount)
|
||||
end_fsb = min(end_fsb, data_fsb);
|
||||
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
|
||||
/*
|
||||
* This is a COW extent, so we must probe the page cache
|
||||
* because there could be dirty page cache being backed
|
||||
@ -1215,7 +1233,7 @@ xfs_seek_iomap_begin(
|
||||
imap.br_state = XFS_EXT_NORM;
|
||||
done:
|
||||
xfs_trim_extent(&imap, offset_fsb, end_fsb);
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
|
||||
out_unlock:
|
||||
xfs_iunlock(ip, lockmode);
|
||||
return error;
|
||||
@ -1231,7 +1249,8 @@ xfs_xattr_iomap_begin(
|
||||
loff_t offset,
|
||||
loff_t length,
|
||||
unsigned flags,
|
||||
struct iomap *iomap)
|
||||
struct iomap *iomap,
|
||||
struct iomap *srcmap)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
@ -1261,7 +1280,7 @@ out_unlock:
|
||||
if (error)
|
||||
return error;
|
||||
ASSERT(nimaps);
|
||||
return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
|
||||
return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
|
||||
}
|
||||
|
||||
const struct iomap_ops xfs_xattr_iomap_ops = {
|
||||
|
@ -16,7 +16,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
|
||||
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
|
||||
|
||||
int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
|
||||
struct xfs_bmbt_irec *, bool shared);
|
||||
struct xfs_bmbt_irec *, u16);
|
||||
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
|
||||
|
||||
static inline xfs_filblks_t
|
||||
|
@ -178,7 +178,7 @@ xfs_fs_map_blocks(
|
||||
}
|
||||
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
|
||||
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
|
||||
*device_generation = mp->m_generation;
|
||||
return error;
|
||||
out_unlock:
|
||||
|
@ -1442,7 +1442,7 @@ xfs_reflink_dirty_extents(
|
||||
flen = XFS_FSB_TO_B(mp, rlen);
|
||||
if (fpos + flen > isize)
|
||||
flen = isize - fpos;
|
||||
error = iomap_file_dirty(VFS_I(ip), fpos, flen,
|
||||
error = iomap_file_unshare(VFS_I(ip), fpos, flen,
|
||||
&xfs_iomap_ops);
|
||||
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
||||
if (error)
|
||||
|
@ -40,7 +40,6 @@
|
||||
#include <linux/parser.h>
|
||||
|
||||
static const struct super_operations xfs_super_operations;
|
||||
struct bio_set xfs_ioend_bioset;
|
||||
|
||||
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
|
||||
#ifdef DEBUG
|
||||
@ -1853,15 +1852,10 @@ MODULE_ALIAS_FS("xfs");
|
||||
STATIC int __init
|
||||
xfs_init_zones(void)
|
||||
{
|
||||
if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
|
||||
offsetof(struct xfs_ioend, io_inline_bio),
|
||||
BIOSET_NEED_BVECS))
|
||||
goto out;
|
||||
|
||||
xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
|
||||
"xfs_log_ticket");
|
||||
if (!xfs_log_ticket_zone)
|
||||
goto out_free_ioend_bioset;
|
||||
goto out;
|
||||
|
||||
xfs_bmap_free_item_zone = kmem_zone_init(
|
||||
sizeof(struct xfs_extent_free_item),
|
||||
@ -1996,8 +1990,6 @@ xfs_init_zones(void)
|
||||
kmem_zone_destroy(xfs_bmap_free_item_zone);
|
||||
out_destroy_log_ticket_zone:
|
||||
kmem_zone_destroy(xfs_log_ticket_zone);
|
||||
out_free_ioend_bioset:
|
||||
bioset_exit(&xfs_ioend_bioset);
|
||||
out:
|
||||
return -ENOMEM;
|
||||
}
|
||||
@ -2028,7 +2020,6 @@ xfs_destroy_zones(void)
|
||||
kmem_zone_destroy(xfs_btree_cur_zone);
|
||||
kmem_zone_destroy(xfs_bmap_free_item_zone);
|
||||
kmem_zone_destroy(xfs_log_ticket_zone);
|
||||
bioset_exit(&xfs_ioend_bioset);
|
||||
}
|
||||
|
||||
STATIC int __init
|
||||
|
@ -1158,71 +1158,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write);
|
||||
DEFINE_RW_EVENT(xfs_file_direct_write);
|
||||
DEFINE_RW_EVENT(xfs_file_dax_write);
|
||||
|
||||
DECLARE_EVENT_CLASS(xfs_page_class,
|
||||
TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
|
||||
unsigned int len),
|
||||
TP_ARGS(inode, page, off, len),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(xfs_ino_t, ino)
|
||||
__field(pgoff_t, pgoff)
|
||||
__field(loff_t, size)
|
||||
__field(unsigned long, offset)
|
||||
__field(unsigned int, length)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->ino = XFS_I(inode)->i_ino;
|
||||
__entry->pgoff = page_offset(page);
|
||||
__entry->size = i_size_read(inode);
|
||||
__entry->offset = off;
|
||||
__entry->length = len;
|
||||
),
|
||||
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
|
||||
"length %x",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->ino,
|
||||
__entry->pgoff,
|
||||
__entry->size,
|
||||
__entry->offset,
|
||||
__entry->length)
|
||||
)
|
||||
|
||||
#define DEFINE_PAGE_EVENT(name) \
|
||||
DEFINE_EVENT(xfs_page_class, name, \
|
||||
TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
|
||||
unsigned int len), \
|
||||
TP_ARGS(inode, page, off, len))
|
||||
DEFINE_PAGE_EVENT(xfs_writepage);
|
||||
DEFINE_PAGE_EVENT(xfs_releasepage);
|
||||
DEFINE_PAGE_EVENT(xfs_invalidatepage);
|
||||
|
||||
DECLARE_EVENT_CLASS(xfs_readpage_class,
|
||||
TP_PROTO(struct inode *inode, int nr_pages),
|
||||
TP_ARGS(inode, nr_pages),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(xfs_ino_t, ino)
|
||||
__field(int, nr_pages)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->nr_pages = nr_pages;
|
||||
),
|
||||
TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->ino,
|
||||
__entry->nr_pages)
|
||||
)
|
||||
|
||||
#define DEFINE_READPAGE_EVENT(name) \
|
||||
DEFINE_EVENT(xfs_readpage_class, name, \
|
||||
TP_PROTO(struct inode *inode, int nr_pages), \
|
||||
TP_ARGS(inode, nr_pages))
|
||||
DEFINE_READPAGE_EVENT(xfs_vm_readpage);
|
||||
DEFINE_READPAGE_EVENT(xfs_vm_readpages);
|
||||
|
||||
DECLARE_EVENT_CLASS(xfs_imap_class,
|
||||
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
|
||||
int whichfork, struct xfs_bmbt_irec *irec),
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/blk_types.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/mm_types.h>
|
||||
@ -12,6 +13,7 @@
|
||||
struct address_space;
|
||||
struct fiemap_extent_info;
|
||||
struct inode;
|
||||
struct iomap_writepage_ctx;
|
||||
struct iov_iter;
|
||||
struct kiocb;
|
||||
struct page;
|
||||
@ -21,28 +23,45 @@ struct vm_fault;
|
||||
/*
|
||||
* Types of block ranges for iomap mappings:
|
||||
*/
|
||||
#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
|
||||
#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
|
||||
#define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */
|
||||
#define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */
|
||||
#define IOMAP_INLINE 0x05 /* data inline in the inode */
|
||||
#define IOMAP_HOLE 0 /* no blocks allocated, need allocation */
|
||||
#define IOMAP_DELALLOC 1 /* delayed allocation blocks */
|
||||
#define IOMAP_MAPPED 2 /* blocks allocated at @addr */
|
||||
#define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */
|
||||
#define IOMAP_INLINE 4 /* data inline in the inode */
|
||||
|
||||
/*
|
||||
* Flags for all iomap mappings:
|
||||
* Flags reported by the file system from iomap_begin:
|
||||
*
|
||||
* IOMAP_F_NEW indicates that the blocks have been newly allocated and need
|
||||
* zeroing for areas that no data is copied to.
|
||||
*
|
||||
* IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
|
||||
* written data and requires fdatasync to commit them to persistent storage.
|
||||
* This needs to take into account metadata changes that *may* be made at IO
|
||||
* completion, such as file size updates from direct IO.
|
||||
*
|
||||
* IOMAP_F_SHARED indicates that the blocks are shared, and will need to be
|
||||
* unshared as part a write.
|
||||
*
|
||||
* IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block
|
||||
* mappings.
|
||||
*
|
||||
* IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
|
||||
* buffer heads for this mapping.
|
||||
*/
|
||||
#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */
|
||||
#define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */
|
||||
#define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */
|
||||
#define IOMAP_F_SIZE_CHANGED 0x08 /* file size has changed */
|
||||
#define IOMAP_F_NEW 0x01
|
||||
#define IOMAP_F_DIRTY 0x02
|
||||
#define IOMAP_F_SHARED 0x04
|
||||
#define IOMAP_F_MERGED 0x08
|
||||
#define IOMAP_F_BUFFER_HEAD 0x10
|
||||
|
||||
/*
|
||||
* Flags that only need to be reported for IOMAP_REPORT requests:
|
||||
* Flags set by the core iomap code during operations:
|
||||
*
|
||||
* IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size
|
||||
* has changed as the result of this write operation.
|
||||
*/
|
||||
#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */
|
||||
#define IOMAP_F_SHARED 0x20 /* block shared with another file */
|
||||
#define IOMAP_F_SIZE_CHANGED 0x100
|
||||
|
||||
/*
|
||||
* Flags from 0x1000 up are for file system specific usage:
|
||||
@ -110,7 +129,8 @@ struct iomap_ops {
|
||||
* The actual length is returned in iomap->length.
|
||||
*/
|
||||
int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
|
||||
unsigned flags, struct iomap *iomap);
|
||||
unsigned flags, struct iomap *iomap,
|
||||
struct iomap *srcmap);
|
||||
|
||||
/*
|
||||
* Commit and/or unreserve space previous allocated using iomap_begin.
|
||||
@ -126,29 +146,12 @@ struct iomap_ops {
|
||||
* Main iomap iterator function.
|
||||
*/
|
||||
typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
|
||||
void *data, struct iomap *iomap);
|
||||
void *data, struct iomap *iomap, struct iomap *srcmap);
|
||||
|
||||
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
|
||||
unsigned flags, const struct iomap_ops *ops, void *data,
|
||||
iomap_actor_t actor);
|
||||
|
||||
/*
|
||||
* Structure allocate for each page when block size < PAGE_SIZE to track
|
||||
* sub-page uptodate status and I/O completions.
|
||||
*/
|
||||
struct iomap_page {
|
||||
atomic_t read_count;
|
||||
atomic_t write_count;
|
||||
DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
|
||||
};
|
||||
|
||||
static inline struct iomap_page *to_iomap_page(struct page *page)
|
||||
{
|
||||
if (page_has_private(page))
|
||||
return (struct iomap_page *)page_private(page);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
|
||||
const struct iomap_ops *ops);
|
||||
int iomap_readpage(struct page *page, const struct iomap_ops *ops);
|
||||
@ -166,7 +169,7 @@ int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
|
||||
#else
|
||||
#define iomap_migrate_page NULL
|
||||
#endif
|
||||
int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
|
||||
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
|
||||
const struct iomap_ops *ops);
|
||||
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
|
||||
bool *did_zero, const struct iomap_ops *ops);
|
||||
@ -183,6 +186,63 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset,
|
||||
sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
|
||||
const struct iomap_ops *ops);
|
||||
|
||||
/*
|
||||
* Structure for writeback I/O completions.
|
||||
*/
|
||||
struct iomap_ioend {
|
||||
struct list_head io_list; /* next ioend in chain */
|
||||
u16 io_type;
|
||||
u16 io_flags; /* IOMAP_F_* */
|
||||
struct inode *io_inode; /* file being written to */
|
||||
size_t io_size; /* size of the extent */
|
||||
loff_t io_offset; /* offset in the file */
|
||||
void *io_private; /* file system private data */
|
||||
struct bio *io_bio; /* bio being built */
|
||||
struct bio io_inline_bio; /* MUST BE LAST! */
|
||||
};
|
||||
|
||||
struct iomap_writeback_ops {
|
||||
/*
|
||||
* Required, maps the blocks so that writeback can be performed on
|
||||
* the range starting at offset.
|
||||
*/
|
||||
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
|
||||
loff_t offset);
|
||||
|
||||
/*
|
||||
* Optional, allows the file systems to perform actions just before
|
||||
* submitting the bio and/or override the bio end_io handler for complex
|
||||
* operations like copy on write extent manipulation or unwritten extent
|
||||
* conversions.
|
||||
*/
|
||||
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
|
||||
|
||||
/*
|
||||
* Optional, allows the file system to discard state on a page where
|
||||
* we failed to submit any I/O.
|
||||
*/
|
||||
void (*discard_page)(struct page *page);
|
||||
};
|
||||
|
||||
struct iomap_writepage_ctx {
|
||||
struct iomap iomap;
|
||||
struct iomap_ioend *ioend;
|
||||
const struct iomap_writeback_ops *ops;
|
||||
};
|
||||
|
||||
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
|
||||
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
|
||||
struct list_head *more_ioends,
|
||||
void (*merge_private)(struct iomap_ioend *ioend,
|
||||
struct iomap_ioend *next));
|
||||
void iomap_sort_ioends(struct list_head *ioend_list);
|
||||
int iomap_writepage(struct page *page, struct writeback_control *wbc,
|
||||
struct iomap_writepage_ctx *wpc,
|
||||
const struct iomap_writeback_ops *ops);
|
||||
int iomap_writepages(struct address_space *mapping,
|
||||
struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
|
||||
const struct iomap_writeback_ops *ops);
|
||||
|
||||
/*
|
||||
* Flags for direct I/O ->end_io:
|
||||
*/
|
||||
@ -195,7 +255,8 @@ struct iomap_dio_ops {
|
||||
};
|
||||
|
||||
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
const struct iomap_ops *ops, const struct iomap_dio_ops *dops);
|
||||
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
|
||||
bool wait_for_completion);
|
||||
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
|
||||
|
||||
#ifdef CONFIG_SWAP
|
||||
|
Loading…
Reference in New Issue
Block a user