9f67672a81
casefold, ensure that deleted file names are cleared in directory blocks by zeroing directory entries when they are unlinked or moved as part of a hash tree node split. We also improve the block allocator's performance on a freshly mounted file system by prefetching block bitmaps. There are also the usual cleanups and bug fixes, including fixing a page cache invalidation race when there is mixed buffered and direct I/O and the block size is less than page size, and allow the dax flag to be set and cleared on inline directories. -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAmCLei4ACgkQ8vlZVpUN gaPZkgf/VH08xjMf3VthC+BpvVmChQXfV4yjigHbO2pmPyYWZhyJzkEGCQD8u2eB b7ShW+B1NCifcTU34xAkKHwEtakzzEv3WIMrT1oZNWrpfo8tt850EkwQggaGGDpd /HnP1/wLtziJ5hE6DwutmX7qB4VFghVj898MjDrEPSOBqItOjWps9mn/JWL7SHyI Dqzhf5XZTYPaXWuJmSmKw3q8O70JDHnZe/rRWlfX1jLI5KDtqp71Nw1B+gszUB66 IUdncyZKvInsyjYhkbCQ8U6WFih82MrbKeuGYDp/RFvg5eMELEYkwT9j0ofuDHq8 zn62sAlbOXv1DiqkPDHKVm9GkHx8/g== =UpnH -----END PGP SIGNATURE----- Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 updates from Ted Ts'o: "New features for ext4 this cycle include support for encrypted casefold, ensure that deleted file names are cleared in directory blocks by zeroing directory entries when they are unlinked or moved as part of a hash tree node split. We also improve the block allocator's performance on a freshly mounted file system by prefetching block bitmaps. There are also the usual cleanups and bug fixes, including fixing a page cache invalidation race when there is mixed buffered and direct I/O and the block size is less than page size, and allow the dax flag to be set and cleared on inline directories" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (32 commits) ext4: wipe ext4_dir_entry2 upon file deletion ext4: Fix occasional generic/418 failure fs: fix reporting supported extra file attributes for statx() ext4: allow the dax flag to be set and cleared on inline directories ext4: fix debug format string warning ext4: fix trailing whitespace ext4: fix various seppling typos ext4: fix error return code in ext4_fc_perform_commit() ext4: annotate data race in jbd2_journal_dirty_metadata() ext4: annotate data race in start_this_handle() ext4: fix ext4_error_err save negative errno into superblock ext4: fix error code in ext4_commit_super ext4: always panic when errors=panic is specified ext4: delete redundant uptodate check for buffer ext4: do not set SB_ACTIVE in ext4_orphan_cleanup() ext4: make prefetch_block_bitmaps default ext4: add proc files to monitor new structures ext4: improve cr 0 / cr 1 group scanning ext4: add MB_NUM_ORDERS macro ext4: add mballoc stats proc file ...
943 lines
24 KiB
C
943 lines
24 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* linux/fs/ext4/file.c
|
|
*
|
|
* Copyright (C) 1992, 1993, 1994, 1995
|
|
* Remy Card (card@masi.ibp.fr)
|
|
* Laboratoire MASI - Institut Blaise Pascal
|
|
* Universite Pierre et Marie Curie (Paris VI)
|
|
*
|
|
* from
|
|
*
|
|
* linux/fs/minix/file.c
|
|
*
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
*
|
|
* ext4 fs regular file handling primitives
|
|
*
|
|
* 64-bit file support on 64-bit platforms by Jakub Jelinek
|
|
* (jj@sunsite.ms.mff.cuni.cz)
|
|
*/
|
|
|
|
#include <linux/time.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/iomap.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/path.h>
|
|
#include <linux/dax.h>
|
|
#include <linux/quotaops.h>
|
|
#include <linux/pagevec.h>
|
|
#include <linux/uio.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/backing-dev.h>
|
|
#include "ext4.h"
|
|
#include "ext4_jbd2.h"
|
|
#include "xattr.h"
|
|
#include "acl.h"
|
|
#include "truncate.h"
|
|
|
|
static bool ext4_dio_supported(struct inode *inode)
|
|
{
|
|
if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
|
|
return false;
|
|
if (fsverity_active(inode))
|
|
return false;
|
|
if (ext4_should_journal_data(inode))
|
|
return false;
|
|
if (ext4_has_inline_data(inode))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
|
{
|
|
ssize_t ret;
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
if (!inode_trylock_shared(inode))
|
|
return -EAGAIN;
|
|
} else {
|
|
inode_lock_shared(inode);
|
|
}
|
|
|
|
if (!ext4_dio_supported(inode)) {
|
|
inode_unlock_shared(inode);
|
|
/*
|
|
* Fallback to buffered I/O if the operation being performed on
|
|
* the inode is not supported by direct I/O. The IOCB_DIRECT
|
|
* flag needs to be cleared here in order to ensure that the
|
|
* direct I/O path within generic_file_read_iter() is not
|
|
* taken.
|
|
*/
|
|
iocb->ki_flags &= ~IOCB_DIRECT;
|
|
return generic_file_read_iter(iocb, to);
|
|
}
|
|
|
|
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
|
|
inode_unlock_shared(inode);
|
|
|
|
file_accessed(iocb->ki_filp);
|
|
return ret;
|
|
}
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
|
{
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
ssize_t ret;
|
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
if (!inode_trylock_shared(inode))
|
|
return -EAGAIN;
|
|
} else {
|
|
inode_lock_shared(inode);
|
|
}
|
|
/*
|
|
* Recheck under inode lock - at this point we are sure it cannot
|
|
* change anymore
|
|
*/
|
|
if (!IS_DAX(inode)) {
|
|
inode_unlock_shared(inode);
|
|
/* Fallback to buffered IO in case we cannot support DAX */
|
|
return generic_file_read_iter(iocb, to);
|
|
}
|
|
ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
|
|
inode_unlock_shared(inode);
|
|
|
|
file_accessed(iocb->ki_filp);
|
|
return ret;
|
|
}
|
|
#endif
|
|
|
|
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
|
{
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
return -EIO;
|
|
|
|
if (!iov_iter_count(to))
|
|
return 0; /* skip atime */
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
if (IS_DAX(inode))
|
|
return ext4_dax_read_iter(iocb, to);
|
|
#endif
|
|
if (iocb->ki_flags & IOCB_DIRECT)
|
|
return ext4_dio_read_iter(iocb, to);
|
|
|
|
return generic_file_read_iter(iocb, to);
|
|
}
|
|
|
|
/*
|
|
* Called when an inode is released. Note that this is different
|
|
* from ext4_file_open: open gets called at every open, but release
|
|
* gets called only when /all/ the files are closed.
|
|
*/
|
|
static int ext4_release_file(struct inode *inode, struct file *filp)
|
|
{
|
|
if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
|
|
ext4_alloc_da_blocks(inode);
|
|
ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
|
|
}
|
|
/* if we are the last writer on the inode, drop the block reservation */
|
|
if ((filp->f_mode & FMODE_WRITE) &&
|
|
(atomic_read(&inode->i_writecount) == 1) &&
|
|
!EXT4_I(inode)->i_reserved_data_blocks) {
|
|
down_write(&EXT4_I(inode)->i_data_sem);
|
|
ext4_discard_preallocations(inode, 0);
|
|
up_write(&EXT4_I(inode)->i_data_sem);
|
|
}
|
|
if (is_dx(inode) && filp->private_data)
|
|
ext4_htree_free_dir_info(filp->private_data);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* This tests whether the IO in question is block-aligned or not.
|
|
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
|
|
* are converted to written only after the IO is complete. Until they are
|
|
* mapped, these blocks appear as holes, so dio_zero_block() will assume that
|
|
* it needs to zero out portions of the start and/or end block. If 2 AIO
|
|
* threads are at work on the same unwritten block, they must be synchronized
|
|
* or one thread will zero the other's data, causing corruption.
|
|
*/
|
|
static bool
|
|
ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
|
|
{
|
|
struct super_block *sb = inode->i_sb;
|
|
unsigned long blockmask = sb->s_blocksize - 1;
|
|
|
|
if ((pos | iov_iter_alignment(from)) & blockmask)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
|
|
{
|
|
if (offset + len > i_size_read(inode) ||
|
|
offset + len > EXT4_I(inode)->i_disksize)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
/* Is IO overwriting allocated and initialized blocks? */
|
|
static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
|
|
{
|
|
struct ext4_map_blocks map;
|
|
unsigned int blkbits = inode->i_blkbits;
|
|
int err, blklen;
|
|
|
|
if (pos + len > i_size_read(inode))
|
|
return false;
|
|
|
|
map.m_lblk = pos >> blkbits;
|
|
map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
|
|
blklen = map.m_len;
|
|
|
|
err = ext4_map_blocks(NULL, inode, &map, 0);
|
|
/*
|
|
* 'err==len' means that all of the blocks have been preallocated,
|
|
* regardless of whether they have been initialized or not. To exclude
|
|
* unwritten extents, we need to check m_flags.
|
|
*/
|
|
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
|
|
}
|
|
|
|
static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
|
|
struct iov_iter *from)
|
|
{
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
ssize_t ret;
|
|
|
|
if (unlikely(IS_IMMUTABLE(inode)))
|
|
return -EPERM;
|
|
|
|
ret = generic_write_checks(iocb, from);
|
|
if (ret <= 0)
|
|
return ret;
|
|
|
|
/*
|
|
* If we have encountered a bitmap-format file, the size limit
|
|
* is smaller than s_maxbytes, which is for extent-mapped files.
|
|
*/
|
|
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
|
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
|
|
if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
|
|
return -EFBIG;
|
|
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
|
|
}
|
|
|
|
return iov_iter_count(from);
|
|
}
|
|
|
|
static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
|
|
{
|
|
ssize_t ret, count;
|
|
|
|
count = ext4_generic_write_checks(iocb, from);
|
|
if (count <= 0)
|
|
return count;
|
|
|
|
ret = file_modified(iocb->ki_filp);
|
|
if (ret)
|
|
return ret;
|
|
return count;
|
|
}
|
|
|
|
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
|
|
struct iov_iter *from)
|
|
{
|
|
ssize_t ret;
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT)
|
|
return -EOPNOTSUPP;
|
|
|
|
ext4_fc_start_update(inode);
|
|
inode_lock(inode);
|
|
ret = ext4_write_checks(iocb, from);
|
|
if (ret <= 0)
|
|
goto out;
|
|
|
|
current->backing_dev_info = inode_to_bdi(inode);
|
|
ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
|
|
current->backing_dev_info = NULL;
|
|
|
|
out:
|
|
inode_unlock(inode);
|
|
ext4_fc_stop_update(inode);
|
|
if (likely(ret > 0)) {
|
|
iocb->ki_pos += ret;
|
|
ret = generic_write_sync(iocb, ret);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
|
|
ssize_t written, size_t count)
|
|
{
|
|
handle_t *handle;
|
|
bool truncate = false;
|
|
u8 blkbits = inode->i_blkbits;
|
|
ext4_lblk_t written_blk, end_blk;
|
|
int ret;
|
|
|
|
/*
|
|
* Note that EXT4_I(inode)->i_disksize can get extended up to
|
|
* inode->i_size while the I/O was running due to writeback of delalloc
|
|
* blocks. But, the code in ext4_iomap_alloc() is careful to use
|
|
* zeroed/unwritten extents if this is possible; thus we won't leave
|
|
* uninitialized blocks in a file even if we didn't succeed in writing
|
|
* as much as we intended.
|
|
*/
|
|
WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
|
|
if (offset + count <= EXT4_I(inode)->i_disksize) {
|
|
/*
|
|
* We need to ensure that the inode is removed from the orphan
|
|
* list if it has been added prematurely, due to writeback of
|
|
* delalloc blocks.
|
|
*/
|
|
if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
|
|
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
|
|
|
if (IS_ERR(handle)) {
|
|
ext4_orphan_del(NULL, inode);
|
|
return PTR_ERR(handle);
|
|
}
|
|
|
|
ext4_orphan_del(handle, inode);
|
|
ext4_journal_stop(handle);
|
|
}
|
|
|
|
return written;
|
|
}
|
|
|
|
if (written < 0)
|
|
goto truncate;
|
|
|
|
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
|
if (IS_ERR(handle)) {
|
|
written = PTR_ERR(handle);
|
|
goto truncate;
|
|
}
|
|
|
|
if (ext4_update_inode_size(inode, offset + written)) {
|
|
ret = ext4_mark_inode_dirty(handle, inode);
|
|
if (unlikely(ret)) {
|
|
written = ret;
|
|
ext4_journal_stop(handle);
|
|
goto truncate;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We may need to truncate allocated but not written blocks beyond EOF.
|
|
*/
|
|
written_blk = ALIGN(offset + written, 1 << blkbits);
|
|
end_blk = ALIGN(offset + count, 1 << blkbits);
|
|
if (written_blk < end_blk && ext4_can_truncate(inode))
|
|
truncate = true;
|
|
|
|
/*
|
|
* Remove the inode from the orphan list if it has been extended and
|
|
* everything went OK.
|
|
*/
|
|
if (!truncate && inode->i_nlink)
|
|
ext4_orphan_del(handle, inode);
|
|
ext4_journal_stop(handle);
|
|
|
|
if (truncate) {
|
|
truncate:
|
|
ext4_truncate_failed_write(inode);
|
|
/*
|
|
* If the truncate operation failed early, then the inode may
|
|
* still be on the orphan list. In that case, we need to try
|
|
* remove the inode from the in-memory linked list.
|
|
*/
|
|
if (inode->i_nlink)
|
|
ext4_orphan_del(NULL, inode);
|
|
}
|
|
|
|
return written;
|
|
}
|
|
|
|
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
|
|
int error, unsigned int flags)
|
|
{
|
|
loff_t pos = iocb->ki_pos;
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
if (error)
|
|
return error;
|
|
|
|
if (size && flags & IOMAP_DIO_UNWRITTEN) {
|
|
error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
|
|
if (error < 0)
|
|
return error;
|
|
}
|
|
/*
|
|
* If we are extending the file, we have to update i_size here before
|
|
* page cache gets invalidated in iomap_dio_rw(). Otherwise racing
|
|
* buffered reads could zero out too much from page cache pages. Update
|
|
* of on-disk size will happen later in ext4_dio_write_iter() where
|
|
* we have enough information to also perform orphan list handling etc.
|
|
* Note that we perform all extending writes synchronously under
|
|
* i_rwsem held exclusively so i_size update is safe here in that case.
|
|
* If the write was not extending, we cannot see pos > i_size here
|
|
* because operations reducing i_size like truncate wait for all
|
|
* outstanding DIO before updating i_size.
|
|
*/
|
|
pos += size;
|
|
if (pos > i_size_read(inode))
|
|
i_size_write(inode, pos);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const struct iomap_dio_ops ext4_dio_write_ops = {
|
|
.end_io = ext4_dio_write_end_io,
|
|
};
|
|
|
|
/*
|
|
* The intention here is to start with shared lock acquired then see if any
|
|
* condition requires an exclusive inode lock. If yes, then we restart the
|
|
* whole operation by releasing the shared lock and acquiring exclusive lock.
|
|
*
|
|
* - For unaligned_io we never take shared lock as it may cause data corruption
|
|
* when two unaligned IO tries to modify the same block e.g. while zeroing.
|
|
*
|
|
* - For extending writes case we don't take the shared lock, since it requires
|
|
* updating inode i_disksize and/or orphan handling with exclusive lock.
|
|
*
|
|
* - shared locking will only be true mostly with overwrites. Otherwise we will
|
|
* switch to exclusive i_rwsem lock.
|
|
*/
|
|
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
|
|
bool *ilock_shared, bool *extend)
|
|
{
|
|
struct file *file = iocb->ki_filp;
|
|
struct inode *inode = file_inode(file);
|
|
loff_t offset;
|
|
size_t count;
|
|
ssize_t ret;
|
|
|
|
restart:
|
|
ret = ext4_generic_write_checks(iocb, from);
|
|
if (ret <= 0)
|
|
goto out;
|
|
|
|
offset = iocb->ki_pos;
|
|
count = ret;
|
|
if (ext4_extending_io(inode, offset, count))
|
|
*extend = true;
|
|
/*
|
|
* Determine whether the IO operation will overwrite allocated
|
|
* and initialized blocks.
|
|
* We need exclusive i_rwsem for changing security info
|
|
* in file_modified().
|
|
*/
|
|
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
|
|
!ext4_overwrite_io(inode, offset, count))) {
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
ret = -EAGAIN;
|
|
goto out;
|
|
}
|
|
inode_unlock_shared(inode);
|
|
*ilock_shared = false;
|
|
inode_lock(inode);
|
|
goto restart;
|
|
}
|
|
|
|
ret = file_modified(file);
|
|
if (ret < 0)
|
|
goto out;
|
|
|
|
return count;
|
|
out:
|
|
if (*ilock_shared)
|
|
inode_unlock_shared(inode);
|
|
else
|
|
inode_unlock(inode);
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
|
{
|
|
ssize_t ret;
|
|
handle_t *handle;
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
loff_t offset = iocb->ki_pos;
|
|
size_t count = iov_iter_count(from);
|
|
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
|
|
bool extend = false, unaligned_io = false;
|
|
bool ilock_shared = true;
|
|
|
|
/*
|
|
* We initially start with shared inode lock unless it is
|
|
* unaligned IO which needs exclusive lock anyways.
|
|
*/
|
|
if (ext4_unaligned_io(inode, from, offset)) {
|
|
unaligned_io = true;
|
|
ilock_shared = false;
|
|
}
|
|
/*
|
|
* Quick check here without any i_rwsem lock to see if it is extending
|
|
* IO. A more reliable check is done in ext4_dio_write_checks() with
|
|
* proper locking in place.
|
|
*/
|
|
if (offset + count > i_size_read(inode))
|
|
ilock_shared = false;
|
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
if (ilock_shared) {
|
|
if (!inode_trylock_shared(inode))
|
|
return -EAGAIN;
|
|
} else {
|
|
if (!inode_trylock(inode))
|
|
return -EAGAIN;
|
|
}
|
|
} else {
|
|
if (ilock_shared)
|
|
inode_lock_shared(inode);
|
|
else
|
|
inode_lock(inode);
|
|
}
|
|
|
|
/* Fallback to buffered I/O if the inode does not support direct I/O. */
|
|
if (!ext4_dio_supported(inode)) {
|
|
if (ilock_shared)
|
|
inode_unlock_shared(inode);
|
|
else
|
|
inode_unlock(inode);
|
|
return ext4_buffered_write_iter(iocb, from);
|
|
}
|
|
|
|
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
|
|
if (ret <= 0)
|
|
return ret;
|
|
|
|
/* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
|
|
if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
|
|
ret = -EAGAIN;
|
|
goto out;
|
|
}
|
|
|
|
offset = iocb->ki_pos;
|
|
count = ret;
|
|
|
|
/*
|
|
* Unaligned direct IO must be serialized among each other as zeroing
|
|
* of partial blocks of two competing unaligned IOs can result in data
|
|
* corruption.
|
|
*
|
|
* So we make sure we don't allow any unaligned IO in flight.
|
|
* For IOs where we need not wait (like unaligned non-AIO DIO),
|
|
* below inode_dio_wait() may anyway become a no-op, since we start
|
|
* with exclusive lock.
|
|
*/
|
|
if (unaligned_io)
|
|
inode_dio_wait(inode);
|
|
|
|
if (extend) {
|
|
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
|
if (IS_ERR(handle)) {
|
|
ret = PTR_ERR(handle);
|
|
goto out;
|
|
}
|
|
|
|
ext4_fc_start_update(inode);
|
|
ret = ext4_orphan_add(handle, inode);
|
|
ext4_fc_stop_update(inode);
|
|
if (ret) {
|
|
ext4_journal_stop(handle);
|
|
goto out;
|
|
}
|
|
|
|
ext4_journal_stop(handle);
|
|
}
|
|
|
|
if (ilock_shared)
|
|
iomap_ops = &ext4_iomap_overwrite_ops;
|
|
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
|
|
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
|
|
if (ret == -ENOTBLK)
|
|
ret = 0;
|
|
|
|
if (extend)
|
|
ret = ext4_handle_inode_extension(inode, offset, ret, count);
|
|
|
|
out:
|
|
if (ilock_shared)
|
|
inode_unlock_shared(inode);
|
|
else
|
|
inode_unlock(inode);
|
|
|
|
if (ret >= 0 && iov_iter_count(from)) {
|
|
ssize_t err;
|
|
loff_t endbyte;
|
|
|
|
offset = iocb->ki_pos;
|
|
err = ext4_buffered_write_iter(iocb, from);
|
|
if (err < 0)
|
|
return err;
|
|
|
|
/*
|
|
* We need to ensure that the pages within the page cache for
|
|
* the range covered by this I/O are written to disk and
|
|
* invalidated. This is in attempt to preserve the expected
|
|
* direct I/O semantics in the case we fallback to buffered I/O
|
|
* to complete off the I/O request.
|
|
*/
|
|
ret += err;
|
|
endbyte = offset + err - 1;
|
|
err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
|
|
offset, endbyte);
|
|
if (!err)
|
|
invalidate_mapping_pages(iocb->ki_filp->f_mapping,
|
|
offset >> PAGE_SHIFT,
|
|
endbyte >> PAGE_SHIFT);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
static ssize_t
|
|
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
|
{
|
|
ssize_t ret;
|
|
size_t count;
|
|
loff_t offset;
|
|
handle_t *handle;
|
|
bool extend = false;
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
if (!inode_trylock(inode))
|
|
return -EAGAIN;
|
|
} else {
|
|
inode_lock(inode);
|
|
}
|
|
|
|
ret = ext4_write_checks(iocb, from);
|
|
if (ret <= 0)
|
|
goto out;
|
|
|
|
offset = iocb->ki_pos;
|
|
count = iov_iter_count(from);
|
|
|
|
if (offset + count > EXT4_I(inode)->i_disksize) {
|
|
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
|
|
if (IS_ERR(handle)) {
|
|
ret = PTR_ERR(handle);
|
|
goto out;
|
|
}
|
|
|
|
ret = ext4_orphan_add(handle, inode);
|
|
if (ret) {
|
|
ext4_journal_stop(handle);
|
|
goto out;
|
|
}
|
|
|
|
extend = true;
|
|
ext4_journal_stop(handle);
|
|
}
|
|
|
|
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
|
|
|
|
if (extend)
|
|
ret = ext4_handle_inode_extension(inode, offset, ret, count);
|
|
out:
|
|
inode_unlock(inode);
|
|
if (ret > 0)
|
|
ret = generic_write_sync(iocb, ret);
|
|
return ret;
|
|
}
|
|
#endif
|
|
|
|
static ssize_t
|
|
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
|
{
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
return -EIO;
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
if (IS_DAX(inode))
|
|
return ext4_dax_write_iter(iocb, from);
|
|
#endif
|
|
if (iocb->ki_flags & IOCB_DIRECT)
|
|
return ext4_dio_write_iter(iocb, from);
|
|
else
|
|
return ext4_buffered_write_iter(iocb, from);
|
|
}
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
|
|
enum page_entry_size pe_size)
|
|
{
|
|
int error = 0;
|
|
vm_fault_t result;
|
|
int retries = 0;
|
|
handle_t *handle = NULL;
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
|
struct super_block *sb = inode->i_sb;
|
|
|
|
/*
|
|
* We have to distinguish real writes from writes which will result in a
|
|
* COW page; COW writes should *not* poke the journal (the file will not
|
|
* be changed). Doing so would cause unintended failures when mounted
|
|
* read-only.
|
|
*
|
|
* We check for VM_SHARED rather than vmf->cow_page since the latter is
|
|
* unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
|
|
* other sizes, dax_iomap_fault will handle splitting / fallback so that
|
|
* we eventually come back with a COW page.
|
|
*/
|
|
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
|
|
(vmf->vma->vm_flags & VM_SHARED);
|
|
pfn_t pfn;
|
|
|
|
if (write) {
|
|
sb_start_pagefault(sb);
|
|
file_update_time(vmf->vma->vm_file);
|
|
down_read(&EXT4_I(inode)->i_mmap_sem);
|
|
retry:
|
|
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
|
|
EXT4_DATA_TRANS_BLOCKS(sb));
|
|
if (IS_ERR(handle)) {
|
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
|
sb_end_pagefault(sb);
|
|
return VM_FAULT_SIGBUS;
|
|
}
|
|
} else {
|
|
down_read(&EXT4_I(inode)->i_mmap_sem);
|
|
}
|
|
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
|
|
if (write) {
|
|
ext4_journal_stop(handle);
|
|
|
|
if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
|
|
ext4_should_retry_alloc(sb, &retries))
|
|
goto retry;
|
|
/* Handling synchronous page fault? */
|
|
if (result & VM_FAULT_NEEDDSYNC)
|
|
result = dax_finish_sync_fault(vmf, pe_size, pfn);
|
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
|
sb_end_pagefault(sb);
|
|
} else {
|
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
|
|
{
|
|
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
|
|
}
|
|
|
|
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
|
.fault = ext4_dax_fault,
|
|
.huge_fault = ext4_dax_huge_fault,
|
|
.page_mkwrite = ext4_dax_fault,
|
|
.pfn_mkwrite = ext4_dax_fault,
|
|
};
|
|
#else
|
|
#define ext4_dax_vm_ops ext4_file_vm_ops
|
|
#endif
|
|
|
|
static const struct vm_operations_struct ext4_file_vm_ops = {
|
|
.fault = ext4_filemap_fault,
|
|
.map_pages = filemap_map_pages,
|
|
.page_mkwrite = ext4_page_mkwrite,
|
|
};
|
|
|
|
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
|
|
{
|
|
struct inode *inode = file->f_mapping->host;
|
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
struct dax_device *dax_dev = sbi->s_daxdev;
|
|
|
|
if (unlikely(ext4_forced_shutdown(sbi)))
|
|
return -EIO;
|
|
|
|
/*
|
|
* We don't support synchronous mappings for non-DAX files and
|
|
* for DAX files if underneath dax_device is not synchronous.
|
|
*/
|
|
if (!daxdev_mapping_supported(vma, dax_dev))
|
|
return -EOPNOTSUPP;
|
|
|
|
file_accessed(file);
|
|
if (IS_DAX(file_inode(file))) {
|
|
vma->vm_ops = &ext4_dax_vm_ops;
|
|
vma->vm_flags |= VM_HUGEPAGE;
|
|
} else {
|
|
vma->vm_ops = &ext4_file_vm_ops;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int ext4_sample_last_mounted(struct super_block *sb,
|
|
struct vfsmount *mnt)
|
|
{
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
struct path path;
|
|
char buf[64], *cp;
|
|
handle_t *handle;
|
|
int err;
|
|
|
|
if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
|
|
return 0;
|
|
|
|
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
|
|
return 0;
|
|
|
|
ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
|
|
/*
|
|
* Sample where the filesystem has been mounted and
|
|
* store it in the superblock for sysadmin convenience
|
|
* when trying to sort through large numbers of block
|
|
* devices or filesystem images.
|
|
*/
|
|
memset(buf, 0, sizeof(buf));
|
|
path.mnt = mnt;
|
|
path.dentry = mnt->mnt_root;
|
|
cp = d_path(&path, buf, sizeof(buf));
|
|
err = 0;
|
|
if (IS_ERR(cp))
|
|
goto out;
|
|
|
|
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
|
|
err = PTR_ERR(handle);
|
|
if (IS_ERR(handle))
|
|
goto out;
|
|
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
|
|
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
|
|
if (err)
|
|
goto out_journal;
|
|
lock_buffer(sbi->s_sbh);
|
|
strncpy(sbi->s_es->s_last_mounted, cp,
|
|
sizeof(sbi->s_es->s_last_mounted));
|
|
ext4_superblock_csum_set(sb);
|
|
unlock_buffer(sbi->s_sbh);
|
|
ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
|
|
out_journal:
|
|
ext4_journal_stop(handle);
|
|
out:
|
|
sb_end_intwrite(sb);
|
|
return err;
|
|
}
|
|
|
|
static int ext4_file_open(struct inode *inode, struct file *filp)
|
|
{
|
|
int ret;
|
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
return -EIO;
|
|
|
|
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
|
|
if (ret)
|
|
return ret;
|
|
|
|
ret = fscrypt_file_open(inode, filp);
|
|
if (ret)
|
|
return ret;
|
|
|
|
ret = fsverity_file_open(inode, filp);
|
|
if (ret)
|
|
return ret;
|
|
|
|
/*
|
|
* Set up the jbd2_inode if we are opening the inode for
|
|
* writing and the journal is present
|
|
*/
|
|
if (filp->f_mode & FMODE_WRITE) {
|
|
ret = ext4_inode_attach_jinode(inode);
|
|
if (ret < 0)
|
|
return ret;
|
|
}
|
|
|
|
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
|
|
return dquot_file_open(inode, filp);
|
|
}
|
|
|
|
/*
|
|
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
|
|
* by calling generic_file_llseek_size() with the appropriate maxbytes
|
|
* value for each.
|
|
*/
|
|
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
|
|
{
|
|
struct inode *inode = file->f_mapping->host;
|
|
loff_t maxbytes;
|
|
|
|
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
|
|
maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
|
|
else
|
|
maxbytes = inode->i_sb->s_maxbytes;
|
|
|
|
switch (whence) {
|
|
default:
|
|
return generic_file_llseek_size(file, offset, whence,
|
|
maxbytes, i_size_read(inode));
|
|
case SEEK_HOLE:
|
|
inode_lock_shared(inode);
|
|
offset = iomap_seek_hole(inode, offset,
|
|
&ext4_iomap_report_ops);
|
|
inode_unlock_shared(inode);
|
|
break;
|
|
case SEEK_DATA:
|
|
inode_lock_shared(inode);
|
|
offset = iomap_seek_data(inode, offset,
|
|
&ext4_iomap_report_ops);
|
|
inode_unlock_shared(inode);
|
|
break;
|
|
}
|
|
|
|
if (offset < 0)
|
|
return offset;
|
|
return vfs_setpos(file, offset, maxbytes);
|
|
}
|
|
|
|
const struct file_operations ext4_file_operations = {
|
|
.llseek = ext4_llseek,
|
|
.read_iter = ext4_file_read_iter,
|
|
.write_iter = ext4_file_write_iter,
|
|
.iopoll = iomap_dio_iopoll,
|
|
.unlocked_ioctl = ext4_ioctl,
|
|
#ifdef CONFIG_COMPAT
|
|
.compat_ioctl = ext4_compat_ioctl,
|
|
#endif
|
|
.mmap = ext4_file_mmap,
|
|
.mmap_supported_flags = MAP_SYNC,
|
|
.open = ext4_file_open,
|
|
.release = ext4_release_file,
|
|
.fsync = ext4_sync_file,
|
|
.get_unmapped_area = thp_get_unmapped_area,
|
|
.splice_read = generic_file_splice_read,
|
|
.splice_write = iter_file_splice_write,
|
|
.fallocate = ext4_fallocate,
|
|
};
|
|
|
|
const struct inode_operations ext4_file_inode_operations = {
|
|
.setattr = ext4_setattr,
|
|
.getattr = ext4_file_getattr,
|
|
.listxattr = ext4_listxattr,
|
|
.get_acl = ext4_get_acl,
|
|
.set_acl = ext4_set_acl,
|
|
.fiemap = ext4_fiemap,
|
|
.fileattr_get = ext4_fileattr_get,
|
|
.fileattr_set = ext4_fileattr_set,
|
|
};
|
|
|