for-5.20/io_uring-buffered-writes-2022-07-29
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmLkm7UQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpldTEADTg/96R+eq78UZBNZmdifY9/qwQD+kzNiK ACDoYZFSbWUjMeOWqRxbYr6mXBKHnGHyTGlraTTpLDzhpB1xwoWfgOK9uOYXW/Ik eWfgTujPW/8v/l/z86khE+GH9b/maGCRqNZgS6uLVLzhxG6oCkoYTyOh1iHaF1VM Rma4nbJ8GSEDtiXNDl0Bznnyks/pzwoz/9slwzZ7PxtFwZsBxKuxgMUR5HIXdRp7 5iUoFJhZrGWyi/dbQZUsK/9VYVVnKkcBCz2pb4GEmC+3dS/vlPEoeWUpPHInNyd1 9NB9v8c+KFmQaWnCxuxcdHvCfmRRQrX8Pr8/OBNZKO6McYrKWKA+lurp4EGClE3m cZdK+P/9FS/Eeua8hum9UnbPAqsJPqLTbpbrySeBdd4iFA6u7rRqDX2+nz3PNe9U 1b7V1bWBIEY/Rsw/PKo59oIeV0auD8v9OCHJ0lF2pv6dRln2/W0y1Qfd1DI18xFG +9bBnQzhF7R0O8UP5ApVayQCYrd906YsSVUOqAiLmUs/BoOgRq6g/0BqSOVVKE2u 5iq8zTsVMkxY0ZpExwZST/700JwkPIV4SVPEYRC6QssFTcylvlisIek6XYSS9HX4 Z6gzMwJW1H47bEfG4JolTI8uBjp0hQLCPX0O0XFLVnbHQwN0kjIBmv3axAwJO2NV qrrHXjf09w== =hV7G -----END PGP SIGNATURE----- Merge tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block Pull io_uring buffered writes support from Jens Axboe: "This contains support for buffered writes, specifically for XFS. btrfs is in progress, will be coming in the next release. io_uring does support buffered writes on any file type, but since the buffered write path just always -EAGAIN (or -EOPNOTSUPP) any attempt to do so if IOCB_NOWAIT is set, any buffered write will effectively be handled by io-wq offload. This isn't very efficient, and we even have specific code in io-wq to serialize buffered writes to the same inode to avoid further inefficiencies with thread offload. This is particularly sad since most buffered writes don't block, they simply copy data to a page and dirty it. With this pull request, we can handle buffered writes a lot more effiently. If balance_dirty_pages() needs to block, we back off on writes as indicated. This improves buffered write support by 2-3x. Jan Kara helped with the mm bits for this, and Stefan handled the fs/iomap/xfs/io_uring parts of it" * tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block: mm: honor FGP_NOWAIT for page cache page allocation xfs: Add async buffered write support xfs: Specify lockmode when calling xfs_ilock_for_iomap() io_uring: Add tracepoint for short writes io_uring: fix issue with io_write() not always undoing sb_start_write() io_uring: Add support for async buffered writes fs: Add async write file modification handling. fs: Split off inode_needs_update_time and __file_update_time fs: add __remove_file_privs() with flags parameter fs: add a FMODE_BUF_WASYNC flags for f_mode iomap: Return -EAGAIN from iomap_write_iter() iomap: Add async buffered write support iomap: Add flags parameter to iomap_page_create() mm: Add balance_dirty_pages_ratelimited_flags() function mm: Move updates of dirty_exceeded into one place mm: Move starting of background writeback into the main balancing loop
This commit is contained in:
commit
98e2474640
170
fs/inode.c
170
fs/inode.c
@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns,
|
||||
return notify_change(mnt_userns, dentry, &newattrs, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove special file priviledges (suid, capabilities) when file is written
|
||||
* to or truncated.
|
||||
*/
|
||||
int file_remove_privs(struct file *file)
|
||||
static int __file_remove_privs(struct file *file, unsigned int flags)
|
||||
{
|
||||
struct dentry *dentry = file_dentry(file);
|
||||
struct inode *inode = file_inode(file);
|
||||
int error;
|
||||
int kill;
|
||||
int error = 0;
|
||||
|
||||
/*
|
||||
* Fast path for nothing security related.
|
||||
* As well for non-regular files, e.g. blkdev inodes.
|
||||
* For example, blkdev_write_iter() might get here
|
||||
* trying to remove privs which it is not allowed to.
|
||||
*/
|
||||
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
|
||||
return 0;
|
||||
|
||||
kill = dentry_needs_remove_privs(dentry);
|
||||
if (kill < 0)
|
||||
if (kill <= 0)
|
||||
return kill;
|
||||
if (kill)
|
||||
error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
|
||||
|
||||
if (flags & IOCB_NOWAIT)
|
||||
return -EAGAIN;
|
||||
|
||||
error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
|
||||
if (!error)
|
||||
inode_has_no_xattr(inode);
|
||||
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL(file_remove_privs);
|
||||
|
||||
/**
|
||||
* file_update_time - update mtime and ctime time
|
||||
* @file: file accessed
|
||||
* file_remove_privs - remove special file privileges (suid, capabilities)
|
||||
* @file: file to remove privileges from
|
||||
*
|
||||
* Update the mtime and ctime members of an inode and mark the inode
|
||||
* for writeback. Note that this function is meant exclusively for
|
||||
* usage in the file write path of filesystems, and filesystems may
|
||||
* choose to explicitly ignore update via this function with the
|
||||
* S_NOCMTIME inode flag, e.g. for network filesystem where these
|
||||
* timestamps are handled by the server. This can return an error for
|
||||
* file systems who need to allocate space in order to update an inode.
|
||||
* When file is modified by a write or truncation ensure that special
|
||||
* file privileges are removed.
|
||||
*
|
||||
* Return: 0 on success, negative errno on failure.
|
||||
*/
|
||||
|
||||
int file_update_time(struct file *file)
|
||||
int file_remove_privs(struct file *file)
|
||||
{
|
||||
return __file_remove_privs(file, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(file_remove_privs);
|
||||
|
||||
static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct timespec64 now;
|
||||
int sync_it = 0;
|
||||
int ret;
|
||||
|
||||
/* First try to exhaust all avenues to not sync */
|
||||
if (IS_NOCMTIME(inode))
|
||||
return 0;
|
||||
|
||||
now = current_time(inode);
|
||||
if (!timespec64_equal(&inode->i_mtime, &now))
|
||||
if (!timespec64_equal(&inode->i_mtime, now))
|
||||
sync_it = S_MTIME;
|
||||
|
||||
if (!timespec64_equal(&inode->i_ctime, &now))
|
||||
if (!timespec64_equal(&inode->i_ctime, now))
|
||||
sync_it |= S_CTIME;
|
||||
|
||||
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
|
||||
@ -2079,37 +2069,127 @@ int file_update_time(struct file *file)
|
||||
if (!sync_it)
|
||||
return 0;
|
||||
|
||||
/* Finally allowed to write? Takes lock. */
|
||||
if (__mnt_want_write_file(file))
|
||||
return 0;
|
||||
return sync_it;
|
||||
}
|
||||
|
||||
ret = inode_update_time(inode, &now, sync_it);
|
||||
__mnt_drop_write_file(file);
|
||||
static int __file_update_time(struct file *file, struct timespec64 *now,
|
||||
int sync_mode)
|
||||
{
|
||||
int ret = 0;
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
/* try to update time settings */
|
||||
if (!__mnt_want_write_file(file)) {
|
||||
ret = inode_update_time(inode, now, sync_mode);
|
||||
__mnt_drop_write_file(file);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* file_update_time - update mtime and ctime time
|
||||
* @file: file accessed
|
||||
*
|
||||
* Update the mtime and ctime members of an inode and mark the inode for
|
||||
* writeback. Note that this function is meant exclusively for usage in
|
||||
* the file write path of filesystems, and filesystems may choose to
|
||||
* explicitly ignore updates via this function with the _NOCMTIME inode
|
||||
* flag, e.g. for network filesystem where these imestamps are handled
|
||||
* by the server. This can return an error for file systems who need to
|
||||
* allocate space in order to update an inode.
|
||||
*
|
||||
* Return: 0 on success, negative errno on failure.
|
||||
*/
|
||||
int file_update_time(struct file *file)
|
||||
{
|
||||
int ret;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct timespec64 now = current_time(inode);
|
||||
|
||||
ret = inode_needs_update_time(inode, &now);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
return __file_update_time(file, &now, ret);
|
||||
}
|
||||
EXPORT_SYMBOL(file_update_time);
|
||||
|
||||
/* Caller must hold the file's inode lock */
|
||||
int file_modified(struct file *file)
|
||||
/**
|
||||
* file_modified_flags - handle mandated vfs changes when modifying a file
|
||||
* @file: file that was modified
|
||||
* @flags: kiocb flags
|
||||
*
|
||||
* When file has been modified ensure that special
|
||||
* file privileges are removed and time settings are updated.
|
||||
*
|
||||
* If IOCB_NOWAIT is set, special file privileges will not be removed and
|
||||
* time settings will not be updated. It will return -EAGAIN.
|
||||
*
|
||||
* Context: Caller must hold the file's inode lock.
|
||||
*
|
||||
* Return: 0 on success, negative errno on failure.
|
||||
*/
|
||||
static int file_modified_flags(struct file *file, int flags)
|
||||
{
|
||||
int err;
|
||||
int ret;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct timespec64 now = current_time(inode);
|
||||
|
||||
/*
|
||||
* Clear the security bits if the process is not being run by root.
|
||||
* This keeps people from modifying setuid and setgid binaries.
|
||||
*/
|
||||
err = file_remove_privs(file);
|
||||
if (err)
|
||||
return err;
|
||||
ret = __file_remove_privs(file, flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (unlikely(file->f_mode & FMODE_NOCMTIME))
|
||||
return 0;
|
||||
|
||||
return file_update_time(file);
|
||||
ret = inode_needs_update_time(inode, &now);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
if (flags & IOCB_NOWAIT)
|
||||
return -EAGAIN;
|
||||
|
||||
return __file_update_time(file, &now, ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* file_modified - handle mandated vfs changes when modifying a file
|
||||
* @file: file that was modified
|
||||
*
|
||||
* When file has been modified ensure that special
|
||||
* file privileges are removed and time settings are updated.
|
||||
*
|
||||
* Context: Caller must hold the file's inode lock.
|
||||
*
|
||||
* Return: 0 on success, negative errno on failure.
|
||||
*/
|
||||
int file_modified(struct file *file)
|
||||
{
|
||||
return file_modified_flags(file, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(file_modified);
|
||||
|
||||
/**
|
||||
* kiocb_modified - handle mandated vfs changes when modifying a file
|
||||
* @iocb: iocb that was modified
|
||||
*
|
||||
* When file has been modified ensure that special
|
||||
* file privileges are removed and time settings are updated.
|
||||
*
|
||||
* Context: Caller must hold the file's inode lock.
|
||||
*
|
||||
* Return: 0 on success, negative errno on failure.
|
||||
*/
|
||||
int kiocb_modified(struct kiocb *iocb)
|
||||
{
|
||||
return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kiocb_modified);
|
||||
|
||||
int inode_needs_sync(struct inode *inode)
|
||||
{
|
||||
if (IS_SYNC(inode))
|
||||
|
@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
|
||||
static struct bio_set iomap_ioend_bioset;
|
||||
|
||||
static struct iomap_page *
|
||||
iomap_page_create(struct inode *inode, struct folio *folio)
|
||||
iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
|
||||
{
|
||||
struct iomap_page *iop = to_iomap_page(folio);
|
||||
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
|
||||
gfp_t gfp;
|
||||
|
||||
if (iop || nr_blocks <= 1)
|
||||
return iop;
|
||||
|
||||
if (flags & IOMAP_NOWAIT)
|
||||
gfp = GFP_NOWAIT;
|
||||
else
|
||||
gfp = GFP_NOFS | __GFP_NOFAIL;
|
||||
|
||||
iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
|
||||
GFP_NOFS | __GFP_NOFAIL);
|
||||
spin_lock_init(&iop->uptodate_lock);
|
||||
if (folio_test_uptodate(folio))
|
||||
bitmap_fill(iop->uptodate, nr_blocks);
|
||||
folio_attach_private(folio, iop);
|
||||
gfp);
|
||||
if (iop) {
|
||||
spin_lock_init(&iop->uptodate_lock);
|
||||
if (folio_test_uptodate(folio))
|
||||
bitmap_fill(iop->uptodate, nr_blocks);
|
||||
folio_attach_private(folio, iop);
|
||||
}
|
||||
return iop;
|
||||
}
|
||||
|
||||
@ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
|
||||
if (WARN_ON_ONCE(size > iomap->length))
|
||||
return -EIO;
|
||||
if (offset > 0)
|
||||
iop = iomap_page_create(iter->inode, folio);
|
||||
iop = iomap_page_create(iter->inode, folio, iter->flags);
|
||||
else
|
||||
iop = to_iomap_page(folio);
|
||||
|
||||
@ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
|
||||
return iomap_read_inline_data(iter, folio);
|
||||
|
||||
/* zero post-eof blocks as the page may be mapped */
|
||||
iop = iomap_page_create(iter->inode, folio);
|
||||
iop = iomap_page_create(iter->inode, folio, iter->flags);
|
||||
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
|
||||
if (plen == 0)
|
||||
goto done;
|
||||
@ -547,10 +555,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
|
||||
size_t len, struct folio *folio)
|
||||
{
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
||||
struct iomap_page *iop = iomap_page_create(iter->inode, folio);
|
||||
struct iomap_page *iop;
|
||||
loff_t block_size = i_blocksize(iter->inode);
|
||||
loff_t block_start = round_down(pos, block_size);
|
||||
loff_t block_end = round_up(pos + len, block_size);
|
||||
unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
|
||||
size_t from = offset_in_folio(folio, pos), to = from + len;
|
||||
size_t poff, plen;
|
||||
|
||||
@ -558,6 +567,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
|
||||
return 0;
|
||||
folio_clear_error(folio);
|
||||
|
||||
iop = iomap_page_create(iter->inode, folio, iter->flags);
|
||||
if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
|
||||
return -EAGAIN;
|
||||
|
||||
do {
|
||||
iomap_adjust_read_range(iter->inode, folio, &block_start,
|
||||
block_end - block_start, &poff, &plen);
|
||||
@ -574,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
|
||||
return -EIO;
|
||||
folio_zero_segments(folio, poff, from, to, poff + plen);
|
||||
} else {
|
||||
int status = iomap_read_folio_sync(block_start, folio,
|
||||
int status;
|
||||
|
||||
if (iter->flags & IOMAP_NOWAIT)
|
||||
return -EAGAIN;
|
||||
|
||||
status = iomap_read_folio_sync(block_start, folio,
|
||||
poff, plen, srcmap);
|
||||
if (status)
|
||||
return status;
|
||||
@ -603,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
|
||||
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
|
||||
int status = 0;
|
||||
|
||||
if (iter->flags & IOMAP_NOWAIT)
|
||||
fgp |= FGP_NOWAIT;
|
||||
|
||||
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
|
||||
if (srcmap != &iter->iomap)
|
||||
BUG_ON(pos + len > srcmap->offset + srcmap->length);
|
||||
@ -622,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
|
||||
folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
|
||||
fgp, mapping_gfp_mask(iter->inode->i_mapping));
|
||||
if (!folio) {
|
||||
status = -ENOMEM;
|
||||
status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
|
||||
goto out_no_page;
|
||||
}
|
||||
if (pos + len > folio_pos(folio) + folio_size(folio))
|
||||
@ -740,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
||||
loff_t pos = iter->pos;
|
||||
ssize_t written = 0;
|
||||
long status = 0;
|
||||
struct address_space *mapping = iter->inode->i_mapping;
|
||||
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
|
||||
|
||||
do {
|
||||
struct folio *folio;
|
||||
@ -752,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
||||
bytes = min_t(unsigned long, PAGE_SIZE - offset,
|
||||
iov_iter_count(i));
|
||||
again:
|
||||
status = balance_dirty_pages_ratelimited_flags(mapping,
|
||||
bdp_flags);
|
||||
if (unlikely(status))
|
||||
break;
|
||||
|
||||
if (bytes > length)
|
||||
bytes = length;
|
||||
|
||||
@ -760,6 +788,10 @@ again:
|
||||
* Otherwise there's a nasty deadlock on copying from the
|
||||
* same page as we're writing to, without it being marked
|
||||
* up-to-date.
|
||||
*
|
||||
* For async buffered writes the assumption is that the user
|
||||
* page has already been faulted in. This can be optimized by
|
||||
* faulting the user page.
|
||||
*/
|
||||
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
|
||||
status = -EFAULT;
|
||||
@ -771,7 +803,7 @@ again:
|
||||
break;
|
||||
|
||||
page = folio_file_page(folio, pos >> PAGE_SHIFT);
|
||||
if (mapping_writably_mapped(iter->inode->i_mapping))
|
||||
if (mapping_writably_mapped(mapping))
|
||||
flush_dcache_page(page);
|
||||
|
||||
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
|
||||
@ -796,10 +828,12 @@ again:
|
||||
pos += status;
|
||||
written += status;
|
||||
length -= status;
|
||||
|
||||
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
|
||||
} while (iov_iter_count(i) && length);
|
||||
|
||||
if (status == -EAGAIN) {
|
||||
iov_iter_revert(i, written);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return written ? written : status;
|
||||
}
|
||||
|
||||
@ -815,6 +849,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
|
||||
};
|
||||
int ret;
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
iter.flags |= IOMAP_NOWAIT;
|
||||
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_write_iter(&iter, i);
|
||||
if (iter.pos == iocb->ki_pos)
|
||||
@ -1329,7 +1366,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct inode *inode,
|
||||
struct folio *folio, u64 end_pos)
|
||||
{
|
||||
struct iomap_page *iop = iomap_page_create(inode, folio);
|
||||
struct iomap_page *iop = iomap_page_create(inode, folio, 0);
|
||||
struct iomap_ioend *ioend, *next;
|
||||
unsigned len = i_blocksize(inode);
|
||||
unsigned nblocks = i_blocks_per_folio(inode, folio);
|
||||
|
@ -1663,7 +1663,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
|
||||
if (iocb->ki_flags & IOCB_APPEND)
|
||||
iocb->ki_pos = i_size_read(inode);
|
||||
|
||||
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
|
||||
if ((iocb->ki_flags & IOCB_NOWAIT) &&
|
||||
!((iocb->ki_flags & IOCB_DIRECT) ||
|
||||
(file->f_mode & FMODE_BUF_WASYNC)))
|
||||
return -EINVAL;
|
||||
|
||||
return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
|
||||
|
@ -410,7 +410,7 @@ restart:
|
||||
spin_unlock(&ip->i_flags_lock);
|
||||
|
||||
out:
|
||||
return file_modified(file);
|
||||
return kiocb_modified(iocb);
|
||||
}
|
||||
|
||||
static int
|
||||
@ -700,12 +700,11 @@ xfs_file_buffered_write(
|
||||
bool cleared_space = false;
|
||||
unsigned int iolock;
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
write_retry:
|
||||
iolock = XFS_IOLOCK_EXCL;
|
||||
xfs_ilock(ip, iolock);
|
||||
ret = xfs_ilock_iocb(iocb, iolock);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock);
|
||||
if (ret)
|
||||
@ -1165,7 +1164,7 @@ xfs_file_open(
|
||||
{
|
||||
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
|
||||
return -EIO;
|
||||
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
|
||||
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
|
||||
return generic_file_open(inode, file);
|
||||
}
|
||||
|
||||
|
@ -664,7 +664,7 @@ xfs_ilock_for_iomap(
|
||||
unsigned flags,
|
||||
unsigned *lockmode)
|
||||
{
|
||||
unsigned mode = XFS_ILOCK_SHARED;
|
||||
unsigned int mode = *lockmode;
|
||||
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
|
||||
|
||||
/*
|
||||
@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin(
|
||||
int nimaps = 1, error = 0;
|
||||
bool shared = false;
|
||||
u16 iomap_flags = 0;
|
||||
unsigned lockmode;
|
||||
unsigned int lockmode = XFS_ILOCK_SHARED;
|
||||
|
||||
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
|
||||
|
||||
@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin(
|
||||
bool eof = false, cow_eof = false, shared = false;
|
||||
int allocfork = XFS_DATA_FORK;
|
||||
int error = 0;
|
||||
unsigned int lockmode = XFS_ILOCK_EXCL;
|
||||
|
||||
if (xfs_is_shutdown(mp))
|
||||
return -EIO;
|
||||
@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin(
|
||||
|
||||
ASSERT(!XFS_IS_REALTIME_INODE(ip));
|
||||
|
||||
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
||||
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
|
||||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
|
||||
@ -1172,7 +1175,7 @@ xfs_read_iomap_begin(
|
||||
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
|
||||
int nimaps = 1, error = 0;
|
||||
bool shared = false;
|
||||
unsigned lockmode;
|
||||
unsigned int lockmode = XFS_ILOCK_SHARED;
|
||||
|
||||
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
|
||||
|
||||
|
@ -180,6 +180,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
||||
/* File supports async buffered reads */
|
||||
#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
|
||||
|
||||
/* File supports async nowait buffered writes */
|
||||
#define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000)
|
||||
|
||||
/*
|
||||
* Attribute flags. These should be or-ed together to figure out what
|
||||
* has been changed!
|
||||
@ -2515,6 +2518,7 @@ static inline void file_accessed(struct file *file)
|
||||
}
|
||||
|
||||
extern int file_modified(struct file *file);
|
||||
int kiocb_modified(struct kiocb *iocb);
|
||||
|
||||
int sync_inode_metadata(struct inode *inode, int wait);
|
||||
|
||||
|
@ -364,7 +364,14 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
|
||||
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
|
||||
|
||||
void wb_update_bandwidth(struct bdi_writeback *wb);
|
||||
|
||||
/* Invoke balance dirty pages in async mode. */
|
||||
#define BDP_ASYNC 0x0001
|
||||
|
||||
void balance_dirty_pages_ratelimited(struct address_space *mapping);
|
||||
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
|
||||
unsigned int flags);
|
||||
|
||||
bool wb_over_bg_thresh(struct bdi_writeback *wb);
|
||||
|
||||
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
|
||||
|
@ -630,6 +630,31 @@ TRACE_EVENT(io_uring_task_work_run,
|
||||
__entry->tctx, __entry->count, __entry->loops)
|
||||
);
|
||||
|
||||
TRACE_EVENT(io_uring_short_write,
|
||||
|
||||
TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got),
|
||||
|
||||
TP_ARGS(ctx, fpos, wanted, got),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(void *, ctx)
|
||||
__field(u64, fpos)
|
||||
__field(u64, wanted)
|
||||
__field(u64, got)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->fpos = fpos;
|
||||
__entry->wanted = wanted;
|
||||
__entry->got = got;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, fpos %lld, wanted %lld, got %lld",
|
||||
__entry->ctx, __entry->fpos,
|
||||
__entry->wanted, __entry->got)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_IO_URING_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
@ -641,7 +641,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static bool need_read_all(struct io_kiocb *req)
|
||||
static bool need_complete_io(struct io_kiocb *req)
|
||||
{
|
||||
return req->flags & REQ_F_ISREG ||
|
||||
S_ISBLK(file_inode(req->file)->i_mode);
|
||||
@ -775,7 +775,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
|
||||
kfree(iovec);
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
|
||||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
|
||||
(req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
|
||||
/* read all, failed, already did sync or don't want to retry */
|
||||
goto done;
|
||||
}
|
||||
@ -870,9 +870,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
|
||||
if (unlikely(!io_file_supports_nowait(req)))
|
||||
goto copy_iov;
|
||||
|
||||
/* file path doesn't support NOWAIT for non-direct_IO */
|
||||
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
|
||||
(req->flags & REQ_F_ISREG))
|
||||
/* File path supports NOWAIT for non-direct_IO only for block devices. */
|
||||
if (!(kiocb->ki_flags & IOCB_DIRECT) &&
|
||||
!(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
|
||||
(req->flags & REQ_F_ISREG))
|
||||
goto copy_iov;
|
||||
|
||||
kiocb->ki_flags |= IOCB_NOWAIT;
|
||||
@ -928,13 +929,41 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
|
||||
/* IOPOLL retry should happen for io-wq threads */
|
||||
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
|
||||
goto copy_iov;
|
||||
|
||||
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
|
||||
struct io_async_rw *rw;
|
||||
|
||||
trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
|
||||
req->cqe.res, ret2);
|
||||
|
||||
/* This is a partial write. The file pos has already been
|
||||
* updated, setup the async struct to complete the request
|
||||
* in the worker. Also update bytes_done to account for
|
||||
* the bytes already written.
|
||||
*/
|
||||
iov_iter_save_state(&s->iter, &s->iter_state);
|
||||
ret = io_setup_async_rw(req, iovec, s, true);
|
||||
|
||||
rw = req->async_data;
|
||||
if (rw)
|
||||
rw->bytes_done += ret2;
|
||||
|
||||
if (kiocb->ki_flags & IOCB_WRITE)
|
||||
kiocb_end_write(req);
|
||||
return ret ? ret : -EAGAIN;
|
||||
}
|
||||
done:
|
||||
ret = kiocb_done(req, ret2, issue_flags);
|
||||
} else {
|
||||
copy_iov:
|
||||
iov_iter_restore(&s->iter, &s->iter_state);
|
||||
ret = io_setup_async_rw(req, iovec, s, false);
|
||||
return ret ?: -EAGAIN;
|
||||
if (!ret) {
|
||||
if (kiocb->ki_flags & IOCB_WRITE)
|
||||
kiocb_end_write(req);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
/* it's reportedly faster than delegating the null check to kfree() */
|
||||
if (iovec)
|
||||
|
@ -1988,6 +1988,10 @@ no_page:
|
||||
gfp |= __GFP_WRITE;
|
||||
if (fgp_flags & FGP_NOFS)
|
||||
gfp &= ~__GFP_FS;
|
||||
if (fgp_flags & FGP_NOWAIT) {
|
||||
gfp &= ~GFP_KERNEL;
|
||||
gfp |= GFP_NOWAIT | __GFP_NOWARN;
|
||||
}
|
||||
|
||||
folio = filemap_alloc_folio(gfp, 0);
|
||||
if (!folio)
|
||||
|
@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
|
||||
* If we're over `background_thresh' then the writeback threads are woken to
|
||||
* perform some writeout.
|
||||
*/
|
||||
static void balance_dirty_pages(struct bdi_writeback *wb,
|
||||
unsigned long pages_dirtied)
|
||||
static int balance_dirty_pages(struct bdi_writeback *wb,
|
||||
unsigned long pages_dirtied, unsigned int flags)
|
||||
{
|
||||
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
|
||||
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
|
||||
@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
|
||||
struct backing_dev_info *bdi = wb->bdi;
|
||||
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
|
||||
unsigned long start_time = jiffies;
|
||||
int ret = 0;
|
||||
|
||||
for (;;) {
|
||||
unsigned long now = jiffies;
|
||||
@ -1627,6 +1628,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* In laptop mode, we wait until hitting the higher threshold
|
||||
* before starting background writeout, and then write out all
|
||||
* the way down to the lower threshold. So slow writers cause
|
||||
* minimal disk activity.
|
||||
*
|
||||
* In normal mode, we start background writeout at the lower
|
||||
* background_thresh, to keep the amount of dirty memory low.
|
||||
*/
|
||||
if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
|
||||
!writeback_in_progress(wb))
|
||||
wb_start_background_writeback(wb);
|
||||
|
||||
/*
|
||||
* Throttle it only when the background writeback cannot
|
||||
* catch-up. This avoids (excessively) small writeouts
|
||||
@ -1657,6 +1671,7 @@ free_running:
|
||||
break;
|
||||
}
|
||||
|
||||
/* Start writeback even when in laptop mode */
|
||||
if (unlikely(!writeback_in_progress(wb)))
|
||||
wb_start_background_writeback(wb);
|
||||
|
||||
@ -1715,8 +1730,8 @@ free_running:
|
||||
sdtc = mdtc;
|
||||
}
|
||||
|
||||
if (dirty_exceeded && !wb->dirty_exceeded)
|
||||
wb->dirty_exceeded = 1;
|
||||
if (dirty_exceeded != wb->dirty_exceeded)
|
||||
wb->dirty_exceeded = dirty_exceeded;
|
||||
|
||||
if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
|
||||
BANDWIDTH_INTERVAL))
|
||||
@ -1789,6 +1804,10 @@ pause:
|
||||
period,
|
||||
pause,
|
||||
start_time);
|
||||
if (flags & BDP_ASYNC) {
|
||||
ret = -EAGAIN;
|
||||
break;
|
||||
}
|
||||
__set_current_state(TASK_KILLABLE);
|
||||
wb->dirty_sleep = now;
|
||||
io_schedule_timeout(pause);
|
||||
@ -1820,26 +1839,7 @@ pause:
|
||||
if (fatal_signal_pending(current))
|
||||
break;
|
||||
}
|
||||
|
||||
if (!dirty_exceeded && wb->dirty_exceeded)
|
||||
wb->dirty_exceeded = 0;
|
||||
|
||||
if (writeback_in_progress(wb))
|
||||
return;
|
||||
|
||||
/*
|
||||
* In laptop mode, we wait until hitting the higher threshold before
|
||||
* starting background writeout, and then write out all the way down
|
||||
* to the lower threshold. So slow writers cause minimal disk activity.
|
||||
*
|
||||
* In normal mode, we start background writeout at the lower
|
||||
* background_thresh, to keep the amount of dirty memory low.
|
||||
*/
|
||||
if (laptop_mode)
|
||||
return;
|
||||
|
||||
if (nr_reclaimable > gdtc->bg_thresh)
|
||||
wb_start_background_writeback(wb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(int, bdp_ratelimits);
|
||||
@ -1861,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
|
||||
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
|
||||
|
||||
/**
|
||||
* balance_dirty_pages_ratelimited - balance dirty memory state
|
||||
* @mapping: address_space which was dirtied
|
||||
* balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
|
||||
* @mapping: address_space which was dirtied.
|
||||
* @flags: BDP flags.
|
||||
*
|
||||
* Processes which are dirtying memory should call in here once for each page
|
||||
* which was newly dirtied. The function will periodically check the system's
|
||||
* dirty state and will initiate writeback if needed.
|
||||
*
|
||||
* Once we're over the dirty memory limit we decrease the ratelimiting
|
||||
* by a lot, to prevent individual processes from overshooting the limit
|
||||
* by (ratelimit_pages) each.
|
||||
* See balance_dirty_pages_ratelimited() for details.
|
||||
*
|
||||
* Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
|
||||
* indicate that memory is out of balance and the caller must wait
|
||||
* for I/O to complete. Otherwise, it will return 0 to indicate
|
||||
* that either memory was already in balance, or it was able to sleep
|
||||
* until the amount of dirty memory returned to balance.
|
||||
*/
|
||||
void balance_dirty_pages_ratelimited(struct address_space *mapping)
|
||||
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
|
||||
unsigned int flags)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
struct backing_dev_info *bdi = inode_to_bdi(inode);
|
||||
struct bdi_writeback *wb = NULL;
|
||||
int ratelimit;
|
||||
int ret = 0;
|
||||
int *p;
|
||||
|
||||
if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
|
||||
return;
|
||||
return ret;
|
||||
|
||||
if (inode_cgwb_enabled(inode))
|
||||
wb = wb_get_create_current(bdi, GFP_KERNEL);
|
||||
@ -1921,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
|
||||
preempt_enable();
|
||||
|
||||
if (unlikely(current->nr_dirtied >= ratelimit))
|
||||
balance_dirty_pages(wb, current->nr_dirtied);
|
||||
ret = balance_dirty_pages(wb, current->nr_dirtied, flags);
|
||||
|
||||
wb_put(wb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* balance_dirty_pages_ratelimited - balance dirty memory state.
|
||||
* @mapping: address_space which was dirtied.
|
||||
*
|
||||
* Processes which are dirtying memory should call in here once for each page
|
||||
* which was newly dirtied. The function will periodically check the system's
|
||||
* dirty state and will initiate writeback if needed.
|
||||
*
|
||||
* Once we're over the dirty memory limit we decrease the ratelimiting
|
||||
* by a lot, to prevent individual processes from overshooting the limit
|
||||
* by (ratelimit_pages) each.
|
||||
*/
|
||||
void balance_dirty_pages_ratelimited(struct address_space *mapping)
|
||||
{
|
||||
balance_dirty_pages_ratelimited_flags(mapping, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user