for-5.20/io_uring-buffered-writes-2022-07-29

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmLkm7UQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpldTEADTg/96R+eq78UZBNZmdifY9/qwQD+kzNiK
 ACDoYZFSbWUjMeOWqRxbYr6mXBKHnGHyTGlraTTpLDzhpB1xwoWfgOK9uOYXW/Ik
 eWfgTujPW/8v/l/z86khE+GH9b/maGCRqNZgS6uLVLzhxG6oCkoYTyOh1iHaF1VM
 Rma4nbJ8GSEDtiXNDl0Bznnyks/pzwoz/9slwzZ7PxtFwZsBxKuxgMUR5HIXdRp7
 5iUoFJhZrGWyi/dbQZUsK/9VYVVnKkcBCz2pb4GEmC+3dS/vlPEoeWUpPHInNyd1
 9NB9v8c+KFmQaWnCxuxcdHvCfmRRQrX8Pr8/OBNZKO6McYrKWKA+lurp4EGClE3m
 cZdK+P/9FS/Eeua8hum9UnbPAqsJPqLTbpbrySeBdd4iFA6u7rRqDX2+nz3PNe9U
 1b7V1bWBIEY/Rsw/PKo59oIeV0auD8v9OCHJ0lF2pv6dRln2/W0y1Qfd1DI18xFG
 +9bBnQzhF7R0O8UP5ApVayQCYrd906YsSVUOqAiLmUs/BoOgRq6g/0BqSOVVKE2u
 5iq8zTsVMkxY0ZpExwZST/700JwkPIV4SVPEYRC6QssFTcylvlisIek6XYSS9HX4
 Z6gzMwJW1H47bEfG4JolTI8uBjp0hQLCPX0O0XFLVnbHQwN0kjIBmv3axAwJO2NV
 qrrHXjf09w==
 =hV7G
 -----END PGP SIGNATURE-----

Merge tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block

Pull io_uring buffered writes support from Jens Axboe:
 "This contains support for buffered writes, specifically for XFS. btrfs
  is in progress, will be coming in the next release.

  io_uring does support buffered writes on any file type, but since the
  buffered write path just always -EAGAIN (or -EOPNOTSUPP) any attempt
  to do so if IOCB_NOWAIT is set, any buffered write will effectively be
  handled by io-wq offload. This isn't very efficient, and we even have
  specific code in io-wq to serialize buffered writes to the same inode
  to avoid further inefficiencies with thread offload.

  This is particularly sad since most buffered writes don't block, they
  simply copy data to a page and dirty it. With this pull request, we
  can handle buffered writes a lot more effiently.

  If balance_dirty_pages() needs to block, we back off on writes as
  indicated.

  This improves buffered write support by 2-3x.

  Jan Kara helped with the mm bits for this, and Stefan handled the
  fs/iomap/xfs/io_uring parts of it"

* tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block:
  mm: honor FGP_NOWAIT for page cache page allocation
  xfs: Add async buffered write support
  xfs: Specify lockmode when calling xfs_ilock_for_iomap()
  io_uring: Add tracepoint for short writes
  io_uring: fix issue with io_write() not always undoing sb_start_write()
  io_uring: Add support for async buffered writes
  fs: Add async write file modification handling.
  fs: Split off inode_needs_update_time and __file_update_time
  fs: add __remove_file_privs() with flags parameter
  fs: add a FMODE_BUF_WASYNC flags for f_mode
  iomap: Return -EAGAIN from iomap_write_iter()
  iomap: Add async buffered write support
  iomap: Add flags parameter to iomap_page_create()
  mm: Add balance_dirty_pages_ratelimited_flags() function
  mm: Move updates of dirty_exceeded into one place
  mm: Move starting of background writeback into the main balancing loop
This commit is contained in:
Linus Torvalds 2022-08-02 13:27:23 -07:00
commit 98e2474640
11 changed files with 324 additions and 109 deletions

View File

@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns,
return notify_change(mnt_userns, dentry, &newattrs, NULL);
}
/*
* Remove special file priviledges (suid, capabilities) when file is written
* to or truncated.
*/
int file_remove_privs(struct file *file)
static int __file_remove_privs(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
int error;
int kill;
int error = 0;
/*
* Fast path for nothing security related.
* As well for non-regular files, e.g. blkdev inodes.
* For example, blkdev_write_iter() might get here
* trying to remove privs which it is not allowed to.
*/
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
kill = dentry_needs_remove_privs(dentry);
if (kill < 0)
if (kill <= 0)
return kill;
if (kill)
error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
if (flags & IOCB_NOWAIT)
return -EAGAIN;
error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
if (!error)
inode_has_no_xattr(inode);
return error;
}
EXPORT_SYMBOL(file_remove_privs);
/**
* file_update_time - update mtime and ctime time
* @file: file accessed
* file_remove_privs - remove special file privileges (suid, capabilities)
* @file: file to remove privileges from
*
* Update the mtime and ctime members of an inode and mark the inode
* for writeback. Note that this function is meant exclusively for
* usage in the file write path of filesystems, and filesystems may
* choose to explicitly ignore update via this function with the
* S_NOCMTIME inode flag, e.g. for network filesystem where these
* timestamps are handled by the server. This can return an error for
* file systems who need to allocate space in order to update an inode.
* When file is modified by a write or truncation ensure that special
* file privileges are removed.
*
* Return: 0 on success, negative errno on failure.
*/
int file_update_time(struct file *file)
int file_remove_privs(struct file *file)
{
return __file_remove_privs(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);
static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
{
struct inode *inode = file_inode(file);
struct timespec64 now;
int sync_it = 0;
int ret;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
now = current_time(inode);
if (!timespec64_equal(&inode->i_mtime, &now))
if (!timespec64_equal(&inode->i_mtime, now))
sync_it = S_MTIME;
if (!timespec64_equal(&inode->i_ctime, &now))
if (!timespec64_equal(&inode->i_ctime, now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@ -2079,37 +2069,127 @@ int file_update_time(struct file *file)
if (!sync_it)
return 0;
/* Finally allowed to write? Takes lock. */
if (__mnt_want_write_file(file))
return 0;
return sync_it;
}
ret = inode_update_time(inode, &now, sync_it);
__mnt_drop_write_file(file);
static int __file_update_time(struct file *file, struct timespec64 *now,
int sync_mode)
{
int ret = 0;
struct inode *inode = file_inode(file);
/* try to update time settings */
if (!__mnt_want_write_file(file)) {
ret = inode_update_time(inode, now, sync_mode);
__mnt_drop_write_file(file);
}
return ret;
}
/**
* file_update_time - update mtime and ctime time
* @file: file accessed
*
* Update the mtime and ctime members of an inode and mark the inode for
* writeback. Note that this function is meant exclusively for usage in
* the file write path of filesystems, and filesystems may choose to
* explicitly ignore updates via this function with the _NOCMTIME inode
* flag, e.g. for network filesystem where these imestamps are handled
* by the server. This can return an error for file systems who need to
* allocate space in order to update an inode.
*
* Return: 0 on success, negative errno on failure.
*/
int file_update_time(struct file *file)
{
int ret;
struct inode *inode = file_inode(file);
struct timespec64 now = current_time(inode);
ret = inode_needs_update_time(inode, &now);
if (ret <= 0)
return ret;
return __file_update_time(file, &now, ret);
}
EXPORT_SYMBOL(file_update_time);
/* Caller must hold the file's inode lock */
int file_modified(struct file *file)
/**
* file_modified_flags - handle mandated vfs changes when modifying a file
* @file: file that was modified
* @flags: kiocb flags
*
* When file has been modified ensure that special
* file privileges are removed and time settings are updated.
*
* If IOCB_NOWAIT is set, special file privileges will not be removed and
* time settings will not be updated. It will return -EAGAIN.
*
* Context: Caller must hold the file's inode lock.
*
* Return: 0 on success, negative errno on failure.
*/
static int file_modified_flags(struct file *file, int flags)
{
int err;
int ret;
struct inode *inode = file_inode(file);
struct timespec64 now = current_time(inode);
/*
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
err = file_remove_privs(file);
if (err)
return err;
ret = __file_remove_privs(file, flags);
if (ret)
return ret;
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
return file_update_time(file);
ret = inode_needs_update_time(inode, &now);
if (ret <= 0)
return ret;
if (flags & IOCB_NOWAIT)
return -EAGAIN;
return __file_update_time(file, &now, ret);
}
/**
* file_modified - handle mandated vfs changes when modifying a file
* @file: file that was modified
*
* When file has been modified ensure that special
* file privileges are removed and time settings are updated.
*
* Context: Caller must hold the file's inode lock.
*
* Return: 0 on success, negative errno on failure.
*/
int file_modified(struct file *file)
{
return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);
/**
* kiocb_modified - handle mandated vfs changes when modifying a file
* @iocb: iocb that was modified
*
* When file has been modified ensure that special
* file privileges are removed and time settings are updated.
*
* Context: Caller must hold the file's inode lock.
*
* Return: 0 on success, negative errno on failure.
*/
int kiocb_modified(struct kiocb *iocb)
{
return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
}
EXPORT_SYMBOL_GPL(kiocb_modified);
int inode_needs_sync(struct inode *inode)
{
if (IS_SYNC(inode))

View File

@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
static struct bio_set iomap_ioend_bioset;
static struct iomap_page *
iomap_page_create(struct inode *inode, struct folio *folio)
iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
{
struct iomap_page *iop = to_iomap_page(folio);
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
gfp_t gfp;
if (iop || nr_blocks <= 1)
return iop;
if (flags & IOMAP_NOWAIT)
gfp = GFP_NOWAIT;
else
gfp = GFP_NOFS | __GFP_NOFAIL;
iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
GFP_NOFS | __GFP_NOFAIL);
spin_lock_init(&iop->uptodate_lock);
if (folio_test_uptodate(folio))
bitmap_fill(iop->uptodate, nr_blocks);
folio_attach_private(folio, iop);
gfp);
if (iop) {
spin_lock_init(&iop->uptodate_lock);
if (folio_test_uptodate(folio))
bitmap_fill(iop->uptodate, nr_blocks);
folio_attach_private(folio, iop);
}
return iop;
}
@ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
iop = iomap_page_create(iter->inode, folio);
iop = iomap_page_create(iter->inode, folio, iter->flags);
else
iop = to_iomap_page(folio);
@ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */
iop = iomap_page_create(iter->inode, folio);
iop = iomap_page_create(iter->inode, folio, iter->flags);
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
@ -547,10 +555,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct iomap_page *iop = iomap_page_create(iter->inode, folio);
struct iomap_page *iop;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
@ -558,6 +567,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0;
folio_clear_error(folio);
iop = iomap_page_create(iter->inode, folio, iter->flags);
if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
return -EAGAIN;
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen);
@ -574,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return -EIO;
folio_zero_segments(folio, poff, from, to, poff + plen);
} else {
int status = iomap_read_folio_sync(block_start, folio,
int status;
if (iter->flags & IOMAP_NOWAIT)
return -EAGAIN;
status = iomap_read_folio_sync(block_start, folio,
poff, plen, srcmap);
if (status)
return status;
@ -603,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@ -622,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
if (!folio) {
status = -ENOMEM;
status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
goto out_no_page;
}
if (pos + len > folio_pos(folio) + folio_size(folio))
@ -740,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
loff_t pos = iter->pos;
ssize_t written = 0;
long status = 0;
struct address_space *mapping = iter->inode->i_mapping;
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
do {
struct folio *folio;
@ -752,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
status = balance_dirty_pages_ratelimited_flags(mapping,
bdp_flags);
if (unlikely(status))
break;
if (bytes > length)
bytes = length;
@ -760,6 +788,10 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*
* For async buffered writes the assumption is that the user
* page has already been faulted in. This can be optimized by
* faulting the user page.
*/
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
@ -771,7 +803,7 @@ again:
break;
page = folio_file_page(folio, pos >> PAGE_SHIFT);
if (mapping_writably_mapped(iter->inode->i_mapping))
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
@ -796,10 +828,12 @@ again:
pos += status;
written += status;
length -= status;
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (iov_iter_count(i) && length);
if (status == -EAGAIN) {
iov_iter_revert(i, written);
return -EAGAIN;
}
return written ? written : status;
}
@ -815,6 +849,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
};
int ret;
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i);
if (iter.pos == iocb->ki_pos)
@ -1329,7 +1366,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos)
{
struct iomap_page *iop = iomap_page_create(inode, folio);
struct iomap_page *iop = iomap_page_create(inode, folio, 0);
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio);

View File

@ -1663,7 +1663,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
if ((iocb->ki_flags & IOCB_NOWAIT) &&
!((iocb->ki_flags & IOCB_DIRECT) ||
(file->f_mode & FMODE_BUF_WASYNC)))
return -EINVAL;
return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);

View File

@ -410,7 +410,7 @@ restart:
spin_unlock(&ip->i_flags_lock);
out:
return file_modified(file);
return kiocb_modified(iocb);
}
static int
@ -700,12 +700,11 @@ xfs_file_buffered_write(
bool cleared_space = false;
unsigned int iolock;
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
write_retry:
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
@ -1165,7 +1164,7 @@ xfs_file_open(
{
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
return generic_file_open(inode, file);
}

View File

@ -664,7 +664,7 @@ xfs_ilock_for_iomap(
unsigned flags,
unsigned *lockmode)
{
unsigned mode = XFS_ILOCK_SHARED;
unsigned int mode = *lockmode;
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
/*
@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
unsigned lockmode;
unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin(
bool eof = false, cow_eof = false, shared = false;
int allocfork = XFS_DATA_FORK;
int error = 0;
unsigned int lockmode = XFS_ILOCK_EXCL;
if (xfs_is_shutdown(mp))
return -EIO;
@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
@ -1172,7 +1175,7 @@ xfs_read_iomap_begin(
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
bool shared = false;
unsigned lockmode;
unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));

View File

@ -180,6 +180,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File supports async buffered reads */
#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
/* File supports async nowait buffered writes */
#define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000)
/*
* Attribute flags. These should be or-ed together to figure out what
* has been changed!
@ -2515,6 +2518,7 @@ static inline void file_accessed(struct file *file)
}
extern int file_modified(struct file *file);
int kiocb_modified(struct kiocb *iocb);
int sync_inode_metadata(struct inode *inode, int wait);

View File

@ -364,7 +364,14 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
void wb_update_bandwidth(struct bdi_writeback *wb);
/* Invoke balance dirty pages in async mode. */
#define BDP_ASYNC 0x0001
void balance_dirty_pages_ratelimited(struct address_space *mapping);
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
unsigned int flags);
bool wb_over_bg_thresh(struct bdi_writeback *wb);
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,

View File

@ -630,6 +630,31 @@ TRACE_EVENT(io_uring_task_work_run,
__entry->tctx, __entry->count, __entry->loops)
);
TRACE_EVENT(io_uring_short_write,
TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got),
TP_ARGS(ctx, fpos, wanted, got),
TP_STRUCT__entry(
__field(void *, ctx)
__field(u64, fpos)
__field(u64, wanted)
__field(u64, got)
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->fpos = fpos;
__entry->wanted = wanted;
__entry->got = got;
),
TP_printk("ring %p, fpos %lld, wanted %lld, got %lld",
__entry->ctx, __entry->fpos,
__entry->wanted, __entry->got)
);
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */

View File

@ -641,7 +641,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
return -EINVAL;
}
static bool need_read_all(struct io_kiocb *req)
static bool need_complete_io(struct io_kiocb *req)
{
return req->flags & REQ_F_ISREG ||
S_ISBLK(file_inode(req->file)->i_mode);
@ -775,7 +775,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
kfree(iovec);
return IOU_ISSUE_SKIP_COMPLETE;
} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
(req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
@ -870,9 +870,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!io_file_supports_nowait(req)))
goto copy_iov;
/* file path doesn't support NOWAIT for non-direct_IO */
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
(req->flags & REQ_F_ISREG))
/* File path supports NOWAIT for non-direct_IO only for block devices. */
if (!(kiocb->ki_flags & IOCB_DIRECT) &&
!(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
(req->flags & REQ_F_ISREG))
goto copy_iov;
kiocb->ki_flags |= IOCB_NOWAIT;
@ -928,13 +929,41 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
/* IOPOLL retry should happen for io-wq threads */
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov;
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
struct io_async_rw *rw;
trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
req->cqe.res, ret2);
/* This is a partial write. The file pos has already been
* updated, setup the async struct to complete the request
* in the worker. Also update bytes_done to account for
* the bytes already written.
*/
iov_iter_save_state(&s->iter, &s->iter_state);
ret = io_setup_async_rw(req, iovec, s, true);
rw = req->async_data;
if (rw)
rw->bytes_done += ret2;
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
return ret ? ret : -EAGAIN;
}
done:
ret = kiocb_done(req, ret2, issue_flags);
} else {
copy_iov:
iov_iter_restore(&s->iter, &s->iter_state);
ret = io_setup_async_rw(req, iovec, s, false);
return ret ?: -EAGAIN;
if (!ret) {
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
return -EAGAIN;
}
return ret;
}
/* it's reportedly faster than delegating the null check to kfree() */
if (iovec)

View File

@ -1988,6 +1988,10 @@ no_page:
gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp &= ~__GFP_FS;
if (fgp_flags & FGP_NOWAIT) {
gfp &= ~GFP_KERNEL;
gfp |= GFP_NOWAIT | __GFP_NOWARN;
}
folio = filemap_alloc_folio(gfp, 0);
if (!folio)

View File

@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
* If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout.
*/
static void balance_dirty_pages(struct bdi_writeback *wb,
unsigned long pages_dirtied)
static int balance_dirty_pages(struct bdi_writeback *wb,
unsigned long pages_dirtied, unsigned int flags)
{
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
int ret = 0;
for (;;) {
unsigned long now = jiffies;
@ -1627,6 +1628,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
}
}
/*
* In laptop mode, we wait until hitting the higher threshold
* before starting background writeout, and then write out all
* the way down to the lower threshold. So slow writers cause
* minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
!writeback_in_progress(wb))
wb_start_background_writeback(wb);
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
@ -1657,6 +1671,7 @@ free_running:
break;
}
/* Start writeback even when in laptop mode */
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
@ -1715,8 +1730,8 @@ free_running:
sdtc = mdtc;
}
if (dirty_exceeded && !wb->dirty_exceeded)
wb->dirty_exceeded = 1;
if (dirty_exceeded != wb->dirty_exceeded)
wb->dirty_exceeded = dirty_exceeded;
if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
BANDWIDTH_INTERVAL))
@ -1789,6 +1804,10 @@ pause:
period,
pause,
start_time);
if (flags & BDP_ASYNC) {
ret = -EAGAIN;
break;
}
__set_current_state(TASK_KILLABLE);
wb->dirty_sleep = now;
io_schedule_timeout(pause);
@ -1820,26 +1839,7 @@ pause:
if (fatal_signal_pending(current))
break;
}
if (!dirty_exceeded && wb->dirty_exceeded)
wb->dirty_exceeded = 0;
if (writeback_in_progress(wb))
return;
/*
* In laptop mode, we wait until hitting the higher threshold before
* starting background writeout, and then write out all the way down
* to the lower threshold. So slow writers cause minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if (laptop_mode)
return;
if (nr_reclaimable > gdtc->bg_thresh)
wb_start_background_writeback(wb);
return ret;
}
static DEFINE_PER_CPU(int, bdp_ratelimits);
@ -1861,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
* balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping: address_space which was dirtied
* balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
* @mapping: address_space which was dirtied.
* @flags: BDP flags.
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
* Once we're over the dirty memory limit we decrease the ratelimiting
* by a lot, to prevent individual processes from overshooting the limit
* by (ratelimit_pages) each.
* See balance_dirty_pages_ratelimited() for details.
*
* Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
* indicate that memory is out of balance and the caller must wait
* for I/O to complete. Otherwise, it will return 0 to indicate
* that either memory was already in balance, or it was able to sleep
* until the amount of dirty memory returned to balance.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
unsigned int flags)
{
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
int ratelimit;
int ret = 0;
int *p;
if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
return;
return ret;
if (inode_cgwb_enabled(inode))
wb = wb_get_create_current(bdi, GFP_KERNEL);
@ -1921,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
preempt_enable();
if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(wb, current->nr_dirtied);
ret = balance_dirty_pages(wb, current->nr_dirtied, flags);
wb_put(wb);
return ret;
}
/**
* balance_dirty_pages_ratelimited - balance dirty memory state.
* @mapping: address_space which was dirtied.
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
* Once we're over the dirty memory limit we decrease the ratelimiting
* by a lot, to prevent individual processes from overshooting the limit
* by (ratelimit_pages) each.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
balance_dirty_pages_ratelimited_flags(mapping, 0);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);