From 0db1d53937fafa8bb96e077375691e16902f4899 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 23 Nov 2023 11:20:00 +0200 Subject: [PATCH 01/30] scsi: target: core: add missing file_{start,end}_write() The callers of vfs_iter_write() are required to hold file_start_write(). file_start_write() is a no-op for the S_ISBLK() case, but it is really needed when the backing file is a regular file. We are going to move file_{start,end}_write() into vfs_iter_write(), but we need to fix this first, so that the fix could be backported to stable kernels. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/r/ZV8ETIpM+wZa33B5@infradead.org/ Cc: Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231123092000.2665902-1-amir73il@gmail.com Acked-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- drivers/target/target_core_file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 4d447520bab8..4e4cf6c34a77 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -332,11 +332,13 @@ static int fd_do_rw(struct se_cmd *cmd, struct file *fd, } iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len); - if (is_write) + if (is_write) { + file_start_write(fd); ret = vfs_iter_write(fd, &iter, &pos, 0); - else + file_end_write(fd); + } else { ret = vfs_iter_read(fd, &iter, &pos, 0); - + } if (is_write) { if (ret < 0 || ret != data_length) { pr_err("%s() write returned %d\n", __func__, ret); @@ -467,7 +469,9 @@ fd_execute_write_same(struct se_cmd *cmd) } iov_iter_bvec(&iter, ITER_SOURCE, bvec, nolb, len); + file_start_write(fd_dev->fd_file); ret = vfs_iter_write(fd_dev->fd_file, &iter, &pos, 0); + file_end_write(fd_dev->fd_file); kfree(bvec); if (ret < 0 || ret != len) { From ca7ab482401cf0a7497dad05f4918dc64115538b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:00 +0200 Subject: [PATCH 02/30] ovl: add permission hooks outside of do_splice_direct() The main callers of do_splice_direct() also call rw_verify_area() for the entire range that is being copied, e.g. by vfs_copy_file_range() or do_sendfile() before calling do_splice_direct(). The only caller that does not have those checks for entire range is ovl_copy_up_file(). In preparation for removing the checks inside do_splice_direct(), add rw_verify_area() call in ovl_copy_up_file(). For extra safety, perform minimal sanity checks from rw_verify_area() for non negative offsets also in the copy up do_splice_direct() loop without calling the file permission hooks. This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-2-amir73il@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/overlayfs/copy_up.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 4382881b0709..106f8643af3b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -230,6 +230,19 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old, return ovl_real_fileattr_set(new, &newfa); } +static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) +{ + loff_t tmp; + + if (WARN_ON_ONCE(pos != pos2)) + return -EIO; + if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0)) + return -EIO; + if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp))) + return -EIO; + return 0; +} + static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, struct file *new_file, loff_t len) { @@ -244,13 +257,20 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, int error = 0; ovl_path_lowerdata(dentry, &datapath); - if (WARN_ON(datapath.dentry == NULL)) + if (WARN_ON_ONCE(datapath.dentry == NULL) || + WARN_ON_ONCE(len < 0)) return -EIO; old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); if (IS_ERR(old_file)) return PTR_ERR(old_file); + error = rw_verify_area(READ, old_file, &old_pos, len); + if (!error) + error = rw_verify_area(WRITE, new_file, &new_pos, len); + if (error) + goto out_fput; + /* Try to use clone_file_range to clone up within the same fs */ ovl_start_write(dentry); cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); @@ -309,6 +329,10 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, } } + error = ovl_verify_area(old_pos, new_pos, this_len, len); + if (error) + break; + ovl_start_write(dentry); bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, From 2a33e2ddc6ebf9b5468091aded8a38f57de9a580 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:01 +0200 Subject: [PATCH 03/30] splice: remove permission hook from do_splice_direct() All callers of do_splice_direct() have a call to rw_verify_area() for the entire range that is being copied, e.g. by vfs_copy_file_range() or do_sendfile() before calling do_splice_direct(). The rw_verify_area() check inside do_splice_direct() is redundant and is called after sb_start_write(), so it is not "start-write-safe". Remove this redundant check. This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-3-amir73il@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/splice.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index d983d375ff11..6e917db6f49a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1166,6 +1166,7 @@ static void direct_file_splice_eof(struct splice_desc *sd) * (splice in + splice out, as compared to just sendfile()). So this helper * can splice directly through a process-private pipe. * + * Callers already called rw_verify_area() on the entire range. */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) @@ -1187,10 +1188,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; - ret = rw_verify_area(WRITE, out, opos, len); - if (unlikely(ret < 0)) - return ret; - ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) *ppos = sd.pos; From feebea75bdf499aefd11d0df7b02d384a9f92fc1 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:02 +0200 Subject: [PATCH 04/30] splice: move permission hook out of splice_direct_to_actor() vfs_splice_read() has a permission hook inside rw_verify_area() and it is called from do_splice_direct() -> splice_direct_to_actor(). The callers of do_splice_direct() (e.g. vfs_copy_file_range()) already call rw_verify_area() for the entire range, but the other caller of splice_direct_to_actor() (nfsd) does not. Add the rw_verify_area() checks in nfsd_splice_read() and use a variant of vfs_splice_read() without rw_verify_area() check in splice_direct_to_actor() to avoid the redundant rw_verify_area() checks. This is needed for fanotify "pre content" events. Acked-by: Chuck Lever Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-4-amir73il@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/nfsd/vfs.c | 5 ++++- fs/splice.c | 58 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fbbea7498f02..5d704461e3b4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1046,7 +1046,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + host_err = rw_verify_area(READ, file, &offset, *count); + if (!host_err) + host_err = splice_direct_to_actor(file, &sd, + nfsd_direct_splice_actor); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } diff --git a/fs/splice.c b/fs/splice.c index 6e917db6f49a..6fc2c27e9520 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -944,6 +944,39 @@ static void do_splice_eof(struct splice_desc *sd) sd->splice_eof(sd); } +/* + * Callers already called rw_verify_area() on the entire range. + * No need to call it for sub ranges. + */ +static long do_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + unsigned int p_space; + + if (unlikely(!(in->f_mode & FMODE_READ))) + return -EBADF; + if (!len) + return 0; + + /* Don't try to read more the pipe has space for. */ + p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); + len = min_t(size_t, len, p_space << PAGE_SHIFT); + + if (unlikely(len > MAX_RW_COUNT)) + len = MAX_RW_COUNT; + + if (unlikely(!in->f_op->splice_read)) + return warn_unsupported(in, "read"); + /* + * O_DIRECT and DAX don't deal with the pagecache, so we allocate a + * buffer, copy into it and splice that into the pipe. + */ + if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) + return copy_splice_read(in, ppos, pipe, len, flags); + return in->f_op->splice_read(in, ppos, pipe, len, flags); +} + /** * vfs_splice_read - Read data from a file and splice it into a pipe * @in: File to splice from @@ -963,34 +996,13 @@ long vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - unsigned int p_space; int ret; - if (unlikely(!(in->f_mode & FMODE_READ))) - return -EBADF; - if (!len) - return 0; - - /* Don't try to read more the pipe has space for. */ - p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); - len = min_t(size_t, len, p_space << PAGE_SHIFT); - ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; - if (unlikely(len > MAX_RW_COUNT)) - len = MAX_RW_COUNT; - - if (unlikely(!in->f_op->splice_read)) - return warn_unsupported(in, "read"); - /* - * O_DIRECT and DAX don't deal with the pagecache, so we allocate a - * buffer, copy into it and splice that into the pipe. - */ - if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) - return copy_splice_read(in, ppos, pipe, len, flags); - return in->f_op->splice_read(in, ppos, pipe, len, flags); + return do_splice_read(in, ppos, pipe, len, flags); } EXPORT_SYMBOL_GPL(vfs_splice_read); @@ -1066,7 +1078,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - ret = vfs_splice_read(in, &pos, pipe, len, flags); + ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; From b70d8e2b8ce56c79d9d18d20955e6de1631e9509 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:03 +0200 Subject: [PATCH 05/30] splice: move permission hook out of splice_file_to_pipe() vfs_splice_read() has a permission hook inside rw_verify_area() and it is called from splice_file_to_pipe(), which is called from do_splice() and do_sendfile(). do_sendfile() already has a rw_verify_area() check for the entire range. do_splice() has a rw_verify_check() for the splice to file case, not for the splice from file case. Add the rw_verify_area() check for splice from file case in do_splice() and use a variant of vfs_splice_read() without rw_verify_area() check in splice_file_to_pipe() to avoid the redundant rw_verify_area() checks. This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-5-amir73il@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/splice.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index 6fc2c27e9520..d4fdd44c0b32 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1239,7 +1239,7 @@ long splice_file_to_pipe(struct file *in, pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) - ret = vfs_splice_read(in, offset, opipe, len, flags); + ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); @@ -1316,6 +1316,10 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, offset = in->f_pos; } + ret = rw_verify_area(READ, in, &offset, len); + if (unlikely(ret < 0)) + return ret; + if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; From d53471ba6f7ae97a4e223539029528108b705af1 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 23 Nov 2023 18:51:44 +0100 Subject: [PATCH 06/30] splice: remove permission hook from iter_file_splice_write() All the callers of ->splice_write(), (e.g. do_splice_direct() and do_splice()) already check rw_verify_area() for the entire range and perform all the other checks that are in vfs_write_iter(). Instead of creating another tiny helper for special caller, just open-code it. This is needed for fanotify "pre content" events. Suggested-by: Jan Kara Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-6-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/splice.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index d4fdd44c0b32..3fce5f6072dd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -673,10 +673,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; int nbufs = pipe->max_usage; - struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), - GFP_KERNEL); + struct bio_vec *array; ssize_t ret; + if (!out->f_op->write_iter) + return -EINVAL; + + array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (unlikely(!array)) return -ENOMEM; @@ -684,6 +687,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { + struct kiocb kiocb; struct iov_iter from; unsigned int head, tail, mask; size_t left; @@ -733,7 +737,10 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); - ret = vfs_iter_write(out, &from, &sd.pos, 0); + init_sync_kiocb(&kiocb, out); + kiocb.ki_pos = sd.pos; + ret = call_write_iter(out, &kiocb, &from); + sd.pos = kiocb.ki_pos; if (ret <= 0) break; From dfad37051ade6ac0d404ef4913f3bd01954ee51c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:05 +0200 Subject: [PATCH 07/30] remap_range: move permission hooks out of do_clone_file_range() In many of the vfs helpers, file permission hook is called before taking sb_start_write(), making them "start-write-safe". do_clone_file_range() is an exception to this rule. do_clone_file_range() has two callers - vfs_clone_file_range() and overlayfs. Move remap_verify_area() checks from do_clone_file_range() out to vfs_clone_file_range() to make them "start-write-safe". Overlayfs already has calls to rw_verify_area() with the same security permission hooks as remap_verify_area() has. The rest of the checks in remap_verify_area() are irrelevant for overlayfs that calls do_clone_file_range() offset 0 and positive length. This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-7-amir73il@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/remap_range.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/remap_range.c b/fs/remap_range.c index 87ae4f0dc3aa..42f79cb2b1b1 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -385,14 +385,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = remap_verify_area(file_in, pos_in, len, false); - if (ret) - return ret; - - ret = remap_verify_area(file_out, pos_out, len, true); - if (ret) - return ret; - ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); if (ret < 0) @@ -410,6 +402,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, { loff_t ret; + ret = remap_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = remap_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + file_start_write(file_out); ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); From 0b5263d12aed0437c1bdb7ba0be27437fc12c274 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:06 +0200 Subject: [PATCH 08/30] remap_range: move file_start_write() to after permission hook In vfs code, file_start_write() is usually called after the permission hook in rw_verify_area(). vfs_dedupe_file_range_one() is an exception to this rule. In vfs_dedupe_file_range_one(), move file_start_write() to after the the rw_verify_area() checks to make them "start-write-safe". This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-8-amir73il@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/remap_range.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/remap_range.c b/fs/remap_range.c index 42f79cb2b1b1..12131f2a6c9e 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -420,7 +420,7 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ -static bool allow_file_dedupe(struct file *file) +static bool may_dedupe_file(struct file *file) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); @@ -445,24 +445,29 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)); - ret = mnt_want_write_file(dst_file); - if (ret) - return ret; - /* * This is redundant if called from vfs_dedupe_file_range(), but other * callers need it and it's not performance sesitive... */ ret = remap_verify_area(src_file, src_pos, len, false); if (ret) - goto out_drop_write; + return ret; ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret) - goto out_drop_write; + return ret; + + /* + * This needs to be called after remap_verify_area() because of + * sb_start_write() and before may_dedupe_file() because the mount's + * MAY_WRITE need to be checked with mnt_get_write_access_file() held. + */ + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; ret = -EPERM; - if (!allow_file_dedupe(dst_file)) + if (!may_dedupe_file(dst_file)) goto out_drop_write; ret = -EXDEV; From 2f4d8ad82511a336f8a805e3759c5189a25bb286 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:07 +0200 Subject: [PATCH 09/30] btrfs: move file_start_write() to after permission hook In vfs code, file_start_write() is usually called after the permission hook in rw_verify_area(). btrfs_ioctl_encoded_write() in an exception to this rule. Move file_start_write() to after the rw_verify_area() check in encoded write to make the permission hook "start-write-safe". This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-9-amir73il@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/btrfs/ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 752acff2c734..e691770c25aa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4523,29 +4523,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (ret < 0) goto out_acct; - file_start_write(file); - if (iov_iter_count(&iter) == 0) { ret = 0; - goto out_end_write; + goto out_iov; } pos = args.offset; ret = rw_verify_area(WRITE, file, &pos, args.len); if (ret < 0) - goto out_end_write; + goto out_iov; init_sync_kiocb(&kiocb, file); ret = kiocb_set_rw_flags(&kiocb, 0); if (ret) - goto out_end_write; + goto out_iov; kiocb.ki_pos = pos; + file_start_write(file); + ret = btrfs_do_write_iter(&kiocb, &iter, &args); if (ret > 0) fsnotify_modify(file); -out_end_write: file_end_write(file); +out_iov: kfree(iov); out_acct: if (ret > 0) From e389b76a7ee1b62392ab52c22f9ba81f23145824 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:08 +0200 Subject: [PATCH 10/30] coda: change locking order in coda_file_write_iter() The coda host file is a backing file for the coda inode on a different filesystem than the coda inode. Change the locking order to take the coda inode lock before taking the backing host file freeze protection, same as in ovl_write_iter() and in network filesystems that use cachefiles. Link: https://lore.kernel.org/r/CAOQ4uxjcnwuF1gMxe64WLODGA_MyAy8x-DtqkCUxqVQKk3Xbng@mail.gmail.com/ Acked-by: Jan Harkes Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-10-amir73il@gmail.com Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/coda/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/coda/file.c b/fs/coda/file.c index 16acc58311ea..e62315c37386 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -79,14 +79,14 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) if (ret) goto finish_write; - file_start_write(host_file); inode_lock(coda_inode); + file_start_write(host_file); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode)); - inode_unlock(coda_inode); file_end_write(host_file); + inode_unlock(coda_inode); finish_write: venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), From 269aed7014b3db9acdbc5a5e163d8a6c62e0e770 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:09 +0200 Subject: [PATCH 11/30] fs: move file_start_write() into vfs_iter_write() All the callers of vfs_iter_write() call file_start_write() just before calling vfs_iter_write() except for target_core_file's fd_do_rw(). Move file_start_write() from the callers into vfs_iter_write(). fd_do_rw() calls vfs_iter_write() with a non-regular file, so file_start_write() is a no-op. This is needed for fanotify "pre content" events. Suggested-by: Jan Kara Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-11-amir73il@gmail.com Signed-off-by: Christian Brauner --- drivers/block/loop.c | 2 -- drivers/target/target_core_file.c | 10 +++------- fs/coda/file.c | 2 -- fs/nfsd/vfs.c | 2 -- fs/overlayfs/file.c | 2 -- fs/read_write.c | 13 ++++++++++--- 6 files changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 9f2d412fc560..8a8cd4fc9238 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -245,9 +245,7 @@ static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len); - file_start_write(file); bw = vfs_iter_write(file, &i, ppos, 0); - file_end_write(file); if (likely(bw == bvec->bv_len)) return 0; diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 4e4cf6c34a77..4d447520bab8 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -332,13 +332,11 @@ static int fd_do_rw(struct se_cmd *cmd, struct file *fd, } iov_iter_bvec(&iter, is_write, bvec, sgl_nents, len); - if (is_write) { - file_start_write(fd); + if (is_write) ret = vfs_iter_write(fd, &iter, &pos, 0); - file_end_write(fd); - } else { + else ret = vfs_iter_read(fd, &iter, &pos, 0); - } + if (is_write) { if (ret < 0 || ret != data_length) { pr_err("%s() write returned %d\n", __func__, ret); @@ -469,9 +467,7 @@ fd_execute_write_same(struct se_cmd *cmd) } iov_iter_bvec(&iter, ITER_SOURCE, bvec, nolb, len); - file_start_write(fd_dev->fd_file); ret = vfs_iter_write(fd_dev->fd_file, &iter, &pos, 0); - file_end_write(fd_dev->fd_file); kfree(bvec); if (ret < 0 || ret != len) { diff --git a/fs/coda/file.c b/fs/coda/file.c index e62315c37386..148856a582a9 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -80,12 +80,10 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) goto finish_write; inode_lock(coda_inode); - file_start_write(host_file); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode)); - file_end_write(host_file); inode_unlock(coda_inode); finish_write: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 5d704461e3b4..35c9546b3396 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1186,9 +1186,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); - file_start_write(file); host_err = vfs_iter_write(file, &iter, &pos, flags); - file_end_write(file); if (host_err < 0) { commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 131621daeb13..690b173f34fc 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -436,9 +436,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (is_sync_kiocb(iocb)) { rwf_t rwf = iocb_to_rw_flags(ifl); - file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); - file_end_write(real.file); /* Update size */ ovl_file_modified(file); } else { diff --git a/fs/read_write.c b/fs/read_write.c index 4771701c896b..3aa3bce18075 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -839,7 +839,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, EXPORT_SYMBOL(vfs_iter_read); static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) + loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -894,11 +894,18 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, - rwf_t flags) + rwf_t flags) { + int ret; + if (!file->f_op->write_iter) return -EINVAL; - return do_iter_write(file, iter, ppos, flags); + + file_start_write(file); + ret = do_iter_write(file, iter, ppos, flags); + file_end_write(file); + + return ret; } EXPORT_SYMBOL(vfs_iter_write); From 1c8aa833034a00617866ea4738a40491e3e23902 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 16 Jul 2023 14:47:14 +0300 Subject: [PATCH 12/30] fs: move permission hook out of do_iter_write() In many of the vfs helpers, the rw_verity_area() checks are called before taking sb_start_write(), making them "start-write-safe". do_iter_write() is an exception to this rule. do_iter_write() has two callers - vfs_iter_write() and vfs_writev(). Move rw_verify_area() and other checks from do_iter_write() out to its callers to make them "start-write-safe". Move also the fsnotify_modify() hook to align with similar pattern used in vfs_write() and other vfs helpers. This is needed for fanotify "pre content" events. Suggested-by: Jan Kara Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-12-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/read_write.c | 86 +++++++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 3aa3bce18075..d08d0a3ff7de 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -838,33 +838,6 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, } EXPORT_SYMBOL(vfs_iter_read); -static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) -{ - size_t tot_len; - ssize_t ret = 0; - - if (!(file->f_mode & FMODE_WRITE)) - return -EBADF; - if (!(file->f_mode & FMODE_CAN_WRITE)) - return -EINVAL; - - tot_len = iov_iter_count(iter); - if (!tot_len) - return 0; - ret = rw_verify_area(WRITE, file, pos, tot_len); - if (ret < 0) - return ret; - - if (file->f_op->write_iter) - ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); - else - ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); - if (ret > 0) - fsnotify_modify(file); - return ret; -} - ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { @@ -896,13 +869,28 @@ EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { - int ret; + size_t tot_len; + ssize_t ret; + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; + tot_len = iov_iter_count(iter); + if (!tot_len) + return 0; + + ret = rw_verify_area(WRITE, file, ppos, tot_len); + if (ret < 0) + return ret; + file_start_write(file); - ret = do_iter_write(file, iter, ppos, flags); + ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); + if (ret > 0) + fsnotify_modify(file); file_end_write(file); return ret; @@ -927,20 +915,42 @@ static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t ret; + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); - if (ret >= 0) { - file_start_write(file); - ret = do_iter_write(file, &iter, pos, flags); - file_end_write(file); - kfree(iov); - } + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, + &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + + ret = rw_verify_area(WRITE, file, pos, tot_len); + if (ret < 0) + goto out; + + file_start_write(file); + if (file->f_op->write_iter) + ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); + else + ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); + if (ret > 0) + fsnotify_modify(file); + file_end_write(file); +out: + kfree(iov); return ret; } From b8e1425bae856b189e2365ff795e30fdd9e77049 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 16 Jul 2023 14:47:14 +0300 Subject: [PATCH 13/30] fs: move permission hook out of do_iter_read() We recently moved fsnotify hook, rw_verify_area() and other checks from do_iter_write() out to its two callers. for consistency, do the same thing for do_iter_read() - move the rw_verify_area() checks and fsnotify hook to the callers vfs_iter_read() and vfs_readv(). This aligns those vfs helpers with the pattern used in vfs_read() and vfs_iocb_iter_read() and the vfs write helpers, where all the checks are in the vfs helpers and the do_* or call_* helpers do the work. This is needed for fanotify "pre content" events. Suggested-by: Jan Kara Reviewed-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-13-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/read_write.c | 86 ++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index d08d0a3ff7de..9fce8d6968df 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -773,34 +773,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return ret; } -static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) -{ - size_t tot_len; - ssize_t ret = 0; - - if (!(file->f_mode & FMODE_READ)) - return -EBADF; - if (!(file->f_mode & FMODE_CAN_READ)) - return -EINVAL; - - tot_len = iov_iter_count(iter); - if (!tot_len) - goto out; - ret = rw_verify_area(READ, file, pos, tot_len); - if (ret < 0) - return ret; - - if (file->f_op->read_iter) - ret = do_iter_readv_writev(file, iter, pos, READ, flags); - else - ret = do_loop_readv_writev(file, iter, pos, READ, flags); -out: - if (ret >= 0) - fsnotify_access(file); - return ret; -} - ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { @@ -830,11 +802,30 @@ out: EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, - rwf_t flags) + rwf_t flags) { + size_t tot_len; + ssize_t ret = 0; + if (!file->f_op->read_iter) return -EINVAL; - return do_iter_read(file, iter, ppos, flags); + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + goto out; + ret = rw_verify_area(READ, file, ppos, tot_len); + if (ret < 0) + return ret; + + ret = do_iter_readv_writev(file, iter, ppos, READ, flags); +out: + if (ret >= 0) + fsnotify_access(file); + return ret; } EXPORT_SYMBOL(vfs_iter_read); @@ -898,19 +889,40 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t ret; + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); - if (ret >= 0) { - ret = do_iter_read(file, &iter, pos, flags); - kfree(iov); - } + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, + &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + + ret = rw_verify_area(READ, file, pos, tot_len); + if (ret < 0) + goto out; + + if (file->f_op->read_iter) + ret = do_iter_readv_writev(file, &iter, pos, READ, flags); + else + ret = do_loop_readv_writev(file, &iter, pos, READ, flags); +out: + if (ret >= 0) + fsnotify_access(file); + kfree(iov); return ret; } From 6ae654392bb516a0baa47fed1f085d84e8cad739 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:12 +0200 Subject: [PATCH 14/30] fs: move kiocb_start_write() into vfs_iocb_iter_write() In vfs code, sb_start_write() is usually called after the permission hook in rw_verify_area(). vfs_iocb_iter_write() is an exception to this rule, where kiocb_start_write() is called by its callers. Move kiocb_start_write() from the callers into vfs_iocb_iter_write() after the rw_verify_area() checks, to make them "start-write-safe". The semantics of vfs_iocb_iter_write() is changed, so that the caller is responsible for calling kiocb_end_write() on completion only if async iocb was queued. The completion handlers of both callers were adapted to this semantic change. This is needed for fanotify "pre content" events. Suggested-by: Jan Kara Suggested-by: Josef Bacik Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-14-amir73il@gmail.com Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/cachefiles/io.c | 5 ++--- fs/overlayfs/file.c | 8 ++++---- fs/read_write.c | 7 +++++++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 009d23cd435b..5857241c5918 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -259,7 +259,8 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - kiocb_end_write(iocb); + if (ki->was_async) + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -319,8 +320,6 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - kiocb_start_write(&ki->iocb); - get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 690b173f34fc..4e46420c8fdd 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -295,10 +295,8 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) struct kiocb *iocb = &aio_req->iocb; struct kiocb *orig_iocb = aio_req->orig_iocb; - if (iocb->ki_flags & IOCB_WRITE) { - kiocb_end_write(iocb); + if (iocb->ki_flags & IOCB_WRITE) ovl_file_modified(orig_iocb->ki_filp); - } orig_iocb->ki_pos = iocb->ki_pos; ovl_aio_put(aio_req); @@ -310,6 +308,9 @@ static void ovl_aio_rw_complete(struct kiocb *iocb, long res) struct ovl_aio_req, iocb); struct kiocb *orig_iocb = aio_req->orig_iocb; + if (iocb->ki_flags & IOCB_WRITE) + kiocb_end_write(iocb); + ovl_aio_cleanup_handler(aio_req); orig_iocb->ki_complete(orig_iocb, res); } @@ -456,7 +457,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_queue_completion; refcount_set(&aio_req->ref, 2); - kiocb_start_write(&aio_req->iocb); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) diff --git a/fs/read_write.c b/fs/read_write.c index 9fce8d6968df..92c68ab4f221 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -829,6 +829,10 @@ out: } EXPORT_SYMBOL(vfs_iter_read); +/* + * Caller is responsible for calling kiocb_end_write() on completion + * if async iocb was queued. + */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { @@ -849,7 +853,10 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, if (ret < 0) return ret; + kiocb_start_write(iocb); ret = call_write_iter(file, iocb, iter); + if (ret != -EIOCBQUEUED) + kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); From 8802e580ee643e3f63c6b39ff64e7c7baa4a55ba Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:13 +0200 Subject: [PATCH 15/30] fs: create __sb_write_started() helper Similar to sb_write_started() for use by other sb freeze levels. Unlike the boolean sb_write_started(), this helper returns a tristate to distiguish the cases of lockdep disabled or unknown lock state. This is needed for fanotify "pre content" events. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-15-amir73il@gmail.com Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..ac8b5a9b467b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1645,9 +1645,23 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) +/** + * __sb_write_started - check if sb freeze level is held + * @sb: the super we write to + * @level: the freeze level + * + * > 0 sb freeze level is held + * 0 sb freeze level is not held + * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + */ +static inline int __sb_write_started(const struct super_block *sb, int level) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); +} + static inline bool sb_write_started(const struct super_block *sb) { - return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1); + return __sb_write_started(sb, SB_FREEZE_WRITE); } /** From 3d5cd4911e04683df8f4439fddd788e00a2510a8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:14 +0200 Subject: [PATCH 16/30] fs: create file_write_started() helper Convenience wrapper for sb_write_started(file_inode(inode)->i_sb)), which has a single occurrence in the code right now. Document the false negatives of those helpers, which makes them unusable to assert that sb_start_write() is not held. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-16-amir73il@gmail.com Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/read_write.c | 2 +- include/linux/fs.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/read_write.c b/fs/read_write.c index 92c68ab4f221..f791555fa246 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1423,7 +1423,7 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { - lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); + lockdep_assert(file_write_started(file_out)); return do_splice_direct(file_in, &pos_in, file_out, &pos_out, len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); diff --git a/include/linux/fs.h b/include/linux/fs.h index ac8b5a9b467b..75a10b632edd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1659,11 +1659,32 @@ static inline int __sb_write_started(const struct super_block *sb, int level) return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); } +/** + * sb_write_started - check if SB_FREEZE_WRITE is held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ static inline bool sb_write_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE); } +/** + * file_write_started - check if SB_FREEZE_WRITE is held + * @file: the file we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + * May be false positive with !S_ISREG, because file_start_write() has + * no effect on !S_ISREG. + */ +static inline bool file_write_started(const struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return true; + return sb_write_started(file_inode(file)->i_sb); +} + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to From 21b32e6a0ab5b174fa1ca2fb4c212577cf405d83 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:15 +0200 Subject: [PATCH 17/30] fs: create {sb,file}_write_not_started() helpers Create new helpers {sb,file}_write_not_started() that can be used to assert that sb_start_write() is not held. This is needed for fanotify "pre content" events. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-17-amir73il@gmail.com Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/fs.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 75a10b632edd..ae0e2fb7bcea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1670,6 +1670,17 @@ static inline bool sb_write_started(const struct super_block *sb) return __sb_write_started(sb, SB_FREEZE_WRITE); } +/** + * sb_write_not_started - check if SB_FREEZE_WRITE is not held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_not_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; +} + /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to @@ -1685,6 +1696,21 @@ static inline bool file_write_started(const struct file *file) return sb_write_started(file_inode(file)->i_sb); } +/** + * file_write_not_started - check if SB_FREEZE_WRITE is not held + * @file: the file we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + * May be false positive with !S_ISREG, because file_start_write() has + * no effect on !S_ISREG. + */ +static inline bool file_write_not_started(const struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return true; + return sb_write_not_started(file_inode(file)->i_sb); +} + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to From 488e8f685207e0758398963d6834f81e5e61c162 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 16:16:22 +0200 Subject: [PATCH 18/30] fs: fork splice_file_range() from do_splice_direct() In preparation of calling do_splice_direct() without file_start_write() held, create a new helper splice_file_range(), to be called from context of ->copy_file_range() methods instead of do_splice_direct(). Currently, the only difference is that splice_file_range() does not take flags argument and that it asserts that file_start_write() is held, but we factor out a common helper do_splice_direct_actor() that will be used later. Use the new helper from __ceph_copy_file_range(), that was incorrectly passing to do_splice_direct() the copy flags argument as splice flags. The value of copy flags in ceph is always 0, so it is a smenatic bug fix. Move the declaration of both helpers to linux/splice.h. Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231130141624.3338942-2-amir73il@gmail.com Acked-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/ceph/file.c | 9 ++--- fs/read_write.c | 6 ++-- fs/splice.c | 75 +++++++++++++++++++++++++++++------------- include/linux/fs.h | 2 -- include/linux/splice.h | 13 +++++--- 5 files changed, 68 insertions(+), 37 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b5aae29e944..f11de6e1f1c1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * {read,write}_iter, which will get caps again. */ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); - ret = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, src_objlen, flags); + ret = splice_file_range(src_file, &src_off, dst_file, &dst_off, + src_objlen); /* Abort on short copies or on error */ if (ret < (long)src_objlen) { doutc(cl, "Failed partial copy (%zd)\n", ret); @@ -3065,8 +3066,8 @@ out_caps: */ if (len && (len < src_ci->i_layout.object_size)) { doutc(cl, "Final partial copy of %zu bytes\n", len); - bytes = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, len, flags); + bytes = splice_file_range(src_file, &src_off, dst_file, + &dst_off, len); if (bytes > 0) ret += bytes; else diff --git a/fs/read_write.c b/fs/read_write.c index f791555fa246..642c7ce1ced1 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1423,10 +1423,8 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { - lockdep_assert(file_write_started(file_out)); - - return do_splice_direct(file_in, &pos_in, file_out, &pos_out, - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + return splice_file_range(file_in, &pos_in, file_out, &pos_out, + min_t(size_t, len, MAX_RW_COUNT)); } EXPORT_SYMBOL(generic_copy_file_range); diff --git a/fs/splice.c b/fs/splice.c index 3fce5f6072dd..9007b2c8baa8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1170,6 +1170,34 @@ static void direct_file_splice_eof(struct splice_desc *sd) file->f_op->splice_eof(file); } +static long do_splice_direct_actor(struct file *in, loff_t *ppos, + struct file *out, loff_t *opos, + size_t len, unsigned int flags, + splice_direct_actor *actor) +{ + struct splice_desc sd = { + .len = len, + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + .splice_eof = direct_file_splice_eof, + .opos = opos, + }; + long ret; + + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = splice_direct_to_actor(in, &sd, actor); + if (ret > 0) + *ppos = sd.pos; + + return ret; +} /** * do_splice_direct - splices data directly between two files * @in: file to splice from @@ -1190,31 +1218,34 @@ static void direct_file_splice_eof(struct splice_desc *sd) long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) { - struct splice_desc sd = { - .len = len, - .total_len = len, - .flags = flags, - .pos = *ppos, - .u.file = out, - .splice_eof = direct_file_splice_eof, - .opos = opos, - }; - long ret; - - if (unlikely(!(out->f_mode & FMODE_WRITE))) - return -EBADF; - - if (unlikely(out->f_flags & O_APPEND)) - return -EINVAL; - - ret = splice_direct_to_actor(in, &sd, direct_splice_actor); - if (ret > 0) - *ppos = sd.pos; - - return ret; + return do_splice_direct_actor(in, ppos, out, opos, len, flags, + direct_splice_actor); } EXPORT_SYMBOL(do_splice_direct); +/** + * splice_file_range - splices data between two files for copy_file_range() + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * + * Description: + * For use by generic_copy_file_range() and ->copy_file_range() methods. + * + * Callers already called rw_verify_area() on the entire range. + */ +long splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len) +{ + lockdep_assert(file_write_started(out)); + + return do_splice_direct_actor(in, ppos, out, opos, len, 0, + direct_splice_actor); +} +EXPORT_SYMBOL(splice_file_range); + static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { diff --git a/include/linux/fs.h b/include/linux/fs.h index ae0e2fb7bcea..04422a0eccdd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3052,8 +3052,6 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); extern void diff --git a/include/linux/splice.h b/include/linux/splice.h index 6c461573434d..49532d5dda52 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -80,11 +80,14 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, long vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, - splice_direct_actor *); -extern long do_splice(struct file *in, loff_t *off_in, - struct file *out, loff_t *off_out, - size_t len, unsigned int flags); +ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd, + splice_direct_actor *actor); +long do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags); +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); +long splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len); extern long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags); From da40448ce4eb4de18eb7b0db61dddece32677939 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 16:16:23 +0200 Subject: [PATCH 19/30] fs: move file_start_write() into direct_splice_actor() The callers of do_splice_direct() hold file_start_write() on the output file. This may cause file permission hooks to be called indirectly on an overlayfs lower layer, which is on the same filesystem of the output file and could lead to deadlock with fanotify permission events. To fix this potential deadlock, move file_start_write() from the callers into the direct_splice_actor(), so file_start_write() will not be held while splicing from the input file. Suggested-by: Josef Bacik Link: https://lore.kernel.org/r/20231128214258.GA2398475@perftesting/ Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231130141624.3338942-3-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/overlayfs/copy_up.c | 2 -- fs/read_write.c | 2 -- fs/splice.c | 19 ++++++++++++++++--- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 106f8643af3b..2f587ee0b334 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -333,11 +333,9 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, if (error) break; - ovl_start_write(dentry); bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); - ovl_end_write(dentry); if (bytes <= 0) { error = bytes; break; diff --git a/fs/read_write.c b/fs/read_write.c index 642c7ce1ced1..0bc99f38e623 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1286,10 +1286,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; - file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); - file_end_write(out.file); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; diff --git a/fs/splice.c b/fs/splice.c index 9007b2c8baa8..7cda013e5a1e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1157,9 +1157,20 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; + long ret; - return do_splice_from(pipe, file, sd->opos, sd->total_len, - sd->flags); + file_start_write(file); + ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); + file_end_write(file); + return ret; +} + +static int splice_file_range_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; + + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) @@ -1233,6 +1244,8 @@ EXPORT_SYMBOL(do_splice_direct); * * Description: * For use by generic_copy_file_range() and ->copy_file_range() methods. + * Like do_splice_direct(), but vfs_copy_file_range() already holds + * start_file_write() on @out file. * * Callers already called rw_verify_area() on the entire range. */ @@ -1242,7 +1255,7 @@ long splice_file_range(struct file *in, loff_t *ppos, struct file *out, lockdep_assert(file_write_started(out)); return do_splice_direct_actor(in, ppos, out, opos, len, 0, - direct_splice_actor); + splice_file_range_actor); } EXPORT_SYMBOL(splice_file_range); From 730651268664070dbd582d7d0338b47d066d6323 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 16:16:24 +0200 Subject: [PATCH 20/30] fs: use do_splice_direct() for nfsd/ksmbd server-side-copy nfsd/ksmbd call vfs_copy_file_range() with flag COPY_FILE_SPLICE to perform kernel copy between two files on any two filesystems. Splicing input file, while holding file_start_write() on the output file which is on a different sb, posses a risk for fanotify related deadlocks. We only need to call splice_file_range() from within the context of ->copy_file_range() filesystem methods with file_start_write() held. To avoid the possible deadlocks, always use do_splice_direct() instead of splice_file_range() for the kernel copy fallback in vfs_copy_file_range() without holding file_start_write(). Reported-and-tested-by: Bert Karwatzki Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231130141624.3338942-4-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/read_write.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 0bc99f38e623..01a14570015b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1421,6 +1421,10 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { + /* May only be called from within ->copy_file_range() methods */ + if (WARN_ON_ONCE(flags)) + return -EINVAL; + return splice_file_range(file_in, &pos_in, file_out, &pos_out, min_t(size_t, len, MAX_RW_COUNT)); } @@ -1510,6 +1514,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; + bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; @@ -1541,19 +1546,24 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); - goto done; - } - - if (!splice && file_in->f_op->remap_file_range && - file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { + } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); - if (ret > 0) - goto done; + /* fallback to splice */ + if (ret <= 0) + splice = true; + } else if (samesb) { + /* Fallback to splice for same sb copy for backward compat */ + splice = true; } + file_end_write(file_out); + + if (!splice) + goto done; + /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in @@ -1565,11 +1575,16 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * - * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. + * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE + * for server-side-copy between any two sb. + * + * In any case, we call do_splice_direct() and not splice_file_range(), + * without file_start_write() held, to avoid possible deadlocks related + * to splicing from input file, while file_start_write() is held on + * the output file on a different sb. */ - ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); - + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); @@ -1581,8 +1596,6 @@ done: inc_syscr(current); inc_syscw(current); - file_end_write(file_out); - return ret; } EXPORT_SYMBOL(vfs_copy_file_range); From 0f292086c22b43202daffc14b585d3b54b9a1206 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:36 +0200 Subject: [PATCH 21/30] splice: return type ssize_t from all helpers Not sure why some splice helpers return long, maybe historic reasons. Change them all to return ssize_t to conform to the splice methods and to the rest of the helpers. Suggested-by: Christian Brauner Link: https://lore.kernel.org/r/20231208-horchen-helium-d3ec1535ede5@brauner/ Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-2-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/internal.h | 8 ++-- fs/overlayfs/copy_up.c | 2 +- fs/read_write.c | 2 +- fs/splice.c | 104 +++++++++++++++++++++-------------------- include/linux/splice.h | 43 +++++++++-------- io_uring/splice.c | 4 +- 6 files changed, 82 insertions(+), 81 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 58e43341aebf..4eaa758e3bfd 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -243,10 +243,10 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, /* * fs/splice.c: */ -long splice_file_to_pipe(struct file *in, - struct pipe_inode_info *opipe, - loff_t *offset, - size_t len, unsigned int flags); +ssize_t splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags); /* * fs/xattr.c: diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2f587ee0b334..070d36ba0f09 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -285,7 +285,7 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, while (len) { size_t this_len = OVL_COPY_UP_CHUNK_SIZE; - long bytes; + ssize_t bytes; if (len < this_len) this_len = len; diff --git a/fs/read_write.c b/fs/read_write.c index 01a14570015b..7783b8522693 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1214,7 +1214,7 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, - size_t count, loff_t max) + size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; diff --git a/fs/splice.c b/fs/splice.c index 7cda013e5a1e..c39d1abf23c8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -201,7 +201,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int tail = pipe->tail; unsigned int head = pipe->head; unsigned int mask = pipe->ring_size - 1; - int ret = 0, page_nr = 0; + ssize_t ret = 0; + int page_nr = 0; if (!spd_pages) return 0; @@ -932,8 +933,8 @@ static int warn_unsupported(struct file *file, const char *op) /* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); @@ -955,9 +956,9 @@ static void do_splice_eof(struct splice_desc *sd) * Callers already called rw_verify_area() on the entire range. * No need to call it for sub ranges. */ -static long do_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static ssize_t do_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { unsigned int p_space; @@ -999,11 +1000,11 @@ static long do_splice_read(struct file *in, loff_t *ppos, * If successful, it returns the amount of data spliced, 0 if it hit the EOF or * a hole and a negative error code otherwise. */ -long vfs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +ssize_t vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { - int ret; + ssize_t ret; ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) @@ -1030,7 +1031,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; - long ret, bytes; + ssize_t ret, bytes; size_t len; int i, flags, more; @@ -1181,10 +1182,10 @@ static void direct_file_splice_eof(struct splice_desc *sd) file->f_op->splice_eof(file); } -static long do_splice_direct_actor(struct file *in, loff_t *ppos, - struct file *out, loff_t *opos, - size_t len, unsigned int flags, - splice_direct_actor *actor) +static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, + struct file *out, loff_t *opos, + size_t len, unsigned int flags, + splice_direct_actor *actor) { struct splice_desc sd = { .len = len, @@ -1195,7 +1196,7 @@ static long do_splice_direct_actor(struct file *in, loff_t *ppos, .splice_eof = direct_file_splice_eof, .opos = opos, }; - long ret; + ssize_t ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; @@ -1226,8 +1227,8 @@ static long do_splice_direct_actor(struct file *in, loff_t *ppos, * * Callers already called rw_verify_area() on the entire range. */ -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags) +ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags) { return do_splice_direct_actor(in, ppos, out, opos, len, flags, direct_splice_actor); @@ -1249,8 +1250,8 @@ EXPORT_SYMBOL(do_splice_direct); * * Callers already called rw_verify_area() on the entire range. */ -long splice_file_range(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len) +ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len) { lockdep_assert(file_write_started(out)); @@ -1280,12 +1281,12 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -long splice_file_to_pipe(struct file *in, - struct pipe_inode_info *opipe, - loff_t *offset, - size_t len, unsigned int flags) +ssize_t splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags) { - long ret; + ssize_t ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); @@ -1300,13 +1301,13 @@ long splice_file_to_pipe(struct file *in, /* * Determine where to splice to/from. */ -long do_splice(struct file *in, loff_t *off_in, struct file *out, - loff_t *off_out, size_t len, unsigned int flags) +ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; - long ret; + ssize_t ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) @@ -1397,14 +1398,14 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, return ret; } -static long __do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, - size_t len, unsigned int flags) +static ssize_t __do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; - long ret; + ssize_t ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); @@ -1443,16 +1444,16 @@ static long __do_splice(struct file *in, loff_t __user *off_in, return ret; } -static int iter_to_pipe(struct iov_iter *from, - struct pipe_inode_info *pipe, - unsigned flags) +static ssize_t iter_to_pipe(struct iov_iter *from, + struct pipe_inode_info *pipe, + unsigned int flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; - int ret = 0; + ssize_t ret = 0; while (iov_iter_count(from)) { struct page *pages[16]; @@ -1501,8 +1502,8 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, struct iov_iter *iter, - unsigned int flags) +static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, + unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { @@ -1510,7 +1511,7 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, .flags = flags, .u.data = iter }; - long ret = 0; + ssize_t ret = 0; if (!pipe) return -EBADF; @@ -1534,11 +1535,11 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ -static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, - unsigned int flags) +static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, + unsigned int flags) { struct pipe_inode_info *pipe; - long ret = 0; + ssize_t ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) @@ -1634,7 +1635,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, size_t, len, unsigned int, flags) { struct fd in, out; - long error; + ssize_t error; if (unlikely(!len)) return 0; @@ -1648,7 +1649,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, out = fdget(fd_out); if (out.file) { error = __do_splice(in.file, off_in, out.file, off_out, - len, flags); + len, flags); fdput(out); } fdput(in); @@ -1871,15 +1872,15 @@ retry: /* * Link contents of ipipe to opipe. */ -static int link_pipe(struct pipe_inode_info *ipipe, - struct pipe_inode_info *opipe, - size_t len, unsigned int flags) +static ssize_t link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; - int ret = 0; + ssize_t ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1962,11 +1963,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ -long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) +ssize_t do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); - int ret = -EINVAL; + ssize_t ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) @@ -2003,7 +2005,7 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { struct fd in, out; - int error; + ssize_t error; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; diff --git a/include/linux/splice.h b/include/linux/splice.h index 49532d5dda52..068a8e8ffd73 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -68,31 +68,30 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, typedef int (splice_direct_actor)(struct pipe_inode_info *, struct splice_desc *); -extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, - loff_t *, size_t, unsigned int, - splice_actor *); -extern ssize_t __splice_from_pipe(struct pipe_inode_info *, - struct splice_desc *, splice_actor *); -extern ssize_t splice_to_pipe(struct pipe_inode_info *, - struct splice_pipe_desc *); -extern ssize_t add_to_pipe(struct pipe_inode_info *, - struct pipe_buffer *); -long vfs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags); +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor); +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, + struct splice_desc *sd, splice_actor *actor); +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd); +ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf); +ssize_t vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd, splice_direct_actor *actor); -long do_splice(struct file *in, loff_t *off_in, struct file *out, - loff_t *off_out, size_t len, unsigned int flags); -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); -long splice_file_range(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len); +ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags); +ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); +ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len); -extern long do_tee(struct file *in, struct file *out, size_t len, - unsigned int flags); -extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags); +ssize_t do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags); +ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags); /* * for dynamic pipe sizing diff --git a/io_uring/splice.c b/io_uring/splice.c index 7c4469e9540e..3b659cd23e9d 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -51,7 +51,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; struct file *in; - long ret = 0; + ssize_t ret = 0; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); @@ -92,7 +92,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; loff_t *poff_in, *poff_out; struct file *in; - long ret = 0; + ssize_t ret = 0; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); From 705bcfcbde38b9dd4db00cd3deb0b98bddb0dd4a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:37 +0200 Subject: [PATCH 22/30] fs: use splice_copy_file_range() inline helper generic_copy_file_range() is just a wrapper around splice_file_range(), which caps the maximum copy length. The only caller of splice_file_range(), namely __ceph_copy_file_range() is already ready to cope with short copy. Move the length capping into splice_file_range() and replace the exported symbol generic_copy_file_range() with a simple inline helper. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/linux-fsdevel/20231204083849.GC32438@lst.de/ Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-3-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/ceph/file.c | 4 ++-- fs/fuse/file.c | 5 +++-- fs/nfs/nfs4file.c | 5 +++-- fs/read_write.c | 34 ---------------------------------- fs/smb/client/cifsfs.c | 5 +++-- fs/splice.c | 7 ++++--- include/linux/fs.h | 3 --- include/linux/splice.h | 7 +++++++ 8 files changed, 22 insertions(+), 48 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f11de6e1f1c1..d380d9dad0e0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -3090,8 +3090,8 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, len, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_file, src_off, dst_file, - dst_off, len, flags); + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); return ret; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1cdb6327511e..7d0860ef63d0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,6 +19,7 @@ #include #include #include +#include static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -3193,8 +3194,8 @@ static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, len, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_file, src_off, dst_file, - dst_off, len, flags); + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); return ret; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 02788c3c85e5..e238abc78a13 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "delegation.h" #include "internal.h" #include "iostat.h" @@ -195,8 +196,8 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(file_in, pos_in, file_out, - pos_out, count, flags); + ret = splice_copy_file_range(file_in, pos_in, file_out, + pos_out, count); return ret; } diff --git a/fs/read_write.c b/fs/read_write.c index 7783b8522693..e3abf603eaaf 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1396,40 +1396,6 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, } #endif -/** - * generic_copy_file_range - copy data between two files - * @file_in: file structure to read from - * @pos_in: file offset to read from - * @file_out: file structure to write data to - * @pos_out: file offset to write data to - * @len: amount of data to copy - * @flags: copy flags - * - * This is a generic filesystem helper to copy data from one file to another. - * It has no constraints on the source or destination file owners - the files - * can belong to different superblocks and different filesystem types. Short - * copies are allowed. - * - * This should be called from the @file_out filesystem, as per the - * ->copy_file_range() method. - * - * Returns the number of bytes copied or a negative error indicating the - * failure. - */ - -ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - /* May only be called from within ->copy_file_range() methods */ - if (WARN_ON_ONCE(flags)) - return -EINVAL; - - return splice_file_range(file_in, &pos_in, file_out, &pos_out, - min_t(size_t, len, MAX_RW_COUNT)); -} -EXPORT_SYMBOL(generic_copy_file_range); - /* * Performs necessary checks before doing a file copy * diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index ea3a7a668b45..ee461bf0ef63 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1362,8 +1363,8 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, free_xid(xid); if (rc == -EOPNOTSUPP || rc == -EXDEV) - rc = generic_copy_file_range(src_file, off, dst_file, - destoff, len, flags); + rc = splice_copy_file_range(src_file, off, dst_file, + destoff, len); return rc; } diff --git a/fs/splice.c b/fs/splice.c index c39d1abf23c8..218e24b1ac40 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1244,7 +1244,7 @@ EXPORT_SYMBOL(do_splice_direct); * @len: number of bytes to splice * * Description: - * For use by generic_copy_file_range() and ->copy_file_range() methods. + * For use by ->copy_file_range() methods. * Like do_splice_direct(), but vfs_copy_file_range() already holds * start_file_write() on @out file. * @@ -1255,8 +1255,9 @@ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, { lockdep_assert(file_write_started(out)); - return do_splice_direct_actor(in, ppos, out, opos, len, 0, - splice_file_range_actor); + return do_splice_direct_actor(in, ppos, out, opos, + min_t(size_t, len, MAX_RW_COUNT), + 0, splice_file_range_actor); } EXPORT_SYMBOL(splice_file_range); diff --git a/include/linux/fs.h b/include/linux/fs.h index 04422a0eccdd..900d0cd55b50 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2090,9 +2090,6 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); -extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, diff --git a/include/linux/splice.h b/include/linux/splice.h index 068a8e8ffd73..9dec4861d09f 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -88,6 +88,13 @@ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len); +static inline long splice_copy_file_range(struct file *in, loff_t pos_in, + struct file *out, loff_t pos_out, + size_t len) +{ + return splice_file_range(in, &pos_in, out, &pos_out, len); +} + ssize_t do_tee(struct file *in, struct file *out, size_t len, unsigned int flags); ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, From 36e28c42187c95eb148873ffb059bfdcb8cdb75b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:38 +0200 Subject: [PATCH 23/30] fsnotify: split fsnotify_perm() into two hooks We would like to make changes to the fsnotify access permission hook - add file range arguments and add the pre modify event. In preparation for these changes, split the fsnotify_perm() hook into fsnotify_open_perm() and fsnotify_file_perm(). This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-4-amir73il@gmail.com Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 34 +++++++++++++++++++--------------- security/security.c | 4 ++-- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index bcb6609b54b3..926bb4461b9e 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -100,29 +100,33 @@ static inline int fsnotify_file(struct file *file, __u32 mask) return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } -/* Simple call site for access decisions */ -static inline int fsnotify_perm(struct file *file, int mask) +/* + * fsnotify_file_perm - permission hook before file access + */ +static inline int fsnotify_file_perm(struct file *file, int perm_mask) { - int ret; - __u32 fsnotify_mask = 0; + __u32 fsnotify_mask = FS_ACCESS_PERM; - if (!(mask & (MAY_READ | MAY_OPEN))) + if (!(perm_mask & MAY_READ)) return 0; - if (mask & MAY_OPEN) { - fsnotify_mask = FS_OPEN_PERM; + return fsnotify_file(file, fsnotify_mask); +} - if (file->f_flags & __FMODE_EXEC) { - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); +/* + * fsnotify_open_perm - permission hook before file open + */ +static inline int fsnotify_open_perm(struct file *file) +{ + int ret; - if (ret) - return ret; - } - } else if (mask & MAY_READ) { - fsnotify_mask = FS_ACCESS_PERM; + if (file->f_flags & __FMODE_EXEC) { + ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); + if (ret) + return ret; } - return fsnotify_file(file, fsnotify_mask); + return fsnotify_file(file, FS_OPEN_PERM); } /* diff --git a/security/security.c b/security/security.c index dcb3e7014f9b..d7f3703c5905 100644 --- a/security/security.c +++ b/security/security.c @@ -2586,7 +2586,7 @@ int security_file_permission(struct file *file, int mask) if (ret) return ret; - return fsnotify_perm(file, mask); + return fsnotify_file_perm(file, mask); } /** @@ -2837,7 +2837,7 @@ int security_file_open(struct file *file) if (ret) return ret; - return fsnotify_perm(file, MAY_OPEN); + return fsnotify_open_perm(file); } /** From cb383f06686734ef04daf63a4369566800717b7b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:39 +0200 Subject: [PATCH 24/30] fsnotify: assert that file_start_write() is not held in permission hooks filesystem may be modified in the context of fanotify permission events (e.g. by HSM service), so assert that sb freeze protection is not held. If the assertion fails, then the following deadlock would be possible: CPU0 CPU1 CPU2 ------------------------------------------------------------------------- file_start_write()#0 ... fsnotify_perm() fanotify_get_response() => (read event and fill file) ... ... freeze_super() ... sb_wait_write() ... vfs_write() file_start_write()#1 This example demonstrates a use case of an hierarchical storage management (HSM) service that uses fanotify permission events to fill the content of a file before access, while a 3rd process starts fsfreeze. This creates a circular dependeny: file_start_write()#0 => fanotify_get_response => file_start_write()#1 => sb_wait_write() => file_end_write()#0 Where file_end_write()#0 can never be called and none of the threads can make progress. The assertion is checked for both MAY_READ and MAY_WRITE permission hooks in preparation for a pre-modify permission event. The assertion is not checked for an open permission event, because do_open() takes mnt_want_write() in O_TRUNC case, meaning that it is not safe to write to filesystem in the content of an open permission event. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-5-amir73il@gmail.com Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 926bb4461b9e..0a9d6a8a747a 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -107,6 +107,13 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { __u32 fsnotify_mask = FS_ACCESS_PERM; + /* + * filesystem may be modified in the context of permission events + * (e.g. by HSM filling a file on access), so sb freeze protection + * must not be held. + */ + lockdep_assert_once(file_write_not_started(file)); + if (!(perm_mask & MAY_READ)) return 0; From d9e5d31084b024734e64307521414ef0ae1d5333 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:40 +0200 Subject: [PATCH 25/30] fsnotify: optionally pass access range in file permission hooks In preparation for pre-content permission events with file access range, move fsnotify_file_perm() hook out of security_file_permission() and into the callers. Callers that have the access range information call the new hook fsnotify_file_area_perm() with the access range. Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-6-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/open.c | 4 ++++ fs/read_write.c | 10 ++++++++-- fs/readdir.c | 4 ++++ fs/remap_range.c | 8 +++++++- include/linux/fsnotify.h | 13 +++++++++++-- security/security.c | 8 +------- 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/fs/open.c b/fs/open.c index 02dc608d40d8..d877228d5939 100644 --- a/fs/open.c +++ b/fs/open.c @@ -304,6 +304,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); + if (ret) + return ret; + if (S_ISFIFO(inode->i_mode)) return -ESPIPE; diff --git a/fs/read_write.c b/fs/read_write.c index e3abf603eaaf..d4c036e82b6c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -354,6 +354,9 @@ out_putf: int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { + int mask = read_write == READ ? MAY_READ : MAY_WRITE; + int ret; + if (unlikely((ssize_t) count < 0)) return -EINVAL; @@ -371,8 +374,11 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t } } - return security_file_permission(file, - read_write == READ ? MAY_READ : MAY_WRITE); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); diff --git a/fs/readdir.c b/fs/readdir.c index c8c46e294431..278bc0254732 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -96,6 +96,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; + res = fsnotify_file_perm(file, MAY_READ); + if (res) + goto out; + res = down_read_killable(&inode->i_rwsem); if (res) goto out; diff --git a/fs/remap_range.c b/fs/remap_range.c index 12131f2a6c9e..f8c1120b8311 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -102,7 +102,9 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { + int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; + int ret; if (unlikely(pos < 0 || len < 0)) return -EINVAL; @@ -110,7 +112,11 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, &pos, len); } /* diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 0a9d6a8a747a..11e6434b8e71 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -101,9 +101,10 @@ static inline int fsnotify_file(struct file *file, __u32 mask) } /* - * fsnotify_file_perm - permission hook before file access + * fsnotify_file_area_perm - permission hook before access to file range */ -static inline int fsnotify_file_perm(struct file *file, int perm_mask) +static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, + const loff_t *ppos, size_t count) { __u32 fsnotify_mask = FS_ACCESS_PERM; @@ -120,6 +121,14 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask) return fsnotify_file(file, fsnotify_mask); } +/* + * fsnotify_file_perm - permission hook before file access + */ +static inline int fsnotify_file_perm(struct file *file, int perm_mask) +{ + return fsnotify_file_area_perm(file, perm_mask, NULL, 0); +} + /* * fsnotify_open_perm - permission hook before file open */ diff --git a/security/security.c b/security/security.c index d7f3703c5905..2a7fc7881cbc 100644 --- a/security/security.c +++ b/security/security.c @@ -2580,13 +2580,7 @@ int security_kernfs_init_security(struct kernfs_node *kn_dir, */ int security_file_permission(struct file *file, int mask) { - int ret; - - ret = call_int_hook(file_permission, 0, file, mask); - if (ret) - return ret; - - return fsnotify_file_perm(file, mask); + return call_int_hook(file_permission, 0, file, mask); } /** From f91a704f7161c2cf0fcd41fa9fbec4355b813fff Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 2 Oct 2023 17:19:46 +0300 Subject: [PATCH 26/30] fs: prepare for stackable filesystems backing file helpers In preparation for factoring out some backing file io helpers from overlayfs, move backing_file_open() into a new file fs/backing-file.c and header. Add a MAINTAINERS entry for stackable filesystems and add a Kconfig FS_STACK which stackable filesystems need to select. For now, the backing_file struct, the backing_file alloc/free functions and the backing_file_real_path() accessor remain internal to file_table.c. We may change that in the future. Signed-off-by: Amir Goldstein --- MAINTAINERS | 9 +++++++ fs/Kconfig | 4 +++ fs/Makefile | 1 + fs/backing-file.c | 48 ++++++++++++++++++++++++++++++++++++ fs/open.c | 38 ---------------------------- fs/overlayfs/Kconfig | 1 + fs/overlayfs/file.c | 1 + include/linux/backing-file.h | 17 +++++++++++++ include/linux/fs.h | 3 --- 9 files changed, 81 insertions(+), 41 deletions(-) create mode 100644 fs/backing-file.c create mode 100644 include/linux/backing-file.h diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..d9ae2688f5c8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8186,6 +8186,15 @@ S: Supported F: fs/iomap/ F: include/linux/iomap.h +FILESYSTEMS [STACKABLE] +M: Miklos Szeredi +M: Amir Goldstein +L: linux-fsdevel@vger.kernel.org +L: linux-unionfs@vger.kernel.org +S: Maintained +F: fs/backing-file.c +F: include/linux/backing-file.h + FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio L: linux-hwmon@vger.kernel.org diff --git a/fs/Kconfig b/fs/Kconfig index fd1f655b4f1f..c47fa4eb9282 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +# Stackable filesystems +config FS_STACK + bool + config BUFFER_HEAD bool diff --git a/fs/Makefile b/fs/Makefile index 75522f88e763..a6962c588962 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o +obj-$(CONFIG_FS_STACK) += backing-file.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ diff --git a/fs/backing-file.c b/fs/backing-file.c new file mode 100644 index 000000000000..04b33036f709 --- /dev/null +++ b/fs/backing-file.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common helpers for stackable filesystems and backing files. + * + * Copyright (C) 2023 CTERA Networks. + */ + +#include +#include + +#include "internal.h" + +/** + * backing_file_open - open a backing file for kernel internal use + * @user_path: path that the user reuqested to open + * @flags: open flags + * @real_path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). + */ +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + path_get(user_path); + *backing_file_user_path(f) = *user_path; + error = vfs_open(real_path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + + return f; +} +EXPORT_SYMBOL_GPL(backing_file_open); diff --git a/fs/open.c b/fs/open.c index d877228d5939..a75054237437 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1184,44 +1184,6 @@ struct file *kernel_file_open(const struct path *path, int flags, } EXPORT_SYMBOL_GPL(kernel_file_open); -/** - * backing_file_open - open a backing file for kernel internal use - * @user_path: path that the user reuqested to open - * @flags: open flags - * @real_path: path of the backing file - * @cred: credentials for open - * - * Open a backing file for a stackable filesystem (e.g., overlayfs). - * @user_path may be on the stackable filesystem and @real_path on the - * underlying filesystem. In this case, we want to be able to return the - * @user_path of the stackable filesystem. This is done by embedding the - * returned file into a container structure that also stores the stacked - * file's path, which can be retrieved using backing_file_user_path(). - */ -struct file *backing_file_open(const struct path *user_path, int flags, - const struct path *real_path, - const struct cred *cred) -{ - struct file *f; - int error; - - f = alloc_empty_backing_file(flags, cred); - if (IS_ERR(f)) - return f; - - path_get(user_path); - *backing_file_user_path(f) = *user_path; - f->f_path = *real_path; - error = do_dentry_open(f, d_inode(real_path->dentry), NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } - - return f; -} -EXPORT_SYMBOL_GPL(backing_file_open); - #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index fec5020c3495..2ac67e04a6fb 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config OVERLAY_FS tristate "Overlay filesystem support" + select FS_STACK select EXPORTFS help An overlay filesystem combines two filesystems - an 'upper' filesystem diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 4e46420c8fdd..a6da3eaf6d4f 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "overlayfs.h" #include "../internal.h" /* for sb_init_dio_done_wq */ diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h new file mode 100644 index 000000000000..55c9e804f780 --- /dev/null +++ b/include/linux/backing-file.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common helpers for stackable filesystems and backing files. + * + * Copyright (C) 2023 CTERA Networks. + */ + +#ifndef _LINUX_BACKING_FILE_H +#define _LINUX_BACKING_FILE_H + +#include + +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred); + +#endif /* _LINUX_BACKING_FILE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 900d0cd55b50..db5d07e6e02e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2575,9 +2575,6 @@ struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); -struct file *backing_file_open(const struct path *user_path, int flags, - const struct path *real_path, - const struct cred *cred); struct path *backing_file_user_path(struct file *f); /* From a6293b3e285cd0d7692141d7981a5f144f0e2f0b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 17:48:52 +0200 Subject: [PATCH 27/30] fs: factor out backing_file_{read,write}_iter() helpers Overlayfs submits files io to backing files on other filesystems. Factor out some common helpers to perform io to backing files, into fs/backing-file.c. Suggested-by: Miklos Szeredi Link: https://lore.kernel.org/r/CAJfpeguhmZbjP3JLqtUy0AdWaHOkAPWeP827BBWwRFEAUgnUcQ@mail.gmail.com Signed-off-by: Amir Goldstein --- fs/backing-file.c | 210 +++++++++++++++++++++++++++++++++++ fs/overlayfs/file.c | 188 +++---------------------------- fs/overlayfs/overlayfs.h | 8 +- fs/overlayfs/super.c | 11 +- include/linux/backing-file.h | 15 +++ 5 files changed, 247 insertions(+), 185 deletions(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index 04b33036f709..323187a49da3 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -2,6 +2,9 @@ /* * Common helpers for stackable filesystems and backing files. * + * Forked from fs/overlayfs/file.c. + * + * Copyright (C) 2017 Red Hat, Inc. * Copyright (C) 2023 CTERA Networks. */ @@ -46,3 +49,210 @@ struct file *backing_file_open(const struct path *user_path, int flags, return f; } EXPORT_SYMBOL_GPL(backing_file_open); + +struct backing_aio { + struct kiocb iocb; + refcount_t ref; + struct kiocb *orig_iocb; + /* used for aio completion */ + void (*end_write)(struct file *); + struct work_struct work; + long res; +}; + +static struct kmem_cache *backing_aio_cachep; + +#define BACKING_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) +{ + return (__force rwf_t)(flags & BACKING_IOCB_MASK); +} + +static void backing_aio_put(struct backing_aio *aio) +{ + if (refcount_dec_and_test(&aio->ref)) { + fput(aio->iocb.ki_filp); + kmem_cache_free(backing_aio_cachep, aio); + } +} + +static void backing_aio_cleanup(struct backing_aio *aio, long res) +{ + struct kiocb *iocb = &aio->iocb; + struct kiocb *orig_iocb = aio->orig_iocb; + + if (aio->end_write) + aio->end_write(orig_iocb->ki_filp); + + orig_iocb->ki_pos = iocb->ki_pos; + backing_aio_put(aio); +} + +static void backing_aio_rw_complete(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + struct kiocb *orig_iocb = aio->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) + kiocb_end_write(iocb); + + backing_aio_cleanup(aio, res); + orig_iocb->ki_complete(orig_iocb, res); +} + +static void backing_aio_complete_work(struct work_struct *work) +{ + struct backing_aio *aio = container_of(work, struct backing_aio, work); + + backing_aio_rw_complete(&aio->iocb, aio->res); +} + +static void backing_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio->res = res; + INIT_WORK(&aio->work, backing_aio_complete_work); + queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio->work); +} + +static int backing_aio_init_wq(struct kiocb *iocb) +{ + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; + + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + + +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + struct backing_aio *aio = NULL; + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } else { + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + /* + * Stacked filesystems don't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + flags &= ~IOCB_DIO_CALLER_COMP; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + if (ctx->end_write) + ctx->end_write(ctx->user_file); + } else { + struct backing_aio *aio; + + ret = backing_aio_init_wq(iocb); + if (ret) + goto out; + + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + aio->end_write = ctx->end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_write_iter); + +static int __init backing_aio_init(void) +{ + backing_aio_cachep = kmem_cache_create("backing_aio", + sizeof(struct backing_aio), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!backing_aio_cachep) + return -ENOMEM; + + return 0; +} +fs_initcall(backing_aio_init); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a6da3eaf6d4f..1b578cb27a26 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -16,19 +16,6 @@ #include #include "overlayfs.h" -#include "../internal.h" /* for sb_init_dio_done_wq */ - -struct ovl_aio_req { - struct kiocb iocb; - refcount_t ref; - struct kiocb *orig_iocb; - /* used for aio completion */ - struct work_struct work; - long res; -}; - -static struct kmem_cache *ovl_aio_request_cachep; - static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -275,84 +262,16 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -#define OVL_IOCB_MASK \ - (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) - -static rwf_t iocb_to_rw_flags(int flags) -{ - return (__force rwf_t)(flags & OVL_IOCB_MASK); -} - -static inline void ovl_aio_put(struct ovl_aio_req *aio_req) -{ - if (refcount_dec_and_test(&aio_req->ref)) { - fput(aio_req->iocb.ki_filp); - kmem_cache_free(ovl_aio_request_cachep, aio_req); - } -} - -static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) -{ - struct kiocb *iocb = &aio_req->iocb; - struct kiocb *orig_iocb = aio_req->orig_iocb; - - if (iocb->ki_flags & IOCB_WRITE) - ovl_file_modified(orig_iocb->ki_filp); - - orig_iocb->ki_pos = iocb->ki_pos; - ovl_aio_put(aio_req); -} - -static void ovl_aio_rw_complete(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - if (iocb->ki_flags & IOCB_WRITE) - kiocb_end_write(iocb); - - ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res); -} - -static void ovl_aio_complete_work(struct work_struct *work) -{ - struct ovl_aio_req *aio_req = container_of(work, - struct ovl_aio_req, work); - - ovl_aio_rw_complete(&aio_req->iocb, aio_req->res); -} - -static void ovl_aio_queue_completion(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - /* - * Punt to a work queue to serialize updates of mtime/size. - */ - aio_req->res = res; - INIT_WORK(&aio_req->work, ovl_aio_complete_work); - queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq, - &aio_req->work); -} - -static int ovl_init_aio_done_wq(struct super_block *sb) -{ - if (sb->s_dio_done_wq) - return 0; - - return sb_init_dio_done_wq(sb); -} - static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct fd real; - const struct cred *old_cred; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; if (!iov_iter_count(iter)) return 0; @@ -361,37 +280,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags); - - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf); - } else { - struct ovl_aio_req *aio_req; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_complete = ovl_aio_rw_complete; - refcount_set(&aio_req->ref, 2); - ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); - ovl_file_accessed(file); -out_fdput: + ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags, + &ctx); fdput(real); return ret; @@ -402,9 +292,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct fd real; - const struct cred *old_cred; ssize_t ret; int ifl = iocb->ki_flags; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = file, + .end_write = ovl_file_modified, + }; if (!iov_iter_count(iter)) return 0; @@ -412,19 +306,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(file); - if (ret) - goto out_unlock; ret = ovl_real_fdget(file, &real); if (ret) goto out_unlock; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); @@ -433,39 +319,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) * this property in case it is set by the issuer. */ ifl &= ~IOCB_DIO_CALLER_COMP; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(ifl); - - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); - /* Update size */ - ovl_file_modified(file); - } else { - struct ovl_aio_req *aio_req; - - ret = ovl_init_aio_done_wq(inode->i_sb); - if (ret) - goto out; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_flags = ifl; - aio_req->iocb.ki_complete = ovl_aio_queue_completion; - refcount_set(&aio_req->ref, 2); - ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); -out_fdput: + ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx); fdput(real); out_unlock: @@ -777,19 +631,3 @@ const struct file_operations ovl_file_operations = { .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; - -int __init ovl_aio_request_cache_init(void) -{ - ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", - sizeof(struct ovl_aio_req), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ovl_aio_request_cachep) - return -ENOMEM; - - return 0; -} - -void ovl_aio_request_cache_destroy(void) -{ - kmem_cache_destroy(ovl_aio_request_cachep); -} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index ca88b2636a57..509a57c85fae 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -417,6 +417,12 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); + +static inline const struct cred *ovl_creds(struct super_block *sb) +{ + return OVL_FS(sb)->creator_cred; +} + int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -829,8 +835,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, /* file.c */ extern const struct file_operations ovl_file_operations; -int __init ovl_aio_request_cache_init(void); -void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a0967bb25003..bcd4c314a7eb 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1501,14 +1501,10 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = ovl_aio_request_cache_init(); - if (!err) { - err = register_filesystem(&ovl_fs_type); - if (!err) - return 0; + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; - ovl_aio_request_cache_destroy(); - } kmem_cache_destroy(ovl_inode_cachep); return err; @@ -1524,7 +1520,6 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 55c9e804f780..0648d548a418 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -9,9 +9,24 @@ #define _LINUX_BACKING_FILE_H #include +#include +#include + +struct backing_file_ctx { + const struct cred *cred; + struct file *user_file; + void (*accessed)(struct file *); + void (*end_write)(struct file *); +}; struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred); +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ From 9b7e9e2f5d5c3d079ec46bc71b114012e362ea6e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Oct 2023 12:13:12 +0300 Subject: [PATCH 28/30] fs: factor out backing_file_splice_{read,write}() helpers There is not much in those helpers, but it makes sense to have them logically next to the backing_file_{read,write}_iter() helpers as they may grow more common logic in the future. Signed-off-by: Amir Goldstein --- fs/backing-file.c | 51 ++++++++++++++++++++++++++++++++++++ fs/overlayfs/file.c | 33 +++++++++-------------- include/linux/backing-file.h | 8 ++++++ 3 files changed, 72 insertions(+), 20 deletions(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index 323187a49da3..ddd35c1d6c71 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -10,6 +10,7 @@ #include #include +#include #include "internal.h" @@ -245,6 +246,56 @@ out: } EXPORT_SYMBOL_GPL(backing_file_write_iter); +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) + return -EIO; + + old_cred = override_creds(ctx->cred); + ret = vfs_splice_read(in, ppos, pipe, len, flags); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_read); + +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) + return -EIO; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + old_cred = override_creds(ctx->cred); + file_start_write(out); + ret = iter_file_splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + revert_creds(old_cred); + + if (ctx->end_write) + ctx->end_write(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_write); + static int __init backing_aio_init(void) { backing_aio_cachep = kmem_cache_create("backing_aio", diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 1b578cb27a26..69b52d2f9c74 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -332,20 +331,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - const struct cred *old_cred; struct fd real; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(in)->i_sb), + .user_file = in, + .accessed = ovl_file_accessed, + }; ret = ovl_real_fdget(in, &real); if (ret) return ret; - old_cred = ovl_override_creds(file_inode(in)->i_sb); - ret = vfs_splice_read(real.file, ppos, pipe, len, flags); - revert_creds(old_cred); - ovl_file_accessed(in); - + ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx); fdput(real); + return ret; } @@ -361,30 +361,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct fd real; - const struct cred *old_cred; struct inode *inode = file_inode(out); ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = out, + .end_write = ovl_file_modified, + }; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(out); - if (ret) - goto out_unlock; ret = ovl_real_fdget(out, &real); if (ret) goto out_unlock; - old_cred = ovl_override_creds(inode->i_sb); - file_start_write(real.file); - - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); - - file_end_write(real.file); - /* Update size */ - ovl_file_modified(out); - revert_creds(old_cred); + ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx); fdput(real); out_unlock: diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 0648d548a418..0546d5b1c9f5 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -28,5 +28,13 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx); +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ From f567377e406c032fff0799bde4fdf4a977529b84 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Oct 2023 12:49:37 +0300 Subject: [PATCH 29/30] fs: factor out backing_file_mmap() helper Assert that the file object is allocated in a backing_file container so that file_user_path() could be used to display the user path and not the backing file's path in /proc//maps. Signed-off-by: Amir Goldstein --- fs/backing-file.c | 27 +++++++++++++++++++++++++++ fs/overlayfs/file.c | 23 ++++++----------------- include/linux/backing-file.h | 2 ++ 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index ddd35c1d6c71..a681f38d84d8 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -296,6 +297,32 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, } EXPORT_SYMBOL_GPL(backing_file_splice_write); +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + int ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || + WARN_ON_ONCE(ctx->user_file != vma->vm_file)) + return -EIO; + + if (!file->f_op->mmap) + return -ENODEV; + + vma_set_file(vma, file); + + old_cred = override_creds(ctx->cred); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_mmap); + static int __init backing_aio_init(void) { backing_aio_cachep = kmem_cache_create("backing_aio", diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 69b52d2f9c74..05536964d37f 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include "overlayfs.h" @@ -415,23 +414,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { struct file *realfile = file->private_data; - const struct cred *old_cred; - int ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; - if (!realfile->f_op->mmap) - return -ENODEV; - - if (WARN_ON(file != vma->vm_file)) - return -EIO; - - vma_set_file(vma, realfile); - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); - revert_creds(old_cred); - ovl_file_accessed(file); - - return ret; + return backing_file_mmap(realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 0546d5b1c9f5..3f1fe1774f1b 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -36,5 +36,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, struct backing_file_ctx *ctx); +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ From c39e2ae3943d4ee278af4e1b1dcfd5946da1089b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 28 Dec 2023 11:06:08 +0100 Subject: [PATCH 30/30] fs: fix __sb_write_started() kerneldoc formatting When running 'make htmldocs', I see the following warning: Documentation/filesystems/api-summary:14: ./include/linux/fs.h:1659: WARNING: Definition list ends without a blank line; unexpected unindent. The official guidance [1] seems to be to use lists, which will prevent both the "unexpected unindent" warning as well as ensure that each line is formatted on a separate line in the HTML output instead of being all considered a single paragraph. [1]: https://docs.kernel.org/doc-guide/kernel-doc.html#return-values Fixes: 8802e580ee64 ("fs: create __sb_write_started() helper") Cc: Amir Goldstein Cc: Josef Bacik Cc: Jan Kara Signed-off-by: Vegard Nossum Link: https://lore.kernel.org/r/20231228100608.3123987-1-vegard.nossum@oracle.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index db5d07e6e02e..473063f385e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1650,9 +1650,9 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) * @sb: the super we write to * @level: the freeze level * - * > 0 sb freeze level is held - * 0 sb freeze level is not held - * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN */ static inline int __sb_write_started(const struct super_block *sb, int level) {