linux/fs/netfs/buffered_read.c

634 lines
19 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/* Network filesystem high-level buffered read support.
*
* Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/export.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
/*
* Unlock the folios in a read operation. We need to set PG_fscache on any
* folios we're going to write back before we unlock them.
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_folio *finfo;
struct folio *folio;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
size_t account = 0;
bool subreq_failed = false;
XA_STATE(xas, &rreq->mapping->i_pages, start_page);
if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
__clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
}
}
/* Walk through the pagecache and the I/O request lists simultaneously.
* We may have a mixture of cached and uncached sections and we only
* really want to write out the uncached sections. This is slightly
* complicated by the possibility that we might have huge pages with a
* mixture inside.
*/
subreq = list_first_entry(&rreq->subrequests,
struct netfs_io_subrequest, rreq_link);
subreq_failed = (subreq->error < 0);
trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
rcu_read_lock();
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
netfs: Only call folio_start_fscache() one time for each folio If a network filesystem using netfs implements a clamp_length() function, it can set subrequest lengths smaller than a page size. When we loop through the folios in netfs_rreq_unlock_folios() to set any folios to be written back, we need to make sure we only call folio_start_fscache() once for each folio. Otherwise, this simple testcase: mount -o fsc,rsize=1024,wsize=1024 127.0.0.1:/export /mnt/nfs dd if=/dev/zero of=/mnt/nfs/file.bin bs=4096 count=1 1+0 records in 1+0 records out 4096 bytes (4.1 kB, 4.0 KiB) copied, 0.0126359 s, 324 kB/s echo 3 > /proc/sys/vm/drop_caches cat /mnt/nfs/file.bin > /dev/null will trigger an oops similar to the following: page dumped because: VM_BUG_ON_FOLIO(folio_test_private_2(folio)) ------------[ cut here ]------------ kernel BUG at include/linux/netfs.h:44! ... CPU: 5 PID: 134 Comm: kworker/u16:5 Kdump: loaded Not tainted 6.4.0-rc5 ... RIP: 0010:netfs_rreq_unlock_folios+0x68e/0x730 [netfs] ... Call Trace: netfs_rreq_assess+0x497/0x660 [netfs] netfs_subreq_terminated+0x32b/0x610 [netfs] nfs_netfs_read_completion+0x14e/0x1a0 [nfs] nfs_read_completion+0x2f9/0x330 [nfs] rpc_free_task+0x72/0xa0 [sunrpc] rpc_async_release+0x46/0x70 [sunrpc] process_one_work+0x3bd/0x710 worker_thread+0x89/0x610 kthread+0x181/0x1c0 ret_from_fork+0x29/0x50 Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers" Link: https://bugzilla.redhat.com/show_bug.cgi?id=2210612 Signed-off-by: Dave Wysochanski <dwysocha@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20230608214137.856006-1-dwysocha@redhat.com/ # v1 Link: https://lore.kernel.org/r/20230915185704.1082982-1-dwysocha@redhat.com/ # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-09-18 16:17:11 +03:00
bool folio_started;
netfs: Fix missing xas_retry() calls in xarray iteration netfslib has a number of places in which it performs iteration of an xarray whilst being under the RCU read lock. It *should* call xas_retry() as the first thing inside of the loop and do "continue" if it returns true in case the xarray walker passed out a special value indicating that the walk needs to be redone from the root[*]. Fix this by adding the missing retry checks. [*] I wonder if this should be done inside xas_find(), xas_next_node() and suchlike, but I'm told that's not an simple change to effect. This can cause an oops like that below. Note the faulting address - this is an internal value (|0x2) returned from xarray. BUG: kernel NULL pointer dereference, address: 0000000000000402 ... RIP: 0010:netfs_rreq_unlock+0xef/0x380 [netfs] ... Call Trace: netfs_rreq_assess+0xa6/0x240 [netfs] netfs_readpage+0x173/0x3b0 [netfs] ? init_wait_var_entry+0x50/0x50 filemap_read_page+0x33/0xf0 filemap_get_pages+0x2f2/0x3f0 filemap_read+0xaa/0x320 ? do_filp_open+0xb2/0x150 ? rmqueue+0x3be/0xe10 ceph_read_iter+0x1fe/0x680 [ceph] ? new_sync_read+0x115/0x1a0 new_sync_read+0x115/0x1a0 vfs_read+0xf3/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Changes: ======== ver #2) - Changed an unsigned int to a size_t to reduce the likelihood of an overflow as per Willy's suggestion. - Added an additional patch to fix the maths. Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers") Reported-by: George Law <glaw@redhat.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com> cc: Matthew Wilcox <willy@infradead.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/166749229733.107206.17482609105741691452.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/166757987929.950645.12595273010425381286.stgit@warthog.procyon.org.uk/ # v2
2022-11-03 19:08:14 +03:00
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
netfs: Fix missing xas_retry() calls in xarray iteration netfslib has a number of places in which it performs iteration of an xarray whilst being under the RCU read lock. It *should* call xas_retry() as the first thing inside of the loop and do "continue" if it returns true in case the xarray walker passed out a special value indicating that the walk needs to be redone from the root[*]. Fix this by adding the missing retry checks. [*] I wonder if this should be done inside xas_find(), xas_next_node() and suchlike, but I'm told that's not an simple change to effect. This can cause an oops like that below. Note the faulting address - this is an internal value (|0x2) returned from xarray. BUG: kernel NULL pointer dereference, address: 0000000000000402 ... RIP: 0010:netfs_rreq_unlock+0xef/0x380 [netfs] ... Call Trace: netfs_rreq_assess+0xa6/0x240 [netfs] netfs_readpage+0x173/0x3b0 [netfs] ? init_wait_var_entry+0x50/0x50 filemap_read_page+0x33/0xf0 filemap_get_pages+0x2f2/0x3f0 filemap_read+0xaa/0x320 ? do_filp_open+0xb2/0x150 ? rmqueue+0x3be/0xe10 ceph_read_iter+0x1fe/0x680 [ceph] ? new_sync_read+0x115/0x1a0 new_sync_read+0x115/0x1a0 vfs_read+0xf3/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Changes: ======== ver #2) - Changed an unsigned int to a size_t to reduce the likelihood of an overflow as per Willy's suggestion. - Added an additional patch to fix the maths. Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers") Reported-by: George Law <glaw@redhat.com> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com> cc: Matthew Wilcox <willy@infradead.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/166749229733.107206.17482609105741691452.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/166757987929.950645.12595273010425381286.stgit@warthog.procyon.org.uk/ # v2
2022-11-03 19:08:14 +03:00
netfs: Only call folio_start_fscache() one time for each folio If a network filesystem using netfs implements a clamp_length() function, it can set subrequest lengths smaller than a page size. When we loop through the folios in netfs_rreq_unlock_folios() to set any folios to be written back, we need to make sure we only call folio_start_fscache() once for each folio. Otherwise, this simple testcase: mount -o fsc,rsize=1024,wsize=1024 127.0.0.1:/export /mnt/nfs dd if=/dev/zero of=/mnt/nfs/file.bin bs=4096 count=1 1+0 records in 1+0 records out 4096 bytes (4.1 kB, 4.0 KiB) copied, 0.0126359 s, 324 kB/s echo 3 > /proc/sys/vm/drop_caches cat /mnt/nfs/file.bin > /dev/null will trigger an oops similar to the following: page dumped because: VM_BUG_ON_FOLIO(folio_test_private_2(folio)) ------------[ cut here ]------------ kernel BUG at include/linux/netfs.h:44! ... CPU: 5 PID: 134 Comm: kworker/u16:5 Kdump: loaded Not tainted 6.4.0-rc5 ... RIP: 0010:netfs_rreq_unlock_folios+0x68e/0x730 [netfs] ... Call Trace: netfs_rreq_assess+0x497/0x660 [netfs] netfs_subreq_terminated+0x32b/0x610 [netfs] nfs_netfs_read_completion+0x14e/0x1a0 [nfs] nfs_read_completion+0x2f9/0x330 [nfs] rpc_free_task+0x72/0xa0 [sunrpc] rpc_async_release+0x46/0x70 [sunrpc] process_one_work+0x3bd/0x710 worker_thread+0x89/0x610 kthread+0x181/0x1c0 ret_from_fork+0x29/0x50 Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers" Link: https://bugzilla.redhat.com/show_bug.cgi?id=2210612 Signed-off-by: Dave Wysochanski <dwysocha@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20230608214137.856006-1-dwysocha@redhat.com/ # v1 Link: https://lore.kernel.org/r/20230915185704.1082982-1-dwysocha@redhat.com/ # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-09-18 16:17:11 +03:00
folio_started = false;
for (;;) {
loff_t sreq_end;
if (!subreq) {
pg_failed = true;
break;
}
netfs: Only call folio_start_fscache() one time for each folio If a network filesystem using netfs implements a clamp_length() function, it can set subrequest lengths smaller than a page size. When we loop through the folios in netfs_rreq_unlock_folios() to set any folios to be written back, we need to make sure we only call folio_start_fscache() once for each folio. Otherwise, this simple testcase: mount -o fsc,rsize=1024,wsize=1024 127.0.0.1:/export /mnt/nfs dd if=/dev/zero of=/mnt/nfs/file.bin bs=4096 count=1 1+0 records in 1+0 records out 4096 bytes (4.1 kB, 4.0 KiB) copied, 0.0126359 s, 324 kB/s echo 3 > /proc/sys/vm/drop_caches cat /mnt/nfs/file.bin > /dev/null will trigger an oops similar to the following: page dumped because: VM_BUG_ON_FOLIO(folio_test_private_2(folio)) ------------[ cut here ]------------ kernel BUG at include/linux/netfs.h:44! ... CPU: 5 PID: 134 Comm: kworker/u16:5 Kdump: loaded Not tainted 6.4.0-rc5 ... RIP: 0010:netfs_rreq_unlock_folios+0x68e/0x730 [netfs] ... Call Trace: netfs_rreq_assess+0x497/0x660 [netfs] netfs_subreq_terminated+0x32b/0x610 [netfs] nfs_netfs_read_completion+0x14e/0x1a0 [nfs] nfs_read_completion+0x2f9/0x330 [nfs] rpc_free_task+0x72/0xa0 [sunrpc] rpc_async_release+0x46/0x70 [sunrpc] process_one_work+0x3bd/0x710 worker_thread+0x89/0x610 kthread+0x181/0x1c0 ret_from_fork+0x29/0x50 Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers" Link: https://bugzilla.redhat.com/show_bug.cgi?id=2210612 Signed-off-by: Dave Wysochanski <dwysocha@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20230608214137.856006-1-dwysocha@redhat.com/ # v1 Link: https://lore.kernel.org/r/20230915185704.1082982-1-dwysocha@redhat.com/ # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-09-18 16:17:11 +03:00
if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_start_fscache(folio);
netfs: Only call folio_start_fscache() one time for each folio If a network filesystem using netfs implements a clamp_length() function, it can set subrequest lengths smaller than a page size. When we loop through the folios in netfs_rreq_unlock_folios() to set any folios to be written back, we need to make sure we only call folio_start_fscache() once for each folio. Otherwise, this simple testcase: mount -o fsc,rsize=1024,wsize=1024 127.0.0.1:/export /mnt/nfs dd if=/dev/zero of=/mnt/nfs/file.bin bs=4096 count=1 1+0 records in 1+0 records out 4096 bytes (4.1 kB, 4.0 KiB) copied, 0.0126359 s, 324 kB/s echo 3 > /proc/sys/vm/drop_caches cat /mnt/nfs/file.bin > /dev/null will trigger an oops similar to the following: page dumped because: VM_BUG_ON_FOLIO(folio_test_private_2(folio)) ------------[ cut here ]------------ kernel BUG at include/linux/netfs.h:44! ... CPU: 5 PID: 134 Comm: kworker/u16:5 Kdump: loaded Not tainted 6.4.0-rc5 ... RIP: 0010:netfs_rreq_unlock_folios+0x68e/0x730 [netfs] ... Call Trace: netfs_rreq_assess+0x497/0x660 [netfs] netfs_subreq_terminated+0x32b/0x610 [netfs] nfs_netfs_read_completion+0x14e/0x1a0 [nfs] nfs_read_completion+0x2f9/0x330 [nfs] rpc_free_task+0x72/0xa0 [sunrpc] rpc_async_release+0x46/0x70 [sunrpc] process_one_work+0x3bd/0x710 worker_thread+0x89/0x610 kthread+0x181/0x1c0 ret_from_fork+0x29/0x50 Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers" Link: https://bugzilla.redhat.com/show_bug.cgi?id=2210612 Signed-off-by: Dave Wysochanski <dwysocha@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20230608214137.856006-1-dwysocha@redhat.com/ # v1 Link: https://lore.kernel.org/r/20230915185704.1082982-1-dwysocha@redhat.com/ # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-09-18 16:17:11 +03:00
folio_started = true;
}
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
if (pg_end < sreq_end)
break;
account += subreq->transferred;
if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
subreq = list_next_entry(subreq, rreq_link);
subreq_failed = (subreq->error < 0);
} else {
subreq = NULL;
subreq_failed = false;
}
if (pg_end == sreq_end)
break;
}
if (!pg_failed) {
flush_dcache_folio(folio);
finfo = netfs_folio_info(folio);
if (finfo) {
trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
if (finfo->netfs_group)
folio_change_private(folio, finfo->netfs_group);
else
folio_detach_private(folio);
kfree(finfo);
}
folio_mark_uptodate(folio);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
folio_unlock(folio);
}
}
rcu_read_unlock();
task_io_account_read(account);
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
}
static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
loff_t *_start, size_t *_len, loff_t i_size)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
if (cres->ops && cres->ops->expand_readahead)
cres->ops->expand_readahead(cres, _start, _len, i_size);
}
static void netfs_rreq_expand(struct netfs_io_request *rreq,
struct readahead_control *ractl)
{
/* Give the cache a chance to change the request parameters. The
* resultant request must contain the original region.
*/
netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
/* Give the netfs a chance to change the request parameters. The
* resultant request must contain the original region.
*/
if (rreq->netfs_ops->expand_readahead)
rreq->netfs_ops->expand_readahead(rreq);
/* Expand the request if the cache wants it to start earlier. Note
* that the expansion may get further extended if the VM wishes to
* insert THPs and the preferred start and/or end wind up in the middle
* of THPs.
*
* If this is the case, however, the THP size should be an integer
* multiple of the cache granule size, so we get a whole number of
* granules to deal with.
*/
if (rreq->start != readahead_pos(ractl) ||
rreq->len != readahead_length(ractl)) {
readahead_expand(ractl, rreq->start, rreq->len);
rreq->start = readahead_pos(ractl);
rreq->len = readahead_length(ractl);
trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
netfs_read_trace_expanded);
}
}
/*
* Begin an operation, and fetch the stored zero point value from the cookie if
* available.
*/
static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
{
return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
}
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
*
* Fulfil a readahead request by drawing data from the cache if possible, or
* the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
* requests from different sources will get munged together. If necessary, the
* readahead window can be expanded in either direction to a more convenient
* alighment for RPC efficiency or to make storage in the cache feasible.
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
*
* This is usable whether or not caching is enabled.
*/
void netfs_readahead(struct readahead_control *ractl)
{
struct netfs_io_request *rreq;
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-09 23:46:04 +03:00
struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
int ret;
_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
if (readahead_count(ractl) == 0)
return;
rreq = netfs_alloc_request(ractl->mapping, ractl->file,
readahead_pos(ractl),
readahead_length(ractl),
NETFS_READAHEAD);
if (IS_ERR(rreq))
return;
ret = netfs_begin_cache_read(rreq, ctx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto cleanup_free;
netfs_stat(&netfs_n_rh_readahead);
trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
netfs_read_trace_readahead);
netfs_rreq_expand(rreq, ractl);
/* Set up the output buffer */
iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
rreq->start, rreq->len);
/* Drop the refs on the folios here rather than in the cache or
* filesystem. The locks will be dropped in netfs_rreq_unlock().
*/
while (readahead_folio(ractl))
;
netfs_begin_read(rreq, false);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return;
cleanup_free:
netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
return;
}
EXPORT_SYMBOL(netfs_readahead);
/**
* netfs_read_folio - Helper to manage a read_folio request
* @file: The file to read from
* @folio: The folio to read
*
* Fulfil a read_folio request by drawing data from the cache if
* possible, or the netfs if not. Space beyond the EOF is zero-filled.
* Multiple I/O requests from different sources will get munged together.
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
*
* This is usable whether or not caching is enabled.
*/
int netfs_read_folio(struct file *file, struct folio *folio)
{
struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-09 23:46:04 +03:00
struct netfs_inode *ctx = netfs_inode(mapping->host);
struct folio *sink = NULL;
int ret;
_enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
folio_file_pos(folio), folio_size(folio),
NETFS_READPAGE);
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
goto alloc_error;
}
ret = netfs_begin_cache_read(rreq, ctx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto discard;
netfs_stat(&netfs_n_rh_readpage);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
/* Set up the output buffer */
if (folio_test_dirty(folio)) {
/* Handle someone trying to read from an unflushed streaming
* write. We fiddle the buffer so that a gap at the beginning
* and/or a gap at the end get copied to, but the middle is
* discarded.
*/
struct netfs_folio *finfo = netfs_folio_info(folio);
struct bio_vec *bvec;
unsigned int from = finfo->dirty_offset;
unsigned int to = from + finfo->dirty_len;
unsigned int off = 0, i = 0;
size_t flen = folio_size(folio);
size_t nr_bvec = flen / PAGE_SIZE + 2;
size_t part;
ret = -ENOMEM;
bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
if (!bvec)
goto discard;
sink = folio_alloc(GFP_KERNEL, 0);
if (!sink)
goto discard;
trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
rreq->direct_bv = bvec;
rreq->direct_bv_count = nr_bvec;
if (from > 0) {
bvec_set_folio(&bvec[i++], folio, from, 0);
off = from;
}
while (off < to) {
part = min_t(size_t, to - off, PAGE_SIZE);
bvec_set_folio(&bvec[i++], sink, part, 0);
off += part;
}
if (to < flen)
bvec_set_folio(&bvec[i++], folio, flen - to, to);
iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
} else {
iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
rreq->start, rreq->len);
}
ret = netfs_begin_read(rreq, true);
if (sink)
folio_put(sink);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
discard:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
alloc_error:
folio_unlock(folio);
return ret;
}
EXPORT_SYMBOL(netfs_read_folio);
/*
* Prepare a folio for writing without reading first
* @folio: The folio being prepared
* @pos: starting position for the write
* @len: length of write
* @always_fill: T if the folio should always be completely filled/cleared
*
* In some cases, write_begin doesn't need to read at all:
* - full folio write
* - write that lies in a folio that is completely beyond EOF
* - write that covers the folio from start to EOF or beyond it
*
* If any of these criteria are met, then zero out the unwritten parts
* of the folio and return true. Otherwise, return false.
*/
static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
bool always_fill)
{
struct inode *inode = folio_inode(folio);
loff_t i_size = i_size_read(inode);
size_t offset = offset_in_folio(folio, pos);
size_t plen = folio_size(folio);
if (unlikely(always_fill)) {
if (pos - offset + len <= i_size)
return false; /* Page entirely before EOF */
zero_user_segment(&folio->page, 0, plen);
folio_mark_uptodate(folio);
return true;
}
/* Full folio write */
if (offset == 0 && len >= plen)
return true;
/* Page entirely beyond the end of the file */
if (pos - offset >= i_size)
goto zero_out;
/* Write that covers from the start of the folio to EOF or beyond */
if (offset == 0 && (pos + len) >= i_size)
goto zero_out;
return false;
zero_out:
zero_user_segments(&folio->page, 0, offset, offset + len, plen);
return true;
}
/**
* netfs_write_begin - Helper to prepare for writing
* @ctx: The netfs context
* @file: The file to read from
* @mapping: The mapping to read from
* @pos: File position at which the write will begin
* @len: The length of the write (may extend beyond the end of the folio chosen)
* @_folio: Where to put the resultant folio
* @_fsdata: Place for the netfs to store a cookie
*
* Pre-read data for a write-begin request by drawing data from the cache if
* possible, or the netfs if not. Space beyond the EOF is zero-filled.
* Multiple I/O requests from different sources will get munged together. If
* necessary, the readahead window can be expanded in either direction to a
* more convenient alighment for RPC efficiency or to make storage in the cache
* feasible.
*
* The calling netfs must provide a table of operations, only one of which,
* issue_op, is mandatory.
*
* The check_write_begin() operation can be provided to check for and flush
* conflicting writes once the folio is grabbed and locked. It is passed a
* pointer to the fsdata cookie that gets returned to the VM to be passed to
* write_end. It is permitted to sleep. It should return 0 if the request
* should go ahead or it may return an error. It may also unlock and put the
* folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
* will cause the folio to be re-got and the process to be retried.
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
*
* This is usable whether or not caching is enabled.
*/
int netfs_write_begin(struct netfs_inode *ctx,
struct file *file, struct address_space *mapping,
loff_t pos, unsigned int len, struct folio **_folio,
void **_fsdata)
{
struct netfs_io_request *rreq;
struct folio *folio;
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
retry:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
if (ctx->ops->check_write_begin) {
/* Allow the netfs (eg. ceph) to flush conflicts. */
ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
if (ret < 0) {
trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
goto error;
}
if (!folio)
goto retry;
}
if (folio_test_uptodate(folio))
goto have_folio;
/* If the page is beyond the EOF, we want to clear it - unless it's
* within the cache granule containing the EOF, in which case we need
* to preload the granule.
*/
if (!netfs_is_cache_enabled(ctx) &&
netfs_skip_folio_read(folio, pos, len, false)) {
netfs_stat(&netfs_n_rh_write_zskip);
goto have_folio_no_wait;
}
rreq = netfs_alloc_request(mapping, file,
folio_file_pos(folio), folio_size(folio),
NETFS_READ_FOR_WRITE);
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
goto error;
}
rreq->no_unlock_folio = folio->index;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
ret = netfs_begin_cache_read(rreq, ctx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto error_put;
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
/* Expand the request to meet caching requirements and download
* preferences.
*/
ractl._nr_pages = folio_nr_pages(folio);
netfs_rreq_expand(rreq, &ractl);
/* Set up the output buffer */
iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
rreq->start, rreq->len);
/* We hold the folio locks, so we can drop the references */
folio_get(folio);
while (readahead_folio(&ractl))
;
ret = netfs_begin_read(rreq, true);
if (ret < 0)
goto error;
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_fscache_killable(folio);
if (ret < 0)
goto error;
have_folio_no_wait:
*_folio = folio;
_leave(" = 0");
return 0;
error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
error:
if (folio) {
folio_unlock(folio);
folio_put(folio);
}
_leave(" = %d", ret);
return ret;
}
EXPORT_SYMBOL(netfs_write_begin);
/*
* Preload the data into a page we're proposing to write into.
*/
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len)
{
struct netfs_io_request *rreq;
struct address_space *mapping = folio->mapping;
struct netfs_inode *ctx = netfs_inode(mapping->host);
unsigned long long start = folio_pos(folio);
size_t flen = folio_size(folio);
int ret;
_enter("%zx @%llx", flen, start);
ret = -ENOMEM;
rreq = netfs_alloc_request(mapping, file, start, flen,
NETFS_READ_FOR_WRITE);
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
goto error;
}
rreq->no_unlock_folio = folio->index;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
ret = netfs_begin_cache_read(rreq, ctx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto error_put;
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
/* Set up the output buffer */
iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
rreq->start, rreq->len);
ret = netfs_begin_read(rreq, true);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret;
error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
error:
_leave(" = %d", ret);
return ret;
}
/**
* netfs_buffered_read_iter - Filesystem buffered I/O read routine
* @iocb: kernel I/O control block
* @iter: destination for the data read
*
* This is the ->read_iter() routine for all filesystems that can use the page
* cache directly.
*
* The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
* returned when no data can be read without waiting for I/O requests to
* complete; it doesn't prevent readahead.
*
* The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
* shall be made for the read or for readahead. When no data can be read,
* -EAGAIN shall be returned. When readahead would be triggered, a partial,
* possibly empty read shall be returned.
*
* Return:
* * number of bytes copied, even for partial reads
* * negative error code (or 0 if IOCB_NOIO) if nothing was read
*/
ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct netfs_inode *ictx = netfs_inode(inode);
ssize_t ret;
if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
return -EINVAL;
ret = netfs_start_io_read(inode);
if (ret == 0) {
ret = filemap_read(iocb, iter, 0);
netfs_end_io_read(inode);
}
return ret;
}
EXPORT_SYMBOL(netfs_buffered_read_iter);
/**
* netfs_file_read_iter - Generic filesystem read routine
* @iocb: kernel I/O control block
* @iter: destination for the data read
*
* This is the ->read_iter() routine for all filesystems that can use the page
* cache directly.
*
* The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
* returned when no data can be read without waiting for I/O requests to
* complete; it doesn't prevent readahead.
*
* The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
* shall be made for the read or for readahead. When no data can be read,
* -EAGAIN shall be returned. When readahead would be triggered, a partial,
* possibly empty read shall be returned.
*
* Return:
* * number of bytes copied, even for partial reads
* * negative error code (or 0 if IOCB_NOIO) if nothing was read
*/
ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
if ((iocb->ki_flags & IOCB_DIRECT) ||
test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
return netfs_unbuffered_read_iter(iocb, iter);
return netfs_buffered_read_iter(iocb, iter);
}
EXPORT_SYMBOL(netfs_file_read_iter);