2019-05-19 15:08:55 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2005-04-17 02:20:36 +04:00
/*
* linux / fs / nfs / file . c
*
* Copyright ( C ) 1992 Rick Sladkey
*
* Changes Copyright ( C ) 1994 by Florian La Roche
* - Do not copy data too often around in the kernel .
* - In nfs_file_read the return value of kmalloc wasn ' t checked .
* - Put in a better version of read look - ahead buffering . Original idea
* and implementation by Wai S Kok elekokws @ ee . nus . sg .
*
* Expire cache on write to a file by Wai S Kok ( Oct 1994 ) .
*
* Total rewrite of read side for new NFS buffer cache . . Linus .
*
* nfs regular file handling functions
*/
2012-07-31 00:05:23 +04:00
# include <linux/module.h>
2005-04-17 02:20:36 +04:00
# include <linux/time.h>
# include <linux/kernel.h>
# include <linux/errno.h>
# include <linux/fcntl.h>
# include <linux/stat.h>
# include <linux/nfs_fs.h>
# include <linux/nfs_mount.h>
# include <linux/mm.h>
# include <linux/pagemap.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
# include <linux/gfp.h>
2010-07-30 23:31:54 +04:00
# include <linux/swap.h>
2005-04-17 02:20:36 +04:00
2016-12-24 22:46:01 +03:00
# include <linux/uaccess.h>
2022-11-20 17:15:34 +03:00
# include <linux/filelock.h>
2005-04-17 02:20:36 +04:00
# include "delegation.h"
2007-07-23 01:09:05 +04:00
# include "internal.h"
2006-03-20 21:44:14 +03:00
# include "iostat.h"
2009-04-03 19:42:44 +04:00
# include "fscache.h"
2014-09-10 19:23:30 +04:00
# include "pnfs.h"
2005-04-17 02:20:36 +04:00
2013-08-20 02:59:33 +04:00
# include "nfstrace.h"
2005-04-17 02:20:36 +04:00
# define NFSDBG_FACILITY NFSDBG_FILE
2009-09-27 22:29:37 +04:00
static const struct vm_operations_struct nfs_file_vm_ops ;
2007-07-23 01:09:05 +04:00
2012-07-17 00:39:15 +04:00
int nfs_check_flags ( int flags )
2005-04-17 02:20:36 +04:00
{
if ( ( flags & ( O_APPEND | O_DIRECT ) ) = = ( O_APPEND | O_DIRECT ) )
return - EINVAL ;
return 0 ;
}
2012-07-31 00:05:25 +04:00
EXPORT_SYMBOL_GPL ( nfs_check_flags ) ;
2005-04-17 02:20:36 +04:00
/*
* Open file
*/
static int
nfs_file_open ( struct inode * inode , struct file * filp )
{
int res ;
2013-09-16 18:53:17 +04:00
dprintk ( " NFS: open file(%pD2) \n " , filp ) ;
2008-06-12 01:55:42 +04:00
2010-02-01 22:17:14 +03:00
nfs_inc_stats ( inode , NFSIOS_VFSOPEN ) ;
2005-04-17 02:20:36 +04:00
res = nfs_check_flags ( filp - > f_flags ) ;
if ( res )
return res ;
2008-06-12 00:32:46 +04:00
res = nfs_open ( inode , filp ) ;
2022-05-10 04:20:49 +03:00
if ( res = = 0 )
filp - > f_mode | = FMODE_CAN_ODIRECT ;
2005-04-17 02:20:36 +04:00
return res ;
}
2012-07-17 00:39:15 +04:00
int
2005-04-17 02:20:36 +04:00
nfs_file_release ( struct inode * inode , struct file * filp )
{
2013-09-16 18:53:17 +04:00
dprintk ( " NFS: release(%pD2) \n " , filp ) ;
2008-06-12 01:55:58 +04:00
2006-03-20 21:44:14 +03:00
nfs_inc_stats ( inode , NFSIOS_VFSRELEASE ) ;
2015-07-13 21:01:33 +03:00
nfs_file_clear_open_context ( filp ) ;
nfs: Convert to new fscache volume/cookie API
Change the nfs filesystem to support fscache's indexing rewrite and
reenable caching in nfs.
The following changes have been made:
(1) The fscache_netfs struct is no more, and there's no need to register
the filesystem as a whole.
(2) The session cookie is now an fscache_volume cookie, allocated with
fscache_acquire_volume(). That takes three parameters: a string
representing the "volume" in the index, a string naming the cache to
use (or NULL) and a u64 that conveys coherency metadata for the
volume.
For nfs, I've made it render the volume name string as:
"nfs,<ver>,<family>,<address>,<port>,<fsidH>,<fsidL>*<,param>[,<uniq>]"
(3) The fscache_cookie_def is no more and needed information is passed
directly to fscache_acquire_cookie(). The cache no longer calls back
into the filesystem, but rather metadata changes are indicated at
other times.
fscache_acquire_cookie() is passed the same keying and coherency
information as before.
(4) fscache_enable/disable_cookie() have been removed.
Call fscache_use_cookie() and fscache_unuse_cookie() when a file is
opened or closed to prevent a cache file from being culled and to keep
resources to hand that are needed to do I/O.
If a file is opened for writing, we invalidate it with
FSCACHE_INVAL_DIO_WRITE in lieu of doing writeback to the cache,
thereby making it cease caching until all currently open files are
closed. This should give the same behaviour as the uptream code.
Making the cache store local modifications isn't straightforward for
NFS, so that's left for future patches.
(5) fscache_invalidate() now needs to be given uptodate auxiliary data and
a file size. It also takes a flag to indicate if this was due to a
DIO write.
(6) Call nfs_fscache_invalidate() with FSCACHE_INVAL_DIO_WRITE on a file
to which a DIO write is made.
(7) Call fscache_note_page_release() from nfs_release_page().
(8) Use a killable wait in nfs_vm_page_mkwrite() when waiting for
PG_fscache to be cleared.
(9) The functions to read and write data to/from the cache are stubbed out
pending a conversion to use netfslib.
Changes
=======
ver #3:
- Added missing =n fallback for nfs_fscache_release_file()[1][2].
ver #2:
- Use gfpflags_allow_blocking() rather than using flag directly.
- fscache_acquire_volume() now returns errors.
- Remove NFS_INO_FSCACHE as it's no longer used.
- Need to unuse a cookie on file-release, not inode-clear.
Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Co-developed-by: David Howells <dhowells@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Acked-by: Jeff Layton <jlayton@kernel.org>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Anna Schumaker <anna.schumaker@netapp.com>
cc: linux-nfs@vger.kernel.org
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/202112100804.nksO8K4u-lkp@intel.com/ [1]
Link: https://lore.kernel.org/r/202112100957.2oEDT20W-lkp@intel.com/ [2]
Link: https://lore.kernel.org/r/163819668938.215744.14448852181937731615.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906979003.143852.2601189243864854724.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967182112.1823006.7791504655391213379.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021575950.640689.12069642327533368467.stgit@warthog.procyon.org.uk/ # v4
2020-11-14 21:43:54 +03:00
nfs_fscache_release_file ( inode , filp ) ;
2015-07-13 21:01:33 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2012-07-31 00:05:25 +04:00
EXPORT_SYMBOL_GPL ( nfs_file_release ) ;
2005-04-17 02:20:36 +04:00
2005-06-13 19:14:01 +04:00
/**
2021-02-08 16:55:45 +03:00
* nfs_revalidate_file_size - Revalidate the file size
2019-02-18 21:32:38 +03:00
* @ inode : pointer to inode struct
* @ filp : pointer to struct file
2005-06-13 19:14:01 +04:00
*
* Revalidates the file length . This is basically a wrapper around
* nfs_revalidate_inode ( ) that takes into account the fact that we may
* have cached writes ( in which case we don ' t care about the server ' s
* idea of what the file length is ) , or O_DIRECT ( in which case we
* shouldn ' t trust the cache ) .
*/
static int nfs_revalidate_file_size ( struct inode * inode , struct file * filp )
{
struct nfs_server * server = NFS_SERVER ( inode ) ;
2010-04-17 00:42:46 +04:00
2005-06-13 19:14:01 +04:00
if ( filp - > f_flags & O_DIRECT )
goto force_reval ;
2021-03-26 04:07:21 +03:00
if ( nfs_check_cache_invalid ( inode , NFS_INO_INVALID_SIZE ) )
2010-04-17 00:42:46 +04:00
goto force_reval ;
return 0 ;
2005-06-13 19:14:01 +04:00
force_reval :
return __nfs_revalidate_inode ( server , inode ) ;
}
2012-12-18 03:59:39 +04:00
loff_t nfs_file_llseek ( struct file * filp , loff_t offset , int whence )
2005-06-13 19:14:01 +04:00
{
2013-09-16 18:53:17 +04:00
dprintk ( " NFS: llseek file(%pD2, %lld, %d) \n " ,
filp , offset , whence ) ;
2008-06-12 01:55:34 +04:00
2011-07-18 21:21:38 +04:00
/*
2012-12-18 03:59:39 +04:00
* whence = = SEEK_END | | SEEK_DATA | | SEEK_HOLE = > we must revalidate
2011-07-18 21:21:38 +04:00
* the cached file length
*/
2012-12-18 03:59:39 +04:00
if ( whence ! = SEEK_SET & & whence ! = SEEK_CUR ) {
2005-06-13 19:14:01 +04:00
struct inode * inode = filp - > f_mapping - > host ;
2008-09-24 01:28:35 +04:00
2005-06-13 19:14:01 +04:00
int retval = nfs_revalidate_file_size ( inode , filp ) ;
if ( retval < 0 )
return ( loff_t ) retval ;
2011-09-16 03:06:52 +04:00
}
2008-09-24 01:28:35 +04:00
2012-12-18 03:59:39 +04:00
return generic_file_llseek ( filp , offset , whence ) ;
2005-06-13 19:14:01 +04:00
}
2012-07-31 00:05:25 +04:00
EXPORT_SYMBOL_GPL ( nfs_file_llseek ) ;
2005-06-13 19:14:01 +04:00
2005-04-17 02:20:36 +04:00
/*
* Flush all dirty pages , and check for write errors .
*/
2015-09-06 02:06:58 +03:00
static int
2006-06-23 13:05:12 +04:00
nfs_file_flush ( struct file * file , fl_owner_t id )
2005-04-17 02:20:36 +04:00
{
2013-09-16 18:53:17 +04:00
struct inode * inode = file_inode ( file ) ;
2020-08-01 14:10:38 +03:00
errseq_t since ;
2005-04-17 02:20:36 +04:00
2013-09-16 18:53:17 +04:00
dprintk ( " NFS: flush(%pD2) \n " , file ) ;
2005-04-17 02:20:36 +04:00
2010-02-01 22:17:14 +03:00
nfs_inc_stats ( inode , NFSIOS_VFSFLUSH ) ;
2005-04-17 02:20:36 +04:00
if ( ( file - > f_mode & FMODE_WRITE ) = = 0 )
return 0 ;
2007-07-25 22:09:54 +04:00
2009-03-19 22:35:50 +03:00
/* Flush writes to the server and return any errors */
2020-08-01 14:10:38 +03:00
since = filemap_sample_wb_err ( file - > f_mapping ) ;
nfs_wb_all ( inode ) ;
return filemap_check_wb_err ( file - > f_mapping , since ) ;
2005-04-17 02:20:36 +04:00
}
2012-07-17 00:39:15 +04:00
ssize_t
2014-04-03 04:14:12 +04:00
nfs_file_read ( struct kiocb * iocb , struct iov_iter * to )
2005-04-17 02:20:36 +04:00
{
2013-09-16 18:53:17 +04:00
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
2005-04-17 02:20:36 +04:00
ssize_t result ;
2015-04-09 20:52:01 +03:00
if ( iocb - > ki_flags & IOCB_DIRECT )
2022-03-07 02:41:44 +03:00
return nfs_file_direct_read ( iocb , to , false ) ;
2005-04-17 02:20:36 +04:00
2014-03-05 06:53:33 +04:00
dprintk ( " NFS: read(%pD2, %zu@%lu) \n " ,
2013-09-16 18:53:17 +04:00
iocb - > ki_filp ,
2014-04-03 04:14:12 +04:00
iov_iter_count ( to ) , ( unsigned long ) iocb - > ki_pos ) ;
2005-04-17 02:20:36 +04:00
2016-06-04 00:07:19 +03:00
nfs_start_io_read ( inode ) ;
result = nfs_revalidate_mapping ( inode , iocb - > ki_filp - > f_mapping ) ;
2010-02-01 22:17:23 +03:00
if ( ! result ) {
2014-04-03 04:14:12 +04:00
result = generic_file_read_iter ( iocb , to ) ;
2010-02-01 22:17:23 +03:00
if ( result > 0 )
nfs_add_stats ( inode , NFSIOS_NORMALREADBYTES , result ) ;
}
2016-06-04 00:07:19 +03:00
nfs_end_io_read ( inode ) ;
2005-04-17 02:20:36 +04:00
return result ;
}
2012-07-31 00:05:25 +04:00
EXPORT_SYMBOL_GPL ( nfs_file_read ) ;
2005-04-17 02:20:36 +04:00
2023-05-22 16:50:07 +03:00
ssize_t
nfs_file_splice_read ( struct file * in , loff_t * ppos , struct pipe_inode_info * pipe ,
size_t len , unsigned int flags )
{
struct inode * inode = file_inode ( in ) ;
ssize_t result ;
dprintk ( " NFS: splice_read(%pD2, %zu@%llu) \n " , in , len , * ppos ) ;
nfs_start_io_read ( inode ) ;
result = nfs_revalidate_mapping ( inode , in - > f_mapping ) ;
if ( ! result ) {
result = filemap_splice_read ( in , ppos , pipe , len , flags ) ;
if ( result > 0 )
nfs_add_stats ( inode , NFSIOS_NORMALREADBYTES , result ) ;
}
nfs_end_io_read ( inode ) ;
return result ;
}
EXPORT_SYMBOL_GPL ( nfs_file_splice_read ) ;
2012-07-17 00:39:15 +04:00
int
2023-07-19 14:00:38 +03:00
nfs_file_mmap ( struct file * file , struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
2013-09-16 18:53:17 +04:00
struct inode * inode = file_inode ( file ) ;
2005-04-17 02:20:36 +04:00
int status ;
2013-09-16 18:53:17 +04:00
dprintk ( " NFS: mmap(%pD2) \n " , file ) ;
2005-04-17 02:20:36 +04:00
2009-03-11 21:37:54 +03:00
/* Note: generic_file_mmap() returns ENOSYS on nommu systems
* so we call that before revalidating the mapping
*/
status = generic_file_mmap ( file , vma ) ;
2007-07-23 01:09:05 +04:00
if ( ! status ) {
vma - > vm_ops = & nfs_file_vm_ops ;
2009-03-11 21:37:54 +03:00
status = nfs_revalidate_mapping ( inode , file - > f_mapping ) ;
2007-07-23 01:09:05 +04:00
}
2005-04-17 02:20:36 +04:00
return status ;
}
2012-07-31 00:05:25 +04:00
EXPORT_SYMBOL_GPL ( nfs_file_mmap ) ;
2005-04-17 02:20:36 +04:00
/*
* Flush any dirty pages for this process , and check for write errors .
* The return status from this call provides a reliable indication of
* whether any write errors occurred for this process .
*/
2016-03-02 19:35:54 +03:00
static int
2017-09-11 06:15:50 +03:00
nfs_file_fsync_commit ( struct file * file , int datasync )
2005-04-17 02:20:36 +04:00
{
2013-09-16 18:53:17 +04:00
struct inode * inode = file_inode ( file ) ;
2022-05-14 17:27:01 +03:00
int ret , ret2 ;
2010-07-31 22:29:06 +04:00
2013-09-16 18:53:17 +04:00
dprintk ( " NFS: fsync file(%pD2) datasync %d \n " , file , datasync ) ;
2005-04-17 02:20:36 +04:00
2006-03-20 21:44:14 +03:00
nfs_inc_stats ( inode , NFSIOS_VFSFSYNC ) ;
2020-01-06 23:25:03 +03:00
ret = nfs_commit_inode ( inode , FLUSH_SYNC ) ;
2022-05-14 17:27:01 +03:00
ret2 = file_check_and_advance_wb_err ( file ) ;
if ( ret2 < 0 )
return ret2 ;
return ret ;
2012-06-20 23:53:42 +04:00
}
2016-03-02 19:35:54 +03:00
int
2012-06-20 23:53:42 +04:00
nfs_file_fsync ( struct file * file , loff_t start , loff_t end , int datasync )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2022-08-13 15:22:25 +03:00
struct nfs_inode * nfsi = NFS_I ( inode ) ;
long save_nredirtied = atomic_long_read ( & nfsi - > redirtied_pages ) ;
long nredirtied ;
2020-01-06 23:25:03 +03:00
int ret ;
2012-06-20 23:53:42 +04:00
2013-08-20 02:59:33 +04:00
trace_nfs_fsync_enter ( inode ) ;
2020-01-06 23:25:03 +03:00
for ( ; ; ) {
2019-04-07 20:59:05 +03:00
ret = file_write_and_wait_range ( file , start , end ) ;
2012-09-12 00:01:22 +04:00
if ( ret ! = 0 )
break ;
2017-09-11 06:15:50 +03:00
ret = nfs_file_fsync_commit ( file , datasync ) ;
2020-01-06 23:25:03 +03:00
if ( ret ! = 0 )
break ;
ret = pnfs_sync_inode ( inode , ! ! datasync ) ;
if ( ret ! = 0 )
break ;
2022-08-13 15:22:25 +03:00
nredirtied = atomic_long_read ( & nfsi - > redirtied_pages ) ;
if ( nredirtied = = save_nredirtied )
2020-01-06 23:25:03 +03:00
break ;
2022-08-13 15:22:25 +03:00
save_nredirtied = nredirtied ;
2020-01-06 23:25:03 +03:00
}
2012-09-12 00:01:22 +04:00
2013-08-20 02:59:33 +04:00
trace_nfs_fsync_exit ( inode , ret ) ;
2010-07-31 22:29:06 +04:00
return ret ;
2005-04-17 02:20:36 +04:00
}
2016-03-02 19:35:54 +03:00
EXPORT_SYMBOL_GPL ( nfs_file_fsync ) ;
2005-04-17 02:20:36 +04:00
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
/*
* Decide whether a read / modify / write cycle may be more efficient
* then a modify / write / read cycle when writing to a page in the
* page cache .
*
2019-02-14 12:39:03 +03:00
* Some pNFS layout drivers can only read / write at a certain block
* granularity like all block devices and therefore we must perform
* read / modify / write whenever a page hasn ' t read yet and the data
* to be written there is not aligned to a block boundary and / or
* smaller than the block size .
*
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
* The modify / write / read cycle may occur if a page is read before
* being completely filled by the writer . In this situation , the
* page must be completely written to stable storage on the server
* before it can be refilled by reading in the page from the server .
* This can lead to expensive , small , FILE_SYNC mode writes being
* done .
*
* It may be more efficient to read the page first if the file is
* open for reading in addition to writing , the page is not marked
* as Uptodate , it is not dirty or waiting to be committed ,
* indicating that it was previously allocated and then modified ,
* that there were valid bytes of data in that range of the file ,
* and that the new data won ' t completely replace the old data in
* that range of the file .
*/
2023-01-20 00:33:45 +03:00
static bool nfs_folio_is_full_write ( struct folio * folio , loff_t pos ,
unsigned int len )
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
{
2023-01-20 00:33:45 +03:00
unsigned int pglen = nfs_folio_length ( folio ) ;
unsigned int offset = offset_in_folio ( folio , pos ) ;
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
unsigned int end = offset + len ;
2019-02-14 12:39:03 +03:00
return ! pglen | | ( end > = pglen & & ! offset ) ;
}
2014-09-10 19:23:30 +04:00
2023-01-20 00:33:45 +03:00
static bool nfs_want_read_modify_write ( struct file * file , struct folio * folio ,
loff_t pos , unsigned int len )
2019-02-14 12:39:03 +03:00
{
/*
* Up - to - date pages , those with ongoing or full - page write
* don ' t need read / modify / write
*/
2023-01-20 00:33:45 +03:00
if ( folio_test_uptodate ( folio ) | | folio_test_private ( folio ) | |
nfs_folio_is_full_write ( folio , pos , len ) )
2019-02-14 12:39:03 +03:00
return false ;
2023-01-20 00:33:45 +03:00
if ( pnfs_ld_read_whole_page ( file_inode ( file ) ) )
2019-02-14 12:39:03 +03:00
return true ;
/* Open for reading too? */
if ( file - > f_mode & FMODE_READ )
return true ;
return false ;
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
}
2005-04-17 02:20:36 +04:00
/*
2007-10-16 12:25:16 +04:00
* This does the " real " work of the write . We must allocate and lock the
* page to be sent back to the generic routine , which then copies the
* data from user space .
2005-04-17 02:20:36 +04:00
*
* If the writer ends up delaying the write , the writer needs to
* increment the page use counts until he is done with the page .
*/
2007-10-16 12:25:16 +04:00
static int nfs_write_begin ( struct file * file , struct address_space * mapping ,
2023-01-20 00:33:45 +03:00
loff_t pos , unsigned len , struct page * * pagep ,
void * * fsdata )
2005-04-17 02:20:36 +04:00
{
2023-01-20 00:33:43 +03:00
struct folio * folio ;
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
int once_thru = 0 ;
2023-01-20 00:33:45 +03:00
int ret ;
2007-10-16 12:25:16 +04:00
2013-12-17 21:20:16 +04:00
dfprintk ( PAGECACHE , " NFS: write_begin(%pD2(%lu), %u@%lld) \n " ,
2013-09-16 18:53:17 +04:00
file , mapping - > host - > i_ino , len , ( long long ) pos ) ;
2008-06-12 01:55:50 +04:00
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
start :
2023-03-24 21:01:01 +03:00
folio = __filemap_get_folio ( mapping , pos > > PAGE_SHIFT , FGP_WRITEBEGIN ,
mapping_gfp_mask ( mapping ) ) ;
2023-03-07 17:34:10 +03:00
if ( IS_ERR ( folio ) )
return PTR_ERR ( folio ) ;
2023-01-20 00:33:45 +03:00
* pagep = & folio - > page ;
2007-10-16 12:25:16 +04:00
2023-01-20 00:33:43 +03:00
ret = nfs_flush_incompatible ( file , folio ) ;
2007-10-16 12:25:16 +04:00
if ( ret ) {
2023-01-20 00:33:45 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
} else if ( ! once_thru & &
2023-01-20 00:33:45 +03:00
nfs_want_read_modify_write ( file , folio , pos , len ) ) {
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
once_thru = 1 ;
2023-01-20 00:33:43 +03:00
ret = nfs_read_folio ( file , folio ) ;
2023-01-20 00:33:45 +03:00
folio_put ( folio ) ;
NFS: read-modify-write page updating
Hi.
I have a proposal for possibly resolving this issue.
I believe that this situation occurs due to the way that the
Linux NFS client handles writes which modify partial pages.
The Linux NFS client handles partial page modifications by
allocating a page from the page cache, copying the data from
the user level into the page, and then keeping track of the
offset and length of the modified portions of the page. The
page is not marked as up to date because there are portions
of the page which do not contain valid file contents.
When a read call comes in for a portion of the page, the
contents of the page must be read in the from the server.
However, since the page may already contain some modified
data, that modified data must be written to the server
before the file contents can be read back in the from server.
And, since the writing and reading can not be done atomically,
the data must be written and committed to stable storage on
the server for safety purposes. This means either a
FILE_SYNC WRITE or a UNSTABLE WRITE followed by a COMMIT.
This has been discussed at length previously.
This algorithm could be described as modify-write-read. It
is most efficient when the application only updates pages
and does not read them.
My proposed solution is to add a heuristic to decide whether
to do this modify-write-read algorithm or switch to a read-
modify-write algorithm when initially allocating the page
in the write system call path. The heuristic uses the modes
that the file was opened with, the offset in the page to
read from, and the size of the region to read.
If the file was opened for reading in addition to writing
and the page would not be filled completely with data from
the user level, then read in the old contents of the page
and mark it as Uptodate before copying in the new data. If
the page would be completely filled with data from the user
level, then there would be no reason to read in the old
contents because they would just be copied over.
This would optimize for applications which randomly access
and update portions of files. The linkage editor for the
C compiler is an example of such a thing.
I tested the attached patch by using rpmbuild to build the
current Fedora rawhide kernel. The kernel without the
patch generated about 269,500 WRITE requests. The modified
kernel containing the patch generated about 261,000 WRITE
requests. Thus, about 8,500 fewer WRITE requests were
generated. I suspect that many of these additional
WRITE requests were probably FILE_SYNC requests to WRITE
a single page, but I didn't test this theory.
The difference between this patch and the previous one was
to remove the unneeded PageDirty() test. I then retested to
ensure that the resulting system continued to behave as
desired.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-08-10 16:54:16 +04:00
if ( ! ret )
goto start ;
2007-10-16 12:25:16 +04:00
}
return ret ;
2005-04-17 02:20:36 +04:00
}
2007-10-16 12:25:16 +04:00
static int nfs_write_end ( struct file * file , struct address_space * mapping ,
2023-01-20 00:33:45 +03:00
loff_t pos , unsigned len , unsigned copied ,
struct page * page , void * fsdata )
2005-04-17 02:20:36 +04:00
{
2013-08-14 19:59:16 +04:00
struct nfs_open_context * ctx = nfs_file_open_context ( file ) ;
2023-01-20 00:33:43 +03:00
struct folio * folio = page_folio ( page ) ;
2023-01-20 00:33:45 +03:00
unsigned offset = offset_in_folio ( folio , pos ) ;
2007-10-16 12:25:16 +04:00
int status ;
2005-04-17 02:20:36 +04:00
2013-12-17 21:20:16 +04:00
dfprintk ( PAGECACHE , " NFS: write_end(%pD2(%lu), %u@%lld) \n " ,
2013-09-16 18:53:17 +04:00
file , mapping - > host - > i_ino , len , ( long long ) pos ) ;
2008-06-12 01:55:50 +04:00
2008-06-11 02:31:00 +04:00
/*
* Zero any uninitialised parts of the page , and then mark the page
* as up to date if it turns out that we ' re extending the file .
*/
2023-01-20 00:33:45 +03:00
if ( ! folio_test_uptodate ( folio ) ) {
size_t fsize = folio_size ( folio ) ;
unsigned pglen = nfs_folio_length ( folio ) ;
2016-09-06 04:42:32 +03:00
unsigned end = offset + copied ;
2008-06-11 02:31:00 +04:00
if ( pglen = = 0 ) {
2023-01-20 00:33:45 +03:00
folio_zero_segments ( folio , 0 , offset , end , fsize ) ;
folio_mark_uptodate ( folio ) ;
2008-06-11 02:31:00 +04:00
} else if ( end > = pglen ) {
2023-01-20 00:33:45 +03:00
folio_zero_segment ( folio , end , fsize ) ;
2008-06-11 02:31:00 +04:00
if ( offset = = 0 )
2023-01-20 00:33:45 +03:00
folio_mark_uptodate ( folio ) ;
2008-06-11 02:31:00 +04:00
} else
2023-01-20 00:33:45 +03:00
folio_zero_segment ( folio , pglen , fsize ) ;
2008-06-11 02:31:00 +04:00
}
2023-01-20 00:33:43 +03:00
status = nfs_update_folio ( file , folio , offset , copied ) ;
2007-10-16 12:25:16 +04:00
2023-01-20 00:33:45 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2007-10-16 12:25:16 +04:00
2007-12-20 22:55:04 +03:00
if ( status < 0 )
return status ;
2012-05-24 21:13:24 +04:00
NFS_I ( mapping - > host ) - > write_io + = copied ;
2013-08-14 19:59:16 +04:00
2022-05-14 17:27:03 +03:00
if ( nfs_ctx_key_to_expire ( ctx , mapping - > host ) )
nfs_wb_all ( mapping - > host ) ;
2013-08-14 19:59:16 +04:00
2007-12-20 22:55:04 +03:00
return copied ;
2005-04-17 02:20:36 +04:00
}
2009-04-03 19:42:41 +04:00
/*
* Partially or wholly invalidate a page
* - Release the private state associated with a page if undergoing complete
* page invalidation
2009-04-03 19:42:44 +04:00
* - Called if either PG_private or PG_fscache is set on the page
2009-04-03 19:42:41 +04:00
* - Caller holds page lock
*/
2022-02-09 23:21:47 +03:00
static void nfs_invalidate_folio ( struct folio * folio , size_t offset ,
size_t length )
2006-03-20 21:44:04 +03:00
{
2023-01-20 00:33:49 +03:00
struct inode * inode = folio_file_mapping ( folio ) - > host ;
2022-02-09 23:21:47 +03:00
dfprintk ( PAGECACHE , " NFS: invalidate_folio(%lu, %zu, %zu) \n " ,
folio - > index , offset , length ) ;
2008-06-12 01:55:50 +04:00
2022-02-09 23:21:47 +03:00
if ( offset ! = 0 | | length < folio_size ( folio ) )
2006-10-10 00:18:38 +04:00
return ;
2006-05-31 09:13:38 +04:00
/* Cancel any unstarted writes on this page */
2023-01-20 00:33:49 +03:00
nfs_wb_folio_cancel ( inode , folio ) ;
2022-02-09 23:21:47 +03:00
folio_wait_fscache ( folio ) ;
2023-01-20 00:33:49 +03:00
trace_nfs_invalidate_folio ( inode , folio ) ;
2006-03-20 21:44:04 +03:00
}
2009-04-03 19:42:41 +04:00
/*
2022-05-01 06:53:28 +03:00
* Attempt to release the private state associated with a folio
* - Called if either private or fscache flags are set on the folio
* - Caller holds folio lock
* - Return true ( may release folio ) or false ( may not )
2009-04-03 19:42:41 +04:00
*/
2022-05-01 06:53:28 +03:00
static bool nfs_release_folio ( struct folio * folio , gfp_t gfp )
2006-03-20 21:44:04 +03:00
{
2022-05-01 06:53:28 +03:00
dfprintk ( PAGECACHE , " NFS: release_folio(%p) \n " , folio ) ;
2008-06-12 01:55:50 +04:00
2022-05-01 06:53:28 +03:00
/* If the private flag is set, then the folio is not freeable */
2023-01-20 00:33:48 +03:00
if ( folio_test_private ( folio ) ) {
if ( ( current_gfp_context ( gfp ) & GFP_KERNEL ) ! = GFP_KERNEL | |
current_is_kswapd ( ) )
return false ;
if ( nfs_wb_folio ( folio_file_mapping ( folio ) - > host , folio ) < 0 )
return false ;
}
2022-05-01 06:53:28 +03:00
return nfs_fscache_release_folio ( folio , gfp ) ;
2007-01-11 10:15:39 +03:00
}
2022-01-17 22:35:22 +03:00
static void nfs_check_dirty_writeback ( struct folio * folio ,
2013-07-04 02:02:06 +04:00
bool * dirty , bool * writeback )
{
struct nfs_inode * nfsi ;
2022-01-17 22:35:22 +03:00
struct address_space * mapping = folio - > mapping ;
2013-07-04 02:02:06 +04:00
/*
2022-01-17 22:35:22 +03:00
* Check if an unstable folio is currently being committed and
* if so , have the VM treat it as if the folio is under writeback
* so it will not block due to folios that will shortly be freeable .
2013-07-04 02:02:06 +04:00
*/
nfsi = NFS_I ( mapping - > host ) ;
2015-09-30 03:34:05 +03:00
if ( atomic_read ( & nfsi - > commit_info . rpcs_out ) ) {
2013-07-04 02:02:06 +04:00
* writeback = true ;
return ;
}
/*
2022-01-17 22:35:22 +03:00
* If the private flag is set , then the folio is not freeable
* and as the inode is not being committed , it ' s not going to
* be cleaned in the near future so treat it as dirty
2013-07-04 02:02:06 +04:00
*/
2022-01-17 22:35:22 +03:00
if ( folio_test_private ( folio ) )
2013-07-04 02:02:06 +04:00
* dirty = true ;
}
2009-04-03 19:42:41 +04:00
/*
* Attempt to clear the private state associated with a page when an error
* occurs that requires the cached contents of an inode to be written back or
* destroyed
2009-04-03 19:42:44 +04:00
* - Called if either PG_private or fscache is set on the page
2009-04-03 19:42:41 +04:00
* - Caller holds page lock
* - Return 0 if successful , - error otherwise
*/
2022-02-09 23:21:57 +03:00
static int nfs_launder_folio ( struct folio * folio )
2007-01-11 10:15:39 +03:00
{
2022-02-09 23:21:57 +03:00
struct inode * inode = folio - > mapping - > host ;
2023-01-20 00:33:49 +03:00
int ret ;
2008-06-12 01:55:50 +04:00
2022-02-09 23:21:57 +03:00
dfprintk ( PAGECACHE , " NFS: launder_folio(%ld, %llu) \n " ,
inode - > i_ino , folio_pos ( folio ) ) ;
2008-06-12 01:55:50 +04:00
2022-02-09 23:21:57 +03:00
folio_wait_fscache ( folio ) ;
2023-01-20 00:33:49 +03:00
ret = nfs_wb_folio ( inode , folio ) ;
trace_nfs_launder_folio_done ( inode , folio , ret ) ;
return ret ;
2006-03-20 21:44:04 +03:00
}
2012-08-01 03:45:12 +04:00
static int nfs_swap_activate ( struct swap_info_struct * sis , struct file * file ,
sector_t * span )
{
2020-01-02 11:04:26 +03:00
unsigned long blocks ;
long long isize ;
2022-05-10 04:20:48 +03:00
int ret ;
2022-03-07 02:41:44 +03:00
struct inode * inode = file_inode ( file ) ;
struct rpc_clnt * clnt = NFS_CLIENT ( inode ) ;
struct nfs_client * cl = NFS_SERVER ( inode ) - > nfs_client ;
2020-01-02 11:04:26 +03:00
spin_lock ( & inode - > i_lock ) ;
blocks = inode - > i_blocks ;
isize = inode - > i_size ;
spin_unlock ( & inode - > i_lock ) ;
if ( blocks * 512 < isize ) {
pr_warn ( " swap activate: swapfile has holes \n " ) ;
return - EINVAL ;
}
2014-09-10 17:03:55 +04:00
2022-05-10 04:20:48 +03:00
ret = rpc_clnt_swap_activate ( clnt ) ;
if ( ret )
return ret ;
ret = add_swap_extent ( sis , 0 , sis - > max , 0 ) ;
if ( ret < 0 ) {
rpc_clnt_swap_deactivate ( clnt ) ;
return ret ;
}
2014-09-10 17:03:55 +04:00
2022-05-10 04:20:48 +03:00
* span = sis - > pages ;
2022-03-07 02:41:44 +03:00
if ( cl - > rpc_ops - > enable_swap )
cl - > rpc_ops - > enable_swap ( inode ) ;
2022-05-10 04:20:48 +03:00
sis - > flags | = SWP_FS_OPS ;
return ret ;
2012-08-01 03:45:12 +04:00
}
static void nfs_swap_deactivate ( struct file * file )
{
2022-03-07 02:41:44 +03:00
struct inode * inode = file_inode ( file ) ;
struct rpc_clnt * clnt = NFS_CLIENT ( inode ) ;
struct nfs_client * cl = NFS_SERVER ( inode ) - > nfs_client ;
2014-09-10 17:03:55 +04:00
2015-06-03 23:14:25 +03:00
rpc_clnt_swap_deactivate ( clnt ) ;
2022-03-07 02:41:44 +03:00
if ( cl - > rpc_ops - > disable_swap )
cl - > rpc_ops - > disable_swap ( file_inode ( file ) ) ;
2012-08-01 03:45:12 +04:00
}
2006-06-28 15:26:44 +04:00
const struct address_space_operations nfs_file_aops = {
2022-04-29 18:12:16 +03:00
. read_folio = nfs_read_folio ,
2022-01-22 23:54:52 +03:00
. readahead = nfs_readahead ,
2022-02-09 23:22:03 +03:00
. dirty_folio = filemap_dirty_folio ,
2005-04-17 02:20:36 +04:00
. writepage = nfs_writepage ,
. writepages = nfs_writepages ,
2007-10-16 12:25:16 +04:00
. write_begin = nfs_write_begin ,
. write_end = nfs_write_end ,
2022-02-09 23:21:47 +03:00
. invalidate_folio = nfs_invalidate_folio ,
2022-05-01 06:53:28 +03:00
. release_folio = nfs_release_folio ,
2022-06-06 16:22:19 +03:00
. migrate_folio = nfs_migrate_folio ,
2022-02-09 23:21:57 +03:00
. launder_folio = nfs_launder_folio ,
2013-07-04 02:02:06 +04:00
. is_dirty_writeback = nfs_check_dirty_writeback ,
2009-09-16 13:50:17 +04:00
. error_remove_page = generic_error_remove_page ,
2012-08-01 03:45:12 +04:00
. swap_activate = nfs_swap_activate ,
. swap_deactivate = nfs_swap_deactivate ,
2022-05-10 04:20:48 +03:00
. swap_rw = nfs_swap_rw ,
2005-04-17 02:20:36 +04:00
} ;
2009-04-03 19:42:41 +04:00
/*
* Notification that a PTE pointing to an NFS page is about to be made
* writable , implying that someone is about to modify the page through a
* shared - writable mapping
*/
2018-07-02 18:27:09 +03:00
static vm_fault_t nfs_vm_page_mkwrite ( struct vm_fault * vmf )
2007-07-23 01:09:05 +04:00
{
2017-02-25 01:56:41 +03:00
struct file * filp = vmf - > vma - > vm_file ;
2013-09-16 18:53:17 +04:00
struct inode * inode = file_inode ( filp ) ;
2007-07-23 01:09:05 +04:00
unsigned pagelen ;
2018-07-02 18:27:09 +03:00
vm_fault_t ret = VM_FAULT_NOPAGE ;
2007-10-16 12:25:16 +04:00
struct address_space * mapping ;
2023-01-20 00:33:46 +03:00
struct folio * folio = page_folio ( vmf - > page ) ;
2007-07-23 01:09:05 +04:00
2013-12-17 21:20:16 +04:00
dfprintk ( PAGECACHE , " NFS: vm_page_mkwrite(%pD2(%lu), offset %lld) \n " ,
2023-01-20 00:33:46 +03:00
filp , filp - > f_mapping - > host - > i_ino ,
( long long ) folio_file_pos ( folio ) ) ;
2008-06-12 01:55:50 +04:00
2016-06-23 18:09:04 +03:00
sb_start_pagefault ( inode - > i_sb ) ;
2009-04-03 19:42:44 +04:00
/* make sure the cache has finished storing the page */
2023-01-20 00:33:46 +03:00
if ( folio_test_fscache ( folio ) & &
folio_wait_fscache_killable ( folio ) < 0 ) {
nfs: Convert to new fscache volume/cookie API
Change the nfs filesystem to support fscache's indexing rewrite and
reenable caching in nfs.
The following changes have been made:
(1) The fscache_netfs struct is no more, and there's no need to register
the filesystem as a whole.
(2) The session cookie is now an fscache_volume cookie, allocated with
fscache_acquire_volume(). That takes three parameters: a string
representing the "volume" in the index, a string naming the cache to
use (or NULL) and a u64 that conveys coherency metadata for the
volume.
For nfs, I've made it render the volume name string as:
"nfs,<ver>,<family>,<address>,<port>,<fsidH>,<fsidL>*<,param>[,<uniq>]"
(3) The fscache_cookie_def is no more and needed information is passed
directly to fscache_acquire_cookie(). The cache no longer calls back
into the filesystem, but rather metadata changes are indicated at
other times.
fscache_acquire_cookie() is passed the same keying and coherency
information as before.
(4) fscache_enable/disable_cookie() have been removed.
Call fscache_use_cookie() and fscache_unuse_cookie() when a file is
opened or closed to prevent a cache file from being culled and to keep
resources to hand that are needed to do I/O.
If a file is opened for writing, we invalidate it with
FSCACHE_INVAL_DIO_WRITE in lieu of doing writeback to the cache,
thereby making it cease caching until all currently open files are
closed. This should give the same behaviour as the uptream code.
Making the cache store local modifications isn't straightforward for
NFS, so that's left for future patches.
(5) fscache_invalidate() now needs to be given uptodate auxiliary data and
a file size. It also takes a flag to indicate if this was due to a
DIO write.
(6) Call nfs_fscache_invalidate() with FSCACHE_INVAL_DIO_WRITE on a file
to which a DIO write is made.
(7) Call fscache_note_page_release() from nfs_release_page().
(8) Use a killable wait in nfs_vm_page_mkwrite() when waiting for
PG_fscache to be cleared.
(9) The functions to read and write data to/from the cache are stubbed out
pending a conversion to use netfslib.
Changes
=======
ver #3:
- Added missing =n fallback for nfs_fscache_release_file()[1][2].
ver #2:
- Use gfpflags_allow_blocking() rather than using flag directly.
- fscache_acquire_volume() now returns errors.
- Remove NFS_INO_FSCACHE as it's no longer used.
- Need to unuse a cookie on file-release, not inode-clear.
Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Co-developed-by: David Howells <dhowells@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Acked-by: Jeff Layton <jlayton@kernel.org>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Anna Schumaker <anna.schumaker@netapp.com>
cc: linux-nfs@vger.kernel.org
cc: linux-cachefs@redhat.com
Link: https://lore.kernel.org/r/202112100804.nksO8K4u-lkp@intel.com/ [1]
Link: https://lore.kernel.org/r/202112100957.2oEDT20W-lkp@intel.com/ [2]
Link: https://lore.kernel.org/r/163819668938.215744.14448852181937731615.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/163906979003.143852.2601189243864854724.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/163967182112.1823006.7791504655391213379.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/164021575950.640689.12069642327533368467.stgit@warthog.procyon.org.uk/ # v4
2020-11-14 21:43:54 +03:00
ret = VM_FAULT_RETRY ;
goto out ;
}
2009-04-03 19:42:44 +04:00
2015-03-03 08:06:35 +03:00
wait_on_bit_action ( & NFS_I ( inode ) - > flags , NFS_INO_INVALIDATING ,
freezer,sched: Rewrite core freezer logic
Rewrite the core freezer to behave better wrt thawing and be simpler
in general.
By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is
ensured frozen tasks stay frozen until thawed and don't randomly wake
up early, as is currently possible.
As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up
two PF_flags (yay!).
Specifically; the current scheme works a little like:
freezer_do_not_count();
schedule();
freezer_count();
And either the task is blocked, or it lands in try_to_freezer()
through freezer_count(). Now, when it is blocked, the freezer
considers it frozen and continues.
However, on thawing, once pm_freezing is cleared, freezer_count()
stops working, and any random/spurious wakeup will let a task run
before its time.
That is, thawing tries to thaw things in explicit order; kernel
threads and workqueues before doing bringing SMP back before userspace
etc.. However due to the above mentioned races it is entirely possible
for userspace tasks to thaw (by accident) before SMP is back.
This can be a fatal problem in asymmetric ISA architectures (eg ARMv9)
where the userspace task requires a special CPU to run.
As said; replace this with a special task state TASK_FROZEN and add
the following state transitions:
TASK_FREEZABLE -> TASK_FROZEN
__TASK_STOPPED -> TASK_FROZEN
__TASK_TRACED -> TASK_FROZEN
The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL
(IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state
is already required to deal with spurious wakeups and the freezer
causes one such when thawing the task (since the original state is
lost).
The special __TASK_{STOPPED,TRACED} states *can* be restored since
their canonical state is in ->jobctl.
With this, frozen tasks need an explicit TASK_FROZEN wakeup and are
free of undue (early / spurious) wakeups.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org
2022-08-22 14:18:22 +03:00
nfs_wait_bit_killable ,
TASK_KILLABLE | TASK_FREEZABLE_UNSAFE ) ;
2015-03-03 08:06:35 +03:00
2023-01-20 00:33:46 +03:00
folio_lock ( folio ) ;
mapping = folio_file_mapping ( folio ) ;
2013-09-16 18:53:17 +04:00
if ( mapping ! = inode - > i_mapping )
2008-01-23 01:13:06 +03:00
goto out_unlock ;
2023-01-20 00:33:46 +03:00
folio_wait_writeback ( folio ) ;
2012-01-18 07:04:26 +04:00
2023-01-20 00:33:46 +03:00
pagelen = nfs_folio_length ( folio ) ;
2008-01-23 01:13:06 +03:00
if ( pagelen = = 0 )
goto out_unlock ;
2007-10-16 12:25:16 +04:00
2010-10-05 01:59:08 +04:00
ret = VM_FAULT_LOCKED ;
2023-01-20 00:33:43 +03:00
if ( nfs_flush_incompatible ( filp , folio ) = = 0 & &
nfs_update_folio ( filp , folio , 0 , pagelen ) = = 0 )
2010-10-05 01:59:08 +04:00
goto out ;
2008-01-23 01:13:06 +03:00
2010-10-05 01:59:08 +04:00
ret = VM_FAULT_SIGBUS ;
2008-01-23 01:13:06 +03:00
out_unlock :
2023-01-20 00:33:46 +03:00
folio_unlock ( folio ) ;
2010-10-05 01:59:08 +04:00
out :
2016-06-23 18:09:04 +03:00
sb_end_pagefault ( inode - > i_sb ) ;
2010-10-05 01:59:08 +04:00
return ret ;
2007-07-23 01:09:05 +04:00
}
2009-09-27 22:29:37 +04:00
static const struct vm_operations_struct nfs_file_vm_ops = {
2007-07-23 01:09:05 +04:00
. fault = filemap_fault ,
2014-04-08 02:37:19 +04:00
. map_pages = filemap_map_pages ,
2007-07-23 01:09:05 +04:00
. page_mkwrite = nfs_vm_page_mkwrite ,
} ;
2014-04-03 22:07:25 +04:00
ssize_t nfs_file_write ( struct kiocb * iocb , struct iov_iter * from )
2005-04-17 02:20:36 +04:00
{
2013-09-16 18:53:17 +04:00
struct file * file = iocb - > ki_filp ;
struct inode * inode = file_inode ( file ) ;
2021-02-13 00:49:48 +03:00
unsigned int mntflags = NFS_SERVER ( inode ) - > flags ;
ssize_t result , written ;
2020-08-01 14:10:39 +03:00
errseq_t since ;
int error ;
2005-04-17 02:20:36 +04:00
2013-09-16 18:53:17 +04:00
result = nfs_key_timeout_notify ( file , inode ) ;
2013-08-14 19:59:16 +04:00
if ( result )
return result ;
2016-06-23 17:35:48 +03:00
if ( iocb - > ki_flags & IOCB_DIRECT )
2022-03-07 02:41:44 +03:00
return nfs_file_direct_write ( iocb , from , false ) ;
2005-04-17 02:20:36 +04:00
2014-03-05 06:53:33 +04:00
dprintk ( " NFS: write(%pD2, %zu@%Ld) \n " ,
2016-06-23 22:00:42 +03:00
file , iov_iter_count ( from ) , ( long long ) iocb - > ki_pos ) ;
2005-04-17 02:20:36 +04:00
if ( IS_SWAPFILE ( inode ) )
goto out_swapfile ;
2005-06-22 21:16:30 +04:00
/*
* O_APPEND implies that we must revalidate the file length .
*/
2021-02-08 16:55:46 +03:00
if ( iocb - > ki_flags & IOCB_APPEND | | iocb - > ki_pos > i_size_read ( inode ) ) {
2013-09-16 18:53:17 +04:00
result = nfs_revalidate_file_size ( inode , file ) ;
2005-06-22 21:16:30 +04:00
if ( result )
2022-05-14 17:27:02 +03:00
return result ;
2005-06-22 21:16:30 +04:00
}
2005-04-17 02:20:36 +04:00
2021-02-08 16:55:47 +03:00
nfs_clear_invalid_mapping ( file - > f_mapping ) ;
2020-08-01 14:10:39 +03:00
since = filemap_sample_wb_err ( file - > f_mapping ) ;
2016-06-04 00:07:19 +03:00
nfs_start_io_write ( inode ) ;
2016-06-23 22:00:42 +03:00
result = generic_write_checks ( iocb , from ) ;
2023-06-01 17:58:53 +03:00
if ( result > 0 )
2022-02-20 07:19:49 +03:00
result = generic_perform_write ( iocb , from ) ;
2016-06-04 00:07:19 +03:00
nfs_end_io_write ( inode ) ;
2016-06-23 22:00:42 +03:00
if ( result < = 0 )
2005-04-17 02:20:36 +04:00
goto out ;
2016-09-03 19:05:31 +03:00
written = result ;
2022-05-14 17:27:02 +03:00
nfs_add_stats ( inode , NFSIOS_NORMALWRITTENBYTES , written ) ;
2021-02-13 00:49:48 +03:00
if ( mntflags & NFS_MOUNT_WRITE_EAGER ) {
result = filemap_fdatawrite_range ( file - > f_mapping ,
iocb - > ki_pos - written ,
iocb - > ki_pos - 1 ) ;
if ( result < 0 )
goto out ;
}
if ( mntflags & NFS_MOUNT_WRITE_WAIT ) {
2022-08-10 14:40:01 +03:00
filemap_fdatawait_range ( file - > f_mapping ,
iocb - > ki_pos - written ,
iocb - > ki_pos - 1 ) ;
2021-02-13 00:49:48 +03:00
}
2017-09-07 16:29:23 +03:00
result = generic_write_sync ( iocb , written ) ;
if ( result < 0 )
2022-05-14 17:27:02 +03:00
return result ;
2010-02-01 22:17:41 +03:00
2022-05-14 17:27:02 +03:00
out :
2015-08-18 00:55:18 +03:00
/* Return error values */
2020-08-01 14:10:39 +03:00
error = filemap_check_wb_err ( file - > f_mapping , since ) ;
2022-05-14 17:27:02 +03:00
switch ( error ) {
default :
break ;
case - EDQUOT :
case - EFBIG :
case - ENOSPC :
nfs_wb_all ( inode ) ;
error = file_check_and_advance_wb_err ( file ) ;
if ( error < 0 )
result = error ;
2006-12-05 08:35:40 +03:00
}
2005-04-17 02:20:36 +04:00
return result ;
out_swapfile :
printk ( KERN_INFO " NFS: attempt to write to active swap file! \n " ) ;
2019-11-09 00:02:24 +03:00
return - ETXTBSY ;
2005-04-17 02:20:36 +04:00
}
2012-07-31 00:05:25 +04:00
EXPORT_SYMBOL_GPL ( nfs_file_write ) ;
2005-04-17 02:20:36 +04:00
2010-09-23 16:55:58 +04:00
static int
do_getlk ( struct file * filp , int cmd , struct file_lock * fl , int is_local )
2005-04-17 02:20:36 +04:00
{
struct inode * inode = filp - > f_mapping - > host ;
int status = 0 ;
2010-11-29 00:04:05 +03:00
unsigned int saved_type = fl - > fl_type ;
2005-04-17 02:20:36 +04:00
2005-10-19 01:20:16 +04:00
/* Try local locking first */
2007-05-12 00:09:32 +04:00
posix_test_lock ( filp , fl ) ;
if ( fl - > fl_type ! = F_UNLCK ) {
/* found a conflict */
2005-10-19 01:20:16 +04:00
goto out ;
2005-04-17 02:20:36 +04:00
}
2010-11-29 00:04:05 +03:00
fl - > fl_type = saved_type ;
2005-10-19 01:20:16 +04:00
2012-06-20 23:53:43 +04:00
if ( NFS_PROTO ( inode ) - > have_delegation ( inode , FMODE_READ ) )
2005-10-19 01:20:16 +04:00
goto out_noconflict ;
2010-09-23 16:55:58 +04:00
if ( is_local )
2005-10-19 01:20:16 +04:00
goto out_noconflict ;
status = NFS_PROTO ( inode ) - > lock ( filp , cmd , fl ) ;
out :
2005-04-17 02:20:36 +04:00
return status ;
2005-10-19 01:20:16 +04:00
out_noconflict :
fl - > fl_type = F_UNLCK ;
goto out ;
2005-04-17 02:20:36 +04:00
}
2010-09-23 16:55:58 +04:00
static int
do_unlk ( struct file * filp , int cmd , struct file_lock * fl , int is_local )
2005-04-17 02:20:36 +04:00
{
struct inode * inode = filp - > f_mapping - > host ;
2013-04-09 05:49:53 +04:00
struct nfs_lock_context * l_ctx ;
2005-04-17 02:20:36 +04:00
int status ;
/*
* Flush all pending writes before doing anything
* with locks . .
*/
2019-04-07 20:59:04 +03:00
nfs_wb_all ( inode ) ;
2005-04-17 02:20:36 +04:00
2013-04-09 05:49:53 +04:00
l_ctx = nfs_get_lock_context ( nfs_file_open_context ( filp ) ) ;
if ( ! IS_ERR ( l_ctx ) ) {
2016-01-06 18:40:18 +03:00
status = nfs_iocounter_wait ( l_ctx ) ;
2013-04-09 05:49:53 +04:00
nfs_put_lock_context ( l_ctx ) ;
2017-04-11 19:50:12 +03:00
/* NOTE: special case
* If we ' re signalled while cleaning up locks on process exit , we
* still need to complete the unlock .
*/
if ( status < 0 & & ! ( fl - > fl_flags & FL_CLOSE ) )
2013-04-09 05:49:53 +04:00
return status ;
}
2010-09-23 16:55:58 +04:00
/*
* Use local locking if mounted with " -onolock " or with appropriate
* " -olocal_lock= "
*/
if ( ! is_local )
2005-04-17 02:20:36 +04:00
status = NFS_PROTO ( inode ) - > lock ( filp , cmd , fl ) ;
else
2016-09-18 01:17:32 +03:00
status = locks_lock_file_wait ( filp , fl ) ;
2005-04-17 02:20:36 +04:00
return status ;
}
2010-09-23 16:55:58 +04:00
static int
do_setlk ( struct file * filp , int cmd , struct file_lock * fl , int is_local )
2005-04-17 02:20:36 +04:00
{
struct inode * inode = filp - > f_mapping - > host ;
int status ;
/*
* Flush all pending writes before doing anything
* with locks . .
*/
2005-12-14 00:13:54 +03:00
status = nfs_sync_mapping ( filp - > f_mapping ) ;
if ( status ! = 0 )
2005-04-17 02:20:36 +04:00
goto out ;
2010-09-23 16:55:58 +04:00
/*
* Use local locking if mounted with " -onolock " or with appropriate
* " -olocal_lock= "
*/
if ( ! is_local )
2005-04-17 02:20:36 +04:00
status = NFS_PROTO ( inode ) - > lock ( filp , cmd , fl ) ;
2008-04-02 04:26:52 +04:00
else
2016-09-18 01:17:32 +03:00
status = locks_lock_file_wait ( filp , fl ) ;
2005-04-17 02:20:36 +04:00
if ( status < 0 )
goto out ;
2010-10-13 03:30:05 +04:00
2005-04-17 02:20:36 +04:00
/*
2017-08-18 10:12:52 +03:00
* Invalidate cache to prevent missing any changes . If
* the file is mapped , clear the page cache as well so
* those mappings will be loaded .
2010-10-13 03:30:05 +04:00
*
2005-04-17 02:20:36 +04:00
* This makes locking act as a cache coherency point .
*/
2005-12-14 00:13:54 +03:00
nfs_sync_mapping ( filp - > f_mapping ) ;
2017-08-18 10:12:52 +03:00
if ( ! NFS_PROTO ( inode ) - > have_delegation ( inode , FMODE_READ ) ) {
NFS: invalidate file size when taking a lock.
Prior to commit ca0daa277aca ("NFS: Cache aggressively when file is open
for writing"), NFS would revalidate, or invalidate, the file size when
taking a lock. Since that commit it only invalidates the file content.
If the file size is changed on the server while wait for the lock, the
client will have an incorrect understanding of the file size and could
corrupt data. This particularly happens when writing beyond the
(supposed) end of file and can be easily be demonstrated with
posix_fallocate().
If an application opens an empty file, waits for a write lock, and then
calls posix_fallocate(), glibc will determine that the underlying
filesystem doesn't support fallocate (assuming version 4.1 or earlier)
and will write out a '0' byte at the end of each 4K page in the region
being fallocated that is after the end of the file.
NFS will (usually) detect that these writes are beyond EOF and will
expand them to cover the whole page, and then will merge the pages.
Consequently, NFS will write out large blocks of zeroes beyond where it
thought EOF was. If EOF had moved, the pre-existing part of the file
will be over-written. Locking should have protected against this,
but it doesn't.
This patch restores the use of nfs_zap_caches() which invalidated the
cached attributes. When posix_fallocate() asks for the file size, the
request will go to the server and get a correct answer.
cc: stable@vger.kernel.org (v4.8+)
Fixes: ca0daa277aca ("NFS: Cache aggressively when file is open for writing")
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2017-07-24 06:18:50 +03:00
nfs_zap_caches ( inode ) ;
2017-08-18 10:12:52 +03:00
if ( mapping_mapped ( filp - > f_mapping ) )
nfs_revalidate_mapping ( inode , filp - > f_mapping ) ;
}
2005-04-17 02:20:36 +04:00
out :
return status ;
}
/*
* Lock a ( portion of ) a file
*/
2012-07-17 00:39:15 +04:00
int nfs_lock ( struct file * filp , int cmd , struct file_lock * fl )
2005-04-17 02:20:36 +04:00
{
2008-06-12 01:55:58 +04:00
struct inode * inode = filp - > f_mapping - > host ;
2008-05-21 03:34:39 +04:00
int ret = - ENOLCK ;
2010-09-23 16:55:58 +04:00
int is_local = 0 ;
2005-04-17 02:20:36 +04:00
2013-09-16 18:53:17 +04:00
dprintk ( " NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld) \n " ,
filp , fl - > fl_type , fl - > fl_flags ,
2005-04-17 02:20:36 +04:00
( long long ) fl - > fl_start , ( long long ) fl - > fl_end ) ;
2008-06-12 01:55:58 +04:00
2006-03-20 21:44:14 +03:00
nfs_inc_stats ( inode , NFSIOS_VFSLOCK ) ;
2005-04-17 02:20:36 +04:00
2021-08-21 00:02:06 +03:00
if ( fl - > fl_flags & FL_RECLAIM )
return - ENOGRACE ;
2010-09-23 16:55:58 +04:00
if ( NFS_SERVER ( inode ) - > flags & NFS_MOUNT_LOCAL_FCNTL )
is_local = 1 ;
2008-05-21 03:34:39 +04:00
if ( NFS_PROTO ( inode ) - > lock_check_bounds ! = NULL ) {
ret = NFS_PROTO ( inode ) - > lock_check_bounds ( fl ) ;
if ( ret < 0 )
goto out_err ;
}
2005-04-17 02:20:36 +04:00
if ( IS_GETLK ( cmd ) )
2010-09-23 16:55:58 +04:00
ret = do_getlk ( filp , cmd , fl , is_local ) ;
2008-05-21 03:34:39 +04:00
else if ( fl - > fl_type = = F_UNLCK )
2010-09-23 16:55:58 +04:00
ret = do_unlk ( filp , cmd , fl , is_local ) ;
2008-05-21 03:34:39 +04:00
else
2010-09-23 16:55:58 +04:00
ret = do_setlk ( filp , cmd , fl , is_local ) ;
2008-05-21 03:34:39 +04:00
out_err :
return ret ;
2005-04-17 02:20:36 +04:00
}
2012-07-31 00:05:25 +04:00
EXPORT_SYMBOL_GPL ( nfs_lock ) ;
2005-04-17 02:20:36 +04:00
/*
* Lock a ( portion of ) a file
*/
2012-07-17 00:39:15 +04:00
int nfs_flock ( struct file * filp , int cmd , struct file_lock * fl )
2005-04-17 02:20:36 +04:00
{
2010-09-23 16:55:58 +04:00
struct inode * inode = filp - > f_mapping - > host ;
int is_local = 0 ;
2013-09-16 18:53:17 +04:00
dprintk ( " NFS: flock(%pD2, t=%x, fl=%x) \n " ,
filp , fl - > fl_type , fl - > fl_flags ) ;
2005-04-17 02:20:36 +04:00
if ( ! ( fl - > fl_flags & FL_FLOCK ) )
return - ENOLCK ;
2010-09-23 16:55:58 +04:00
if ( NFS_SERVER ( inode ) - > flags & NFS_MOUNT_LOCAL_FLOCK )
is_local = 1 ;
2017-11-10 14:27:49 +03:00
/* We're simulating flock() locks using posix locks on the server */
if ( fl - > fl_type = = F_UNLCK )
2010-09-23 16:55:58 +04:00
return do_unlk ( filp , cmd , fl , is_local ) ;
return do_setlk ( filp , cmd , fl , is_local ) ;
2005-04-17 02:20:36 +04:00
}
2012-07-31 00:05:25 +04:00
EXPORT_SYMBOL_GPL ( nfs_flock ) ;
2007-06-08 23:23:34 +04:00
2011-11-04 21:31:22 +04:00
const struct file_operations nfs_file_operations = {
. llseek = nfs_file_llseek ,
2014-04-03 04:14:12 +04:00
. read_iter = nfs_file_read ,
2014-04-03 22:07:25 +04:00
. write_iter = nfs_file_write ,
2011-11-04 21:31:22 +04:00
. mmap = nfs_file_mmap ,
. open = nfs_file_open ,
. flush = nfs_file_flush ,
. release = nfs_file_release ,
. fsync = nfs_file_fsync ,
. lock = nfs_lock ,
. flock = nfs_flock ,
2023-05-22 16:50:07 +03:00
. splice_read = nfs_file_splice_read ,
2014-04-05 12:37:17 +04:00
. splice_write = iter_file_splice_write ,
2011-11-04 21:31:22 +04:00
. check_flags = nfs_check_flags ,
2014-08-27 14:49:41 +04:00
. setlease = simple_nosetlease ,
2011-11-04 21:31:22 +04:00
} ;
2012-07-31 00:05:23 +04:00
EXPORT_SYMBOL_GPL ( nfs_file_operations ) ;