2022-02-21 11:38:17 +00:00
// SPDX-License-Identifier: GPL-2.0-or-later
/* Unbuffered and direct write support.
*
* Copyright ( C ) 2023 Red Hat , Inc . All Rights Reserved .
* Written by David Howells ( dhowells @ redhat . com )
*/
# include <linux/export.h>
# include <linux/uio.h>
# include "internal.h"
static void netfs_cleanup_dio_write ( struct netfs_io_request * wreq )
{
struct inode * inode = wreq - > inode ;
2024-05-21 14:37:43 +01:00
unsigned long long end = wreq - > start + wreq - > transferred ;
2022-02-21 11:38:17 +00:00
if ( ! wreq - > error & &
i_size_read ( inode ) < end ) {
if ( wreq - > netfs_ops - > update_i_size )
wreq - > netfs_ops - > update_i_size ( inode , end ) ;
else
i_size_write ( inode , end ) ;
}
}
/*
* Perform an unbuffered write where we may have to do an RMW operation on an
* encrypted file . This can also be used for direct I / O writes .
*/
2024-05-15 18:06:03 -05:00
ssize_t netfs_unbuffered_write_iter_locked ( struct kiocb * iocb , struct iov_iter * iter ,
2024-01-05 14:57:14 +00:00
struct netfs_group * netfs_group )
2022-02-21 11:38:17 +00:00
{
struct netfs_io_request * wreq ;
unsigned long long start = iocb - > ki_pos ;
unsigned long long end = start + iov_iter_count ( iter ) ;
ssize_t ret , n ;
2024-03-08 12:36:05 +00:00
size_t len = iov_iter_count ( iter ) ;
2022-02-21 11:38:17 +00:00
bool async = ! is_sync_kiocb ( iocb ) ;
2024-07-18 21:07:32 +01:00
_enter ( " " ) ;
2022-02-21 11:38:17 +00:00
/* We're going to need a bounce buffer if what we transmit is going to
* be different in some way to the source buffer , e . g . because it gets
* encrypted / compressed or because it needs expanding to a block size .
*/
// TODO
2024-07-18 21:07:32 +01:00
_debug ( " uw %llx-%llx " , start , end ) ;
2022-02-21 11:38:17 +00:00
2024-03-08 12:36:05 +00:00
wreq = netfs_create_write_req ( iocb - > ki_filp - > f_mapping , iocb - > ki_filp , start ,
iocb - > ki_flags & IOCB_DIRECT ?
NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE ) ;
2022-02-21 11:38:17 +00:00
if ( IS_ERR ( wreq ) )
return PTR_ERR ( wreq ) ;
2024-03-08 12:36:05 +00:00
wreq - > io_streams [ 0 ] . avail = true ;
trace_netfs_write ( wreq , ( iocb - > ki_flags & IOCB_DIRECT ?
netfs_write_trace_dio_write :
netfs_write_trace_unbuffered_write ) ) ;
2022-02-21 11:38:17 +00:00
{
/* If this is an async op and we're not using a bounce buffer,
* we have to save the source buffer as the iterator is only
* good until we return . In such a case , extract an iterator
* to represent as much of the the output buffer as we can
* manage . Note that the extraction might not be able to
* allocate a sufficiently large bvec array and may shorten the
* request .
*/
if ( async | | user_backed_iter ( iter ) ) {
2024-03-08 12:36:05 +00:00
n = netfs_extract_user_iter ( iter , len , & wreq - > iter , 0 ) ;
2022-02-21 11:38:17 +00:00
if ( n < 0 ) {
ret = n ;
goto out ;
}
wreq - > direct_bv = ( struct bio_vec * ) wreq - > iter . bvec ;
wreq - > direct_bv_count = n ;
wreq - > direct_bv_unpin = iov_iter_extract_will_pin ( iter ) ;
} else {
wreq - > iter = * iter ;
}
wreq - > io_iter = wreq - > iter ;
}
2024-03-08 12:36:05 +00:00
__set_bit ( NETFS_RREQ_USE_IO_ITER , & wreq - > flags ) ;
2022-02-21 11:38:17 +00:00
/* Copy the data into the bounce buffer and encrypt it. */
// TODO
/* Dispatch the write. */
__set_bit ( NETFS_RREQ_UPLOAD_TO_SERVER , & wreq - > flags ) ;
if ( async )
wreq - > iocb = iocb ;
2024-05-20 16:12:56 +01:00
wreq - > len = iov_iter_count ( & wreq - > io_iter ) ;
2022-02-21 11:38:17 +00:00
wreq - > cleanup = netfs_cleanup_dio_write ;
2024-05-20 16:12:56 +01:00
ret = netfs_unbuffered_write ( wreq , is_sync_kiocb ( iocb ) , wreq - > len ) ;
2022-02-21 11:38:17 +00:00
if ( ret < 0 ) {
2024-07-18 21:07:32 +01:00
_debug ( " begin = %zd " , ret ) ;
2022-02-21 11:38:17 +00:00
goto out ;
}
if ( ! async ) {
trace_netfs_rreq ( wreq , netfs_rreq_trace_wait_ip ) ;
wait_on_bit ( & wreq - > flags , NETFS_RREQ_IN_PROGRESS ,
TASK_UNINTERRUPTIBLE ) ;
2024-03-08 12:36:05 +00:00
smp_rmb ( ) ; /* Read error/transferred after RIP flag */
2022-02-21 11:38:17 +00:00
ret = wreq - > error ;
if ( ret = = 0 ) {
ret = wreq - > transferred ;
iocb - > ki_pos + = ret ;
}
} else {
ret = - EIOCBQUEUED ;
}
out :
netfs_put_request ( wreq , false , netfs_rreq_trace_put_return ) ;
return ret ;
}
2024-05-15 18:06:03 -05:00
EXPORT_SYMBOL ( netfs_unbuffered_write_iter_locked ) ;
2022-02-21 11:38:17 +00:00
/**
* netfs_unbuffered_write_iter - Unbuffered write to a file
* @ iocb : IO state structure
* @ from : iov_iter with data to write
*
* Do an unbuffered write to a file , writing the data directly to the server
* and not lodging the data in the pagecache .
*
* Return :
* * Negative error code if no data has been written at all of
* vfs_fsync_range ( ) failed for a synchronous write
* * Number of bytes written , even for truncated writes
*/
ssize_t netfs_unbuffered_write_iter ( struct kiocb * iocb , struct iov_iter * from )
{
struct file * file = iocb - > ki_filp ;
mm: Provide a means of invalidation without using launder_folio
Implement a replacement for launder_folio. The key feature of
invalidate_inode_pages2() is that it locks each folio individually, unmaps
it to prevent mmap'd accesses interfering and calls the ->launder_folio()
address_space op to flush it. This has problems: firstly, each folio is
written individually as one or more small writes; secondly, adjacent folios
cannot be added so easily into the laundry; thirdly, it's yet another op to
implement.
Instead, use the invalidate lock to cause anyone wanting to add a folio to
the inode to wait, then unmap all the folios if we have mmaps, then,
conditionally, use ->writepages() to flush any dirty data back and then
discard all pages.
The invalidate lock prevents ->read_iter(), ->write_iter() and faulting
through mmap all from adding pages for the duration.
This is then used from netfslib to handle the flusing in unbuffered and
direct writes.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Miklos Szeredi <miklos@szeredi.hu>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Christian Brauner <brauner@kernel.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
cc: netfs@lists.linux.dev
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: ceph-devel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: devel@lists.orangefs.org
2024-03-27 08:51:38 +00:00
struct address_space * mapping = file - > f_mapping ;
struct inode * inode = mapping - > host ;
2022-02-21 11:38:17 +00:00
struct netfs_inode * ictx = netfs_inode ( inode ) ;
ssize_t ret ;
mm: Provide a means of invalidation without using launder_folio
Implement a replacement for launder_folio. The key feature of
invalidate_inode_pages2() is that it locks each folio individually, unmaps
it to prevent mmap'd accesses interfering and calls the ->launder_folio()
address_space op to flush it. This has problems: firstly, each folio is
written individually as one or more small writes; secondly, adjacent folios
cannot be added so easily into the laundry; thirdly, it's yet another op to
implement.
Instead, use the invalidate lock to cause anyone wanting to add a folio to
the inode to wait, then unmap all the folios if we have mmaps, then,
conditionally, use ->writepages() to flush any dirty data back and then
discard all pages.
The invalidate lock prevents ->read_iter(), ->write_iter() and faulting
through mmap all from adding pages for the duration.
This is then used from netfslib to handle the flusing in unbuffered and
direct writes.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Miklos Szeredi <miklos@szeredi.hu>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Christian Brauner <brauner@kernel.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
cc: netfs@lists.linux.dev
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: ceph-devel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: devel@lists.orangefs.org
2024-03-27 08:51:38 +00:00
loff_t pos = iocb - > ki_pos ;
unsigned long long end = pos + iov_iter_count ( from ) - 1 ;
2022-02-21 11:38:17 +00:00
2024-07-18 21:07:32 +01:00
_enter ( " %llx,%zx,%llx " , pos , iov_iter_count ( from ) , i_size_read ( inode ) ) ;
2022-02-21 11:38:17 +00:00
2024-01-29 09:49:19 +00:00
if ( ! iov_iter_count ( from ) )
return 0 ;
2022-02-21 11:38:17 +00:00
trace_netfs_write_iter ( iocb , from ) ;
2024-03-26 08:48:44 +00:00
netfs_stat ( & netfs_n_wh_dio_write ) ;
2022-02-21 11:38:17 +00:00
ret = netfs_start_io_direct ( inode ) ;
if ( ret < 0 )
return ret ;
ret = generic_write_checks ( iocb , from ) ;
2024-01-29 09:49:19 +00:00
if ( ret < = 0 )
2022-02-21 11:38:17 +00:00
goto out ;
ret = file_remove_privs ( file ) ;
if ( ret < 0 )
goto out ;
ret = file_update_time ( file ) ;
if ( ret < 0 )
goto out ;
mm: Provide a means of invalidation without using launder_folio
Implement a replacement for launder_folio. The key feature of
invalidate_inode_pages2() is that it locks each folio individually, unmaps
it to prevent mmap'd accesses interfering and calls the ->launder_folio()
address_space op to flush it. This has problems: firstly, each folio is
written individually as one or more small writes; secondly, adjacent folios
cannot be added so easily into the laundry; thirdly, it's yet another op to
implement.
Instead, use the invalidate lock to cause anyone wanting to add a folio to
the inode to wait, then unmap all the folios if we have mmaps, then,
conditionally, use ->writepages() to flush any dirty data back and then
discard all pages.
The invalidate lock prevents ->read_iter(), ->write_iter() and faulting
through mmap all from adding pages for the duration.
This is then used from netfslib to handle the flusing in unbuffered and
direct writes.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Miklos Szeredi <miklos@szeredi.hu>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Christian Brauner <brauner@kernel.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
cc: netfs@lists.linux.dev
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: ceph-devel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: devel@lists.orangefs.org
2024-03-27 08:51:38 +00:00
if ( iocb - > ki_flags & IOCB_NOWAIT ) {
/* We could block if there are any pages in the range. */
ret = - EAGAIN ;
if ( filemap_range_has_page ( mapping , pos , end ) )
if ( filemap_invalidate_inode ( inode , true , pos , end ) )
goto out ;
} else {
ret = filemap_write_and_wait_range ( mapping , pos , end ) ;
if ( ret < 0 )
goto out ;
}
/*
* After a write we want buffered reads to be sure to go to disk to get
* the new data . We invalidate clean cached page from the region we ' re
* about to write . We do this * before * the write so that we can return
* without clobbering - EIOCBQUEUED from - > direct_IO ( ) .
*/
ret = filemap_invalidate_inode ( inode , true , pos , end ) ;
2022-02-21 11:38:17 +00:00
if ( ret < 0 )
goto out ;
netfs: Optimise away reads above the point at which there can be no data
Track the file position above which the server is not expected to have any
data (the "zero point") and preemptively assume that we can satisfy
requests by filling them with zeroes locally rather than attempting to
download them if they're over that line - even if we've written data back
to the server. Assume that any data that was written back above that
position is held in the local cache. Note that we have to split requests
that straddle the line.
Make use of this to optimise away some reads from the server. We need to
set the zero point in the following circumstances:
(1) When we see an extant remote inode and have no cache for it, we set
the zero_point to i_size.
(2) On local inode creation, we set zero_point to 0.
(3) On local truncation down, we reduce zero_point to the new i_size if
the new i_size is lower.
(4) On local truncation up, we don't change zero_point.
(5) On local modification, we don't change zero_point.
(6) On remote invalidation, we set zero_point to the new i_size.
(7) If stored data is discarded from the pagecache or culled from fscache,
we must set zero_point above that if the data also got written to the
server.
(8) If dirty data is written back to the server, but not fscache, we must
set zero_point above that.
(9) If a direct I/O write is made, set zero_point above that.
Assuming the above, any read from the server at or above the zero_point
position will return all zeroes.
The zero_point value can be stored in the cache, provided the above rules
are applied to it by any code that culls part of the local cache.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
2023-11-24 13:39:02 +00:00
end = iocb - > ki_pos + iov_iter_count ( from ) ;
if ( end > ictx - > zero_point )
ictx - > zero_point = end ;
2022-02-21 11:38:17 +00:00
fscache_invalidate ( netfs_i_cookie ( ictx ) , NULL , i_size_read ( inode ) ,
FSCACHE_INVAL_DIO_WRITE ) ;
ret = netfs_unbuffered_write_iter_locked ( iocb , from , NULL ) ;
out :
netfs_end_io_direct ( inode ) ;
return ret ;
}
EXPORT_SYMBOL ( netfs_unbuffered_write_iter ) ;