2017-03-17 09:18:50 +03:00
// SPDX-License-Identifier: GPL-2.0
# ifndef NO_BCACHEFS_FS
# include "bcachefs.h"
2018-10-06 07:46:55 +03:00
# include "alloc_foreground.h"
2020-12-17 23:08:58 +03:00
# include "bkey_buf.h"
2017-03-17 09:18:50 +03:00
# include "btree_update.h"
# include "buckets.h"
# include "clock.h"
# include "error.h"
2018-08-06 00:46:41 +03:00
# include "extents.h"
2019-11-15 23:52:28 +03:00
# include "extent_update.h"
2017-03-17 09:18:50 +03:00
# include "fs.h"
# include "fs-io.h"
# include "fsck.h"
# include "inode.h"
# include "journal.h"
# include "io.h"
# include "keylist.h"
# include "quota.h"
2019-08-16 16:59:56 +03:00
# include "reflink.h"
2017-03-17 09:18:50 +03:00
# include "trace.h"
# include <linux/aio.h>
# include <linux/backing-dev.h>
# include <linux/falloc.h>
# include <linux/migrate.h>
# include <linux/mmu_context.h>
# include <linux/pagevec.h>
2020-10-09 07:09:20 +03:00
# include <linux/rmap.h>
2017-03-17 09:18:50 +03:00
# include <linux/sched/signal.h>
# include <linux/task_io_accounting_ops.h>
# include <linux/uio.h>
# include <linux/writeback.h>
# include <trace/events/writeback.h>
2023-03-18 02:24:44 +03:00
struct folio_vec {
struct folio * fv_folio ;
size_t fv_offset ;
size_t fv_len ;
} ;
static inline struct folio_vec biovec_to_foliovec ( struct bio_vec bv )
{
struct folio * folio = page_folio ( bv . bv_page ) ;
size_t offset = ( folio_page_idx ( folio , bv . bv_page ) < < PAGE_SHIFT ) +
bv . bv_offset ;
size_t len = min_t ( size_t , folio_size ( folio ) - offset , bv . bv_len ) ;
return ( struct folio_vec ) {
. fv_folio = folio ,
. fv_offset = offset ,
. fv_len = len ,
} ;
}
static inline struct folio_vec bio_iter_iovec_folio ( struct bio * bio ,
struct bvec_iter iter )
{
return biovec_to_foliovec ( bio_iter_iovec ( bio , iter ) ) ;
}
# define __bio_for_each_folio(bvl, bio, iter, start) \
for ( iter = ( start ) ; \
( iter ) . bi_size & & \
( ( bvl = bio_iter_iovec_folio ( ( bio ) , ( iter ) ) ) , 1 ) ; \
bio_advance_iter_single ( ( bio ) , & ( iter ) , ( bvl ) . fv_len ) )
/**
* bio_for_each_folio - iterate over folios within a bio
*
* Like other non - _all versions , this iterates over what bio - > bi_iter currently
* points to . This version is for drivers , where the bio may have previously
* been split or cloned .
*/
# define bio_for_each_folio(bvl, bio, iter) \
__bio_for_each_folio ( bvl , bio , iter , ( bio ) - > bi_iter )
2023-03-29 17:43:23 +03:00
/*
* Use u64 for the end pos and sector helpers because if the folio covers the
* max supported range of the mapping , the start offset of the next folio
* overflows loff_t . This breaks much of the range based processing in the
* buffered write path .
*/
static inline u64 folio_end_pos ( struct folio * folio )
2023-03-17 21:55:53 +03:00
{
return folio_pos ( folio ) + folio_size ( folio ) ;
}
static inline size_t folio_sectors ( struct folio * folio )
{
return PAGE_SECTORS < < folio_order ( folio ) ;
}
static inline loff_t folio_sector ( struct folio * folio )
{
return folio_pos ( folio ) > > 9 ;
}
2023-03-29 17:43:23 +03:00
static inline u64 folio_end_sector ( struct folio * folio )
2023-03-17 21:55:53 +03:00
{
return folio_end_pos ( folio ) > > 9 ;
}
2023-03-23 19:51:47 +03:00
typedef DARRAY ( struct folio * ) folios ;
static int filemap_get_contig_folios_d ( struct address_space * mapping ,
2023-03-29 17:43:23 +03:00
loff_t start , u64 end ,
2023-03-23 19:51:47 +03:00
int fgp_flags , gfp_t gfp ,
folios * folios )
{
struct folio * f ;
2023-03-29 17:43:23 +03:00
u64 pos = start ;
2023-03-23 19:51:47 +03:00
int ret = 0 ;
while ( pos < end ) {
if ( ( u64 ) pos > = ( u64 ) start + ( 1ULL < < 20 ) )
fgp_flags & = ~ FGP_CREAT ;
ret = darray_make_room_gfp ( folios , 1 , gfp & GFP_KERNEL ) ;
if ( ret )
break ;
f = __filemap_get_folio ( mapping , pos > > PAGE_SHIFT , fgp_flags , gfp ) ;
2023-06-21 07:31:49 +03:00
if ( IS_ERR_OR_NULL ( f ) )
2023-03-23 19:51:47 +03:00
break ;
BUG_ON ( folios - > nr & & folio_pos ( f ) ! = pos ) ;
pos = folio_end_pos ( f ) ;
darray_push ( folios , f ) ;
}
if ( ! folios - > nr & & ! ret & & ( fgp_flags & FGP_CREAT ) )
ret = - ENOMEM ;
return folios - > nr ? 0 : ret ;
}
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
struct nocow_flush {
struct closure * cl ;
struct bch_dev * ca ;
struct bio bio ;
} ;
static void nocow_flush_endio ( struct bio * _bio )
{
struct nocow_flush * bio = container_of ( _bio , struct nocow_flush , bio ) ;
closure_put ( bio - > cl ) ;
percpu_ref_put ( & bio - > ca - > io_ref ) ;
bio_put ( & bio - > bio ) ;
}
static void bch2_inode_flush_nocow_writes_async ( struct bch_fs * c ,
struct bch_inode_info * inode ,
struct closure * cl )
{
struct nocow_flush * bio ;
struct bch_dev * ca ;
struct bch_devs_mask devs ;
unsigned dev ;
dev = find_first_bit ( inode - > ei_devs_need_flush . d , BCH_SB_MEMBERS_MAX ) ;
if ( dev = = BCH_SB_MEMBERS_MAX )
return ;
devs = inode - > ei_devs_need_flush ;
memset ( & inode - > ei_devs_need_flush , 0 , sizeof ( inode - > ei_devs_need_flush ) ) ;
for_each_set_bit ( dev , devs . d , BCH_SB_MEMBERS_MAX ) {
rcu_read_lock ( ) ;
ca = rcu_dereference ( c - > devs [ dev ] ) ;
if ( ca & & ! percpu_ref_tryget ( & ca - > io_ref ) )
ca = NULL ;
rcu_read_unlock ( ) ;
if ( ! ca )
continue ;
bio = container_of ( bio_alloc_bioset ( ca - > disk_sb . bdev , 0 ,
REQ_OP_FLUSH ,
GFP_KERNEL ,
& c - > nocow_flush_bioset ) ,
struct nocow_flush , bio ) ;
bio - > cl = cl ;
bio - > ca = ca ;
bio - > bio . bi_end_io = nocow_flush_endio ;
closure_bio_submit ( & bio - > bio , cl ) ;
}
}
static int bch2_inode_flush_nocow_writes ( struct bch_fs * c ,
struct bch_inode_info * inode )
{
struct closure cl ;
closure_init_stack ( & cl ) ;
bch2_inode_flush_nocow_writes_async ( c , inode , & cl ) ;
closure_sync ( & cl ) ;
return 0 ;
}
2019-07-29 19:24:36 +03:00
static inline bool bio_full ( struct bio * bio , unsigned len )
{
if ( bio - > bi_vcnt > = bio - > bi_max_vecs )
return true ;
if ( bio - > bi_iter . bi_size > UINT_MAX - len )
return true ;
return false ;
}
2020-11-11 20:33:12 +03:00
static inline struct address_space * faults_disabled_mapping ( void )
{
return ( void * ) ( ( ( unsigned long ) current - > faults_disabled_mapping ) & ~ 1UL ) ;
}
static inline void set_fdm_dropped_locks ( void )
{
current - > faults_disabled_mapping =
( void * ) ( ( ( unsigned long ) current - > faults_disabled_mapping ) | 1 ) ;
}
static inline bool fdm_dropped_locks ( void )
{
return ( ( unsigned long ) current - > faults_disabled_mapping ) & 1 ;
}
2017-03-17 09:18:50 +03:00
struct quota_res {
u64 sectors ;
} ;
struct bch_writepage_io {
2019-10-09 19:50:39 +03:00
struct bch_inode_info * inode ;
2017-03-17 09:18:50 +03:00
/* must be last: */
2019-10-09 19:50:39 +03:00
struct bch_write_op op ;
2017-03-17 09:18:50 +03:00
} ;
struct dio_write {
struct kiocb * req ;
2022-11-01 03:30:27 +03:00
struct address_space * mapping ;
struct bch_inode_info * inode ;
2019-01-14 05:36:14 +03:00
struct mm_struct * mm ;
2017-03-17 09:18:50 +03:00
unsigned loop : 1 ,
2022-11-14 06:43:37 +03:00
extending : 1 ,
2017-03-17 09:18:50 +03:00
sync : 1 ,
2022-11-03 07:29:43 +03:00
flush : 1 ,
2017-03-17 09:18:50 +03:00
free_iov : 1 ;
struct quota_res quota_res ;
2020-06-30 01:22:06 +03:00
u64 written ;
2017-03-17 09:18:50 +03:00
struct iov_iter iter ;
struct iovec inline_vecs [ 2 ] ;
/* must be last: */
2019-10-09 19:50:39 +03:00
struct bch_write_op op ;
2017-03-17 09:18:50 +03:00
} ;
struct dio_read {
struct closure cl ;
struct kiocb * req ;
long ret ;
2021-01-21 22:42:23 +03:00
bool should_dirty ;
2017-03-17 09:18:50 +03:00
struct bch_read_bio rbio ;
} ;
/* pagecache_block must be held */
2022-11-02 23:45:28 +03:00
static noinline int write_invalidate_inode_pages_range ( struct address_space * mapping ,
2017-03-17 09:18:50 +03:00
loff_t start , loff_t end )
{
int ret ;
/*
* XXX : the way this is currently implemented , we can spin if a process
* is continually redirtying a specific page
*/
do {
if ( ! mapping - > nrpages )
return 0 ;
ret = filemap_write_and_wait_range ( mapping , start , end ) ;
if ( ret )
break ;
if ( ! mapping - > nrpages )
return 0 ;
ret = invalidate_inode_pages2_range ( mapping ,
start > > PAGE_SHIFT ,
end > > PAGE_SHIFT ) ;
} while ( ret = = - EBUSY ) ;
return ret ;
}
/* quotas */
# ifdef CONFIG_BCACHEFS_QUOTA
2022-11-14 06:43:37 +03:00
static void __bch2_quota_reservation_put ( struct bch_fs * c ,
struct bch_inode_info * inode ,
struct quota_res * res )
2017-03-17 09:18:50 +03:00
{
BUG_ON ( res - > sectors > inode - > ei_quota_reserved ) ;
bch2_quota_acct ( c , inode - > ei_qid , Q_SPC ,
2018-11-01 22:10:01 +03:00
- ( ( s64 ) res - > sectors ) , KEY_TYPE_QUOTA_PREALLOC ) ;
2017-03-17 09:18:50 +03:00
inode - > ei_quota_reserved - = res - > sectors ;
res - > sectors = 0 ;
}
2022-11-14 06:43:37 +03:00
static void bch2_quota_reservation_put ( struct bch_fs * c ,
struct bch_inode_info * inode ,
struct quota_res * res )
{
if ( res - > sectors ) {
mutex_lock ( & inode - > ei_quota_lock ) ;
__bch2_quota_reservation_put ( c , inode , res ) ;
mutex_unlock ( & inode - > ei_quota_lock ) ;
}
}
2017-03-17 09:18:50 +03:00
static int bch2_quota_reservation_add ( struct bch_fs * c ,
struct bch_inode_info * inode ,
struct quota_res * res ,
2022-10-11 11:32:14 +03:00
u64 sectors ,
2017-03-17 09:18:50 +03:00
bool check_enospc )
{
int ret ;
2023-04-28 10:50:57 +03:00
if ( test_bit ( EI_INODE_SNAPSHOT , & inode - > ei_flags ) )
return 0 ;
2017-03-17 09:18:50 +03:00
mutex_lock ( & inode - > ei_quota_lock ) ;
ret = bch2_quota_acct ( c , inode - > ei_qid , Q_SPC , sectors ,
2018-11-01 22:10:01 +03:00
check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK ) ;
2017-03-17 09:18:50 +03:00
if ( likely ( ! ret ) ) {
inode - > ei_quota_reserved + = sectors ;
res - > sectors + = sectors ;
}
mutex_unlock ( & inode - > ei_quota_lock ) ;
return ret ;
}
# else
2022-11-14 06:43:37 +03:00
static void __bch2_quota_reservation_put ( struct bch_fs * c ,
struct bch_inode_info * inode ,
struct quota_res * res ) { }
2017-03-17 09:18:50 +03:00
static void bch2_quota_reservation_put ( struct bch_fs * c ,
struct bch_inode_info * inode ,
2022-11-14 06:43:37 +03:00
struct quota_res * res ) { }
2017-03-17 09:18:50 +03:00
static int bch2_quota_reservation_add ( struct bch_fs * c ,
struct bch_inode_info * inode ,
struct quota_res * res ,
unsigned sectors ,
bool check_enospc )
{
return 0 ;
}
# endif
/* i_size updates: */
2018-07-17 21:12:42 +03:00
struct inode_new_size {
loff_t new_size ;
u64 now ;
unsigned fields ;
} ;
2017-03-17 09:18:50 +03:00
static int inode_set_size ( struct bch_inode_info * inode ,
struct bch_inode_unpacked * bi ,
void * p )
{
2018-07-17 21:12:42 +03:00
struct inode_new_size * s = p ;
2017-03-17 09:18:50 +03:00
2018-07-17 21:12:42 +03:00
bi - > bi_size = s - > new_size ;
if ( s - > fields & ATTR_ATIME )
bi - > bi_atime = s - > now ;
if ( s - > fields & ATTR_MTIME )
bi - > bi_mtime = s - > now ;
if ( s - > fields & ATTR_CTIME )
bi - > bi_ctime = s - > now ;
2017-03-17 09:18:50 +03:00
return 0 ;
}
2019-08-16 16:59:56 +03:00
int __must_check bch2_write_inode_size ( struct bch_fs * c ,
struct bch_inode_info * inode ,
loff_t new_size , unsigned fields )
2017-03-17 09:18:50 +03:00
{
2018-07-17 21:12:42 +03:00
struct inode_new_size s = {
. new_size = new_size ,
. now = bch2_current_time ( c ) ,
. fields = fields ,
} ;
return bch2_write_inode ( c , inode , inode_set_size , & s , fields ) ;
2017-03-17 09:18:50 +03:00
}
2022-11-14 06:43:37 +03:00
static void __i_sectors_acct ( struct bch_fs * c , struct bch_inode_info * inode ,
2018-08-06 00:48:00 +03:00
struct quota_res * quota_res , s64 sectors )
2017-03-17 09:18:50 +03:00
{
2022-04-16 23:06:59 +03:00
bch2_fs_inconsistent_on ( ( s64 ) inode - > v . i_blocks + sectors < 0 , c ,
" inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli) " ,
inode - > v . i_ino , ( u64 ) inode - > v . i_blocks , sectors ,
inode - > ei_inode . bi_sectors ) ;
2021-11-24 01:05:56 +03:00
inode - > v . i_blocks + = sectors ;
2017-03-17 09:18:50 +03:00
# ifdef CONFIG_BCACHEFS_QUOTA
2023-04-28 10:50:57 +03:00
if ( quota_res & &
! test_bit ( EI_INODE_SNAPSHOT , & inode - > ei_flags ) & &
sectors > 0 ) {
2017-03-17 09:18:50 +03:00
BUG_ON ( sectors > quota_res - > sectors ) ;
BUG_ON ( sectors > inode - > ei_quota_reserved ) ;
quota_res - > sectors - = sectors ;
inode - > ei_quota_reserved - = sectors ;
} else {
2018-11-01 22:10:01 +03:00
bch2_quota_acct ( c , inode - > ei_qid , Q_SPC , sectors , KEY_TYPE_QUOTA_WARN ) ;
2017-03-17 09:18:50 +03:00
}
# endif
2022-11-14 06:43:37 +03:00
}
static void i_sectors_acct ( struct bch_fs * c , struct bch_inode_info * inode ,
struct quota_res * quota_res , s64 sectors )
{
if ( sectors ) {
mutex_lock ( & inode - > ei_quota_lock ) ;
__i_sectors_acct ( c , inode , quota_res , sectors ) ;
mutex_unlock ( & inode - > ei_quota_lock ) ;
}
2017-03-17 09:18:50 +03:00
}
/* page state: */
/* stored in page->private: */
2023-03-23 18:08:04 +03:00
# define BCH_FOLIO_SECTOR_STATE() \
x ( unallocated ) \
x ( reserved ) \
x ( dirty ) \
x ( dirty_reserved ) \
x ( allocated )
enum bch_folio_sector_state {
# define x(n) SECTOR_##n,
BCH_FOLIO_SECTOR_STATE ( )
# undef x
} ;
2023-07-07 05:47:42 +03:00
static const char * const bch2_folio_sector_states [ ] = {
2023-03-23 18:08:04 +03:00
# define x(n) #n,
BCH_FOLIO_SECTOR_STATE ( )
# undef x
NULL
} ;
static inline enum bch_folio_sector_state
folio_sector_dirty ( enum bch_folio_sector_state state )
{
switch ( state ) {
case SECTOR_unallocated :
return SECTOR_dirty ;
case SECTOR_reserved :
return SECTOR_dirty_reserved ;
default :
return state ;
}
}
static inline enum bch_folio_sector_state
folio_sector_undirty ( enum bch_folio_sector_state state )
{
switch ( state ) {
case SECTOR_dirty :
return SECTOR_unallocated ;
case SECTOR_dirty_reserved :
return SECTOR_reserved ;
default :
return state ;
}
}
static inline enum bch_folio_sector_state
folio_sector_reserve ( enum bch_folio_sector_state state )
{
switch ( state ) {
case SECTOR_unallocated :
return SECTOR_reserved ;
case SECTOR_dirty :
return SECTOR_dirty_reserved ;
default :
return state ;
}
}
2023-03-17 19:53:15 +03:00
struct bch_folio_sector {
2021-11-24 01:05:56 +03:00
/* Uncompressed, fully allocated replicas (or on disk reservation): */
unsigned nr_replicas : 4 ;
2017-03-17 09:18:50 +03:00
2021-11-24 01:05:56 +03:00
/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
unsigned replicas_reserved : 4 ;
2019-07-03 04:41:35 +03:00
/* i_sectors: */
2023-03-23 18:08:04 +03:00
enum bch_folio_sector_state state : 8 ;
2019-07-03 04:41:35 +03:00
} ;
2017-03-17 09:18:50 +03:00
2023-03-17 19:53:15 +03:00
struct bch_folio {
2019-10-09 16:19:06 +03:00
spinlock_t lock ;
2019-07-29 19:24:36 +03:00
atomic_t write_count ;
2023-03-17 19:53:15 +03:00
/*
* Is the sector state up to date with the btree ?
* ( Not the data itself )
*/
2021-11-24 02:17:04 +03:00
bool uptodate ;
2023-03-18 02:46:25 +03:00
struct bch_folio_sector s [ ] ;
2017-03-17 09:18:50 +03:00
} ;
2023-03-23 18:08:04 +03:00
static inline void folio_sector_set ( struct folio * folio ,
struct bch_folio * s ,
unsigned i , unsigned n )
{
s - > s [ i ] . state = n ;
}
2023-04-03 15:17:26 +03:00
/* file offset (to folio offset) to bch_folio_sector index */
static inline int folio_pos_to_s ( struct folio * folio , loff_t pos )
{
u64 f_offset = pos - folio_pos ( folio ) ;
BUG_ON ( pos < folio_pos ( folio ) | | pos > = folio_end_pos ( folio ) ) ;
return f_offset > > SECTOR_SHIFT ;
}
2023-03-17 21:55:53 +03:00
static inline struct bch_folio * __bch2_folio ( struct folio * folio )
2017-03-17 09:18:50 +03:00
{
2023-03-17 21:55:53 +03:00
return folio_has_private ( folio )
? ( struct bch_folio * ) folio_get_private ( folio )
2019-07-03 04:41:35 +03:00
: NULL ;
}
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
static inline struct bch_folio * bch2_folio ( struct folio * folio )
2019-07-03 04:41:35 +03:00
{
2023-03-17 21:55:53 +03:00
EBUG_ON ( ! folio_test_locked ( folio ) ) ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
return __bch2_folio ( folio ) ;
2019-07-03 04:41:35 +03:00
}
2023-03-17 21:55:53 +03:00
/* for newly allocated folios: */
static void __bch2_folio_release ( struct folio * folio )
2019-07-03 04:41:35 +03:00
{
2023-03-17 21:55:53 +03:00
kfree ( folio_detach_private ( folio ) ) ;
2019-07-03 04:41:35 +03:00
}
2023-03-17 21:55:53 +03:00
static void bch2_folio_release ( struct folio * folio )
2019-07-03 04:41:35 +03:00
{
2023-03-17 21:55:53 +03:00
EBUG_ON ( ! folio_test_locked ( folio ) ) ;
__bch2_folio_release ( folio ) ;
2019-07-03 04:41:35 +03:00
}
2023-03-17 21:55:53 +03:00
/* for newly allocated folios: */
static struct bch_folio * __bch2_folio_create ( struct folio * folio , gfp_t gfp )
2019-07-03 04:41:35 +03:00
{
2023-03-17 19:53:15 +03:00
struct bch_folio * s ;
2019-07-03 04:41:35 +03:00
2023-03-18 02:46:25 +03:00
s = kzalloc ( sizeof ( * s ) +
sizeof ( struct bch_folio_sector ) *
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
folio_sectors ( folio ) , gfp ) ;
2019-07-03 04:41:35 +03:00
if ( ! s )
return NULL ;
2017-03-17 09:18:50 +03:00
2019-10-09 16:19:06 +03:00
spin_lock_init ( & s - > lock ) ;
2023-03-17 21:55:53 +03:00
folio_attach_private ( folio , s ) ;
2017-03-17 09:18:50 +03:00
return s ;
}
2023-03-17 21:55:53 +03:00
static struct bch_folio * bch2_folio_create ( struct folio * folio , gfp_t gfp )
2019-07-03 04:41:35 +03:00
{
2023-03-17 21:55:53 +03:00
return bch2_folio ( folio ) ? : __bch2_folio_create ( folio , gfp ) ;
2019-07-03 04:41:35 +03:00
}
2022-11-14 02:59:01 +03:00
static unsigned bkey_to_sector_state ( struct bkey_s_c k )
2021-11-24 01:05:56 +03:00
{
2022-11-14 02:59:01 +03:00
if ( bkey_extent_is_reservation ( k ) )
2023-03-23 18:08:04 +03:00
return SECTOR_reserved ;
2022-11-14 02:59:01 +03:00
if ( bkey_extent_is_allocation ( k . k ) )
2023-03-23 18:08:04 +03:00
return SECTOR_allocated ;
return SECTOR_unallocated ;
2021-11-24 01:05:56 +03:00
}
2023-03-17 21:55:53 +03:00
static void __bch2_folio_set ( struct folio * folio ,
2023-03-17 19:53:15 +03:00
unsigned pg_offset , unsigned pg_len ,
unsigned nr_ptrs , unsigned state )
2021-11-24 02:17:04 +03:00
{
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
struct bch_folio * s = bch2_folio ( folio ) ;
2023-03-17 22:37:34 +03:00
unsigned i , sectors = folio_sectors ( folio ) ;
2021-11-24 02:17:04 +03:00
2023-03-17 22:37:34 +03:00
BUG_ON ( pg_offset > = sectors ) ;
BUG_ON ( pg_offset + pg_len > sectors ) ;
2021-11-24 02:17:04 +03:00
spin_lock ( & s - > lock ) ;
for ( i = pg_offset ; i < pg_offset + pg_len ; i + + ) {
2023-03-17 19:53:15 +03:00
s - > s [ i ] . nr_replicas = nr_ptrs ;
2023-03-23 18:08:04 +03:00
folio_sector_set ( folio , s , i , state ) ;
2021-11-24 02:17:04 +03:00
}
2023-03-17 22:37:34 +03:00
if ( i = = sectors )
2021-11-24 02:17:04 +03:00
s - > uptodate = true ;
spin_unlock ( & s - > lock ) ;
}
2023-03-17 19:53:15 +03:00
/*
* Initialize bch_folio state ( allocated / unallocated , nr_replicas ) from the
* extents btree :
*/
static int bch2_folio_set ( struct bch_fs * c , subvol_inum inum ,
2023-03-17 21:55:53 +03:00
struct folio * * folios , unsigned nr_folios )
2021-11-24 02:17:04 +03:00
{
struct btree_trans trans ;
struct btree_iter iter ;
struct bkey_s_c k ;
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
struct bch_folio * s ;
2023-03-17 21:55:53 +03:00
u64 offset = folio_sector ( folios [ 0 ] ) ;
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
unsigned folio_idx ;
2021-11-24 02:17:04 +03:00
u32 snapshot ;
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
bool need_set = false ;
2021-11-24 02:17:04 +03:00
int ret ;
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
for ( folio_idx = 0 ; folio_idx < nr_folios ; folio_idx + + ) {
s = bch2_folio_create ( folios [ folio_idx ] , GFP_KERNEL ) ;
if ( ! s )
return - ENOMEM ;
need_set | = ! s - > uptodate ;
}
if ( ! need_set )
return 0 ;
folio_idx = 0 ;
2021-11-24 02:17:04 +03:00
bch2_trans_init ( & trans , c , 0 , 0 ) ;
retry :
bch2_trans_begin ( & trans ) ;
ret = bch2_subvolume_get_snapshot ( & trans , inum . subvol , & snapshot ) ;
if ( ret )
goto err ;
for_each_btree_key_norestart ( & trans , iter , BTREE_ID_extents ,
SPOS ( inum . inum , offset , snapshot ) ,
BTREE_ITER_SLOTS , k , ret ) {
unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated ( k ) ;
2022-11-14 02:59:01 +03:00
unsigned state = bkey_to_sector_state ( k ) ;
2021-11-24 02:17:04 +03:00
2023-03-17 21:55:53 +03:00
while ( folio_idx < nr_folios ) {
struct folio * folio = folios [ folio_idx ] ;
u64 folio_start = folio_sector ( folio ) ;
u64 folio_end = folio_end_sector ( folio ) ;
unsigned folio_offset = max ( bkey_start_offset ( k . k ) , folio_start ) - folio_start ;
unsigned folio_len = min ( k . k - > p . offset , folio_end ) - folio_offset - folio_start ;
2021-11-24 02:17:04 +03:00
2023-03-17 21:55:53 +03:00
BUG_ON ( k . k - > p . offset < folio_start ) ;
BUG_ON ( bkey_start_offset ( k . k ) > folio_end ) ;
2021-11-24 02:17:04 +03:00
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
if ( ! bch2_folio ( folio ) - > uptodate )
2023-03-17 21:55:53 +03:00
__bch2_folio_set ( folio , folio_offset , folio_len , nr_ptrs , state ) ;
2021-11-24 02:17:04 +03:00
2023-03-17 21:55:53 +03:00
if ( k . k - > p . offset < folio_end )
2021-11-24 02:17:04 +03:00
break ;
2023-03-17 21:55:53 +03:00
folio_idx + + ;
2021-11-24 02:17:04 +03:00
}
2023-03-17 21:55:53 +03:00
if ( folio_idx = = nr_folios )
2021-11-24 02:17:04 +03:00
break ;
}
offset = iter . pos . offset ;
bch2_trans_iter_exit ( & trans , & iter ) ;
err :
2022-07-18 06:06:38 +03:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
2021-11-24 02:17:04 +03:00
goto retry ;
bch2_trans_exit ( & trans ) ;
return ret ;
}
2021-11-24 01:05:56 +03:00
static void bch2_bio_page_state_set ( struct bio * bio , struct bkey_s_c k )
{
struct bvec_iter iter ;
2023-03-18 02:24:44 +03:00
struct folio_vec fv ;
2021-11-24 01:05:56 +03:00
unsigned nr_ptrs = k . k - > type = = KEY_TYPE_reflink_v
? 0 : bch2_bkey_nr_ptrs_fully_allocated ( k ) ;
2022-11-14 02:59:01 +03:00
unsigned state = bkey_to_sector_state ( k ) ;
2021-11-24 01:05:56 +03:00
2023-03-18 02:24:44 +03:00
bio_for_each_folio ( fv , bio , iter )
__bch2_folio_set ( fv . fv_folio ,
fv . fv_offset > > 9 ,
fv . fv_len > > 9 ,
nr_ptrs , state ) ;
2021-11-24 01:05:56 +03:00
}
2021-11-24 02:21:09 +03:00
static void mark_pagecache_unallocated ( struct bch_inode_info * inode ,
u64 start , u64 end )
{
pgoff_t index = start > > PAGE_SECTORS_SHIFT ;
pgoff_t end_index = ( end - 1 ) > > PAGE_SECTORS_SHIFT ;
struct folio_batch fbatch ;
unsigned i , j ;
if ( end < = start )
return ;
folio_batch_init ( & fbatch ) ;
while ( filemap_get_folios ( inode - > v . i_mapping ,
& index , end_index , & fbatch ) ) {
for ( i = 0 ; i < folio_batch_count ( & fbatch ) ; i + + ) {
struct folio * folio = fbatch . folios [ i ] ;
2023-03-17 22:37:34 +03:00
u64 folio_start = folio_sector ( folio ) ;
u64 folio_end = folio_end_sector ( folio ) ;
2023-03-17 21:55:53 +03:00
unsigned folio_offset = max ( start , folio_start ) - folio_start ;
unsigned folio_len = min ( end , folio_end ) - folio_offset - folio_start ;
2023-03-17 19:53:15 +03:00
struct bch_folio * s ;
2021-11-24 02:21:09 +03:00
2023-03-17 21:55:53 +03:00
BUG_ON ( end < = folio_start ) ;
2021-11-24 02:21:09 +03:00
folio_lock ( folio ) ;
2023-03-17 21:55:53 +03:00
s = bch2_folio ( folio ) ;
2021-11-24 02:21:09 +03:00
if ( s ) {
spin_lock ( & s - > lock ) ;
2023-03-17 21:55:53 +03:00
for ( j = folio_offset ; j < folio_offset + folio_len ; j + + )
2021-11-24 02:21:09 +03:00
s - > s [ j ] . nr_replicas = 0 ;
spin_unlock ( & s - > lock ) ;
}
folio_unlock ( folio ) ;
}
folio_batch_release ( & fbatch ) ;
cond_resched ( ) ;
}
}
static void mark_pagecache_reserved ( struct bch_inode_info * inode ,
u64 start , u64 end )
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
pgoff_t index = start > > PAGE_SECTORS_SHIFT ;
pgoff_t end_index = ( end - 1 ) > > PAGE_SECTORS_SHIFT ;
struct folio_batch fbatch ;
s64 i_sectors_delta = 0 ;
unsigned i , j ;
if ( end < = start )
return ;
folio_batch_init ( & fbatch ) ;
while ( filemap_get_folios ( inode - > v . i_mapping ,
& index , end_index , & fbatch ) ) {
for ( i = 0 ; i < folio_batch_count ( & fbatch ) ; i + + ) {
struct folio * folio = fbatch . folios [ i ] ;
2023-03-17 22:37:34 +03:00
u64 folio_start = folio_sector ( folio ) ;
u64 folio_end = folio_end_sector ( folio ) ;
2023-03-17 21:55:53 +03:00
unsigned folio_offset = max ( start , folio_start ) - folio_start ;
unsigned folio_len = min ( end , folio_end ) - folio_offset - folio_start ;
2023-03-17 19:53:15 +03:00
struct bch_folio * s ;
2021-11-24 02:21:09 +03:00
2023-03-17 21:55:53 +03:00
BUG_ON ( end < = folio_start ) ;
2021-11-24 02:21:09 +03:00
folio_lock ( folio ) ;
2023-03-17 21:55:53 +03:00
s = bch2_folio ( folio ) ;
2021-11-24 02:21:09 +03:00
if ( s ) {
spin_lock ( & s - > lock ) ;
2023-03-23 18:08:04 +03:00
for ( j = folio_offset ; j < folio_offset + folio_len ; j + + ) {
i_sectors_delta - = s - > s [ j ] . state = = SECTOR_dirty ;
folio_sector_set ( folio , s , j , folio_sector_reserve ( s - > s [ j ] . state ) ) ;
}
2021-11-24 02:21:09 +03:00
spin_unlock ( & s - > lock ) ;
}
folio_unlock ( folio ) ;
}
folio_batch_release ( & fbatch ) ;
cond_resched ( ) ;
}
i_sectors_acct ( c , inode , NULL , i_sectors_delta ) ;
}
2019-07-02 21:59:15 +03:00
static inline unsigned inode_nr_replicas ( struct bch_fs * c , struct bch_inode_info * inode )
{
/* XXX: this should not be open coded */
return inode - > ei_inode . bi_data_replicas
? inode - > ei_inode . bi_data_replicas - 1
: c - > opts . data_replicas ;
}
2023-03-17 19:53:15 +03:00
static inline unsigned sectors_to_reserve ( struct bch_folio_sector * s ,
unsigned nr_replicas )
2017-03-17 09:18:50 +03:00
{
2019-07-03 04:41:35 +03:00
return max ( 0 , ( int ) nr_replicas -
s - > nr_replicas -
s - > replicas_reserved ) ;
}
2023-03-17 21:55:53 +03:00
static int bch2_get_folio_disk_reservation ( struct bch_fs * c ,
2019-07-03 04:41:35 +03:00
struct bch_inode_info * inode ,
2023-03-17 21:55:53 +03:00
struct folio * folio , bool check_enospc )
2019-07-03 04:41:35 +03:00
{
2023-03-17 21:55:53 +03:00
struct bch_folio * s = bch2_folio_create ( folio , 0 ) ;
2019-07-02 21:59:15 +03:00
unsigned nr_replicas = inode_nr_replicas ( c , inode ) ;
2019-07-03 04:41:35 +03:00
struct disk_reservation disk_res = { 0 } ;
2023-03-17 22:37:34 +03:00
unsigned i , sectors = folio_sectors ( folio ) , disk_res_sectors = 0 ;
2019-07-03 04:41:35 +03:00
int ret ;
if ( ! s )
return - ENOMEM ;
2023-03-17 22:37:34 +03:00
for ( i = 0 ; i < sectors ; i + + )
2019-07-03 04:41:35 +03:00
disk_res_sectors + = sectors_to_reserve ( & s - > s [ i ] , nr_replicas ) ;
if ( ! disk_res_sectors )
return 0 ;
ret = bch2_disk_reservation_get ( c , & disk_res ,
disk_res_sectors , 1 ,
! check_enospc
? BCH_DISK_RESERVATION_NOFAIL
: 0 ) ;
if ( unlikely ( ret ) )
return ret ;
2023-03-17 22:37:34 +03:00
for ( i = 0 ; i < sectors ; i + + )
2019-07-03 04:41:35 +03:00
s - > s [ i ] . replicas_reserved + =
sectors_to_reserve ( & s - > s [ i ] , nr_replicas ) ;
return 0 ;
}
2023-03-17 21:55:53 +03:00
struct bch2_folio_reservation {
2019-07-29 20:38:38 +03:00
struct disk_reservation disk ;
struct quota_res quota ;
} ;
2023-03-17 21:55:53 +03:00
static void bch2_folio_reservation_init ( struct bch_fs * c ,
2019-07-03 04:41:35 +03:00
struct bch_inode_info * inode ,
2023-03-17 21:55:53 +03:00
struct bch2_folio_reservation * res )
2019-07-29 20:38:38 +03:00
{
memset ( res , 0 , sizeof ( * res ) ) ;
res - > disk . nr_replicas = inode_nr_replicas ( c , inode ) ;
}
2023-03-17 21:55:53 +03:00
static void bch2_folio_reservation_put ( struct bch_fs * c ,
2019-07-29 20:38:38 +03:00
struct bch_inode_info * inode ,
2023-03-17 21:55:53 +03:00
struct bch2_folio_reservation * res )
2019-07-29 20:38:38 +03:00
{
bch2_disk_reservation_put ( c , & res - > disk ) ;
bch2_quota_reservation_put ( c , inode , & res - > quota ) ;
}
2023-03-17 21:55:53 +03:00
static int bch2_folio_reservation_get ( struct bch_fs * c ,
struct bch_inode_info * inode ,
struct folio * folio ,
struct bch2_folio_reservation * res ,
2022-10-15 10:52:28 +03:00
unsigned offset , unsigned len )
2019-07-03 04:41:35 +03:00
{
2023-03-17 21:55:53 +03:00
struct bch_folio * s = bch2_folio_create ( folio , 0 ) ;
2019-07-29 20:38:38 +03:00
unsigned i , disk_sectors = 0 , quota_sectors = 0 ;
2018-11-15 05:53:40 +03:00
int ret ;
2017-03-17 09:18:50 +03:00
2019-07-03 04:41:35 +03:00
if ( ! s )
return - ENOMEM ;
2017-03-17 09:18:50 +03:00
2021-11-24 02:17:04 +03:00
BUG_ON ( ! s - > uptodate ) ;
2019-08-22 03:16:42 +03:00
for ( i = round_down ( offset , block_bytes ( c ) ) > > 9 ;
i < round_up ( offset + len , block_bytes ( c ) ) > > 9 ;
2019-07-29 20:38:38 +03:00
i + + ) {
disk_sectors + = sectors_to_reserve ( & s - > s [ i ] ,
res - > disk . nr_replicas ) ;
2023-03-23 18:08:04 +03:00
quota_sectors + = s - > s [ i ] . state = = SECTOR_unallocated ;
2019-07-29 20:38:38 +03:00
}
2017-03-17 09:18:50 +03:00
2019-07-29 20:38:38 +03:00
if ( disk_sectors ) {
2022-10-15 10:52:28 +03:00
ret = bch2_disk_reservation_add ( c , & res - > disk , disk_sectors , 0 ) ;
2019-07-29 20:38:38 +03:00
if ( unlikely ( ret ) )
return ret ;
}
2017-03-17 09:18:50 +03:00
2019-07-29 20:38:38 +03:00
if ( quota_sectors ) {
ret = bch2_quota_reservation_add ( c , inode , & res - > quota ,
2022-10-15 10:52:28 +03:00
quota_sectors , true ) ;
2019-07-29 20:38:38 +03:00
if ( unlikely ( ret ) ) {
struct disk_reservation tmp = {
. sectors = disk_sectors
} ;
bch2_disk_reservation_put ( c , & tmp ) ;
res - > disk . sectors - = disk_sectors ;
return ret ;
}
}
2017-03-17 09:18:50 +03:00
2019-07-03 00:25:05 +03:00
return 0 ;
2017-03-17 09:18:50 +03:00
}
2023-03-17 21:55:53 +03:00
static void bch2_clear_folio_bits ( struct folio * folio )
2017-03-17 09:18:50 +03:00
{
2023-03-17 21:55:53 +03:00
struct bch_inode_info * inode = to_bch_ei ( folio - > mapping - > host ) ;
2017-03-17 09:18:50 +03:00
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2023-03-17 21:55:53 +03:00
struct bch_folio * s = bch2_folio ( folio ) ;
2019-07-29 20:38:38 +03:00
struct disk_reservation disk_res = { 0 } ;
2023-03-17 22:37:34 +03:00
int i , sectors = folio_sectors ( folio ) , dirty_sectors = 0 ;
2018-11-15 05:53:40 +03:00
2019-07-03 04:41:35 +03:00
if ( ! s )
2017-03-17 09:18:50 +03:00
return ;
2023-03-17 21:55:53 +03:00
EBUG_ON ( ! folio_test_locked ( folio ) ) ;
EBUG_ON ( folio_test_writeback ( folio ) ) ;
2019-10-09 16:19:06 +03:00
2023-03-17 22:37:34 +03:00
for ( i = 0 ; i < sectors ; i + + ) {
2019-07-29 20:38:38 +03:00
disk_res . sectors + = s - > s [ i ] . replicas_reserved ;
s - > s [ i ] . replicas_reserved = 0 ;
2023-03-23 18:08:04 +03:00
dirty_sectors - = s - > s [ i ] . state = = SECTOR_dirty ;
folio_sector_set ( folio , s , i , folio_sector_undirty ( s - > s [ i ] . state ) ) ;
2019-07-03 04:41:35 +03:00
}
2017-03-17 09:18:50 +03:00
2019-07-29 20:38:38 +03:00
bch2_disk_reservation_put ( c , & disk_res ) ;
2021-11-24 02:21:09 +03:00
i_sectors_acct ( c , inode , NULL , dirty_sectors ) ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
bch2_folio_release ( folio ) ;
2017-03-17 09:18:50 +03:00
}
2023-03-17 21:55:53 +03:00
static void bch2_set_folio_dirty ( struct bch_fs * c ,
struct bch_inode_info * inode ,
struct folio * folio ,
struct bch2_folio_reservation * res ,
2019-07-29 20:38:38 +03:00
unsigned offset , unsigned len )
2017-03-17 09:18:50 +03:00
{
2023-03-17 21:55:53 +03:00
struct bch_folio * s = bch2_folio ( folio ) ;
2019-07-03 04:41:35 +03:00
unsigned i , dirty_sectors = 0 ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
WARN_ON ( ( u64 ) folio_pos ( folio ) + offset + len >
2019-09-27 02:09:08 +03:00
round_up ( ( u64 ) i_size_read ( & inode - > v ) , block_bytes ( c ) ) ) ;
2019-09-24 20:33:11 +03:00
2023-03-27 23:55:27 +03:00
BUG_ON ( ! s - > uptodate ) ;
2019-10-09 16:19:06 +03:00
spin_lock ( & s - > lock ) ;
2019-08-22 03:16:42 +03:00
for ( i = round_down ( offset , block_bytes ( c ) ) > > 9 ;
i < round_up ( offset + len , block_bytes ( c ) ) > > 9 ;
2019-07-29 20:38:38 +03:00
i + + ) {
unsigned sectors = sectors_to_reserve ( & s - > s [ i ] ,
res - > disk . nr_replicas ) ;
2017-03-17 09:18:50 +03:00
2019-10-26 01:54:58 +03:00
/*
* This can happen if we race with the error path in
* bch2_writepage_io_done ( ) :
*/
sectors = min_t ( unsigned , sectors , res - > disk . sectors ) ;
2019-07-29 20:38:38 +03:00
s - > s [ i ] . replicas_reserved + = sectors ;
res - > disk . sectors - = sectors ;
2019-07-03 00:25:05 +03:00
2023-03-23 18:08:04 +03:00
dirty_sectors + = s - > s [ i ] . state = = SECTOR_unallocated ;
folio_sector_set ( folio , s , i , folio_sector_dirty ( s - > s [ i ] . state ) ) ;
2019-07-03 04:41:35 +03:00
}
2019-10-09 16:19:06 +03:00
spin_unlock ( & s - > lock ) ;
2021-11-24 02:21:09 +03:00
i_sectors_acct ( c , inode , & res - > quota , dirty_sectors ) ;
2019-07-02 21:59:15 +03:00
2023-03-17 21:55:53 +03:00
if ( ! folio_test_dirty ( folio ) )
filemap_dirty_folio ( inode - > v . i_mapping , folio ) ;
2017-03-17 09:18:50 +03:00
}
vm_fault_t bch2_page_fault ( struct vm_fault * vmf )
{
struct file * file = vmf - > vma - > vm_file ;
2020-11-11 20:33:12 +03:00
struct address_space * mapping = file - > f_mapping ;
struct address_space * fdm = faults_disabled_mapping ( ) ;
2017-03-17 09:18:50 +03:00
struct bch_inode_info * inode = file_bch_inode ( file ) ;
2023-07-07 05:47:42 +03:00
vm_fault_t ret ;
2017-03-17 09:18:50 +03:00
2020-11-11 20:33:12 +03:00
if ( fdm = = mapping )
return VM_FAULT_SIGBUS ;
/* Lock ordering: */
if ( fdm > mapping ) {
struct bch_inode_info * fdm_host = to_bch_ei ( fdm - > host ) ;
2022-11-04 20:25:57 +03:00
if ( bch2_pagecache_add_tryget ( inode ) )
2020-11-11 20:33:12 +03:00
goto got_lock ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_put ( fdm_host ) ;
2020-11-11 20:33:12 +03:00
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_get ( inode ) ;
bch2_pagecache_add_put ( inode ) ;
2020-11-11 20:33:12 +03:00
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_get ( fdm_host ) ;
2020-11-11 20:33:12 +03:00
/* Signal that lock has been dropped: */
set_fdm_dropped_locks ( ) ;
return VM_FAULT_SIGBUS ;
}
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_get ( inode ) ;
2020-11-11 20:33:12 +03:00
got_lock :
2017-03-17 09:18:50 +03:00
ret = filemap_fault ( vmf ) ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_put ( inode ) ;
2017-03-17 09:18:50 +03:00
return ret ;
}
vm_fault_t bch2_page_mkwrite ( struct vm_fault * vmf )
{
2023-03-17 21:55:53 +03:00
struct folio * folio = page_folio ( vmf - > page ) ;
2017-03-17 09:18:50 +03:00
struct file * file = vmf - > vma - > vm_file ;
struct bch_inode_info * inode = file_bch_inode ( file ) ;
struct address_space * mapping = file - > f_mapping ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2023-03-17 21:55:53 +03:00
struct bch2_folio_reservation res ;
2019-09-20 01:05:04 +03:00
unsigned len ;
loff_t isize ;
2023-07-07 05:47:42 +03:00
vm_fault_t ret ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
bch2_folio_reservation_init ( c , inode , & res ) ;
2019-07-29 20:38:38 +03:00
2017-03-17 09:18:50 +03:00
sb_start_pagefault ( inode - > v . i_sb ) ;
file_update_time ( file ) ;
/*
* Not strictly necessary , but helps avoid dio writes livelocking in
* write_invalidate_inode_pages_range ( ) - can drop this if / when we get
* a write_invalidate_inode_pages_range ( ) that works without dropping
* page lock before invalidating page
*/
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_get ( inode ) ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
folio_lock ( folio ) ;
2019-09-20 01:05:04 +03:00
isize = i_size_read ( & inode - > v ) ;
2023-03-17 21:55:53 +03:00
if ( folio - > mapping ! = mapping | | folio_pos ( folio ) > = isize ) {
folio_unlock ( folio ) ;
2017-03-17 09:18:50 +03:00
ret = VM_FAULT_NOPAGE ;
goto out ;
}
2023-03-17 22:37:34 +03:00
len = min_t ( loff_t , folio_size ( folio ) , isize - folio_pos ( folio ) ) ;
2019-09-20 01:05:04 +03:00
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
if ( bch2_folio_set ( c , inode_inum ( inode ) , & folio , 1 ) ? :
bch2_folio_reservation_get ( c , inode , folio , & res , 0 , len ) ) {
2023-03-17 21:55:53 +03:00
folio_unlock ( folio ) ;
2017-03-17 09:18:50 +03:00
ret = VM_FAULT_SIGBUS ;
goto out ;
}
2023-03-17 21:55:53 +03:00
bch2_set_folio_dirty ( c , inode , folio , & res , 0 , len ) ;
bch2_folio_reservation_put ( c , inode , & res ) ;
2019-10-19 01:24:26 +03:00
2023-03-17 21:55:53 +03:00
folio_wait_stable ( folio ) ;
2021-11-24 02:17:04 +03:00
ret = VM_FAULT_LOCKED ;
2017-03-17 09:18:50 +03:00
out :
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_put ( inode ) ;
2017-03-17 09:18:50 +03:00
sb_end_pagefault ( inode - > v . i_sb ) ;
2019-07-29 20:38:38 +03:00
2017-03-17 09:18:50 +03:00
return ret ;
}
void bch2_invalidate_folio ( struct folio * folio , size_t offset , size_t length )
{
if ( offset | | length < folio_size ( folio ) )
return ;
2023-03-17 21:55:53 +03:00
bch2_clear_folio_bits ( folio ) ;
2017-03-17 09:18:50 +03:00
}
bool bch2_release_folio ( struct folio * folio , gfp_t gfp_mask )
{
2022-12-25 06:45:11 +03:00
if ( folio_test_dirty ( folio ) | | folio_test_writeback ( folio ) )
2017-03-17 09:18:50 +03:00
return false ;
2023-03-17 21:55:53 +03:00
bch2_clear_folio_bits ( folio ) ;
2017-03-17 09:18:50 +03:00
return true ;
}
/* readpage(s): */
static void bch2_readpages_end_io ( struct bio * bio )
{
2023-03-17 21:55:53 +03:00
struct folio_iter fi ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
bio_for_each_folio_all ( fi , bio ) {
2017-03-17 09:18:50 +03:00
if ( ! bio - > bi_status ) {
2023-03-17 21:55:53 +03:00
folio_mark_uptodate ( fi . folio ) ;
2017-03-17 09:18:50 +03:00
} else {
2023-03-17 21:55:53 +03:00
folio_clear_uptodate ( fi . folio ) ;
folio_set_error ( fi . folio ) ;
2017-03-17 09:18:50 +03:00
}
2023-03-17 21:55:53 +03:00
folio_unlock ( fi . folio ) ;
2017-03-17 09:18:50 +03:00
}
bio_put ( bio ) ;
}
struct readpages_iter {
struct address_space * mapping ;
unsigned idx ;
2023-03-18 02:24:44 +03:00
folios folios ;
2017-03-17 09:18:50 +03:00
} ;
static int readpages_iter_init ( struct readpages_iter * iter ,
struct readahead_control * ractl )
{
2023-03-18 02:24:44 +03:00
struct folio * * fi ;
int ret ;
2017-03-17 09:18:50 +03:00
memset ( iter , 0 , sizeof ( * iter ) ) ;
2023-03-18 02:24:44 +03:00
iter - > mapping = ractl - > mapping ;
2017-03-17 09:18:50 +03:00
2023-03-18 02:24:44 +03:00
ret = filemap_get_contig_folios_d ( iter - > mapping ,
ractl - > _index < < PAGE_SHIFT ,
( ractl - > _index + ractl - > _nr_pages ) < < PAGE_SHIFT ,
0 , mapping_gfp_mask ( iter - > mapping ) ,
& iter - > folios ) ;
if ( ret )
return ret ;
2017-03-17 09:18:50 +03:00
2023-03-18 02:24:44 +03:00
darray_for_each ( iter - > folios , fi ) {
ractl - > _nr_pages - = 1U < < folio_order ( * fi ) ;
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
__bch2_folio_create ( * fi , __GFP_NOFAIL | GFP_KERNEL ) ;
2023-03-18 02:24:44 +03:00
folio_put ( * fi ) ;
folio_put ( * fi ) ;
2017-03-17 09:18:50 +03:00
}
return 0 ;
}
2023-03-18 02:24:44 +03:00
static inline struct folio * readpage_iter_peek ( struct readpages_iter * iter )
2017-03-17 09:18:50 +03:00
{
2023-03-18 02:24:44 +03:00
if ( iter - > idx > = iter - > folios . nr )
2017-03-17 09:18:50 +03:00
return NULL ;
2023-03-18 02:24:44 +03:00
return iter - > folios . data [ iter - > idx ] ;
}
2017-03-17 09:18:50 +03:00
2023-03-18 02:24:44 +03:00
static inline void readpage_iter_advance ( struct readpages_iter * iter )
{
iter - > idx + + ;
2017-03-17 09:18:50 +03:00
}
2019-11-10 00:01:15 +03:00
static bool extent_partial_reads_expensive ( struct bkey_s_c k )
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
struct bch_extent_crc_unpacked crc ;
const union bch_extent_entry * i ;
bkey_for_each_crc ( k . k , ptrs , crc , i )
if ( crc . csum_type | | crc . compression_type )
return true ;
return false ;
}
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
static int readpage_bio_extend ( struct btree_trans * trans ,
struct readpages_iter * iter ,
struct bio * bio ,
unsigned sectors_this_extent ,
bool get_more )
2017-03-17 09:18:50 +03:00
{
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
/* Don't hold btree locks while allocating memory: */
bch2_trans_unlock ( trans ) ;
2019-08-16 16:59:56 +03:00
while ( bio_sectors ( bio ) < sectors_this_extent & &
2017-03-17 09:18:50 +03:00
bio - > bi_vcnt < bio - > bi_max_vecs ) {
2023-03-18 02:24:44 +03:00
struct folio * folio = readpage_iter_peek ( iter ) ;
2017-03-17 09:18:50 +03:00
int ret ;
2023-03-17 21:55:53 +03:00
if ( folio ) {
2023-03-18 02:24:44 +03:00
readpage_iter_advance ( iter ) ;
2017-03-17 09:18:50 +03:00
} else {
2023-03-18 02:24:44 +03:00
pgoff_t folio_offset = bio_end_sector ( bio ) > > PAGE_SECTORS_SHIFT ;
2017-03-17 09:18:50 +03:00
if ( ! get_more )
break ;
2023-03-17 21:55:53 +03:00
folio = xa_load ( & iter - > mapping - > i_pages , folio_offset ) ;
if ( folio & & ! xa_is_value ( folio ) )
2017-03-17 09:18:50 +03:00
break ;
2023-03-17 21:55:53 +03:00
folio = filemap_alloc_folio ( readahead_gfp_mask ( iter - > mapping ) , 0 ) ;
if ( ! folio )
2017-03-17 09:18:50 +03:00
break ;
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
if ( ! __bch2_folio_create ( folio , GFP_KERNEL ) ) {
2023-03-17 21:55:53 +03:00
folio_put ( folio ) ;
2019-07-03 04:41:35 +03:00
break ;
}
2017-03-17 09:18:50 +03:00
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
ret = filemap_add_folio ( iter - > mapping , folio , folio_offset , GFP_KERNEL ) ;
2017-03-17 09:18:50 +03:00
if ( ret ) {
2023-03-17 21:55:53 +03:00
__bch2_folio_release ( folio ) ;
folio_put ( folio ) ;
2017-03-17 09:18:50 +03:00
break ;
}
2023-03-17 21:55:53 +03:00
folio_put ( folio ) ;
2017-03-17 09:18:50 +03:00
}
2023-03-18 02:24:44 +03:00
BUG_ON ( folio_sector ( folio ) ! = bio_end_sector ( bio ) ) ;
2023-03-17 21:55:53 +03:00
BUG_ON ( ! bio_add_folio ( bio , folio , folio_size ( folio ) , 0 ) ) ;
2017-03-17 09:18:50 +03:00
}
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
return bch2_trans_relock ( trans ) ;
2017-03-17 09:18:50 +03:00
}
2021-03-13 04:30:39 +03:00
static void bchfs_read ( struct btree_trans * trans ,
struct bch_read_bio * rbio ,
subvol_inum inum ,
2017-03-17 09:18:50 +03:00
struct readpages_iter * readpages_iter )
{
2019-03-28 05:03:30 +03:00
struct bch_fs * c = trans - > c ;
2021-03-13 04:30:39 +03:00
struct btree_iter iter ;
2020-12-17 23:08:58 +03:00
struct bkey_buf sk ;
2017-03-17 09:18:50 +03:00
int flags = BCH_READ_RETRY_IF_STALE |
BCH_READ_MAY_PROMOTE ;
2021-03-13 04:30:39 +03:00
u32 snapshot ;
2019-08-16 16:59:56 +03:00
int ret = 0 ;
2017-03-17 09:18:50 +03:00
rbio - > c = c ;
rbio - > start_time = local_clock ( ) ;
2021-03-13 04:30:39 +03:00
rbio - > subvol = inum . subvol ;
2019-11-10 00:01:15 +03:00
2020-12-17 23:08:58 +03:00
bch2_bkey_buf_init ( & sk ) ;
2019-08-16 16:59:56 +03:00
retry :
2021-07-25 03:24:10 +03:00
bch2_trans_begin ( trans ) ;
2021-03-13 04:30:39 +03:00
iter = ( struct btree_iter ) { NULL } ;
2021-07-25 03:24:10 +03:00
2021-03-13 04:30:39 +03:00
ret = bch2_subvolume_get_snapshot ( trans , inum . subvol , & snapshot ) ;
if ( ret )
goto err ;
bch2_trans_iter_init ( trans , & iter , BTREE_ID_extents ,
SPOS ( inum . inum , rbio - > bio . bi_iter . bi_sector , snapshot ) ,
2022-01-05 02:24:55 +03:00
BTREE_ITER_SLOTS ) ;
2017-03-17 09:18:50 +03:00
while ( 1 ) {
struct bkey_s_c k ;
2019-08-16 16:59:56 +03:00
unsigned bytes , sectors , offset_into_extent ;
2021-03-15 04:30:08 +03:00
enum btree_id data_btree = BTREE_ID_extents ;
2017-03-17 09:18:50 +03:00
2021-08-05 20:02:39 +03:00
/*
* read_extent - > io_time_reset may cause a transaction restart
* without returning an error , we need to check for that here :
*/
2022-07-18 06:06:38 +03:00
ret = bch2_trans_relock ( trans ) ;
if ( ret )
2021-08-05 20:02:39 +03:00
break ;
2021-03-13 04:30:39 +03:00
bch2_btree_iter_set_pos ( & iter ,
POS ( inum . inum , rbio - > bio . bi_iter . bi_sector ) ) ;
2017-03-17 09:18:50 +03:00
2021-03-13 04:30:39 +03:00
k = bch2_btree_iter_peek_slot ( & iter ) ;
2019-08-16 16:59:56 +03:00
ret = bkey_err ( k ) ;
if ( ret )
break ;
2017-03-17 09:18:50 +03:00
2021-03-13 04:30:39 +03:00
offset_into_extent = iter . pos . offset -
2019-07-09 19:56:43 +03:00
bkey_start_offset ( k . k ) ;
2019-08-16 16:59:56 +03:00
sectors = k . k - > size - offset_into_extent ;
2020-12-17 23:08:58 +03:00
bch2_bkey_buf_reassemble ( & sk , c , k ) ;
2020-10-25 03:56:47 +03:00
2021-03-15 04:30:08 +03:00
ret = bch2_read_indirect_extent ( trans , & data_btree ,
2020-05-22 17:50:05 +03:00
& offset_into_extent , & sk ) ;
2019-08-16 16:59:56 +03:00
if ( ret )
break ;
2020-10-25 03:56:47 +03:00
k = bkey_i_to_s_c ( sk . k ) ;
2019-08-16 16:59:56 +03:00
sectors = min ( sectors , k . k - > size - offset_into_extent ) ;
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
if ( readpages_iter ) {
ret = readpage_bio_extend ( trans , readpages_iter , & rbio - > bio , sectors ,
extent_partial_reads_expensive ( k ) ) ;
if ( ret )
break ;
}
2017-03-17 09:18:50 +03:00
2019-08-16 16:59:56 +03:00
bytes = min ( sectors , bio_sectors ( & rbio - > bio ) ) < < 9 ;
2019-07-09 19:56:43 +03:00
swap ( rbio - > bio . bi_iter . bi_size , bytes ) ;
2017-03-17 09:18:50 +03:00
2019-07-09 19:56:43 +03:00
if ( rbio - > bio . bi_iter . bi_size = = bytes )
2017-03-17 09:18:50 +03:00
flags | = BCH_READ_LAST_FRAGMENT ;
2021-11-24 01:05:56 +03:00
bch2_bio_page_state_set ( & rbio - > bio , k ) ;
2017-03-17 09:18:50 +03:00
2021-03-13 04:30:39 +03:00
bch2_read_extent ( trans , rbio , iter . pos ,
2021-03-15 04:30:08 +03:00
data_btree , k , offset_into_extent , flags ) ;
2017-03-17 09:18:50 +03:00
if ( flags & BCH_READ_LAST_FRAGMENT )
2019-11-10 00:01:15 +03:00
break ;
2017-03-17 09:18:50 +03:00
2019-07-09 19:56:43 +03:00
swap ( rbio - > bio . bi_iter . bi_size , bytes ) ;
bio_advance ( & rbio - > bio , bytes ) ;
2021-11-24 03:00:23 +03:00
ret = btree_trans_too_many_iters ( trans ) ;
if ( ret )
break ;
2017-03-17 09:18:50 +03:00
}
2021-03-13 04:30:39 +03:00
err :
bch2_trans_iter_exit ( trans , & iter ) ;
2019-08-16 16:59:56 +03:00
2022-07-18 06:06:38 +03:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
2019-08-16 16:59:56 +03:00
goto retry ;
2019-11-10 00:01:15 +03:00
if ( ret ) {
2022-11-16 04:25:08 +03:00
bch_err_inum_offset_ratelimited ( c ,
iter . pos . inode ,
iter . pos . offset < < 9 ,
2020-12-03 21:57:22 +03:00
" read error %i from btree lookup " , ret ) ;
rbio - > bio . bi_status = BLK_STS_IOERR ;
2019-11-10 00:01:15 +03:00
bio_endio ( & rbio - > bio ) ;
}
2020-12-17 23:08:58 +03:00
bch2_bkey_buf_exit ( & sk , c ) ;
2017-03-17 09:18:50 +03:00
}
void bch2_readahead ( struct readahead_control * ractl )
{
struct bch_inode_info * inode = to_bch_ei ( ractl - > mapping - > host ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2022-11-24 04:14:55 +03:00
struct bch_io_opts opts ;
2019-03-25 22:10:15 +03:00
struct btree_trans trans ;
2023-03-17 21:55:53 +03:00
struct folio * folio ;
2017-03-17 09:18:50 +03:00
struct readpages_iter readpages_iter ;
int ret ;
2022-11-24 04:14:55 +03:00
bch2_inode_opts_get ( & opts , c , & inode - > ei_inode ) ;
2017-03-17 09:18:50 +03:00
ret = readpages_iter_init ( & readpages_iter , ractl ) ;
BUG_ON ( ret ) ;
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , c , 0 , 0 ) ;
2017-03-17 09:18:50 +03:00
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_get ( inode ) ;
2017-03-17 09:18:50 +03:00
2023-03-18 02:24:44 +03:00
while ( ( folio = readpage_iter_peek ( & readpages_iter ) ) ) {
2017-03-17 09:18:50 +03:00
unsigned n = min_t ( unsigned ,
2023-03-18 02:24:44 +03:00
readpages_iter . folios . nr -
2017-03-17 09:18:50 +03:00
readpages_iter . idx ,
BIO_MAX_VECS ) ;
struct bch_read_bio * rbio =
rbio_init ( bio_alloc_bioset ( NULL , n , REQ_OP_READ ,
2023-05-28 07:59:26 +03:00
GFP_KERNEL , & c - > bio_read ) ,
2017-03-17 09:18:50 +03:00
opts ) ;
2023-03-18 02:24:44 +03:00
readpage_iter_advance ( & readpages_iter ) ;
2017-03-17 09:18:50 +03:00
2023-03-18 02:24:44 +03:00
rbio - > bio . bi_iter . bi_sector = folio_sector ( folio ) ;
2017-03-17 09:18:50 +03:00
rbio - > bio . bi_end_io = bch2_readpages_end_io ;
2023-03-17 21:55:53 +03:00
BUG_ON ( ! bio_add_folio ( & rbio - > bio , folio , folio_size ( folio ) , 0 ) ) ;
2017-03-17 09:18:50 +03:00
2021-03-13 04:30:39 +03:00
bchfs_read ( & trans , rbio , inode_inum ( inode ) ,
2019-03-28 05:03:30 +03:00
& readpages_iter ) ;
2023-05-28 07:59:26 +03:00
bch2_trans_unlock ( & trans ) ;
2017-03-17 09:18:50 +03:00
}
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_put ( inode ) ;
2019-03-25 22:10:15 +03:00
bch2_trans_exit ( & trans ) ;
2023-03-18 02:24:44 +03:00
darray_exit ( & readpages_iter . folios ) ;
2017-03-17 09:18:50 +03:00
}
2023-03-17 21:55:53 +03:00
static void __bchfs_readfolio ( struct bch_fs * c , struct bch_read_bio * rbio ,
subvol_inum inum , struct folio * folio )
2017-03-17 09:18:50 +03:00
{
2019-03-25 22:10:15 +03:00
struct btree_trans trans ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
bch2_folio_create ( folio , __GFP_NOFAIL ) ;
2017-03-17 09:18:50 +03:00
rbio - > bio . bi_opf = REQ_OP_READ | REQ_SYNC ;
2023-03-17 21:55:53 +03:00
rbio - > bio . bi_iter . bi_sector = folio_sector ( folio ) ;
BUG_ON ( ! bio_add_folio ( & rbio - > bio , folio , folio_size ( folio ) , 0 ) ) ;
2017-03-17 09:18:50 +03:00
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , c , 0 , 0 ) ;
2021-03-13 04:30:39 +03:00
bchfs_read ( & trans , rbio , inum , NULL ) ;
2019-03-25 22:10:15 +03:00
bch2_trans_exit ( & trans ) ;
2017-03-17 09:18:50 +03:00
}
2023-03-17 21:55:53 +03:00
static void bch2_read_single_folio_end_io ( struct bio * bio )
2017-03-17 09:18:50 +03:00
{
complete ( bio - > bi_private ) ;
}
2023-03-17 21:55:53 +03:00
static int bch2_read_single_folio ( struct folio * folio ,
struct address_space * mapping )
2017-03-17 09:18:50 +03:00
{
struct bch_inode_info * inode = to_bch_ei ( mapping - > host ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
struct bch_read_bio * rbio ;
2022-11-24 04:14:55 +03:00
struct bch_io_opts opts ;
2017-03-17 09:18:50 +03:00
int ret ;
DECLARE_COMPLETION_ONSTACK ( done ) ;
2022-11-24 04:14:55 +03:00
bch2_inode_opts_get ( & opts , c , & inode - > ei_inode ) ;
2023-05-28 07:59:26 +03:00
rbio = rbio_init ( bio_alloc_bioset ( NULL , 1 , REQ_OP_READ , GFP_KERNEL , & c - > bio_read ) ,
2022-11-24 04:14:55 +03:00
opts ) ;
2017-03-17 09:18:50 +03:00
rbio - > bio . bi_private = & done ;
2023-03-17 21:55:53 +03:00
rbio - > bio . bi_end_io = bch2_read_single_folio_end_io ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
__bchfs_readfolio ( c , rbio , inode_inum ( inode ) , folio ) ;
2017-03-17 09:18:50 +03:00
wait_for_completion ( & done ) ;
ret = blk_status_to_errno ( rbio - > bio . bi_status ) ;
bio_put ( & rbio - > bio ) ;
if ( ret < 0 )
return ret ;
2023-03-17 21:55:53 +03:00
folio_mark_uptodate ( folio ) ;
2017-03-17 09:18:50 +03:00
return 0 ;
}
int bch2_read_folio ( struct file * file , struct folio * folio )
{
int ret ;
2023-03-17 21:55:53 +03:00
ret = bch2_read_single_folio ( folio , folio - > mapping ) ;
2017-03-17 09:18:50 +03:00
folio_unlock ( folio ) ;
2022-09-18 22:43:50 +03:00
return bch2_err_class ( ret ) ;
2017-03-17 09:18:50 +03:00
}
/* writepages: */
struct bch_writepage_state {
struct bch_writepage_io * io ;
struct bch_io_opts opts ;
2023-03-18 02:46:25 +03:00
struct bch_folio_sector * tmp ;
unsigned tmp_sectors ;
2017-03-17 09:18:50 +03:00
} ;
static inline struct bch_writepage_state bch_writepage_state_init ( struct bch_fs * c ,
struct bch_inode_info * inode )
{
2022-11-24 04:14:55 +03:00
struct bch_writepage_state ret = { 0 } ;
bch2_inode_opts_get ( & ret . opts , c , & inode - > ei_inode ) ;
return ret ;
2017-03-17 09:18:50 +03:00
}
2022-10-29 09:47:33 +03:00
static void bch2_writepage_io_done ( struct bch_write_op * op )
2017-03-17 09:18:50 +03:00
{
2022-10-29 09:47:33 +03:00
struct bch_writepage_io * io =
container_of ( op , struct bch_writepage_io , op ) ;
2019-10-09 19:50:39 +03:00
struct bch_fs * c = io - > op . c ;
struct bio * bio = & io - > op . wbio . bio ;
2023-03-20 01:59:21 +03:00
struct folio_iter fi ;
2019-08-13 10:16:52 +03:00
unsigned i ;
2017-03-17 09:18:50 +03:00
2019-10-09 19:50:39 +03:00
if ( io - > op . error ) {
2020-12-03 22:27:20 +03:00
set_bit ( EI_INODE_ERROR , & io - > inode - > ei_flags ) ;
2023-03-20 01:59:21 +03:00
bio_for_each_folio_all ( fi , bio ) {
2023-03-17 19:53:15 +03:00
struct bch_folio * s ;
2019-08-13 10:16:52 +03:00
2023-03-20 01:59:21 +03:00
folio_set_error ( fi . folio ) ;
mapping_set_error ( fi . folio - > mapping , - EIO ) ;
2019-08-13 10:16:52 +03:00
2023-03-20 01:59:21 +03:00
s = __bch2_folio ( fi . folio ) ;
2019-10-09 16:19:06 +03:00
spin_lock ( & s - > lock ) ;
2023-03-20 01:59:21 +03:00
for ( i = 0 ; i < folio_sectors ( fi . folio ) ; i + + )
2019-08-13 10:16:52 +03:00
s - > s [ i ] . nr_replicas = 0 ;
2019-10-09 16:19:06 +03:00
spin_unlock ( & s - > lock ) ;
2019-04-18 03:34:24 +03:00
}
2017-03-17 09:18:50 +03:00
}
2019-11-10 00:43:16 +03:00
if ( io - > op . flags & BCH_WRITE_WROTE_DATA_INLINE ) {
2023-03-20 01:59:21 +03:00
bio_for_each_folio_all ( fi , bio ) {
2023-03-17 19:53:15 +03:00
struct bch_folio * s ;
2019-11-10 00:43:16 +03:00
2023-03-20 01:59:21 +03:00
s = __bch2_folio ( fi . folio ) ;
2019-11-10 00:43:16 +03:00
spin_lock ( & s - > lock ) ;
2023-03-20 01:59:21 +03:00
for ( i = 0 ; i < folio_sectors ( fi . folio ) ; i + + )
2019-11-10 00:43:16 +03:00
s - > s [ i ] . nr_replicas = 0 ;
spin_unlock ( & s - > lock ) ;
}
}
2017-03-17 09:18:50 +03:00
/*
* racing with fallocate can cause us to add fewer sectors than
* expected - but we shouldn ' t add more sectors than expected :
*/
2022-03-17 03:31:15 +03:00
WARN_ON_ONCE ( io - > op . i_sectors_delta > 0 ) ;
2017-03-17 09:18:50 +03:00
/*
* ( error ( due to going RO ) halfway through a page can screw that up
* slightly )
* XXX wtf ?
2019-10-09 19:50:39 +03:00
BUG_ON ( io - > op . op . i_sectors_delta > = PAGE_SECTORS ) ;
2017-03-17 09:18:50 +03:00
*/
/*
* PageWriteback is effectively our ref on the inode - fixup i_blocks
* before calling end_page_writeback :
*/
2019-10-09 19:50:39 +03:00
i_sectors_acct ( c , io - > inode , NULL , io - > op . i_sectors_delta ) ;
2017-03-17 09:18:50 +03:00
2023-03-20 01:59:21 +03:00
bio_for_each_folio_all ( fi , bio ) {
struct bch_folio * s = __bch2_folio ( fi . folio ) ;
2019-07-29 19:24:36 +03:00
if ( atomic_dec_and_test ( & s - > write_count ) )
2023-03-20 01:59:21 +03:00
folio_end_writeback ( fi . folio ) ;
2019-07-29 19:24:36 +03:00
}
2017-03-17 09:18:50 +03:00
2022-10-29 09:47:33 +03:00
bio_put ( & io - > op . wbio . bio ) ;
2017-03-17 09:18:50 +03:00
}
static void bch2_writepage_do_io ( struct bch_writepage_state * w )
{
struct bch_writepage_io * io = w - > io ;
w - > io = NULL ;
2022-10-29 09:47:33 +03:00
closure_call ( & io - > op . cl , bch2_write , NULL , NULL ) ;
2017-03-17 09:18:50 +03:00
}
/*
* Get a bch_writepage_io and add @ page to it - appending to an existing one if
* possible , else allocating a new one :
*/
static void bch2_writepage_io_alloc ( struct bch_fs * c ,
2019-11-14 03:45:48 +03:00
struct writeback_control * wbc ,
2017-03-17 09:18:50 +03:00
struct bch_writepage_state * w ,
struct bch_inode_info * inode ,
2019-07-29 19:24:36 +03:00
u64 sector ,
2017-03-17 09:18:50 +03:00
unsigned nr_replicas )
{
struct bch_write_op * op ;
w - > io = container_of ( bio_alloc_bioset ( NULL , BIO_MAX_VECS ,
REQ_OP_WRITE ,
2023-05-28 07:59:26 +03:00
GFP_KERNEL ,
2017-03-17 09:18:50 +03:00
& c - > writepage_bioset ) ,
2019-10-09 19:50:39 +03:00
struct bch_writepage_io , op . wbio . bio ) ;
2017-03-17 09:18:50 +03:00
2019-10-09 19:50:39 +03:00
w - > io - > inode = inode ;
op = & w - > io - > op ;
bch2_write_op_init ( op , c , w - > opts ) ;
op - > target = w - > opts . foreground_target ;
2017-03-17 09:18:50 +03:00
op - > nr_replicas = nr_replicas ;
op - > res . nr_replicas = nr_replicas ;
op - > write_point = writepoint_hashed ( inode - > ei_last_dirtied ) ;
2021-03-13 04:30:39 +03:00
op - > subvol = inode - > ei_subvol ;
2019-07-29 19:24:36 +03:00
op - > pos = POS ( inode - > v . i_ino , sector ) ;
2022-10-29 09:47:33 +03:00
op - > end_io = bch2_writepage_io_done ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
op - > devs_need_flush = & inode - > ei_devs_need_flush ;
2019-07-29 19:24:36 +03:00
op - > wbio . bio . bi_iter . bi_sector = sector ;
2019-11-14 03:45:48 +03:00
op - > wbio . bio . bi_opf = wbc_to_write_flags ( wbc ) ;
2017-03-17 09:18:50 +03:00
}
static int __bch2_writepage ( struct folio * folio ,
struct writeback_control * wbc ,
void * data )
{
2023-03-17 21:55:53 +03:00
struct bch_inode_info * inode = to_bch_ei ( folio - > mapping - > host ) ;
2017-03-17 09:18:50 +03:00
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
struct bch_writepage_state * w = data ;
2023-03-18 02:46:25 +03:00
struct bch_folio * s ;
2023-03-17 21:55:53 +03:00
unsigned i , offset , f_sectors , nr_replicas_this_write = U32_MAX ;
2017-03-17 09:18:50 +03:00
loff_t i_size = i_size_read ( & inode - > v ) ;
2019-07-02 21:59:15 +03:00
int ret ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
EBUG_ON ( ! folio_test_uptodate ( folio ) ) ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
/* Is the folio fully inside i_size? */
2023-03-17 22:37:34 +03:00
if ( folio_end_pos ( folio ) < = i_size )
2017-03-17 09:18:50 +03:00
goto do_io ;
2023-03-17 21:55:53 +03:00
/* Is the folio fully outside i_size? (truncate in progress) */
2023-03-17 22:37:34 +03:00
if ( folio_pos ( folio ) > = i_size ) {
2023-03-17 21:55:53 +03:00
folio_unlock ( folio ) ;
2017-03-17 09:18:50 +03:00
return 0 ;
}
/*
2023-03-17 21:55:53 +03:00
* The folio straddles i_size . It must be zeroed out on each and every
2017-03-17 09:18:50 +03:00
* writepage invocation because it may be mmapped . " A file is mapped
2023-03-17 21:55:53 +03:00
* in multiples of the folio size . For a file that is not a multiple of
* the folio size , the remaining memory is zeroed when mapped , and
2017-03-17 09:18:50 +03:00
* writes to that region are not written out to the file . "
*/
2023-03-17 22:37:34 +03:00
folio_zero_segment ( folio ,
i_size - folio_pos ( folio ) ,
folio_size ( folio ) ) ;
2017-03-17 09:18:50 +03:00
do_io :
2023-03-17 21:55:53 +03:00
f_sectors = folio_sectors ( folio ) ;
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
s = bch2_folio ( folio ) ;
2018-11-15 05:53:40 +03:00
2023-03-18 02:46:25 +03:00
if ( f_sectors > w - > tmp_sectors ) {
kfree ( w - > tmp ) ;
w - > tmp = kzalloc ( sizeof ( struct bch_folio_sector ) *
f_sectors , __GFP_NOFAIL ) ;
w - > tmp_sectors = f_sectors ;
}
2021-11-11 21:02:03 +03:00
/*
* Things get really hairy with errors during writeback :
*/
2023-03-17 21:55:53 +03:00
ret = bch2_get_folio_disk_reservation ( c , inode , folio , false ) ;
2021-11-11 21:02:03 +03:00
BUG_ON ( ret ) ;
2018-11-15 05:53:40 +03:00
2019-07-29 19:24:36 +03:00
/* Before unlocking the page, get copy of reservations: */
2021-11-11 21:02:03 +03:00
spin_lock ( & s - > lock ) ;
2023-03-18 02:46:25 +03:00
memcpy ( w - > tmp , s - > s , sizeof ( struct bch_folio_sector ) * f_sectors ) ;
2019-07-29 19:24:36 +03:00
2023-03-17 21:55:53 +03:00
for ( i = 0 ; i < f_sectors ; i + + ) {
2023-03-23 18:08:04 +03:00
if ( s - > s [ i ] . state < SECTOR_dirty )
2019-07-29 19:24:36 +03:00
continue ;
2019-07-03 04:41:35 +03:00
nr_replicas_this_write =
min_t ( unsigned , nr_replicas_this_write ,
s - > s [ i ] . nr_replicas +
s - > s [ i ] . replicas_reserved ) ;
2019-07-29 19:24:36 +03:00
}
2019-07-02 21:59:15 +03:00
2023-03-17 21:55:53 +03:00
for ( i = 0 ; i < f_sectors ; i + + ) {
2023-03-23 18:08:04 +03:00
if ( s - > s [ i ] . state < SECTOR_dirty )
2019-07-29 19:24:36 +03:00
continue ;
2019-07-03 04:41:35 +03:00
s - > s [ i ] . nr_replicas = w - > opts . compression
? 0 : nr_replicas_this_write ;
2019-07-02 21:59:15 +03:00
2019-07-03 04:41:35 +03:00
s - > s [ i ] . replicas_reserved = 0 ;
2023-03-23 18:08:04 +03:00
folio_sector_set ( folio , s , i , SECTOR_allocated ) ;
2019-07-03 04:41:35 +03:00
}
2023-03-23 18:08:04 +03:00
spin_unlock ( & s - > lock ) ;
2017-03-17 09:18:50 +03:00
2019-07-29 19:24:36 +03:00
BUG_ON ( atomic_read ( & s - > write_count ) ) ;
atomic_set ( & s - > write_count , 1 ) ;
2023-03-17 21:55:53 +03:00
BUG_ON ( folio_test_writeback ( folio ) ) ;
folio_start_writeback ( folio ) ;
2019-07-29 19:24:36 +03:00
2023-03-17 21:55:53 +03:00
folio_unlock ( folio ) ;
2017-03-17 09:18:50 +03:00
2019-07-29 19:24:36 +03:00
offset = 0 ;
while ( 1 ) {
2021-11-11 21:02:03 +03:00
unsigned sectors = 0 , dirty_sectors = 0 , reserved_sectors = 0 ;
2019-07-29 19:24:36 +03:00
u64 sector ;
2023-03-17 21:55:53 +03:00
while ( offset < f_sectors & &
2023-03-23 18:08:04 +03:00
w - > tmp [ offset ] . state < SECTOR_dirty )
2019-07-29 19:24:36 +03:00
offset + + ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
if ( offset = = f_sectors )
2019-07-29 19:24:36 +03:00
break ;
2023-03-17 21:55:53 +03:00
while ( offset + sectors < f_sectors & &
2023-03-23 18:08:04 +03:00
w - > tmp [ offset + sectors ] . state > = SECTOR_dirty ) {
2023-03-18 02:46:25 +03:00
reserved_sectors + = w - > tmp [ offset + sectors ] . replicas_reserved ;
2023-03-23 18:08:04 +03:00
dirty_sectors + = w - > tmp [ offset + sectors ] . state = = SECTOR_dirty ;
2019-07-29 19:24:36 +03:00
sectors + + ;
}
2021-11-11 21:02:03 +03:00
BUG_ON ( ! sectors ) ;
2023-03-17 21:55:53 +03:00
sector = folio_sector ( folio ) + offset ;
2019-07-29 19:24:36 +03:00
if ( w - > io & &
2019-10-09 19:50:39 +03:00
( w - > io - > op . res . nr_replicas ! = nr_replicas_this_write | |
2023-03-17 22:37:34 +03:00
bio_full ( & w - > io - > op . wbio . bio , sectors < < 9 ) | |
2020-04-29 22:28:25 +03:00
w - > io - > op . wbio . bio . bi_iter . bi_size + ( sectors < < 9 ) > =
( BIO_MAX_VECS * PAGE_SIZE ) | |
2019-10-09 19:50:39 +03:00
bio_end_sector ( & w - > io - > op . wbio . bio ) ! = sector ) )
2019-07-29 19:24:36 +03:00
bch2_writepage_do_io ( w ) ;
2017-03-17 09:18:50 +03:00
2019-07-29 19:24:36 +03:00
if ( ! w - > io )
2019-11-14 03:45:48 +03:00
bch2_writepage_io_alloc ( c , wbc , w , inode , sector ,
2019-07-29 19:24:36 +03:00
nr_replicas_this_write ) ;
2017-03-17 09:18:50 +03:00
2019-07-29 19:24:36 +03:00
atomic_inc ( & s - > write_count ) ;
2019-10-09 19:50:39 +03:00
BUG_ON ( inode ! = w - > io - > inode ) ;
2023-03-17 21:55:53 +03:00
BUG_ON ( ! bio_add_folio ( & w - > io - > op . wbio . bio , folio ,
2019-07-29 19:24:36 +03:00
sectors < < 9 , offset < < 9 ) ) ;
2019-09-20 01:05:04 +03:00
/* Check for writing past i_size: */
2022-11-15 23:57:07 +03:00
WARN_ONCE ( ( bio_end_sector ( & w - > io - > op . wbio . bio ) < < 9 ) >
round_up ( i_size , block_bytes ( c ) ) & &
! test_bit ( BCH_FS_EMERGENCY_RO , & c - > flags ) ,
" writing past i_size: %llu > %llu (unrounded %llu) \n " ,
bio_end_sector ( & w - > io - > op . wbio . bio ) < < 9 ,
round_up ( i_size , block_bytes ( c ) ) ,
i_size ) ;
2019-09-20 01:05:04 +03:00
2019-10-09 19:50:39 +03:00
w - > io - > op . res . sectors + = reserved_sectors ;
w - > io - > op . i_sectors_delta - = dirty_sectors ;
2019-07-29 19:24:36 +03:00
w - > io - > op . new_i_size = i_size ;
offset + = sectors ;
}
2017-03-17 09:18:50 +03:00
2019-07-29 19:24:36 +03:00
if ( atomic_dec_and_test ( & s - > write_count ) )
2023-03-17 21:55:53 +03:00
folio_end_writeback ( folio ) ;
2017-03-17 09:18:50 +03:00
return 0 ;
}
int bch2_writepages ( struct address_space * mapping , struct writeback_control * wbc )
{
struct bch_fs * c = mapping - > host - > i_sb - > s_fs_info ;
struct bch_writepage_state w =
bch_writepage_state_init ( c , to_bch_ei ( mapping - > host ) ) ;
struct blk_plug plug ;
int ret ;
blk_start_plug ( & plug ) ;
ret = write_cache_pages ( mapping , wbc , __bch2_writepage , & w ) ;
if ( w . io )
bch2_writepage_do_io ( & w ) ;
blk_finish_plug ( & plug ) ;
2023-03-18 02:46:25 +03:00
kfree ( w . tmp ) ;
2022-09-18 22:43:50 +03:00
return bch2_err_class ( ret ) ;
2017-03-17 09:18:50 +03:00
}
/* buffered writes: */
int bch2_write_begin ( struct file * file , struct address_space * mapping ,
loff_t pos , unsigned len ,
struct page * * pagep , void * * fsdata )
{
struct bch_inode_info * inode = to_bch_ei ( mapping - > host ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2023-03-17 21:55:53 +03:00
struct bch2_folio_reservation * res ;
struct folio * folio ;
2023-03-17 22:37:34 +03:00
unsigned offset ;
2017-03-17 09:18:50 +03:00
int ret = - ENOMEM ;
2019-07-29 20:38:38 +03:00
res = kmalloc ( sizeof ( * res ) , GFP_KERNEL ) ;
if ( ! res )
return - ENOMEM ;
2023-03-17 21:55:53 +03:00
bch2_folio_reservation_init ( c , inode , res ) ;
2019-07-29 20:38:38 +03:00
* fsdata = res ;
2017-03-17 09:18:50 +03:00
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_get ( inode ) ;
2017-03-17 09:18:50 +03:00
2023-03-17 22:37:34 +03:00
folio = __filemap_get_folio ( mapping , pos > > PAGE_SHIFT ,
2023-03-17 21:55:53 +03:00
FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE ,
mapping_gfp_mask ( mapping ) ) ;
2023-06-21 07:31:49 +03:00
if ( IS_ERR_OR_NULL ( folio ) )
2017-03-17 09:18:50 +03:00
goto err_unlock ;
2023-03-17 21:55:53 +03:00
if ( folio_test_uptodate ( folio ) )
2017-03-17 09:18:50 +03:00
goto out ;
2023-03-17 22:37:34 +03:00
offset = pos - folio_pos ( folio ) ;
len = min_t ( size_t , len , folio_end_pos ( folio ) - pos ) ;
2023-03-17 21:55:53 +03:00
/* If we're writing entire folio, don't need to read it in first: */
2023-03-17 22:37:34 +03:00
if ( ! offset & & len = = folio_size ( folio ) )
2017-03-17 09:18:50 +03:00
goto out ;
if ( ! offset & & pos + len > = inode - > v . i_size ) {
2023-03-17 21:55:53 +03:00
folio_zero_segment ( folio , len , folio_size ( folio ) ) ;
flush_dcache_folio ( folio ) ;
2017-03-17 09:18:50 +03:00
goto out ;
}
2023-03-17 22:37:34 +03:00
if ( folio_pos ( folio ) > = inode - > v . i_size ) {
2023-03-17 21:55:53 +03:00
folio_zero_segments ( folio , 0 , offset , offset + len , folio_size ( folio ) ) ;
flush_dcache_folio ( folio ) ;
2017-03-17 09:18:50 +03:00
goto out ;
}
readpage :
2023-03-17 21:55:53 +03:00
ret = bch2_read_single_folio ( folio , mapping ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
goto err ;
out :
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
ret = bch2_folio_set ( c , inode_inum ( inode ) , & folio , 1 ) ;
if ( ret )
goto err ;
2021-11-24 02:17:04 +03:00
2023-03-17 21:55:53 +03:00
ret = bch2_folio_reservation_get ( c , inode , folio , res , offset , len ) ;
2017-03-17 09:18:50 +03:00
if ( ret ) {
2023-03-17 21:55:53 +03:00
if ( ! folio_test_uptodate ( folio ) ) {
2017-03-17 09:18:50 +03:00
/*
2023-03-17 21:55:53 +03:00
* If the folio hasn ' t been read in , we won ' t know if we
2017-03-17 09:18:50 +03:00
* actually need a reservation - we don ' t actually need
2023-03-17 21:55:53 +03:00
* to read here , we just need to check if the folio is
2017-03-17 09:18:50 +03:00
* fully backed by uncompressed data :
*/
goto readpage ;
}
goto err ;
}
2023-03-17 21:55:53 +03:00
* pagep = & folio - > page ;
2017-03-17 09:18:50 +03:00
return 0 ;
err :
2023-03-17 21:55:53 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2017-03-17 09:18:50 +03:00
* pagep = NULL ;
err_unlock :
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_put ( inode ) ;
2019-07-29 20:38:38 +03:00
kfree ( res ) ;
* fsdata = NULL ;
2022-09-18 22:43:50 +03:00
return bch2_err_class ( ret ) ;
2017-03-17 09:18:50 +03:00
}
int bch2_write_end ( struct file * file , struct address_space * mapping ,
loff_t pos , unsigned len , unsigned copied ,
struct page * page , void * fsdata )
{
struct bch_inode_info * inode = to_bch_ei ( mapping - > host ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2023-03-17 21:55:53 +03:00
struct bch2_folio_reservation * res = fsdata ;
struct folio * folio = page_folio ( page ) ;
2023-03-17 22:37:34 +03:00
unsigned offset = pos - folio_pos ( folio ) ;
2017-03-17 09:18:50 +03:00
lockdep_assert_held ( & inode - > v . i_rwsem ) ;
2023-03-17 22:37:34 +03:00
BUG_ON ( offset + copied > folio_size ( folio ) ) ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
if ( unlikely ( copied < len & & ! folio_test_uptodate ( folio ) ) ) {
2017-03-17 09:18:50 +03:00
/*
2023-03-17 21:55:53 +03:00
* The folio needs to be read in , but that would destroy
2017-03-17 09:18:50 +03:00
* our partial write - simplest thing is to just force
* userspace to redo the write :
*/
2023-03-17 21:55:53 +03:00
folio_zero_range ( folio , 0 , folio_size ( folio ) ) ;
flush_dcache_folio ( folio ) ;
2017-03-17 09:18:50 +03:00
copied = 0 ;
}
spin_lock ( & inode - > v . i_lock ) ;
if ( pos + copied > inode - > v . i_size )
i_size_write ( & inode - > v , pos + copied ) ;
spin_unlock ( & inode - > v . i_lock ) ;
if ( copied ) {
2023-03-17 21:55:53 +03:00
if ( ! folio_test_uptodate ( folio ) )
folio_mark_uptodate ( folio ) ;
2019-07-29 20:38:38 +03:00
2023-03-17 21:55:53 +03:00
bch2_set_folio_dirty ( c , inode , folio , res , offset , copied ) ;
2017-03-17 09:18:50 +03:00
inode - > ei_last_dirtied = ( unsigned long ) current ;
}
2023-03-17 21:55:53 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_put ( inode ) ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
bch2_folio_reservation_put ( c , inode , res ) ;
2019-07-29 20:38:38 +03:00
kfree ( res ) ;
2017-03-17 09:18:50 +03:00
return copied ;
}
2023-03-19 04:37:43 +03:00
static noinline void folios_trunc ( folios * folios , struct folio * * fi )
{
while ( folios - > data + folios - > nr > fi ) {
struct folio * f = darray_pop ( folios ) ;
folio_unlock ( f ) ;
folio_put ( f ) ;
}
}
2017-03-17 09:18:50 +03:00
static int __bch2_buffered_write ( struct bch_inode_info * inode ,
struct address_space * mapping ,
struct iov_iter * iter ,
loff_t pos , unsigned len )
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2023-03-17 21:55:53 +03:00
struct bch2_folio_reservation res ;
2023-03-19 04:37:43 +03:00
folios folios ;
struct folio * * fi , * f ;
unsigned copied = 0 , f_offset ;
2023-03-29 17:43:23 +03:00
u64 end = pos + len , f_pos ;
2023-03-29 18:23:15 +03:00
loff_t last_folio_pos = inode - > v . i_size ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
BUG_ON ( ! len ) ;
2023-03-17 21:55:53 +03:00
bch2_folio_reservation_init ( c , inode , & res ) ;
2023-03-19 04:37:43 +03:00
darray_init ( & folios ) ;
2019-07-29 20:38:38 +03:00
2023-03-23 19:51:47 +03:00
ret = filemap_get_contig_folios_d ( mapping , pos , end ,
FGP_LOCK | FGP_WRITE | FGP_STABLE | FGP_CREAT ,
mapping_gfp_mask ( mapping ) ,
& folios ) ;
if ( ret )
goto out ;
2023-03-19 04:37:43 +03:00
2023-03-23 19:51:47 +03:00
BUG_ON ( ! folios . nr ) ;
2017-03-17 09:18:50 +03:00
2023-03-19 04:37:43 +03:00
f = darray_first ( folios ) ;
if ( pos ! = folio_pos ( f ) & & ! folio_test_uptodate ( f ) ) {
ret = bch2_read_single_folio ( f , mapping ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
goto out ;
}
2023-03-19 04:37:43 +03:00
f = darray_last ( folios ) ;
2023-03-29 18:23:15 +03:00
end = min ( end , folio_end_pos ( f ) ) ;
last_folio_pos = folio_pos ( f ) ;
2023-03-19 04:37:43 +03:00
if ( end ! = folio_end_pos ( f ) & & ! folio_test_uptodate ( f ) ) {
if ( end > = inode - > v . i_size ) {
folio_zero_range ( f , 0 , folio_size ( f ) ) ;
2017-03-17 09:18:50 +03:00
} else {
2023-03-19 04:37:43 +03:00
ret = bch2_read_single_folio ( f , mapping ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
goto out ;
}
}
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
ret = bch2_folio_set ( c , inode_inum ( inode ) , folios . data , folios . nr ) ;
if ( ret )
goto out ;
2023-03-19 04:37:43 +03:00
f_pos = pos ;
f_offset = pos - folio_pos ( darray_first ( folios ) ) ;
darray_for_each ( folios , fi ) {
struct folio * f = * fi ;
2023-03-29 17:43:23 +03:00
u64 f_len = min ( end , folio_end_pos ( f ) ) - f_pos ;
2019-07-29 20:38:38 +03:00
2022-10-24 05:01:50 +03:00
/*
* XXX : per POSIX and fstests generic / 275 , on - ENOSPC we ' re
* supposed to write as much as we have disk space for .
*
* On failure here we should still write out a partial page if
* we aren ' t completely out of disk space - we don ' t do that
* yet :
*/
2023-03-19 04:37:43 +03:00
ret = bch2_folio_reservation_get ( c , inode , f , & res , f_offset , f_len ) ;
2022-10-24 05:01:50 +03:00
if ( unlikely ( ret ) ) {
2023-03-19 04:37:43 +03:00
folios_trunc ( & folios , fi ) ;
if ( ! folios . nr )
2022-10-24 05:01:50 +03:00
goto out ;
2023-03-19 04:37:43 +03:00
end = min ( end , folio_end_pos ( darray_last ( folios ) ) ) ;
2022-10-24 05:01:50 +03:00
break ;
}
2019-07-29 20:38:38 +03:00
2023-03-19 04:37:43 +03:00
f_pos = folio_end_pos ( f ) ;
f_offset = 0 ;
2017-03-17 09:18:50 +03:00
}
if ( mapping_writably_mapped ( mapping ) )
2023-03-19 04:37:43 +03:00
darray_for_each ( folios , fi )
flush_dcache_folio ( * fi ) ;
f_pos = pos ;
f_offset = pos - folio_pos ( darray_first ( folios ) ) ;
darray_for_each ( folios , fi ) {
struct folio * f = * fi ;
2023-03-29 17:43:23 +03:00
u64 f_len = min ( end , folio_end_pos ( f ) ) - f_pos ;
2023-03-19 04:37:43 +03:00
unsigned f_copied = copy_page_from_iter_atomic ( & f - > page , f_offset , f_len , iter ) ;
if ( ! f_copied ) {
folios_trunc ( & folios , fi ) ;
2019-07-29 20:38:38 +03:00
break ;
2023-03-19 04:37:43 +03:00
}
2017-03-17 09:18:50 +03:00
2023-03-19 04:37:43 +03:00
if ( ! folio_test_uptodate ( f ) & &
f_copied ! = folio_size ( f ) & &
pos + copied + f_copied < inode - > v . i_size ) {
folio_zero_range ( f , 0 , folio_size ( f ) ) ;
folios_trunc ( & folios , fi ) ;
2020-07-09 20:54:58 +03:00
break ;
}
2023-03-19 04:37:43 +03:00
flush_dcache_folio ( f ) ;
copied + = f_copied ;
2020-07-09 20:54:58 +03:00
2023-03-19 04:37:43 +03:00
if ( f_copied ! = f_len ) {
folios_trunc ( & folios , fi + 1 ) ;
2020-07-09 20:54:58 +03:00
break ;
2023-03-19 04:37:43 +03:00
}
f_pos = folio_end_pos ( f ) ;
f_offset = 0 ;
2017-03-17 09:18:50 +03:00
}
if ( ! copied )
goto out ;
2023-03-19 04:37:43 +03:00
end = pos + copied ;
2019-09-27 02:09:08 +03:00
spin_lock ( & inode - > v . i_lock ) ;
2023-03-19 04:37:43 +03:00
if ( end > inode - > v . i_size )
i_size_write ( & inode - > v , end ) ;
2019-09-27 02:09:08 +03:00
spin_unlock ( & inode - > v . i_lock ) ;
2023-03-19 04:37:43 +03:00
f_pos = pos ;
f_offset = pos - folio_pos ( darray_first ( folios ) ) ;
darray_for_each ( folios , fi ) {
struct folio * f = * fi ;
2023-03-29 17:43:23 +03:00
u64 f_len = min ( end , folio_end_pos ( f ) ) - f_pos ;
2019-07-29 20:38:38 +03:00
2023-03-19 04:37:43 +03:00
if ( ! folio_test_uptodate ( f ) )
folio_mark_uptodate ( f ) ;
2019-07-29 20:38:38 +03:00
2023-03-19 04:37:43 +03:00
bch2_set_folio_dirty ( c , inode , f , & res , f_offset , f_len ) ;
2019-07-29 20:38:38 +03:00
2023-03-19 04:37:43 +03:00
f_pos = folio_end_pos ( f ) ;
f_offset = 0 ;
2019-07-29 20:38:38 +03:00
}
2019-09-27 02:09:08 +03:00
inode - > ei_last_dirtied = ( unsigned long ) current ;
2019-07-29 20:38:38 +03:00
out :
2023-03-19 04:37:43 +03:00
darray_for_each ( folios , fi ) {
folio_unlock ( * fi ) ;
folio_put ( * fi ) ;
2017-03-17 09:18:50 +03:00
}
2023-03-29 18:23:15 +03:00
/*
* If the last folio added to the mapping starts beyond current EOF , we
* performed a short write but left around at least one post - EOF folio .
* Clean up the mapping before we return .
*/
if ( last_folio_pos > = inode - > v . i_size )
truncate_pagecache ( & inode - > v , inode - > v . i_size ) ;
2023-03-19 04:37:43 +03:00
darray_exit ( & folios ) ;
2023-03-17 21:55:53 +03:00
bch2_folio_reservation_put ( c , inode , & res ) ;
2019-07-29 20:38:38 +03:00
2017-03-17 09:18:50 +03:00
return copied ? : ret ;
}
static ssize_t bch2_buffered_write ( struct kiocb * iocb , struct iov_iter * iter )
{
struct file * file = iocb - > ki_filp ;
struct address_space * mapping = file - > f_mapping ;
struct bch_inode_info * inode = file_bch_inode ( file ) ;
loff_t pos = iocb - > ki_pos ;
ssize_t written = 0 ;
int ret = 0 ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_get ( inode ) ;
2017-03-17 09:18:50 +03:00
do {
unsigned offset = pos & ( PAGE_SIZE - 1 ) ;
2023-03-19 04:37:43 +03:00
unsigned bytes = iov_iter_count ( iter ) ;
2017-03-17 09:18:50 +03:00
again :
/*
* Bring in the user page that we will copy from _first_ .
* Otherwise there ' s a nasty deadlock on copying from the
* same page as we ' re writing to , without it being marked
* up - to - date .
*
* Not only is this an optimisation , but it is also required
* to check that the address is actually valid , when atomic
* usercopies are used , below .
*/
if ( unlikely ( fault_in_iov_iter_readable ( iter , bytes ) ) ) {
bytes = min_t ( unsigned long , iov_iter_count ( iter ) ,
PAGE_SIZE - offset ) ;
if ( unlikely ( fault_in_iov_iter_readable ( iter , bytes ) ) ) {
ret = - EFAULT ;
break ;
}
}
if ( unlikely ( fatal_signal_pending ( current ) ) ) {
ret = - EINTR ;
break ;
}
ret = __bch2_buffered_write ( inode , mapping , iter , pos , bytes ) ;
if ( unlikely ( ret < 0 ) )
break ;
cond_resched ( ) ;
if ( unlikely ( ret = = 0 ) ) {
/*
* If we were unable to copy any data at all , we must
* fall back to a single segment length write .
*
* If we didn ' t fallback here , we could livelock
* because not all segments in the iov can be copied at
* once without a pagefault .
*/
bytes = min_t ( unsigned long , PAGE_SIZE - offset ,
iov_iter_single_seg_count ( iter ) ) ;
goto again ;
}
pos + = ret ;
written + = ret ;
2020-07-09 20:54:58 +03:00
ret = 0 ;
2017-03-17 09:18:50 +03:00
balance_dirty_pages_ratelimited ( mapping ) ;
} while ( iov_iter_count ( iter ) ) ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_put ( inode ) ;
2017-03-17 09:18:50 +03:00
return written ? written : ret ;
}
/* O_DIRECT reads */
2021-01-21 22:42:23 +03:00
static void bio_check_or_release ( struct bio * bio , bool check_dirty )
{
if ( check_dirty ) {
bio_check_pages_dirty ( bio ) ;
} else {
bio_release_pages ( bio , false ) ;
bio_put ( bio ) ;
}
}
2017-03-17 09:18:50 +03:00
static void bch2_dio_read_complete ( struct closure * cl )
{
struct dio_read * dio = container_of ( cl , struct dio_read , cl ) ;
dio - > req - > ki_complete ( dio - > req , dio - > ret ) ;
2021-01-21 22:42:23 +03:00
bio_check_or_release ( & dio - > rbio . bio , dio - > should_dirty ) ;
2017-03-17 09:18:50 +03:00
}
static void bch2_direct_IO_read_endio ( struct bio * bio )
{
struct dio_read * dio = bio - > bi_private ;
if ( bio - > bi_status )
dio - > ret = blk_status_to_errno ( bio - > bi_status ) ;
closure_put ( & dio - > cl ) ;
}
static void bch2_direct_IO_read_split_endio ( struct bio * bio )
{
2021-01-21 22:42:23 +03:00
struct dio_read * dio = bio - > bi_private ;
bool should_dirty = dio - > should_dirty ;
2017-03-17 09:18:50 +03:00
bch2_direct_IO_read_endio ( bio ) ;
2021-01-21 22:42:23 +03:00
bio_check_or_release ( bio , should_dirty ) ;
2017-03-17 09:18:50 +03:00
}
static int bch2_direct_IO_read ( struct kiocb * req , struct iov_iter * iter )
{
struct file * file = req - > ki_filp ;
struct bch_inode_info * inode = file_bch_inode ( file ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2022-11-24 04:14:55 +03:00
struct bch_io_opts opts ;
2017-03-17 09:18:50 +03:00
struct dio_read * dio ;
struct bio * bio ;
loff_t offset = req - > ki_pos ;
bool sync = is_sync_kiocb ( req ) ;
size_t shorten ;
ssize_t ret ;
2022-11-24 04:14:55 +03:00
bch2_inode_opts_get ( & opts , c , & inode - > ei_inode ) ;
2017-03-17 09:18:50 +03:00
if ( ( offset | iter - > count ) & ( block_bytes ( c ) - 1 ) )
return - EINVAL ;
ret = min_t ( loff_t , iter - > count ,
max_t ( loff_t , 0 , i_size_read ( & inode - > v ) - offset ) ) ;
if ( ! ret )
return ret ;
shorten = iov_iter_count ( iter ) - round_up ( ret , block_bytes ( c ) ) ;
iter - > count - = shorten ;
bio = bio_alloc_bioset ( NULL ,
2022-03-08 21:52:58 +03:00
bio_iov_vecs_to_alloc ( iter , BIO_MAX_VECS ) ,
2017-03-17 09:18:50 +03:00
REQ_OP_READ ,
GFP_KERNEL ,
& c - > dio_read_bioset ) ;
bio - > bi_end_io = bch2_direct_IO_read_endio ;
dio = container_of ( bio , struct dio_read , rbio . bio ) ;
closure_init ( & dio - > cl , NULL ) ;
/*
* this is a _really_ horrible hack just to avoid an atomic sub at the
* end :
*/
if ( ! sync ) {
set_closure_fn ( & dio - > cl , bch2_dio_read_complete , NULL ) ;
atomic_set ( & dio - > cl . remaining ,
CLOSURE_REMAINING_INITIALIZER -
CLOSURE_RUNNING +
CLOSURE_DESTRUCTOR ) ;
} else {
atomic_set ( & dio - > cl . remaining ,
CLOSURE_REMAINING_INITIALIZER + 1 ) ;
}
dio - > req = req ;
dio - > ret = ret ;
2021-01-21 22:42:23 +03:00
/*
* This is one of the sketchier things I ' ve encountered : we have to skip
* the dirtying of requests that are internal from the kernel ( i . e . from
* loopback ) , because we ' ll deadlock on page_lock .
*/
dio - > should_dirty = iter_is_iovec ( iter ) ;
2017-03-17 09:18:50 +03:00
goto start ;
while ( iter - > count ) {
bio = bio_alloc_bioset ( NULL ,
2022-03-08 21:52:58 +03:00
bio_iov_vecs_to_alloc ( iter , BIO_MAX_VECS ) ,
2017-03-17 09:18:50 +03:00
REQ_OP_READ ,
GFP_KERNEL ,
& c - > bio_read ) ;
bio - > bi_end_io = bch2_direct_IO_read_split_endio ;
start :
bio - > bi_opf = REQ_OP_READ | REQ_SYNC ;
bio - > bi_iter . bi_sector = offset > > 9 ;
bio - > bi_private = dio ;
ret = bio_iov_iter_get_pages ( bio , iter ) ;
if ( ret < 0 ) {
/* XXX: fault inject this path */
bio - > bi_status = BLK_STS_RESOURCE ;
bio_endio ( bio ) ;
break ;
}
offset + = bio - > bi_iter . bi_size ;
2021-01-21 22:42:23 +03:00
if ( dio - > should_dirty )
bio_set_pages_dirty ( bio ) ;
2017-03-17 09:18:50 +03:00
if ( iter - > count )
closure_get ( & dio - > cl ) ;
2021-03-13 04:30:39 +03:00
bch2_read ( c , rbio_init ( bio , opts ) , inode_inum ( inode ) ) ;
2017-03-17 09:18:50 +03:00
}
iter - > count + = shorten ;
if ( sync ) {
closure_sync ( & dio - > cl ) ;
closure_debug_destroy ( & dio - > cl ) ;
ret = dio - > ret ;
2021-01-21 22:42:23 +03:00
bio_check_or_release ( & dio - > rbio . bio , dio - > should_dirty ) ;
2017-03-17 09:18:50 +03:00
return ret ;
} else {
return - EIOCBQUEUED ;
}
}
ssize_t bch2_read_iter ( struct kiocb * iocb , struct iov_iter * iter )
{
struct file * file = iocb - > ki_filp ;
struct bch_inode_info * inode = file_bch_inode ( file ) ;
struct address_space * mapping = file - > f_mapping ;
size_t count = iov_iter_count ( iter ) ;
ssize_t ret ;
if ( ! count )
return 0 ; /* skip atime */
if ( iocb - > ki_flags & IOCB_DIRECT ) {
struct blk_plug plug ;
2022-11-02 23:45:28 +03:00
if ( unlikely ( mapping - > nrpages ) ) {
ret = filemap_write_and_wait_range ( mapping ,
iocb - > ki_pos ,
iocb - > ki_pos + count - 1 ) ;
if ( ret < 0 )
2022-09-18 22:43:50 +03:00
goto out ;
2022-11-02 23:45:28 +03:00
}
2017-03-17 09:18:50 +03:00
file_accessed ( file ) ;
blk_start_plug ( & plug ) ;
ret = bch2_direct_IO_read ( iocb , iter ) ;
blk_finish_plug ( & plug ) ;
if ( ret > = 0 )
iocb - > ki_pos + = ret ;
} else {
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_get ( inode ) ;
2017-03-17 09:18:50 +03:00
ret = generic_file_read_iter ( iocb , iter ) ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_add_put ( inode ) ;
2017-03-17 09:18:50 +03:00
}
2022-09-18 22:43:50 +03:00
out :
return bch2_err_class ( ret ) ;
2017-03-17 09:18:50 +03:00
}
/* O_DIRECT writes */
2021-03-16 07:28:17 +03:00
static bool bch2_check_range_allocated ( struct bch_fs * c , subvol_inum inum ,
u64 offset , u64 size ,
unsigned nr_replicas , bool compressed )
{
struct btree_trans trans ;
struct btree_iter iter ;
struct bkey_s_c k ;
u64 end = offset + size ;
u32 snapshot ;
bool ret = true ;
int err ;
bch2_trans_init ( & trans , c , 0 , 0 ) ;
retry :
bch2_trans_begin ( & trans ) ;
err = bch2_subvolume_get_snapshot ( & trans , inum . subvol , & snapshot ) ;
if ( err )
goto err ;
2021-10-21 19:05:21 +03:00
for_each_btree_key_norestart ( & trans , iter , BTREE_ID_extents ,
2021-03-16 07:28:17 +03:00
SPOS ( inum . inum , offset , snapshot ) ,
BTREE_ITER_SLOTS , k , err ) {
2022-11-24 11:12:22 +03:00
if ( bkey_ge ( bkey_start_pos ( k . k ) , POS ( inum . inum , end ) ) )
2021-03-16 07:28:17 +03:00
break ;
2021-03-13 04:30:39 +03:00
if ( k . k - > p . snapshot ! = snapshot | |
nr_replicas > bch2_bkey_replicas ( c , k ) | |
2021-03-16 07:28:17 +03:00
( ! compressed & & bch2_bkey_sectors_compressed ( k ) ) ) {
ret = false ;
break ;
}
}
offset = iter . pos . offset ;
bch2_trans_iter_exit ( & trans , & iter ) ;
err :
2022-07-18 06:06:38 +03:00
if ( bch2_err_matches ( err , BCH_ERR_transaction_restart ) )
2021-03-16 07:28:17 +03:00
goto retry ;
bch2_trans_exit ( & trans ) ;
return err ? false : ret ;
}
2022-11-01 03:30:27 +03:00
static noinline bool bch2_dio_write_check_allocated ( struct dio_write * dio )
{
struct bch_fs * c = dio - > op . c ;
struct bch_inode_info * inode = dio - > inode ;
struct bio * bio = & dio - > op . wbio . bio ;
return bch2_check_range_allocated ( c , inode_inum ( inode ) ,
dio - > op . pos . offset , bio_sectors ( bio ) ,
dio - > op . opts . data_replicas ,
dio - > op . opts . compression ! = 0 ) ;
}
2022-11-03 07:29:43 +03:00
static void bch2_dio_write_loop_async ( struct bch_write_op * ) ;
static __always_inline long bch2_dio_write_done ( struct dio_write * dio ) ;
2017-03-17 09:18:50 +03:00
/*
* We ' re going to return - EIOCBQUEUED , but we haven ' t finished consuming the
* iov_iter yet , so we need to stash a copy of the iovec : it might be on the
* caller ' s stack , we ' re not guaranteed that it will live for the duration of
* the IO :
*/
static noinline int bch2_dio_write_copy_iov ( struct dio_write * dio )
{
struct iovec * iov = dio - > inline_vecs ;
/*
* iov_iter has a single embedded iovec - nothing to do :
*/
if ( iter_is_ubuf ( & dio - > iter ) )
return 0 ;
/*
* We don ' t currently handle non - iovec iov_iters here - return an error ,
* and we ' ll fall back to doing the IO synchronously :
*/
if ( ! iter_is_iovec ( & dio - > iter ) )
return - 1 ;
if ( dio - > iter . nr_segs > ARRAY_SIZE ( dio - > inline_vecs ) ) {
iov = kmalloc_array ( dio - > iter . nr_segs , sizeof ( * iov ) ,
GFP_KERNEL ) ;
if ( unlikely ( ! iov ) )
return - ENOMEM ;
dio - > free_iov = true ;
}
memcpy ( iov , dio - > iter . __iov , dio - > iter . nr_segs * sizeof ( * iov ) ) ;
dio - > iter . __iov = iov ;
return 0 ;
}
2022-11-03 07:29:43 +03:00
static void bch2_dio_write_flush_done ( struct closure * cl )
{
struct dio_write * dio = container_of ( cl , struct dio_write , op . cl ) ;
struct bch_fs * c = dio - > op . c ;
closure_debug_destroy ( cl ) ;
dio - > op . error = bch2_journal_error ( & c - > journal ) ;
bch2_dio_write_done ( dio ) ;
}
static noinline void bch2_dio_write_flush ( struct dio_write * dio )
{
struct bch_fs * c = dio - > op . c ;
struct bch_inode_unpacked inode ;
int ret ;
dio - > flush = 0 ;
closure_init ( & dio - > op . cl , NULL ) ;
if ( ! dio - > op . error ) {
ret = bch2_inode_find_by_inum ( c , inode_inum ( dio - > inode ) , & inode ) ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
if ( ret ) {
2022-11-03 07:29:43 +03:00
dio - > op . error = ret ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
} else {
2022-11-03 07:29:43 +03:00
bch2_journal_flush_seq_async ( & c - > journal , inode . bi_journal_seq , & dio - > op . cl ) ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
bch2_inode_flush_nocow_writes_async ( c , dio - > inode , & dio - > op . cl ) ;
}
2022-11-03 07:29:43 +03:00
}
if ( dio - > sync ) {
closure_sync ( & dio - > op . cl ) ;
closure_debug_destroy ( & dio - > op . cl ) ;
} else {
continue_at ( & dio - > op . cl , bch2_dio_write_flush_done , NULL ) ;
}
}
2020-06-30 01:22:06 +03:00
2022-11-01 03:30:27 +03:00
static __always_inline long bch2_dio_write_done ( struct dio_write * dio )
{
struct kiocb * req = dio - > req ;
struct bch_inode_info * inode = dio - > inode ;
bool sync = dio - > sync ;
2022-11-03 07:29:43 +03:00
long ret ;
if ( unlikely ( dio - > flush ) ) {
bch2_dio_write_flush ( dio ) ;
if ( ! sync )
return - EIOCBQUEUED ;
}
2022-11-01 03:30:27 +03:00
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_put ( inode ) ;
2022-11-01 03:30:27 +03:00
if ( dio - > free_iov )
kfree ( dio - > iter . __iov ) ;
2022-11-03 07:29:43 +03:00
ret = dio - > op . error ? : ( ( long ) dio - > written < < 9 ) ;
2022-11-01 03:30:27 +03:00
bio_put ( & dio - > op . wbio . bio ) ;
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end ( & inode - > v ) ;
if ( ret < 0 )
ret = bch2_err_class ( ret ) ;
if ( ! sync ) {
req - > ki_complete ( req , ret ) ;
ret = - EIOCBQUEUED ;
}
return ret ;
}
static __always_inline void bch2_dio_write_end ( struct dio_write * dio )
{
struct bch_fs * c = dio - > op . c ;
struct kiocb * req = dio - > req ;
struct bch_inode_info * inode = dio - > inode ;
struct bio * bio = & dio - > op . wbio . bio ;
2022-11-14 06:43:37 +03:00
req - > ki_pos + = ( u64 ) dio - > op . written < < 9 ;
dio - > written + = dio - > op . written ;
2022-11-01 03:30:27 +03:00
2022-11-14 06:43:37 +03:00
if ( dio - > extending ) {
spin_lock ( & inode - > v . i_lock ) ;
if ( req - > ki_pos > inode - > v . i_size )
i_size_write ( & inode - > v , req - > ki_pos ) ;
spin_unlock ( & inode - > v . i_lock ) ;
}
if ( dio - > op . i_sectors_delta | | dio - > quota_res . sectors ) {
mutex_lock ( & inode - > ei_quota_lock ) ;
__i_sectors_acct ( c , inode , & dio - > quota_res , dio - > op . i_sectors_delta ) ;
__bch2_quota_reservation_put ( c , inode , & dio - > quota_res ) ;
mutex_unlock ( & inode - > ei_quota_lock ) ;
}
2022-11-01 03:30:27 +03:00
bio_release_pages ( bio , false ) ;
if ( unlikely ( dio - > op . error ) )
set_bit ( EI_INODE_ERROR , & inode - > ei_flags ) ;
}
2022-11-25 07:52:28 +03:00
static __always_inline long bch2_dio_write_loop ( struct dio_write * dio )
2017-03-17 09:18:50 +03:00
{
2022-11-01 03:30:27 +03:00
struct bch_fs * c = dio - > op . c ;
2017-03-17 09:18:50 +03:00
struct kiocb * req = dio - > req ;
2022-11-01 03:30:27 +03:00
struct address_space * mapping = dio - > mapping ;
struct bch_inode_info * inode = dio - > inode ;
2022-11-24 04:14:55 +03:00
struct bch_io_opts opts ;
2019-10-09 19:50:39 +03:00
struct bio * bio = & dio - > op . wbio . bio ;
2020-11-11 20:33:12 +03:00
unsigned unaligned , iter_count ;
bool sync = dio - > sync , dropped_locks ;
2017-03-17 09:18:50 +03:00
long ret ;
2022-11-24 04:14:55 +03:00
bch2_inode_opts_get ( & opts , c , & inode - > ei_inode ) ;
2017-03-17 09:18:50 +03:00
while ( 1 ) {
2020-11-11 20:33:12 +03:00
iter_count = dio - > iter . count ;
2022-11-01 03:30:27 +03:00
EBUG_ON ( current - > faults_disabled_mapping ) ;
2017-03-17 09:18:50 +03:00
current - > faults_disabled_mapping = mapping ;
ret = bio_iov_iter_get_pages ( bio , & dio - > iter ) ;
2020-11-11 20:33:12 +03:00
dropped_locks = fdm_dropped_locks ( ) ;
2017-03-17 09:18:50 +03:00
current - > faults_disabled_mapping = NULL ;
2020-11-11 20:33:12 +03:00
/*
* If the fault handler returned an error but also signalled
* that it dropped & retook ei_pagecache_lock , we just need to
* re - shoot down the page cache and retry :
*/
if ( dropped_locks & & ret )
ret = 0 ;
2017-03-17 09:18:50 +03:00
if ( unlikely ( ret < 0 ) )
goto err ;
2020-11-11 20:33:12 +03:00
if ( unlikely ( dropped_locks ) ) {
ret = write_invalidate_inode_pages_range ( mapping ,
req - > ki_pos ,
req - > ki_pos + iter_count - 1 ) ;
if ( unlikely ( ret ) )
goto err ;
if ( ! bio - > bi_iter . bi_size )
continue ;
}
2019-09-22 22:02:05 +03:00
unaligned = bio - > bi_iter . bi_size & ( block_bytes ( c ) - 1 ) ;
bio - > bi_iter . bi_size - = unaligned ;
iov_iter_revert ( & dio - > iter , unaligned ) ;
if ( ! bio - > bi_iter . bi_size ) {
/*
* bio_iov_iter_get_pages was only able to get <
* blocksize worth of pages :
*/
ret = - EFAULT ;
goto err ;
}
2022-11-24 04:14:55 +03:00
bch2_write_op_init ( & dio - > op , c , opts ) ;
2022-11-01 03:30:27 +03:00
dio - > op . end_io = sync
? NULL
: bch2_dio_write_loop_async ;
2020-06-30 01:22:06 +03:00
dio - > op . target = dio - > op . opts . foreground_target ;
dio - > op . write_point = writepoint_hashed ( ( unsigned long ) current ) ;
dio - > op . nr_replicas = dio - > op . opts . data_replicas ;
2021-03-13 04:30:39 +03:00
dio - > op . subvol = inode - > ei_subvol ;
2020-06-30 01:22:06 +03:00
dio - > op . pos = POS ( inode - > v . i_ino , ( u64 ) req - > ki_pos > > 9 ) ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
dio - > op . devs_need_flush = & inode - > ei_devs_need_flush ;
2020-06-30 01:22:06 +03:00
2022-10-29 22:54:17 +03:00
if ( sync )
dio - > op . flags | = BCH_WRITE_SYNC ;
2021-05-20 22:49:23 +03:00
dio - > op . flags | = BCH_WRITE_CHECK_ENOSPC ;
2020-06-30 01:22:06 +03:00
2022-11-14 06:43:37 +03:00
ret = bch2_quota_reservation_add ( c , inode , & dio - > quota_res ,
bio_sectors ( bio ) , true ) ;
if ( unlikely ( ret ) )
goto err ;
2020-06-30 01:22:06 +03:00
ret = bch2_disk_reservation_get ( c , & dio - > op . res , bio_sectors ( bio ) ,
dio - > op . opts . data_replicas , 0 ) ;
if ( unlikely ( ret ) & &
2022-11-01 03:30:27 +03:00
! bch2_dio_write_check_allocated ( dio ) )
2020-06-30 01:22:06 +03:00
goto err ;
2017-03-17 09:18:50 +03:00
task_io_account_write ( bio - > bi_iter . bi_size ) ;
2022-11-01 03:30:27 +03:00
if ( unlikely ( dio - > iter . count ) & &
! dio - > sync & &
! dio - > loop & &
bch2_dio_write_copy_iov ( dio ) )
dio - > sync = sync = true ;
2017-03-17 09:18:50 +03:00
dio - > loop = true ;
2019-11-02 04:16:51 +03:00
closure_call ( & dio - > op . cl , bch2_write , NULL , NULL ) ;
2017-03-17 09:18:50 +03:00
2022-11-01 03:30:27 +03:00
if ( ! sync )
2017-03-17 09:18:50 +03:00
return - EIOCBQUEUED ;
2019-10-09 19:50:39 +03:00
2022-11-01 03:30:27 +03:00
bch2_dio_write_end ( dio ) ;
2019-10-09 19:50:39 +03:00
2022-11-01 03:30:27 +03:00
if ( likely ( ! dio - > iter . count ) | | dio - > op . error )
2017-03-17 09:18:50 +03:00
break ;
2019-11-02 04:16:51 +03:00
2017-03-17 09:18:50 +03:00
bio_reset ( bio , NULL , REQ_OP_WRITE ) ;
}
2022-11-01 03:30:27 +03:00
out :
return bch2_dio_write_done ( dio ) ;
2017-03-17 09:18:50 +03:00
err :
2022-11-01 03:30:27 +03:00
dio - > op . error = ret ;
2017-03-17 09:18:50 +03:00
2021-07-14 07:14:45 +03:00
bio_release_pages ( bio , false ) ;
2022-11-14 06:43:37 +03:00
bch2_quota_reservation_put ( c , inode , & dio - > quota_res ) ;
2022-11-01 03:30:27 +03:00
goto out ;
2017-03-17 09:18:50 +03:00
}
2022-11-25 07:52:28 +03:00
static noinline __cold void bch2_dio_write_continue ( struct dio_write * dio )
2017-03-17 09:18:50 +03:00
{
2022-11-01 03:30:27 +03:00
struct mm_struct * mm = dio - > mm ;
bio_reset ( & dio - > op . wbio . bio , NULL , REQ_OP_WRITE ) ;
2017-03-17 09:18:50 +03:00
2022-11-01 03:30:27 +03:00
if ( mm )
kthread_use_mm ( mm ) ;
bch2_dio_write_loop ( dio ) ;
if ( mm )
kthread_unuse_mm ( mm ) ;
2017-03-17 09:18:50 +03:00
}
2022-11-25 07:52:28 +03:00
static void bch2_dio_write_loop_async ( struct bch_write_op * op )
{
struct dio_write * dio = container_of ( op , struct dio_write , op ) ;
bch2_dio_write_end ( dio ) ;
if ( likely ( ! dio - > iter . count ) | | dio - > op . error )
bch2_dio_write_done ( dio ) ;
else
bch2_dio_write_continue ( dio ) ;
}
2017-03-17 09:18:50 +03:00
static noinline
ssize_t bch2_direct_write ( struct kiocb * req , struct iov_iter * iter )
{
struct file * file = req - > ki_filp ;
2019-11-04 22:11:53 +03:00
struct address_space * mapping = file - > f_mapping ;
2017-03-17 09:18:50 +03:00
struct bch_inode_info * inode = file_bch_inode ( file ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
struct dio_write * dio ;
struct bio * bio ;
2019-11-02 04:35:25 +03:00
bool locked = true , extending ;
2017-03-17 09:18:50 +03:00
ssize_t ret ;
2019-11-02 04:35:25 +03:00
prefetch ( & c - > opts ) ;
prefetch ( ( void * ) & c - > opts + 64 ) ;
prefetch ( & inode - > ei_inode ) ;
prefetch ( ( void * ) & inode - > ei_inode + 64 ) ;
2017-03-17 09:18:50 +03:00
2019-11-02 04:35:25 +03:00
inode_lock ( & inode - > v ) ;
ret = generic_write_checks ( req , iter ) ;
if ( unlikely ( ret < = 0 ) )
goto err ;
ret = file_remove_privs ( file ) ;
if ( unlikely ( ret ) )
goto err ;
ret = file_update_time ( file ) ;
if ( unlikely ( ret ) )
goto err ;
2017-03-17 09:18:50 +03:00
2019-01-20 02:12:24 +03:00
if ( unlikely ( ( req - > ki_pos | iter - > count ) & ( block_bytes ( c ) - 1 ) ) )
2019-11-02 04:35:25 +03:00
goto err ;
inode_dio_begin ( & inode - > v ) ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_get ( inode ) ;
2019-11-02 04:35:25 +03:00
extending = req - > ki_pos + iter - > count > inode - > v . i_size ;
if ( ! extending ) {
inode_unlock ( & inode - > v ) ;
locked = false ;
}
2017-03-17 09:18:50 +03:00
bio = bio_alloc_bioset ( NULL ,
2022-03-08 21:52:58 +03:00
bio_iov_vecs_to_alloc ( iter , BIO_MAX_VECS ) ,
2017-03-17 09:18:50 +03:00
REQ_OP_WRITE ,
GFP_KERNEL ,
& c - > dio_write_bioset ) ;
2019-10-09 19:50:39 +03:00
dio = container_of ( bio , struct dio_write , op . wbio . bio ) ;
2017-03-17 09:18:50 +03:00
dio - > req = req ;
2022-11-01 03:30:27 +03:00
dio - > mapping = mapping ;
dio - > inode = inode ;
2019-01-14 05:36:14 +03:00
dio - > mm = current - > mm ;
2017-03-17 09:18:50 +03:00
dio - > loop = false ;
2022-11-14 06:43:37 +03:00
dio - > extending = extending ;
2019-11-02 04:35:25 +03:00
dio - > sync = is_sync_kiocb ( req ) | | extending ;
2022-11-03 07:29:43 +03:00
dio - > flush = iocb_is_dsync ( req ) & & ! c - > opts . journal_flush_disabled ;
2017-03-17 09:18:50 +03:00
dio - > free_iov = false ;
dio - > quota_res . sectors = 0 ;
2020-06-30 01:22:06 +03:00
dio - > written = 0 ;
2017-03-17 09:18:50 +03:00
dio - > iter = * iter ;
2022-11-01 03:30:27 +03:00
dio - > op . c = c ;
2019-10-09 19:50:39 +03:00
2022-11-02 23:45:28 +03:00
if ( unlikely ( mapping - > nrpages ) ) {
ret = write_invalidate_inode_pages_range ( mapping ,
req - > ki_pos ,
req - > ki_pos + iter - > count - 1 ) ;
if ( unlikely ( ret ) )
goto err_put_bio ;
}
2019-11-04 22:11:53 +03:00
2019-11-02 04:35:25 +03:00
ret = bch2_dio_write_loop ( dio ) ;
2017-03-17 09:18:50 +03:00
err :
2019-11-02 04:35:25 +03:00
if ( locked )
inode_unlock ( & inode - > v ) ;
return ret ;
err_put_bio :
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_put ( inode ) ;
2017-03-17 09:18:50 +03:00
bio_put ( bio ) ;
2019-11-02 04:35:25 +03:00
inode_dio_end ( & inode - > v ) ;
goto err ;
2017-03-17 09:18:50 +03:00
}
2019-11-02 04:35:25 +03:00
ssize_t bch2_write_iter ( struct kiocb * iocb , struct iov_iter * from )
2017-03-17 09:18:50 +03:00
{
struct file * file = iocb - > ki_filp ;
2019-11-02 04:35:25 +03:00
struct bch_inode_info * inode = file_bch_inode ( file ) ;
2017-03-17 09:18:50 +03:00
ssize_t ret ;
2022-09-18 22:43:50 +03:00
if ( iocb - > ki_flags & IOCB_DIRECT ) {
ret = bch2_direct_write ( iocb , from ) ;
goto out ;
}
2017-03-17 09:18:50 +03:00
2019-11-02 04:35:25 +03:00
inode_lock ( & inode - > v ) ;
ret = generic_write_checks ( iocb , from ) ;
if ( ret < = 0 )
goto unlock ;
2017-03-17 09:18:50 +03:00
ret = file_remove_privs ( file ) ;
if ( ret )
2019-11-02 04:35:25 +03:00
goto unlock ;
2017-03-17 09:18:50 +03:00
ret = file_update_time ( file ) ;
if ( ret )
2019-11-02 04:35:25 +03:00
goto unlock ;
2017-03-17 09:18:50 +03:00
2019-11-02 04:35:25 +03:00
ret = bch2_buffered_write ( iocb , from ) ;
2017-03-17 09:18:50 +03:00
if ( likely ( ret > 0 ) )
iocb - > ki_pos + = ret ;
2019-11-02 04:35:25 +03:00
unlock :
2017-03-17 09:18:50 +03:00
inode_unlock ( & inode - > v ) ;
2019-11-02 04:35:25 +03:00
if ( ret > 0 )
2017-03-17 09:18:50 +03:00
ret = generic_write_sync ( iocb , ret ) ;
2022-09-18 22:43:50 +03:00
out :
return bch2_err_class ( ret ) ;
2017-03-17 09:18:50 +03:00
}
/* fsync: */
2021-11-05 22:17:13 +03:00
/*
* inode - > ei_inode . bi_journal_seq won ' t be up to date since it ' s set in an
* insert trigger : look up the btree inode instead
*/
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
static int bch2_flush_inode ( struct bch_fs * c ,
struct bch_inode_info * inode )
2017-03-17 09:18:50 +03:00
{
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
struct bch_inode_unpacked u ;
2021-11-05 22:17:13 +03:00
int ret ;
2017-03-17 09:18:50 +03:00
2021-11-05 22:17:13 +03:00
if ( c - > opts . journal_flush_disabled )
return 0 ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
ret = bch2_inode_find_by_inum ( c , inode_inum ( inode ) , & u ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
return ret ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
return bch2_journal_flush_seq ( & c - > journal , u . bi_journal_seq ) ? :
bch2_inode_flush_nocow_writes ( c , inode ) ;
2021-11-05 22:17:13 +03:00
}
2017-03-17 09:18:50 +03:00
2021-11-05 22:17:13 +03:00
int bch2_fsync ( struct file * file , loff_t start , loff_t end , int datasync )
{
struct bch_inode_info * inode = file_bch_inode ( file ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
int ret , ret2 , ret3 ;
ret = file_write_and_wait_range ( file , start , end ) ;
ret2 = sync_inode_metadata ( & inode - > v , 1 ) ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
ret3 = bch2_flush_inode ( c , inode ) ;
2018-07-23 14:53:29 +03:00
2022-09-18 22:43:50 +03:00
return bch2_err_class ( ret ? : ret2 ? : ret3 ) ;
2017-03-17 09:18:50 +03:00
}
/* truncate: */
2021-03-16 07:28:17 +03:00
static inline int range_has_data ( struct bch_fs * c , u32 subvol ,
struct bpos start ,
struct bpos end )
2017-03-17 09:18:50 +03:00
{
2019-03-25 22:10:15 +03:00
struct btree_trans trans ;
2021-08-30 22:18:31 +03:00
struct btree_iter iter ;
2017-03-17 09:18:50 +03:00
struct bkey_s_c k ;
int ret = 0 ;
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , c , 0 , 0 ) ;
2021-03-16 07:28:17 +03:00
retry :
bch2_trans_begin ( & trans ) ;
ret = bch2_subvolume_get_snapshot ( & trans , subvol , & start . snapshot ) ;
if ( ret )
goto err ;
2019-03-25 22:10:15 +03:00
2022-10-11 11:32:41 +03:00
for_each_btree_key_upto_norestart ( & trans , iter , BTREE_ID_extents , start , end , 0 , k , ret )
2023-03-29 16:49:04 +03:00
if ( bkey_extent_is_data ( k . k ) & & ! bkey_extent_is_unwritten ( k ) ) {
2017-03-17 09:18:50 +03:00
ret = 1 ;
break ;
}
2021-03-16 07:28:17 +03:00
start = iter . pos ;
2021-08-30 22:18:31 +03:00
bch2_trans_iter_exit ( & trans , & iter ) ;
2021-03-16 07:28:17 +03:00
err :
2022-07-18 06:06:38 +03:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
2021-03-16 07:28:17 +03:00
goto retry ;
2017-03-17 09:18:50 +03:00
2021-10-19 22:08:00 +03:00
bch2_trans_exit ( & trans ) ;
return ret ;
2017-03-17 09:18:50 +03:00
}
2023-03-20 01:03:22 +03:00
static int __bch2_truncate_folio ( struct bch_inode_info * inode ,
pgoff_t index , loff_t start , loff_t end )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
struct address_space * mapping = inode - > v . i_mapping ;
2023-03-17 19:53:15 +03:00
struct bch_folio * s ;
2017-03-17 09:18:50 +03:00
unsigned start_offset = start & ( PAGE_SIZE - 1 ) ;
unsigned end_offset = ( ( end - 1 ) & ( PAGE_SIZE - 1 ) ) + 1 ;
2019-08-06 18:19:58 +03:00
unsigned i ;
2023-03-17 21:55:53 +03:00
struct folio * folio ;
2021-11-22 20:47:20 +03:00
s64 i_sectors_delta = 0 ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
2023-03-29 17:43:23 +03:00
u64 end_pos ;
2017-03-17 09:18:50 +03:00
2023-03-17 21:55:53 +03:00
folio = filemap_lock_folio ( mapping , index ) ;
2023-06-21 07:31:49 +03:00
if ( IS_ERR_OR_NULL ( folio ) ) {
2017-03-17 09:18:50 +03:00
/*
* XXX : we ' re doing two index lookups when we end up reading the
2023-03-17 21:55:53 +03:00
* folio
2017-03-17 09:18:50 +03:00
*/
2021-03-16 07:28:17 +03:00
ret = range_has_data ( c , inode - > ei_subvol ,
2022-10-11 11:32:41 +03:00
POS ( inode - > v . i_ino , ( index < < PAGE_SECTORS_SHIFT ) ) ,
POS ( inode - > v . i_ino , ( index < < PAGE_SECTORS_SHIFT ) + PAGE_SECTORS ) ) ;
2017-03-17 09:18:50 +03:00
if ( ret < = 0 )
return ret ;
2023-03-17 21:55:53 +03:00
folio = __filemap_get_folio ( mapping , index ,
FGP_LOCK | FGP_CREAT , GFP_KERNEL ) ;
2023-06-21 07:31:49 +03:00
if ( unlikely ( IS_ERR_OR_NULL ( folio ) ) ) {
2017-03-17 09:18:50 +03:00
ret = - ENOMEM ;
goto out ;
}
}
2023-03-20 01:03:22 +03:00
BUG_ON ( start > = folio_end_pos ( folio ) ) ;
BUG_ON ( end < = folio_pos ( folio ) ) ;
start_offset = max ( start , folio_pos ( folio ) ) - folio_pos ( folio ) ;
2023-03-29 17:43:23 +03:00
end_offset = min_t ( u64 , end , folio_end_pos ( folio ) ) - folio_pos ( folio ) ;
2023-03-20 01:03:22 +03:00
/* Folio boundary? Nothing to do */
if ( start_offset = = 0 & &
end_offset = = folio_size ( folio ) ) {
ret = 0 ;
goto unlock ;
}
2023-03-17 21:55:53 +03:00
s = bch2_folio_create ( folio , 0 ) ;
2019-08-06 18:19:58 +03:00
if ( ! s ) {
ret = - ENOMEM ;
goto unlock ;
}
2023-03-17 21:55:53 +03:00
if ( ! folio_test_uptodate ( folio ) ) {
ret = bch2_read_single_folio ( folio , mapping ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
goto unlock ;
}
bcachefs: Avoid __GFP_NOFAIL
We've been using __GFP_NOFAIL for allocating struct bch_folio, our
private per-folio state.
However, that struct is variable size - it holds state for each sector
in the folio, and folios can be quite large now, which means it's
possible for bch_folio to be larger than PAGE_SIZE now.
__GFP_NOFAIL allocations are undesirable in normal circumstances, but
particularly so at >= PAGE_SIZE, and warnings are emitted for that.
So, this patch adds proper error paths and eliminates most uses of
__GFP_NOFAIL. Also, do some more cleanup of gfp flags w.r.t. btree node
locks: we can use GFP_KERNEL, but only if we're not holding btree locks,
and if we are holding btree locks we should be using GFP_NOWAIT.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-28 07:35:35 +03:00
ret = bch2_folio_set ( c , inode_inum ( inode ) , & folio , 1 ) ;
if ( ret )
goto unlock ;
2023-03-27 22:16:24 +03:00
2019-08-06 18:19:58 +03:00
for ( i = round_up ( start_offset , block_bytes ( c ) ) > > 9 ;
i < round_down ( end_offset , block_bytes ( c ) ) > > 9 ;
i + + ) {
s - > s [ i ] . nr_replicas = 0 ;
2023-03-23 18:08:04 +03:00
i_sectors_delta - = s - > s [ i ] . state = = SECTOR_dirty ;
folio_sector_set ( folio , s , i , SECTOR_unallocated ) ;
2019-08-06 18:19:58 +03:00
}
2021-11-22 20:47:20 +03:00
i_sectors_acct ( c , inode , NULL , i_sectors_delta ) ;
2021-11-06 20:39:42 +03:00
/*
2023-03-17 21:55:53 +03:00
* Caller needs to know whether this folio will be written out by
2021-11-06 20:39:42 +03:00
* writeback - doing an i_size update if necessary - or whether it will
2023-03-29 16:49:04 +03:00
* be responsible for the i_size update .
*
* Note that we shouldn ' t ever see a folio beyond EOF , but check and
* warn if so . This has been observed by failure to clean up folios
* after a short write and there ' s still a chance reclaim will fix
* things up .
2021-11-06 20:39:42 +03:00
*/
2023-03-29 16:49:04 +03:00
WARN_ON_ONCE ( folio_pos ( folio ) > = inode - > v . i_size ) ;
end_pos = folio_end_pos ( folio ) ;
if ( inode - > v . i_size > folio_pos ( folio ) )
2023-03-29 17:43:23 +03:00
end_pos = min_t ( u64 , inode - > v . i_size , end_pos ) ;
2023-04-03 15:17:26 +03:00
ret = s - > s [ folio_pos_to_s ( folio , end_pos - 1 ) ] . state > = SECTOR_dirty ;
2021-11-06 20:39:42 +03:00
2023-03-17 21:55:53 +03:00
folio_zero_segment ( folio , start_offset , end_offset ) ;
2019-08-06 18:19:58 +03:00
2017-03-17 09:18:50 +03:00
/*
* Bit of a hack - we don ' t want truncate to fail due to - ENOSPC .
*
2023-03-17 21:55:53 +03:00
* XXX : because we aren ' t currently tracking whether the folio has actual
2017-03-17 09:18:50 +03:00
* data in it ( vs . just 0 s , or only partially written ) this wrong . ick .
*/
2023-03-17 21:55:53 +03:00
BUG_ON ( bch2_get_folio_disk_reservation ( c , inode , folio , false ) ) ;
2017-03-17 09:18:50 +03:00
2020-10-09 07:09:20 +03:00
/*
* This removes any writeable userspace mappings ; we need to force
* . page_mkwrite to be called again before any mmapped writes , to
* redirty the full page :
*/
2023-03-17 21:55:53 +03:00
folio_mkclean ( folio ) ;
filemap_dirty_folio ( mapping , folio ) ;
2017-03-17 09:18:50 +03:00
unlock :
2023-03-17 21:55:53 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2017-03-17 09:18:50 +03:00
out :
return ret ;
}
2023-03-20 01:03:22 +03:00
static int bch2_truncate_folio ( struct bch_inode_info * inode , loff_t from )
2017-03-17 09:18:50 +03:00
{
2023-03-20 01:03:22 +03:00
return __bch2_truncate_folio ( inode , from > > PAGE_SHIFT ,
from , ANYSINT_MAX ( loff_t ) ) ;
2017-03-17 09:18:50 +03:00
}
2023-03-20 01:03:22 +03:00
static int bch2_truncate_folios ( struct bch_inode_info * inode ,
loff_t start , loff_t end )
2021-11-06 20:39:42 +03:00
{
2023-03-20 01:03:22 +03:00
int ret = __bch2_truncate_folio ( inode , start > > PAGE_SHIFT ,
start , end ) ;
2021-11-06 20:39:42 +03:00
if ( ret > = 0 & &
start > > PAGE_SHIFT ! = end > > PAGE_SHIFT )
2023-03-20 01:03:22 +03:00
ret = __bch2_truncate_folio ( inode ,
( end - 1 ) > > PAGE_SHIFT ,
start , end ) ;
2021-11-06 20:39:42 +03:00
return ret ;
}
2021-06-15 05:29:54 +03:00
static int bch2_extend ( struct mnt_idmap * idmap ,
struct bch_inode_info * inode ,
2019-10-09 18:12:48 +03:00
struct bch_inode_unpacked * inode_u ,
struct iattr * iattr )
2017-03-17 09:18:50 +03:00
{
struct address_space * mapping = inode - > v . i_mapping ;
int ret ;
2019-10-09 18:12:48 +03:00
/*
* sync appends :
2019-10-09 19:11:00 +03:00
*
* this has to be done _before_ extending i_size :
2019-10-09 18:12:48 +03:00
*/
ret = filemap_write_and_wait_range ( mapping , inode_u - > bi_size , S64_MAX ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
return ret ;
truncate_setsize ( & inode - > v , iattr - > ia_size ) ;
2021-06-15 05:29:54 +03:00
return bch2_setattr_nonsize ( idmap , inode , iattr ) ;
2017-03-17 09:18:50 +03:00
}
2018-08-09 04:09:31 +03:00
static int bch2_truncate_finish_fn ( struct bch_inode_info * inode ,
struct bch_inode_unpacked * bi ,
void * p )
{
bi - > bi_flags & = ~ BCH_INODE_I_SIZE_DIRTY ;
return 0 ;
}
static int bch2_truncate_start_fn ( struct bch_inode_info * inode ,
struct bch_inode_unpacked * bi , void * p )
{
u64 * new_i_size = p ;
bi - > bi_flags | = BCH_INODE_I_SIZE_DIRTY ;
bi - > bi_size = * new_i_size ;
return 0 ;
}
2021-06-15 05:29:54 +03:00
int bch2_truncate ( struct mnt_idmap * idmap ,
struct bch_inode_info * inode , struct iattr * iattr )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
struct address_space * mapping = inode - > v . i_mapping ;
2019-10-09 18:12:48 +03:00
struct bch_inode_unpacked inode_u ;
2018-08-09 04:09:31 +03:00
u64 new_i_size = iattr - > ia_size ;
2019-10-10 19:47:22 +03:00
s64 i_sectors_delta = 0 ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
2021-06-15 05:29:54 +03:00
/*
2021-06-28 03:54:34 +03:00
* If the truncate call with change the size of the file , the
* cmtimes should be updated . If the size will not change , we
* do not need to update the cmtimes .
2021-06-15 05:29:54 +03:00
*/
2021-06-28 03:54:34 +03:00
if ( iattr - > ia_size ! = inode - > v . i_size ) {
if ( ! ( iattr - > ia_valid & ATTR_MTIME ) )
ktime_get_coarse_real_ts64 ( & iattr - > ia_mtime ) ;
if ( ! ( iattr - > ia_valid & ATTR_CTIME ) )
ktime_get_coarse_real_ts64 ( & iattr - > ia_ctime ) ;
iattr - > ia_valid | = ATTR_MTIME | ATTR_CTIME ;
}
2021-06-15 05:29:54 +03:00
2017-03-17 09:18:50 +03:00
inode_dio_wait ( & inode - > v ) ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_get ( inode ) ;
2017-03-17 09:18:50 +03:00
2021-03-16 07:28:17 +03:00
ret = bch2_inode_find_by_inum ( c , inode_inum ( inode ) , & inode_u ) ;
2019-12-18 21:18:33 +03:00
if ( ret )
goto err ;
/*
* check this before next assertion ; on filesystem error our normal
* invariants are a bit broken ( truncate has to truncate the page cache
* before the inode ) .
*/
ret = bch2_journal_error ( & c - > journal ) ;
2019-10-09 18:12:48 +03:00
if ( ret )
goto err ;
2017-03-17 09:18:50 +03:00
2022-11-15 23:57:07 +03:00
WARN_ONCE ( ! test_bit ( EI_INODE_ERROR , & inode - > ei_flags ) & &
inode - > v . i_size < inode_u . bi_size ,
" truncate spotted in mem i_size < btree i_size: %llu < %llu \n " ,
( u64 ) inode - > v . i_size , inode_u . bi_size ) ;
2017-03-17 09:18:50 +03:00
2019-10-09 18:12:48 +03:00
if ( iattr - > ia_size > inode - > v . i_size ) {
2021-06-15 05:29:54 +03:00
ret = bch2_extend ( idmap , inode , & inode_u , iattr ) ;
2018-08-09 04:09:31 +03:00
goto err ;
2017-03-17 09:18:50 +03:00
}
2021-06-15 05:29:54 +03:00
iattr - > ia_valid & = ~ ATTR_SIZE ;
2023-03-20 01:03:22 +03:00
ret = bch2_truncate_folio ( inode , iattr - > ia_size ) ;
2021-11-06 20:39:42 +03:00
if ( unlikely ( ret < 0 ) )
2018-08-09 04:09:31 +03:00
goto err ;
2017-03-17 09:18:50 +03:00
2019-09-20 01:05:04 +03:00
/*
* When extending , we ' re going to write the new i_size to disk
* immediately so we need to flush anything above the current on disk
* i_size first :
*
* Also , when extending we need to flush the page that i_size currently
* straddles - if it ' s mapped to userspace , we need to ensure that
* userspace has to redirty it and call . mkwrite - > set_page_dirty
* again to allocate the part of the page that was extended .
*/
2019-10-09 18:12:48 +03:00
if ( iattr - > ia_size > inode_u . bi_size )
2017-03-17 09:18:50 +03:00
ret = filemap_write_and_wait_range ( mapping ,
2019-10-09 18:12:48 +03:00
inode_u . bi_size ,
2017-03-17 09:18:50 +03:00
iattr - > ia_size - 1 ) ;
else if ( iattr - > ia_size & ( PAGE_SIZE - 1 ) )
ret = filemap_write_and_wait_range ( mapping ,
round_down ( iattr - > ia_size , PAGE_SIZE ) ,
iattr - > ia_size - 1 ) ;
if ( ret )
2018-08-09 04:09:31 +03:00
goto err ;
2017-03-17 09:18:50 +03:00
2018-08-09 04:09:31 +03:00
mutex_lock ( & inode - > ei_update_lock ) ;
ret = bch2_write_inode ( c , inode , bch2_truncate_start_fn ,
& new_i_size , 0 ) ;
mutex_unlock ( & inode - > ei_update_lock ) ;
2017-03-17 09:18:50 +03:00
if ( unlikely ( ret ) )
2018-08-09 04:09:31 +03:00
goto err ;
2017-03-17 09:18:50 +03:00
truncate_setsize ( & inode - > v , iattr - > ia_size ) ;
2021-03-13 04:30:39 +03:00
ret = bch2_fpunch ( c , inode_inum ( inode ) ,
2019-08-06 18:19:58 +03:00
round_up ( iattr - > ia_size , block_bytes ( c ) ) > > 9 ,
2021-11-05 22:17:13 +03:00
U64_MAX , & i_sectors_delta ) ;
2019-10-10 19:47:22 +03:00
i_sectors_acct ( c , inode , NULL , i_sectors_delta ) ;
2022-04-16 23:06:59 +03:00
bch2_fs_inconsistent_on ( ! inode - > v . i_size & & inode - > v . i_blocks & &
! bch2_journal_error ( & c - > journal ) , c ,
" inode %lu truncated to 0 but i_blocks %llu (ondisk %lli) " ,
inode - > v . i_ino , ( u64 ) inode - > v . i_blocks ,
inode - > ei_inode . bi_sectors ) ;
2017-03-17 09:18:50 +03:00
if ( unlikely ( ret ) )
2018-08-09 04:09:31 +03:00
goto err ;
2017-03-17 09:18:50 +03:00
2018-08-09 04:09:31 +03:00
mutex_lock ( & inode - > ei_update_lock ) ;
2021-06-15 05:29:54 +03:00
ret = bch2_write_inode ( c , inode , bch2_truncate_finish_fn , NULL , 0 ) ;
2018-08-09 04:09:31 +03:00
mutex_unlock ( & inode - > ei_update_lock ) ;
2021-06-15 05:29:54 +03:00
ret = bch2_setattr_nonsize ( idmap , inode , iattr ) ;
2018-08-09 04:09:31 +03:00
err :
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_put ( inode ) ;
2022-09-18 22:43:50 +03:00
return bch2_err_class ( ret ) ;
2017-03-17 09:18:50 +03:00
}
/* fallocate: */
2021-04-29 02:36:12 +03:00
static int inode_update_times_fn ( struct bch_inode_info * inode ,
struct bch_inode_unpacked * bi , void * p )
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
bi - > bi_mtime = bi - > bi_ctime = bch2_current_time ( c ) ;
return 0 ;
}
2019-10-10 19:47:22 +03:00
static long bchfs_fpunch ( struct bch_inode_info * inode , loff_t offset , loff_t len )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2021-11-06 20:39:42 +03:00
u64 end = offset + len ;
u64 block_start = round_up ( offset , block_bytes ( c ) ) ;
u64 block_end = round_down ( end , block_bytes ( c ) ) ;
bool truncated_last_page ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
2023-03-20 01:03:22 +03:00
ret = bch2_truncate_folios ( inode , offset , end ) ;
2021-11-06 20:39:42 +03:00
if ( unlikely ( ret < 0 ) )
2017-03-17 09:18:50 +03:00
goto err ;
2021-11-06 20:39:42 +03:00
truncated_last_page = ret ;
2017-03-17 09:18:50 +03:00
2021-11-06 20:39:42 +03:00
truncate_pagecache_range ( & inode - > v , offset , end - 1 ) ;
2017-03-17 09:18:50 +03:00
2022-10-20 01:31:33 +03:00
if ( block_start < block_end ) {
2019-10-10 19:47:22 +03:00
s64 i_sectors_delta = 0 ;
2021-03-13 04:30:39 +03:00
ret = bch2_fpunch ( c , inode_inum ( inode ) ,
2021-11-06 20:39:42 +03:00
block_start > > 9 , block_end > > 9 ,
2019-10-10 19:47:22 +03:00
& i_sectors_delta ) ;
i_sectors_acct ( c , inode , NULL , i_sectors_delta ) ;
}
2021-04-29 02:36:12 +03:00
mutex_lock ( & inode - > ei_update_lock ) ;
2021-11-06 20:39:42 +03:00
if ( end > = inode - > v . i_size & & ! truncated_last_page ) {
ret = bch2_write_inode_size ( c , inode , inode - > v . i_size ,
ATTR_MTIME | ATTR_CTIME ) ;
} else {
ret = bch2_write_inode ( c , inode , inode_update_times_fn , NULL ,
ATTR_MTIME | ATTR_CTIME ) ;
}
2021-04-29 02:36:12 +03:00
mutex_unlock ( & inode - > ei_update_lock ) ;
2017-03-17 09:18:50 +03:00
err :
return ret ;
}
2019-10-10 19:47:22 +03:00
static long bchfs_fcollapse_finsert ( struct bch_inode_info * inode ,
2019-09-08 01:04:23 +03:00
loff_t offset , loff_t len ,
bool insert )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
struct address_space * mapping = inode - > v . i_mapping ;
2020-12-17 23:08:58 +03:00
struct bkey_buf copy ;
2018-07-13 02:19:41 +03:00
struct btree_trans trans ;
2021-08-30 22:18:31 +03:00
struct btree_iter src , dst , del ;
2019-09-08 01:04:23 +03:00
loff_t shift , new_size ;
u64 src_start ;
2021-03-20 03:29:11 +03:00
int ret = 0 ;
2017-03-17 09:18:50 +03:00
if ( ( offset | len ) & ( block_bytes ( c ) - 1 ) )
return - EINVAL ;
2019-09-08 01:04:23 +03:00
if ( insert ) {
if ( inode - > v . i_sb - > s_maxbytes - inode - > v . i_size < len )
2021-11-06 20:39:42 +03:00
return - EFBIG ;
2017-03-17 09:18:50 +03:00
2019-09-08 01:04:23 +03:00
if ( offset > = inode - > v . i_size )
2021-11-06 20:39:42 +03:00
return - EINVAL ;
2017-03-17 09:18:50 +03:00
2019-09-08 01:04:23 +03:00
src_start = U64_MAX ;
shift = len ;
} else {
if ( offset + len > = inode - > v . i_size )
2021-11-06 20:39:42 +03:00
return - EINVAL ;
2017-03-17 09:18:50 +03:00
2019-09-08 01:04:23 +03:00
src_start = offset + len ;
shift = - len ;
}
new_size = inode - > v . i_size + shift ;
2017-03-17 09:18:50 +03:00
2019-09-08 01:04:23 +03:00
ret = write_invalidate_inode_pages_range ( mapping , offset , LLONG_MAX ) ;
2019-07-22 20:37:02 +03:00
if ( ret )
2021-11-06 20:39:42 +03:00
return ret ;
2019-07-22 20:37:02 +03:00
2019-09-08 01:04:23 +03:00
if ( insert ) {
i_size_write ( & inode - > v , new_size ) ;
mutex_lock ( & inode - > ei_update_lock ) ;
ret = bch2_write_inode_size ( c , inode , new_size ,
ATTR_MTIME | ATTR_CTIME ) ;
mutex_unlock ( & inode - > ei_update_lock ) ;
} else {
2019-10-10 19:47:22 +03:00
s64 i_sectors_delta = 0 ;
2021-03-13 04:30:39 +03:00
ret = bch2_fpunch ( c , inode_inum ( inode ) ,
2019-10-10 19:47:22 +03:00
offset > > 9 , ( offset + len ) > > 9 ,
& i_sectors_delta ) ;
i_sectors_acct ( c , inode , NULL , i_sectors_delta ) ;
2019-09-08 01:04:23 +03:00
if ( ret )
2021-11-06 20:39:42 +03:00
return ret ;
2019-09-08 01:04:23 +03:00
}
2018-08-12 00:26:11 +03:00
2021-03-20 03:29:11 +03:00
bch2_bkey_buf_init ( & copy ) ;
2021-06-03 06:31:42 +03:00
bch2_trans_init ( & trans , c , BTREE_ITER_MAX , 1024 ) ;
2021-08-30 22:18:31 +03:00
bch2_trans_iter_init ( & trans , & src , BTREE_ID_extents ,
2019-09-08 01:04:23 +03:00
POS ( inode - > v . i_ino , src_start > > 9 ) ,
2019-07-22 20:37:02 +03:00
BTREE_ITER_INTENT ) ;
2021-08-30 22:18:31 +03:00
bch2_trans_copy_iter ( & dst , & src ) ;
bch2_trans_copy_iter ( & del , & src ) ;
2019-09-08 01:04:23 +03:00
2022-07-18 06:06:38 +03:00
while ( ret = = 0 | |
bch2_err_matches ( ret , BCH_ERR_transaction_restart ) ) {
2019-07-22 20:37:02 +03:00
struct disk_reservation disk_res =
bch2_disk_reservation_init ( c , 0 ) ;
struct bkey_i delete ;
struct bkey_s_c k ;
struct bpos next_pos ;
2019-09-08 01:04:23 +03:00
struct bpos move_pos = POS ( inode - > v . i_ino , offset > > 9 ) ;
struct bpos atomic_end ;
2020-01-01 00:17:42 +03:00
unsigned trigger_flags = 0 ;
2021-03-16 07:28:17 +03:00
u32 snapshot ;
bch2_trans_begin ( & trans ) ;
ret = bch2_subvolume_get_snapshot ( & trans ,
inode - > ei_subvol , & snapshot ) ;
if ( ret )
continue ;
bch2_btree_iter_set_snapshot ( & src , snapshot ) ;
bch2_btree_iter_set_snapshot ( & dst , snapshot ) ;
bch2_btree_iter_set_snapshot ( & del , snapshot ) ;
2017-03-17 09:18:50 +03:00
2021-07-25 03:24:10 +03:00
bch2_trans_begin ( & trans ) ;
2019-09-08 01:04:23 +03:00
k = insert
2021-08-30 22:18:31 +03:00
? bch2_btree_iter_peek_prev ( & src )
2022-10-11 11:32:41 +03:00
: bch2_btree_iter_peek_upto ( & src , POS ( inode - > v . i_ino , U64_MAX ) ) ;
2019-07-22 20:37:02 +03:00
if ( ( ret = bkey_err ( k ) ) )
2021-03-20 03:29:11 +03:00
continue ;
2018-08-12 00:26:11 +03:00
2019-07-22 20:37:02 +03:00
if ( ! k . k | | k . k - > p . inode ! = inode - > v . i_ino )
break ;
2017-03-17 09:18:50 +03:00
2019-09-08 01:04:23 +03:00
if ( insert & &
2022-11-24 11:12:22 +03:00
bkey_le ( k . k - > p , POS ( inode - > v . i_ino , offset > > 9 ) ) )
2019-09-08 01:04:23 +03:00
break ;
reassemble :
2020-12-17 23:08:58 +03:00
bch2_bkey_buf_reassemble ( & copy , c , k ) ;
2019-09-08 01:04:23 +03:00
if ( insert & &
2022-11-24 11:12:22 +03:00
bkey_lt ( bkey_start_pos ( k . k ) , move_pos ) )
2019-11-10 00:01:15 +03:00
bch2_cut_front ( move_pos , copy . k ) ;
2017-03-17 09:18:50 +03:00
2019-11-10 00:01:15 +03:00
copy . k - > k . p . offset + = shift > > 9 ;
2021-08-30 22:18:31 +03:00
bch2_btree_iter_set_pos ( & dst , bkey_start_pos ( & copy . k - > k ) ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
ret = bch2_extent_atomic_end ( & trans , & dst , copy . k , & atomic_end ) ;
2019-08-16 16:58:07 +03:00
if ( ret )
2021-03-20 03:29:11 +03:00
continue ;
2018-08-06 00:46:41 +03:00
2022-11-24 11:12:22 +03:00
if ( ! bkey_eq ( atomic_end , copy . k - > k . p ) ) {
2019-09-08 01:04:23 +03:00
if ( insert ) {
move_pos = atomic_end ;
move_pos . offset - = shift > > 9 ;
goto reassemble ;
} else {
2019-11-10 03:02:48 +03:00
bch2_cut_back ( atomic_end , copy . k ) ;
2019-09-08 01:04:23 +03:00
}
}
2019-07-22 20:37:02 +03:00
bkey_init ( & delete . k ) ;
2020-04-01 23:07:57 +03:00
delete . k . p = copy . k - > k . p ;
delete . k . size = copy . k - > k . size ;
delete . k . p . offset - = shift > > 9 ;
2021-08-30 22:18:31 +03:00
bch2_btree_iter_set_pos ( & del , bkey_start_pos ( & delete . k ) ) ;
2017-03-17 09:18:50 +03:00
2019-09-08 01:04:23 +03:00
next_pos = insert ? bkey_start_pos ( & delete . k ) : delete . k . p ;
2017-03-17 09:18:50 +03:00
2022-04-09 06:54:14 +03:00
if ( copy . k - > k . size ! = k . k - > size ) {
2019-07-22 20:37:02 +03:00
/* We might end up splitting compressed extents: */
unsigned nr_ptrs =
2019-11-17 00:25:58 +03:00
bch2_bkey_nr_ptrs_allocated ( bkey_i_to_s_c ( copy . k ) ) ;
2019-07-22 20:37:02 +03:00
ret = bch2_disk_reservation_get ( c , & disk_res ,
2019-11-10 00:01:15 +03:00
copy . k - > k . size , nr_ptrs ,
2019-07-22 20:37:02 +03:00
BCH_DISK_RESERVATION_NOFAIL ) ;
BUG_ON ( ret ) ;
}
2021-08-30 22:18:31 +03:00
ret = bch2_btree_iter_traverse ( & del ) ? :
bch2_trans_update ( & trans , & del , & delete , trigger_flags ) ? :
bch2_trans_update ( & trans , & dst , copy . k , trigger_flags ) ? :
2021-11-05 22:17:13 +03:00
bch2_trans_commit ( & trans , & disk_res , NULL ,
2020-01-01 03:37:10 +03:00
BTREE_INSERT_NOFAIL ) ;
2017-03-17 09:18:50 +03:00
bch2_disk_reservation_put ( c , & disk_res ) ;
2021-03-20 03:29:11 +03:00
2019-07-22 20:37:02 +03:00
if ( ! ret )
2021-08-30 22:18:31 +03:00
bch2_btree_iter_set_pos ( & src , next_pos ) ;
2017-03-17 09:18:50 +03:00
}
2021-08-30 22:18:31 +03:00
bch2_trans_iter_exit ( & trans , & del ) ;
bch2_trans_iter_exit ( & trans , & dst ) ;
bch2_trans_iter_exit ( & trans , & src ) ;
2021-03-20 03:29:11 +03:00
bch2_trans_exit ( & trans ) ;
bch2_bkey_buf_exit ( & copy , c ) ;
if ( ret )
2021-11-06 20:39:42 +03:00
return ret ;
2017-03-17 09:18:50 +03:00
2021-11-06 20:39:42 +03:00
mutex_lock ( & inode - > ei_update_lock ) ;
2019-09-08 01:04:23 +03:00
if ( ! insert ) {
i_size_write ( & inode - > v , new_size ) ;
ret = bch2_write_inode_size ( c , inode , new_size ,
ATTR_MTIME | ATTR_CTIME ) ;
2021-11-06 20:39:42 +03:00
} else {
/* We need an inode update to update bi_journal_seq for fsync: */
ret = bch2_write_inode ( c , inode , inode_update_times_fn , NULL ,
ATTR_MTIME | ATTR_CTIME ) ;
2019-09-08 01:04:23 +03:00
}
2021-11-06 20:39:42 +03:00
mutex_unlock ( & inode - > ei_update_lock ) ;
2017-03-17 09:18:50 +03:00
return ret ;
}
2021-04-17 03:35:20 +03:00
static int __bchfs_fallocate ( struct bch_inode_info * inode , int mode ,
u64 start_sector , u64 end_sector )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2018-08-06 00:48:00 +03:00
struct btree_trans trans ;
2021-08-30 22:18:31 +03:00
struct btree_iter iter ;
2021-04-17 03:35:20 +03:00
struct bpos end_pos = POS ( inode - > v . i_ino , end_sector ) ;
2022-11-24 04:14:55 +03:00
struct bch_io_opts opts ;
2021-04-17 03:35:20 +03:00
int ret = 0 ;
2017-03-17 09:18:50 +03:00
2022-11-24 04:14:55 +03:00
bch2_inode_opts_get ( & opts , c , & inode - > ei_inode ) ;
2021-06-03 06:31:42 +03:00
bch2_trans_init ( & trans , c , BTREE_ITER_MAX , 512 ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
bch2_trans_iter_init ( & trans , & iter , BTREE_ID_extents ,
2021-04-17 03:35:20 +03:00
POS ( inode - > v . i_ino , start_sector ) ,
2018-08-06 00:48:00 +03:00
BTREE_ITER_SLOTS | BTREE_ITER_INTENT ) ;
2017-03-17 09:18:50 +03:00
2022-11-24 11:12:22 +03:00
while ( ! ret & & bkey_lt ( iter . pos , end_pos ) ) {
2019-10-10 19:47:22 +03:00
s64 i_sectors_delta = 0 ;
2018-08-06 00:48:00 +03:00
struct quota_res quota_res = { 0 } ;
2017-03-17 09:18:50 +03:00
struct bkey_s_c k ;
2021-04-17 03:35:20 +03:00
unsigned sectors ;
2021-03-16 07:28:17 +03:00
u32 snapshot ;
2017-03-17 09:18:50 +03:00
2020-02-26 23:39:46 +03:00
bch2_trans_begin ( & trans ) ;
2019-12-21 00:35:24 +03:00
2021-03-16 07:28:17 +03:00
ret = bch2_subvolume_get_snapshot ( & trans ,
inode - > ei_subvol , & snapshot ) ;
if ( ret )
goto bkey_err ;
bch2_btree_iter_set_snapshot ( & iter , snapshot ) ;
2021-08-30 22:18:31 +03:00
k = bch2_btree_iter_peek_slot ( & iter ) ;
2019-03-28 05:03:30 +03:00
if ( ( ret = bkey_err ( k ) ) )
goto bkey_err ;
2017-03-17 09:18:50 +03:00
/* already reserved */
2022-11-14 02:59:01 +03:00
if ( bkey_extent_is_reservation ( k ) & &
bch2_bkey_nr_ptrs_fully_allocated ( k ) > = opts . data_replicas ) {
2021-08-30 22:18:31 +03:00
bch2_btree_iter_advance ( & iter ) ;
2017-03-17 09:18:50 +03:00
continue ;
}
2018-08-06 00:48:00 +03:00
if ( bkey_extent_is_data ( k . k ) & &
! ( mode & FALLOC_FL_ZERO_RANGE ) ) {
2021-08-30 22:18:31 +03:00
bch2_btree_iter_advance ( & iter ) ;
2018-08-06 00:48:00 +03:00
continue ;
2017-03-17 09:18:50 +03:00
}
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
/*
* XXX : for nocow mode , we should promote shared extents to
* unshared here
*/
2022-11-14 02:54:37 +03:00
sectors = bpos_min ( k . k - > p , end_pos ) . offset - iter . pos . offset ;
2017-03-17 09:18:50 +03:00
if ( ! bkey_extent_is_allocation ( k . k ) ) {
ret = bch2_quota_reservation_add ( c , inode ,
2018-08-06 00:48:00 +03:00
& quota_res ,
2017-03-17 09:18:50 +03:00
sectors , true ) ;
if ( unlikely ( ret ) )
2019-03-28 05:03:30 +03:00
goto bkey_err ;
2017-03-17 09:18:50 +03:00
}
2022-11-14 02:54:37 +03:00
ret = bch2_extent_fallocate ( & trans , inode_inum ( inode ) , & iter ,
sectors , opts , & i_sectors_delta ,
writepoint_hashed ( ( unsigned long ) current ) ) ;
2021-11-22 06:34:26 +03:00
if ( ret )
goto bkey_err ;
2022-11-14 02:54:37 +03:00
2019-10-10 19:47:22 +03:00
i_sectors_acct ( c , inode , & quota_res , i_sectors_delta ) ;
2019-03-28 05:03:30 +03:00
bkey_err :
2018-08-06 00:48:00 +03:00
bch2_quota_reservation_put ( c , inode , & quota_res ) ;
2022-07-18 06:06:38 +03:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
2017-03-17 09:18:50 +03:00
ret = 0 ;
}
2021-11-06 20:39:42 +03:00
2021-11-24 02:21:09 +03:00
bch2_trans_unlock ( & trans ) ; /* lock ordering, before taking pagecache locks: */
mark_pagecache_reserved ( inode , start_sector , iter . pos . offset ) ;
2022-09-19 00:10:33 +03:00
if ( bch2_err_matches ( ret , ENOSPC ) & & ( mode & FALLOC_FL_ZERO_RANGE ) ) {
2021-11-06 20:39:42 +03:00
struct quota_res quota_res = { 0 } ;
s64 i_sectors_delta = 0 ;
bch2_fpunch_at ( & trans , & iter , inode_inum ( inode ) ,
end_sector , & i_sectors_delta ) ;
i_sectors_acct ( c , inode , & quota_res , i_sectors_delta ) ;
bch2_quota_reservation_put ( c , inode , & quota_res ) ;
}
2021-08-30 22:18:31 +03:00
bch2_trans_iter_exit ( & trans , & iter ) ;
2021-04-17 03:35:20 +03:00
bch2_trans_exit ( & trans ) ;
return ret ;
}
2021-03-20 03:29:11 +03:00
2021-04-17 03:35:20 +03:00
static long bchfs_fallocate ( struct bch_inode_info * inode , int mode ,
loff_t offset , loff_t len )
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2021-11-06 20:39:42 +03:00
u64 end = offset + len ;
u64 block_start = round_down ( offset , block_bytes ( c ) ) ;
u64 block_end = round_up ( end , block_bytes ( c ) ) ;
bool truncated_last_page = false ;
int ret , ret2 = 0 ;
2021-04-17 03:35:20 +03:00
if ( ! ( mode & FALLOC_FL_KEEP_SIZE ) & & end > inode - > v . i_size ) {
ret = inode_newsize_ok ( & inode - > v , end ) ;
if ( ret )
2021-11-06 20:39:42 +03:00
return ret ;
2021-04-17 03:35:20 +03:00
}
if ( mode & FALLOC_FL_ZERO_RANGE ) {
2023-03-20 01:03:22 +03:00
ret = bch2_truncate_folios ( inode , offset , end ) ;
2021-11-06 20:39:42 +03:00
if ( unlikely ( ret < 0 ) )
return ret ;
2021-04-17 03:35:20 +03:00
2021-11-06 20:39:42 +03:00
truncated_last_page = ret ;
2021-04-17 03:35:20 +03:00
truncate_pagecache_range ( & inode - > v , offset , end - 1 ) ;
2021-11-06 20:39:42 +03:00
block_start = round_up ( offset , block_bytes ( c ) ) ;
block_end = round_down ( end , block_bytes ( c ) ) ;
2021-04-17 03:35:20 +03:00
}
ret = __bchfs_fallocate ( inode , mode , block_start > > 9 , block_end > > 9 ) ;
2017-03-17 09:18:50 +03:00
2019-10-09 18:12:48 +03:00
/*
2021-11-06 20:39:42 +03:00
* On - ENOSPC in ZERO_RANGE mode , we still want to do the inode update ,
* so that the VFS cache i_size is consistent with the btree i_size :
2019-10-09 18:12:48 +03:00
*/
2021-11-06 20:39:42 +03:00
if ( ret & &
2022-09-19 00:10:33 +03:00
! ( bch2_err_matches ( ret , ENOSPC ) & & ( mode & FALLOC_FL_ZERO_RANGE ) ) )
2021-11-06 20:39:42 +03:00
return ret ;
2017-03-17 09:18:50 +03:00
2021-11-06 20:39:42 +03:00
if ( mode & FALLOC_FL_KEEP_SIZE & & end > inode - > v . i_size )
end = inode - > v . i_size ;
2017-03-17 09:18:50 +03:00
2021-11-06 20:39:42 +03:00
if ( end > = inode - > v . i_size & &
( ( ( mode & FALLOC_FL_ZERO_RANGE ) & & ! truncated_last_page ) | |
! ( mode & FALLOC_FL_KEEP_SIZE ) ) ) {
spin_lock ( & inode - > v . i_lock ) ;
i_size_write ( & inode - > v , end ) ;
spin_unlock ( & inode - > v . i_lock ) ;
2019-10-09 18:12:48 +03:00
mutex_lock ( & inode - > ei_update_lock ) ;
2021-11-06 20:39:42 +03:00
ret2 = bch2_write_inode_size ( c , inode , end , 0 ) ;
2019-10-09 18:12:48 +03:00
mutex_unlock ( & inode - > ei_update_lock ) ;
2017-03-17 09:18:50 +03:00
}
2021-11-06 20:39:42 +03:00
return ret ? : ret2 ;
2017-03-17 09:18:50 +03:00
}
long bch2_fallocate_dispatch ( struct file * file , int mode ,
loff_t offset , loff_t len )
{
struct bch_inode_info * inode = file_bch_inode ( file ) ;
2019-10-20 02:03:23 +03:00
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
long ret ;
2017-03-17 09:18:50 +03:00
2023-02-09 20:21:45 +03:00
if ( ! bch2_write_ref_tryget ( c , BCH_WRITE_REF_fallocate ) )
2019-10-20 02:03:23 +03:00
return - EROFS ;
2019-09-08 01:04:23 +03:00
2021-11-06 20:39:42 +03:00
inode_lock ( & inode - > v ) ;
inode_dio_wait ( & inode - > v ) ;
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_get ( inode ) ;
2021-11-06 20:39:42 +03:00
2022-10-13 07:44:34 +03:00
ret = file_modified ( file ) ;
if ( ret )
goto err ;
2019-10-20 02:03:23 +03:00
if ( ! ( mode & ~ ( FALLOC_FL_KEEP_SIZE | FALLOC_FL_ZERO_RANGE ) ) )
ret = bchfs_fallocate ( inode , mode , offset , len ) ;
else if ( mode = = ( FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE ) )
ret = bchfs_fpunch ( inode , offset , len ) ;
else if ( mode = = FALLOC_FL_INSERT_RANGE )
ret = bchfs_fcollapse_finsert ( inode , offset , len , true ) ;
else if ( mode = = FALLOC_FL_COLLAPSE_RANGE )
ret = bchfs_fcollapse_finsert ( inode , offset , len , false ) ;
else
ret = - EOPNOTSUPP ;
2022-10-13 07:44:34 +03:00
err :
2022-11-04 20:25:57 +03:00
bch2_pagecache_block_put ( inode ) ;
2021-11-06 20:39:42 +03:00
inode_unlock ( & inode - > v ) ;
2023-02-09 20:21:45 +03:00
bch2_write_ref_put ( c , BCH_WRITE_REF_fallocate ) ;
2017-03-17 09:18:50 +03:00
2022-09-18 22:43:50 +03:00
return bch2_err_class ( ret ) ;
2017-03-17 09:18:50 +03:00
}
2022-10-11 11:32:41 +03:00
/*
* Take a quota reservation for unallocated blocks in a given file range
* Does not check pagecache
*/
2022-10-11 11:32:14 +03:00
static int quota_reserve_range ( struct bch_inode_info * inode ,
struct quota_res * res ,
u64 start , u64 end )
{
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
struct btree_trans trans ;
struct btree_iter iter ;
struct bkey_s_c k ;
u32 snapshot ;
u64 sectors = end - start ;
u64 pos = start ;
int ret ;
bch2_trans_init ( & trans , c , 0 , 0 ) ;
retry :
bch2_trans_begin ( & trans ) ;
ret = bch2_subvolume_get_snapshot ( & trans , inode - > ei_subvol , & snapshot ) ;
if ( ret )
goto err ;
bch2_trans_iter_init ( & trans , & iter , BTREE_ID_extents ,
SPOS ( inode - > v . i_ino , pos , snapshot ) , 0 ) ;
while ( ! ( ret = btree_trans_too_many_iters ( & trans ) ) & &
( k = bch2_btree_iter_peek_upto ( & iter , POS ( inode - > v . i_ino , end - 1 ) ) ) . k & &
! ( ret = bkey_err ( k ) ) ) {
if ( bkey_extent_is_allocation ( k . k ) ) {
u64 s = min ( end , k . k - > p . offset ) -
max ( start , bkey_start_offset ( k . k ) ) ;
BUG_ON ( s > sectors ) ;
sectors - = s ;
}
bch2_btree_iter_advance ( & iter ) ;
}
pos = iter . pos . offset ;
bch2_trans_iter_exit ( & trans , & iter ) ;
err :
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
goto retry ;
bch2_trans_exit ( & trans ) ;
if ( ret )
return ret ;
return bch2_quota_reservation_add ( c , inode , res , sectors , true ) ;
}
2019-08-16 16:59:56 +03:00
loff_t bch2_remap_file_range ( struct file * file_src , loff_t pos_src ,
struct file * file_dst , loff_t pos_dst ,
loff_t len , unsigned remap_flags )
{
struct bch_inode_info * src = file_bch_inode ( file_src ) ;
struct bch_inode_info * dst = file_bch_inode ( file_dst ) ;
struct bch_fs * c = src - > v . i_sb - > s_fs_info ;
2022-10-11 11:32:14 +03:00
struct quota_res quota_res = { 0 } ;
2019-10-10 19:47:22 +03:00
s64 i_sectors_delta = 0 ;
2019-11-05 06:22:13 +03:00
u64 aligned_len ;
2019-08-16 16:59:56 +03:00
loff_t ret = 0 ;
if ( remap_flags & ~ ( REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY ) )
return - EINVAL ;
if ( remap_flags & REMAP_FILE_DEDUP )
return - EOPNOTSUPP ;
if ( ( pos_src & ( block_bytes ( c ) - 1 ) ) | |
( pos_dst & ( block_bytes ( c ) - 1 ) ) )
return - EINVAL ;
if ( src = = dst & &
abs ( pos_src - pos_dst ) < len )
return - EINVAL ;
bch2_lock_inodes ( INODE_LOCK | INODE_PAGECACHE_BLOCK , src , dst ) ;
inode_dio_wait ( & src - > v ) ;
inode_dio_wait ( & dst - > v ) ;
ret = generic_remap_file_range_prep ( file_src , pos_src ,
file_dst , pos_dst ,
& len , remap_flags ) ;
if ( ret < 0 | | len = = 0 )
2019-10-10 19:47:22 +03:00
goto err ;
2019-08-16 16:59:56 +03:00
2019-11-05 06:22:13 +03:00
aligned_len = round_up ( ( u64 ) len , block_bytes ( c ) ) ;
2019-08-16 16:59:56 +03:00
ret = write_invalidate_inode_pages_range ( dst - > v . i_mapping ,
2019-11-05 06:22:13 +03:00
pos_dst , pos_dst + len - 1 ) ;
2019-08-16 16:59:56 +03:00
if ( ret )
2019-10-10 19:47:22 +03:00
goto err ;
2019-08-16 16:59:56 +03:00
2022-10-11 11:32:14 +03:00
ret = quota_reserve_range ( dst , & quota_res , pos_dst > > 9 ,
( pos_dst + aligned_len ) > > 9 ) ;
if ( ret )
goto err ;
file_update_time ( file_dst ) ;
2021-11-24 02:21:09 +03:00
mark_pagecache_unallocated ( src , pos_src > > 9 ,
( pos_src + aligned_len ) > > 9 ) ;
2019-08-16 16:59:56 +03:00
2019-10-10 19:47:22 +03:00
ret = bch2_remap_range ( c ,
2021-03-16 07:28:17 +03:00
inode_inum ( dst ) , pos_dst > > 9 ,
inode_inum ( src ) , pos_src > > 9 ,
2019-08-16 16:59:56 +03:00
aligned_len > > 9 ,
2019-10-10 19:47:22 +03:00
pos_dst + len , & i_sectors_delta ) ;
if ( ret < 0 )
goto err ;
2019-08-16 16:59:56 +03:00
2019-10-10 19:47:22 +03:00
/*
* due to alignment , we might have remapped slightly more than requsted
*/
2019-11-05 06:22:13 +03:00
ret = min ( ( u64 ) ret < < 9 , ( u64 ) len ) ;
2019-10-10 19:47:22 +03:00
2022-10-11 11:32:14 +03:00
i_sectors_acct ( c , dst , & quota_res , i_sectors_delta ) ;
2019-10-10 19:47:22 +03:00
spin_lock ( & dst - > v . i_lock ) ;
2019-11-05 06:22:13 +03:00
if ( pos_dst + ret > dst - > v . i_size )
i_size_write ( & dst - > v , pos_dst + ret ) ;
2019-10-10 19:47:22 +03:00
spin_unlock ( & dst - > v . i_lock ) ;
2021-05-20 04:21:49 +03:00
2021-11-05 22:17:13 +03:00
if ( ( file_dst - > f_flags & ( __O_SYNC | O_DSYNC ) ) | |
IS_SYNC ( file_inode ( file_dst ) ) )
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
ret = bch2_flush_inode ( c , dst ) ;
2019-10-10 19:47:22 +03:00
err :
2022-10-11 11:32:14 +03:00
bch2_quota_reservation_put ( c , dst , & quota_res ) ;
2019-08-16 16:59:56 +03:00
bch2_unlock_inodes ( INODE_LOCK | INODE_PAGECACHE_BLOCK , src , dst ) ;
2022-09-18 22:43:50 +03:00
return bch2_err_class ( ret ) ;
2019-08-16 16:59:56 +03:00
}
2017-03-17 09:18:50 +03:00
/* fseek: */
2023-04-03 15:17:26 +03:00
static int folio_data_offset ( struct folio * folio , loff_t pos )
2017-03-17 09:18:50 +03:00
{
2023-03-17 21:55:53 +03:00
struct bch_folio * s = bch2_folio ( folio ) ;
2023-03-20 02:07:28 +03:00
unsigned i , sectors = folio_sectors ( folio ) ;
2019-07-03 04:41:35 +03:00
2019-07-30 20:49:17 +03:00
if ( s )
2023-04-03 15:17:26 +03:00
for ( i = folio_pos_to_s ( folio , pos ) ; i < sectors ; i + + )
2023-03-23 18:08:04 +03:00
if ( s - > s [ i ] . state > = SECTOR_dirty )
2023-04-03 15:17:26 +03:00
return i < < SECTOR_SHIFT ;
2018-11-15 05:53:40 +03:00
2019-07-30 20:49:17 +03:00
return - 1 ;
2017-03-17 09:18:50 +03:00
}
2019-07-30 20:49:17 +03:00
static loff_t bch2_seek_pagecache_data ( struct inode * vinode ,
2017-03-17 09:18:50 +03:00
loff_t start_offset ,
loff_t end_offset )
{
struct folio_batch fbatch ;
pgoff_t start_index = start_offset > > PAGE_SHIFT ;
pgoff_t end_index = end_offset > > PAGE_SHIFT ;
pgoff_t index = start_index ;
unsigned i ;
2019-07-30 20:49:17 +03:00
loff_t ret ;
int offset ;
2017-03-17 09:18:50 +03:00
folio_batch_init ( & fbatch ) ;
while ( filemap_get_folios ( vinode - > i_mapping ,
& index , end_index , & fbatch ) ) {
for ( i = 0 ; i < folio_batch_count ( & fbatch ) ; i + + ) {
struct folio * folio = fbatch . folios [ i ] ;
folio_lock ( folio ) ;
2019-07-30 20:49:17 +03:00
offset = folio_data_offset ( folio ,
2023-04-03 15:17:26 +03:00
max ( folio_pos ( folio ) , start_offset ) ) ;
2019-07-30 20:49:17 +03:00
if ( offset > = 0 ) {
2023-03-20 02:07:28 +03:00
ret = clamp ( folio_pos ( folio ) + offset ,
2019-07-30 20:49:17 +03:00
start_offset , end_offset ) ;
2017-03-17 09:18:50 +03:00
folio_unlock ( folio ) ;
folio_batch_release ( & fbatch ) ;
2019-07-30 20:49:17 +03:00
return ret ;
2017-03-17 09:18:50 +03:00
}
folio_unlock ( folio ) ;
}
folio_batch_release ( & fbatch ) ;
cond_resched ( ) ;
}
return end_offset ;
}
static loff_t bch2_seek_data ( struct file * file , u64 offset )
{
struct bch_inode_info * inode = file_bch_inode ( file ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2019-03-25 22:10:15 +03:00
struct btree_trans trans ;
2021-08-30 22:18:31 +03:00
struct btree_iter iter ;
2017-03-17 09:18:50 +03:00
struct bkey_s_c k ;
2021-03-16 07:28:17 +03:00
subvol_inum inum = inode_inum ( inode ) ;
2017-03-17 09:18:50 +03:00
u64 isize , next_data = MAX_LFS_FILESIZE ;
2021-03-16 07:28:17 +03:00
u32 snapshot ;
2017-03-17 09:18:50 +03:00
int ret ;
isize = i_size_read ( & inode - > v ) ;
if ( offset > = isize )
return - ENXIO ;
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , c , 0 , 0 ) ;
2021-03-16 07:28:17 +03:00
retry :
bch2_trans_begin ( & trans ) ;
ret = bch2_subvolume_get_snapshot ( & trans , inum . subvol , & snapshot ) ;
if ( ret )
goto err ;
2019-03-25 22:10:15 +03:00
2022-10-11 11:32:41 +03:00
for_each_btree_key_upto_norestart ( & trans , iter , BTREE_ID_extents ,
SPOS ( inode - > v . i_ino , offset > > 9 , snapshot ) ,
POS ( inode - > v . i_ino , U64_MAX ) ,
0 , k , ret ) {
if ( bkey_extent_is_data ( k . k ) ) {
2017-03-17 09:18:50 +03:00
next_data = max ( offset , bkey_start_offset ( k . k ) < < 9 ) ;
break ;
} else if ( k . k - > p . offset > > 9 > isize )
break ;
}
2021-08-30 22:18:31 +03:00
bch2_trans_iter_exit ( & trans , & iter ) ;
2021-03-16 07:28:17 +03:00
err :
2022-07-18 06:06:38 +03:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
2021-03-16 07:28:17 +03:00
goto retry ;
2017-03-17 09:18:50 +03:00
2021-10-19 22:08:00 +03:00
bch2_trans_exit ( & trans ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
return ret ;
if ( next_data > offset )
2019-07-30 20:49:17 +03:00
next_data = bch2_seek_pagecache_data ( & inode - > v ,
2017-03-17 09:18:50 +03:00
offset , next_data ) ;
2019-07-30 19:46:53 +03:00
if ( next_data > = isize )
2017-03-17 09:18:50 +03:00
return - ENXIO ;
return vfs_setpos ( file , next_data , MAX_LFS_FILESIZE ) ;
}
2023-03-20 02:06:42 +03:00
static bool folio_hole_offset ( struct address_space * mapping , loff_t * offset )
2017-03-17 09:18:50 +03:00
{
2023-03-20 02:06:42 +03:00
struct folio * folio ;
struct bch_folio * s ;
2023-04-03 15:17:26 +03:00
unsigned i , sectors ;
2023-03-20 02:06:42 +03:00
bool ret = true ;
2019-07-30 20:49:17 +03:00
2023-03-20 02:06:42 +03:00
folio = filemap_lock_folio ( mapping , * offset > > PAGE_SHIFT ) ;
2023-06-21 07:31:49 +03:00
if ( IS_ERR_OR_NULL ( folio ) )
2023-03-20 02:06:42 +03:00
return true ;
2019-07-30 20:49:17 +03:00
2023-03-20 02:06:42 +03:00
s = bch2_folio ( folio ) ;
if ( ! s )
goto unlock ;
2019-07-30 20:49:17 +03:00
2023-03-20 02:06:42 +03:00
sectors = folio_sectors ( folio ) ;
2023-04-03 15:17:26 +03:00
for ( i = folio_pos_to_s ( folio , * offset ) ; i < sectors ; i + + )
2023-03-23 18:08:04 +03:00
if ( s - > s [ i ] . state < SECTOR_dirty ) {
2023-04-03 15:17:26 +03:00
* offset = max ( * offset ,
folio_pos ( folio ) + ( i < < SECTOR_SHIFT ) ) ;
2023-03-20 02:06:42 +03:00
goto unlock ;
}
2019-07-30 20:49:17 +03:00
2023-03-20 02:06:42 +03:00
* offset = folio_end_pos ( folio ) ;
ret = false ;
unlock :
2023-03-17 21:55:53 +03:00
folio_unlock ( folio ) ;
2017-03-17 09:18:50 +03:00
return ret ;
}
2019-07-30 20:49:17 +03:00
static loff_t bch2_seek_pagecache_hole ( struct inode * vinode ,
2017-03-17 09:18:50 +03:00
loff_t start_offset ,
loff_t end_offset )
{
struct address_space * mapping = vinode - > i_mapping ;
2023-03-20 02:06:42 +03:00
loff_t offset = start_offset ;
2017-03-17 09:18:50 +03:00
2023-03-20 02:06:42 +03:00
while ( offset < end_offset & &
! folio_hole_offset ( mapping , & offset ) )
;
2019-07-30 20:49:17 +03:00
2023-03-20 02:06:42 +03:00
return min ( offset , end_offset ) ;
2017-03-17 09:18:50 +03:00
}
static loff_t bch2_seek_hole ( struct file * file , u64 offset )
{
struct bch_inode_info * inode = file_bch_inode ( file ) ;
struct bch_fs * c = inode - > v . i_sb - > s_fs_info ;
2019-03-25 22:10:15 +03:00
struct btree_trans trans ;
2021-08-30 22:18:31 +03:00
struct btree_iter iter ;
2017-03-17 09:18:50 +03:00
struct bkey_s_c k ;
2021-03-16 07:28:17 +03:00
subvol_inum inum = inode_inum ( inode ) ;
2017-03-17 09:18:50 +03:00
u64 isize , next_hole = MAX_LFS_FILESIZE ;
2021-03-16 07:28:17 +03:00
u32 snapshot ;
2017-03-17 09:18:50 +03:00
int ret ;
isize = i_size_read ( & inode - > v ) ;
if ( offset > = isize )
return - ENXIO ;
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , c , 0 , 0 ) ;
2021-03-16 07:28:17 +03:00
retry :
bch2_trans_begin ( & trans ) ;
ret = bch2_subvolume_get_snapshot ( & trans , inum . subvol , & snapshot ) ;
if ( ret )
goto err ;
2019-03-25 22:10:15 +03:00
2021-10-21 19:05:21 +03:00
for_each_btree_key_norestart ( & trans , iter , BTREE_ID_extents ,
2021-03-16 07:28:17 +03:00
SPOS ( inode - > v . i_ino , offset > > 9 , snapshot ) ,
2019-04-17 22:49:28 +03:00
BTREE_ITER_SLOTS , k , ret ) {
2017-03-17 09:18:50 +03:00
if ( k . k - > p . inode ! = inode - > v . i_ino ) {
2019-07-30 20:49:17 +03:00
next_hole = bch2_seek_pagecache_hole ( & inode - > v ,
2017-03-17 09:18:50 +03:00
offset , MAX_LFS_FILESIZE ) ;
break ;
} else if ( ! bkey_extent_is_data ( k . k ) ) {
2019-07-30 20:49:17 +03:00
next_hole = bch2_seek_pagecache_hole ( & inode - > v ,
2017-03-17 09:18:50 +03:00
max ( offset , bkey_start_offset ( k . k ) < < 9 ) ,
k . k - > p . offset < < 9 ) ;
if ( next_hole < k . k - > p . offset < < 9 )
break ;
} else {
offset = max ( offset , bkey_start_offset ( k . k ) < < 9 ) ;
}
}
2021-08-30 22:18:31 +03:00
bch2_trans_iter_exit ( & trans , & iter ) ;
2021-03-16 07:28:17 +03:00
err :
2022-07-18 06:06:38 +03:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
2021-03-16 07:28:17 +03:00
goto retry ;
2017-03-17 09:18:50 +03:00
2021-10-19 22:08:00 +03:00
bch2_trans_exit ( & trans ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
return ret ;
if ( next_hole > isize )
next_hole = isize ;
return vfs_setpos ( file , next_hole , MAX_LFS_FILESIZE ) ;
}
loff_t bch2_llseek ( struct file * file , loff_t offset , int whence )
{
2022-09-18 22:43:50 +03:00
loff_t ret ;
2017-03-17 09:18:50 +03:00
switch ( whence ) {
case SEEK_SET :
case SEEK_CUR :
case SEEK_END :
2022-09-18 22:43:50 +03:00
ret = generic_file_llseek ( file , offset , whence ) ;
break ;
2017-03-17 09:18:50 +03:00
case SEEK_DATA :
2022-09-18 22:43:50 +03:00
ret = bch2_seek_data ( file , offset ) ;
break ;
2017-03-17 09:18:50 +03:00
case SEEK_HOLE :
2022-09-18 22:43:50 +03:00
ret = bch2_seek_hole ( file , offset ) ;
break ;
default :
ret = - EINVAL ;
break ;
2017-03-17 09:18:50 +03:00
}
2022-09-18 22:43:50 +03:00
return bch2_err_class ( ret ) ;
2017-03-17 09:18:50 +03:00
}
void bch2_fs_fsio_exit ( struct bch_fs * c )
{
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
bioset_exit ( & c - > nocow_flush_bioset ) ;
2017-03-17 09:18:50 +03:00
bioset_exit ( & c - > dio_write_bioset ) ;
bioset_exit ( & c - > dio_read_bioset ) ;
bioset_exit ( & c - > writepage_bioset ) ;
}
int bch2_fs_fsio_init ( struct bch_fs * c )
{
int ret = 0 ;
pr_verbose_init ( c - > opts , " " ) ;
if ( bioset_init ( & c - > writepage_bioset ,
2019-10-09 19:50:39 +03:00
4 , offsetof ( struct bch_writepage_io , op . wbio . bio ) ,
2023-03-14 22:35:57 +03:00
BIOSET_NEED_BVECS ) )
return - BCH_ERR_ENOMEM_writepage_bioset_init ;
if ( bioset_init ( & c - > dio_read_bioset ,
2017-03-17 09:18:50 +03:00
4 , offsetof ( struct dio_read , rbio . bio ) ,
2023-03-14 22:35:57 +03:00
BIOSET_NEED_BVECS ) )
return - BCH_ERR_ENOMEM_dio_read_bioset_init ;
if ( bioset_init ( & c - > dio_write_bioset ,
2019-10-09 19:50:39 +03:00
4 , offsetof ( struct dio_write , op . wbio . bio ) ,
2023-03-14 22:35:57 +03:00
BIOSET_NEED_BVECS ) )
return - BCH_ERR_ENOMEM_dio_write_bioset_init ;
if ( bioset_init ( & c - > nocow_flush_bioset ,
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-03 00:12:00 +03:00
1 , offsetof ( struct nocow_flush , bio ) , 0 ) )
2023-03-14 22:35:57 +03:00
return - BCH_ERR_ENOMEM_nocow_flush_bioset_init ;
2017-03-17 09:18:50 +03:00
pr_verbose_init ( c - > opts , " ret %i " , ret ) ;
return ret ;
}
# endif /* NO_BCACHEFS_FS */