2022-06-13 19:07:19 -04:00
// SPDX-License-Identifier: GPL-2.0
# include "bcachefs.h"
# include "alloc_foreground.h"
# include "bkey_buf.h"
# include "btree_update.h"
# include "buckets.h"
# include "data_update.h"
# include "ec.h"
2023-03-10 17:34:29 -05:00
# include "error.h"
2022-06-13 19:07:19 -04:00
# include "extents.h"
2023-09-10 18:05:17 -04:00
# include "io_write.h"
2022-06-13 19:07:19 -04:00
# include "keylist.h"
# include "move.h"
2022-12-14 20:52:11 -05:00
# include "nocow_locking.h"
bcachefs: rebalance_work
This adds a new btree, rebalance_work, to eliminate scanning required
for finding extents that need work done on them in the background - i.e.
for the background_target and background_compression options.
rebalance_work is a bitset btree, where a KEY_TYPE_set corresponds to an
extent in the extents or reflink btree at the same pos.
A new extent field is added, bch_extent_rebalance, which indicates that
this extent has work that needs to be done in the background - and which
options to use. This allows per-inode options to be propagated to
indirect extents - at least in some circumstances. In this patch,
changing IO options on a file will not propagate the new options to
indirect extents pointed to by that file.
Updating (setting/clearing) the rebalance_work btree is done by the
extent trigger, which looks at the bch_extent_rebalance field.
Scanning is still requrired after changing IO path options - either just
for a given inode, or for the whole filesystem. We indicate that
scanning is required by adding a KEY_TYPE_cookie key to the
rebalance_work btree: the cookie counter is so that we can detect that
scanning is still required when an option has been flipped mid-way
through an existing scan.
Future possible work:
- Propagate options to indirect extents when being changed
- Add other IO path options - nr_replicas, ec, to rebalance_work so
they can be applied in the background when they change
- Add a counter, for bcachefs fs usage output, showing the pending
amount of rebalance work: we'll probably want to do this after the
disk space accounting rewrite (moving it to a new btree)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-10-20 13:33:14 -04:00
# include "rebalance.h"
2024-03-22 16:29:23 -04:00
# include "snapshot.h"
2022-06-13 19:07:19 -04:00
# include "subvolume.h"
# include "trace.h"
2023-04-20 15:24:07 -04:00
static void trace_move_extent_finish2 ( struct bch_fs * c , struct bkey_s_c k )
{
if ( trace_move_extent_finish_enabled ( ) ) {
struct printbuf buf = PRINTBUF ;
bch2_bkey_val_to_text ( & buf , c , k ) ;
trace_move_extent_finish ( c , buf . buf ) ;
printbuf_exit ( & buf ) ;
}
}
2023-03-10 17:34:29 -05:00
static void trace_move_extent_fail2 ( struct data_update * m ,
struct bkey_s_c new ,
struct bkey_s_c wrote ,
struct bkey_i * insert ,
const char * msg )
{
struct bch_fs * c = m - > op . c ;
struct bkey_s_c old = bkey_i_to_s_c ( m - > k . k ) ;
const union bch_extent_entry * entry ;
struct bch_extent_ptr * ptr ;
struct extent_ptr_decoded p ;
struct printbuf buf = PRINTBUF ;
unsigned i , rewrites_found = 0 ;
if ( ! trace_move_extent_fail_enabled ( ) )
return ;
prt_str ( & buf , msg ) ;
if ( insert ) {
i = 0 ;
bkey_for_each_ptr_decode ( old . k , bch2_bkey_ptrs_c ( old ) , p , entry ) {
if ( ( ( 1U < < i ) & m - > data_opts . rewrite_ptrs ) & &
( ptr = bch2_extent_has_ptr ( old , p , bkey_i_to_s ( insert ) ) ) & &
! ptr - > cached )
rewrites_found | = 1U < < i ;
i + + ;
}
}
prt_printf ( & buf , " \n rewrite ptrs: %u%u%u%u " ,
( m - > data_opts . rewrite_ptrs & ( 1 < < 0 ) ) ! = 0 ,
( m - > data_opts . rewrite_ptrs & ( 1 < < 1 ) ) ! = 0 ,
( m - > data_opts . rewrite_ptrs & ( 1 < < 2 ) ) ! = 0 ,
( m - > data_opts . rewrite_ptrs & ( 1 < < 3 ) ) ! = 0 ) ;
prt_printf ( & buf , " \n rewrites found: %u%u%u%u " ,
( rewrites_found & ( 1 < < 0 ) ) ! = 0 ,
( rewrites_found & ( 1 < < 1 ) ) ! = 0 ,
( rewrites_found & ( 1 < < 2 ) ) ! = 0 ,
( rewrites_found & ( 1 < < 3 ) ) ! = 0 ) ;
prt_str ( & buf , " \n old: " ) ;
bch2_bkey_val_to_text ( & buf , c , old ) ;
prt_str ( & buf , " \n new: " ) ;
bch2_bkey_val_to_text ( & buf , c , new ) ;
prt_str ( & buf , " \n wrote: " ) ;
bch2_bkey_val_to_text ( & buf , c , wrote ) ;
if ( insert ) {
prt_str ( & buf , " \n insert: " ) ;
bch2_bkey_val_to_text ( & buf , c , bkey_i_to_s_c ( insert ) ) ;
}
trace_move_extent_fail ( c , buf . buf ) ;
printbuf_exit ( & buf ) ;
}
2022-11-14 01:31:10 -05:00
static int __bch2_data_update_index_update ( struct btree_trans * trans ,
struct bch_write_op * op )
2022-06-13 19:07:19 -04:00
{
struct bch_fs * c = op - > c ;
struct btree_iter iter ;
struct data_update * m =
container_of ( op , struct data_update , op ) ;
struct keylist * keys = & op - > insert_keys ;
struct bkey_buf _new , _insert ;
int ret = 0 ;
bch2_bkey_buf_init ( & _new ) ;
bch2_bkey_buf_init ( & _insert ) ;
bch2_bkey_buf_realloc ( & _insert , c , U8_MAX ) ;
2022-11-14 01:31:10 -05:00
bch2_trans_iter_init ( trans , & iter , m - > btree_id ,
2022-06-13 19:07:19 -04:00
bkey_start_pos ( & bch2_keylist_front ( keys ) - > k ) ,
BTREE_ITER_SLOTS | BTREE_ITER_INTENT ) ;
while ( 1 ) {
struct bkey_s_c k ;
2022-06-13 19:17:45 -04:00
struct bkey_s_c old = bkey_i_to_s_c ( m - > k . k ) ;
2023-03-04 03:21:34 -05:00
struct bkey_i * insert = NULL ;
2022-06-13 19:07:19 -04:00
struct bkey_i_extent * new ;
2023-03-04 03:21:34 -05:00
const union bch_extent_entry * entry_c ;
union bch_extent_entry * entry ;
2022-06-13 19:07:19 -04:00
struct extent_ptr_decoded p ;
2023-03-04 03:21:34 -05:00
struct bch_extent_ptr * ptr ;
const struct bch_extent_ptr * ptr_c ;
2022-06-13 19:07:19 -04:00
struct bpos next_pos ;
2021-11-08 12:30:47 -05:00
bool should_check_enospc ;
2022-06-13 19:07:19 -04:00
s64 i_sectors_delta = 0 , disk_sectors_delta = 0 ;
2023-03-04 03:21:34 -05:00
unsigned rewrites_found = 0 , durability , i ;
2022-06-13 19:07:19 -04:00
2022-11-14 01:31:10 -05:00
bch2_trans_begin ( trans ) ;
2022-06-13 19:07:19 -04:00
k = bch2_btree_iter_peek_slot ( & iter ) ;
ret = bkey_err ( k ) ;
if ( ret )
goto err ;
new = bkey_i_to_extent ( bch2_keylist_front ( keys ) ) ;
2023-03-10 17:34:29 -05:00
if ( ! bch2_extents_match ( k , old ) ) {
trace_move_extent_fail2 ( m , k , bkey_i_to_s_c ( & new - > k_i ) ,
NULL , " no match: " ) ;
2023-03-04 03:21:34 -05:00
goto nowork ;
2023-03-10 17:34:29 -05:00
}
2022-06-13 19:07:19 -04:00
bkey_reassemble ( _insert . k , k ) ;
insert = _insert . k ;
bch2_bkey_buf_copy ( & _new , c , bch2_keylist_front ( keys ) ) ;
new = bkey_i_to_extent ( _new . k ) ;
bch2_cut_front ( iter . pos , & new - > k_i ) ;
bch2_cut_front ( iter . pos , insert ) ;
bch2_cut_back ( new - > k . p , insert ) ;
bch2_cut_back ( insert - > k . p , & new - > k_i ) ;
2022-06-13 19:17:45 -04:00
/*
* @ old : extent that we read from
* @ insert : key that we ' re going to update , initialized from
* extent currently in btree - same as @ old unless we raced with
* other updates
* @ new : extent with new pointers that we ' ll be adding to @ insert
*
* Fist , drop rewrite_ptrs from @ new :
*/
i = 0 ;
2023-03-04 03:21:34 -05:00
bkey_for_each_ptr_decode ( old . k , bch2_bkey_ptrs_c ( old ) , p , entry_c ) {
2022-06-13 19:17:45 -04:00
if ( ( ( 1U < < i ) & m - > data_opts . rewrite_ptrs ) & &
2023-03-04 03:21:34 -05:00
( ptr = bch2_extent_has_ptr ( old , p , bkey_i_to_s ( insert ) ) ) & &
! ptr - > cached ) {
bch2_extent_ptr_set_cached ( bkey_i_to_s ( insert ) , ptr ) ;
rewrites_found | = 1U < < i ;
2022-06-13 19:17:45 -04:00
}
i + + ;
2022-06-13 19:07:19 -04:00
}
2023-03-04 03:21:34 -05:00
if ( m - > data_opts . rewrite_ptrs & &
! rewrites_found & &
2023-03-10 17:34:29 -05:00
bch2_bkey_durability ( c , k ) > = m - > op . opts . data_replicas ) {
trace_move_extent_fail2 ( m , k , bkey_i_to_s_c ( & new - > k_i ) , insert , " no rewrites found: " ) ;
2023-03-04 03:21:34 -05:00
goto nowork ;
2023-03-10 17:34:29 -05:00
}
2022-06-13 19:17:45 -04:00
2023-03-04 03:21:34 -05:00
/*
* A replica that we just wrote might conflict with a replica
* that we want to keep , due to racing with another move :
*/
restart_drop_conflicting_replicas :
extent_for_each_ptr ( extent_i_to_s ( new ) , ptr )
if ( ( ptr_c = bch2_bkey_has_device_c ( bkey_i_to_s_c ( insert ) , ptr - > dev ) ) & &
! ptr_c - > cached ) {
bch2_bkey_drop_ptr_noerror ( bkey_i_to_s ( & new - > k_i ) , ptr ) ;
goto restart_drop_conflicting_replicas ;
2022-06-13 19:07:19 -04:00
}
2023-03-10 17:34:29 -05:00
if ( ! bkey_val_u64s ( & new - > k ) ) {
trace_move_extent_fail2 ( m , k , bkey_i_to_s_c ( & new - > k_i ) , insert , " new replicas conflicted: " ) ;
2023-03-04 03:21:34 -05:00
goto nowork ;
2023-03-10 17:34:29 -05:00
}
2023-03-04 03:21:34 -05:00
/* Now, drop pointers that conflict with what we just wrote: */
extent_for_each_ptr_decode ( extent_i_to_s ( new ) , p , entry )
if ( ( ptr = bch2_bkey_has_device ( bkey_i_to_s ( insert ) , p . ptr . dev ) ) )
bch2_bkey_drop_ptr_noerror ( bkey_i_to_s ( insert ) , ptr ) ;
durability = bch2_bkey_durability ( c , bkey_i_to_s_c ( insert ) ) +
bch2_bkey_durability ( c , bkey_i_to_s_c ( & new - > k_i ) ) ;
/* Now, drop excess replicas: */
restart_drop_extra_replicas :
bkey_for_each_ptr_decode ( old . k , bch2_bkey_ptrs ( bkey_i_to_s ( insert ) ) , p , entry ) {
unsigned ptr_durability = bch2_extent_ptr_durability ( c , & p ) ;
if ( ! p . ptr . cached & &
durability - ptr_durability > = m - > op . opts . data_replicas ) {
durability - = ptr_durability ;
2023-10-27 15:23:46 -04:00
2023-03-04 03:21:34 -05:00
bch2_extent_ptr_set_cached ( bkey_i_to_s ( insert ) , & entry - > ptr ) ;
goto restart_drop_extra_replicas ;
}
2022-06-13 19:07:19 -04:00
}
2023-03-04 03:21:34 -05:00
/* Finally, add the pointers we just wrote: */
extent_for_each_ptr_decode ( extent_i_to_s ( new ) , p , entry )
bch2_extent_ptr_decoded_append ( insert , & p ) ;
2022-06-13 19:07:19 -04:00
2022-06-13 19:17:45 -04:00
bch2_bkey_narrow_crcs ( insert , ( struct bch_extent_crc_unpacked ) { 0 } ) ;
2022-06-13 19:07:19 -04:00
bch2_extent_normalize ( c , bkey_i_to_s ( insert ) ) ;
2022-11-14 01:31:10 -05:00
ret = bch2_sum_sector_overwrites ( trans , & iter , insert ,
2022-06-13 19:07:19 -04:00
& should_check_enospc ,
& i_sectors_delta ,
& disk_sectors_delta ) ;
if ( ret )
goto err ;
if ( disk_sectors_delta > ( s64 ) op - > res . sectors ) {
ret = bch2_disk_reservation_add ( c , & op - > res ,
disk_sectors_delta - op - > res . sectors ,
! should_check_enospc
? BCH_DISK_RESERVATION_NOFAIL : 0 ) ;
if ( ret )
goto out ;
}
next_pos = insert - > k . p ;
2023-11-03 18:38:35 -04:00
/*
* Check for nonce offset inconsistency :
* This is debug code - we ' ve been seeing this bug rarely , and
* it ' s been hard to reproduce , so this should give us some more
* information when it does occur :
*/
struct printbuf err = PRINTBUF ;
int invalid = bch2_bkey_invalid ( c , bkey_i_to_s_c ( insert ) , __btree_node_type ( 0 , m - > btree_id ) , 0 , & err ) ;
printbuf_exit ( & err ) ;
if ( invalid ) {
struct printbuf buf = PRINTBUF ;
prt_str ( & buf , " about to insert invalid key in data update path " ) ;
prt_str ( & buf , " \n old: " ) ;
bch2_bkey_val_to_text ( & buf , c , old ) ;
prt_str ( & buf , " \n k: " ) ;
bch2_bkey_val_to_text ( & buf , c , k ) ;
prt_str ( & buf , " \n new: " ) ;
bch2_bkey_val_to_text ( & buf , c , bkey_i_to_s_c ( insert ) ) ;
bch2_print_string_as_lines ( KERN_ERR , buf . buf ) ;
printbuf_exit ( & buf ) ;
bch2_fatal_error ( c ) ;
goto out ;
}
2023-11-24 21:52:17 -05:00
if ( trace_data_update_enabled ( ) ) {
struct printbuf buf = PRINTBUF ;
prt_str ( & buf , " \n old: " ) ;
bch2_bkey_val_to_text ( & buf , c , old ) ;
prt_str ( & buf , " \n k: " ) ;
bch2_bkey_val_to_text ( & buf , c , k ) ;
prt_str ( & buf , " \n new: " ) ;
bch2_bkey_val_to_text ( & buf , c , bkey_i_to_s_c ( insert ) ) ;
trace_data_update ( c , buf . buf ) ;
printbuf_exit ( & buf ) ;
}
2023-05-27 23:19:13 -04:00
ret = bch2_insert_snapshot_whiteouts ( trans , m - > btree_id ,
k . k - > p , bkey_start_pos ( & insert - > k ) ) ? :
bch2_insert_snapshot_whiteouts ( trans , m - > btree_id ,
bcachefs: rebalance_work
This adds a new btree, rebalance_work, to eliminate scanning required
for finding extents that need work done on them in the background - i.e.
for the background_target and background_compression options.
rebalance_work is a bitset btree, where a KEY_TYPE_set corresponds to an
extent in the extents or reflink btree at the same pos.
A new extent field is added, bch_extent_rebalance, which indicates that
this extent has work that needs to be done in the background - and which
options to use. This allows per-inode options to be propagated to
indirect extents - at least in some circumstances. In this patch,
changing IO options on a file will not propagate the new options to
indirect extents pointed to by that file.
Updating (setting/clearing) the rebalance_work btree is done by the
extent trigger, which looks at the bch_extent_rebalance field.
Scanning is still requrired after changing IO path options - either just
for a given inode, or for the whole filesystem. We indicate that
scanning is required by adding a KEY_TYPE_cookie key to the
rebalance_work btree: the cookie counter is so that we can detect that
scanning is still required when an option has been flipped mid-way
through an existing scan.
Future possible work:
- Propagate options to indirect extents when being changed
- Add other IO path options - nr_replicas, ec, to rebalance_work so
they can be applied in the background when they change
- Add a counter, for bcachefs fs usage output, showing the pending
amount of rebalance work: we'll probably want to do this after the
disk space accounting rewrite (moving it to a new btree)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-10-20 13:33:14 -04:00
k . k - > p , insert - > k . p ) ? :
2024-01-16 16:20:21 -05:00
bch2_bkey_set_needs_rebalance ( c , insert , & op - > opts ) ? :
bcachefs: rebalance_work
This adds a new btree, rebalance_work, to eliminate scanning required
for finding extents that need work done on them in the background - i.e.
for the background_target and background_compression options.
rebalance_work is a bitset btree, where a KEY_TYPE_set corresponds to an
extent in the extents or reflink btree at the same pos.
A new extent field is added, bch_extent_rebalance, which indicates that
this extent has work that needs to be done in the background - and which
options to use. This allows per-inode options to be propagated to
indirect extents - at least in some circumstances. In this patch,
changing IO options on a file will not propagate the new options to
indirect extents pointed to by that file.
Updating (setting/clearing) the rebalance_work btree is done by the
extent trigger, which looks at the bch_extent_rebalance field.
Scanning is still requrired after changing IO path options - either just
for a given inode, or for the whole filesystem. We indicate that
scanning is required by adding a KEY_TYPE_cookie key to the
rebalance_work btree: the cookie counter is so that we can detect that
scanning is still required when an option has been flipped mid-way
through an existing scan.
Future possible work:
- Propagate options to indirect extents when being changed
- Add other IO path options - nr_replicas, ec, to rebalance_work so
they can be applied in the background when they change
- Add a counter, for bcachefs fs usage output, showing the pending
amount of rebalance work: we'll probably want to do this after the
disk space accounting rewrite (moving it to a new btree)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-10-20 13:33:14 -04:00
bch2_trans_update ( trans , & iter , insert ,
2022-06-13 19:07:19 -04:00
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE ) ? :
2022-11-14 01:31:10 -05:00
bch2_trans_commit ( trans , & op - > res ,
2022-11-03 00:29:43 -04:00
NULL ,
2023-11-11 16:31:50 -05:00
BCH_TRANS_COMMIT_no_check_rw |
BCH_TRANS_COMMIT_no_enospc |
2022-06-13 19:07:19 -04:00
m - > data_opts . btree_insert_flags ) ;
if ( ! ret ) {
bch2_btree_iter_set_pos ( & iter , next_pos ) ;
2022-08-27 12:48:36 -04:00
this_cpu_add ( c - > counters [ BCH_COUNTER_move_extent_finish ] , new - > k . size ) ;
2023-04-20 15:24:07 -04:00
trace_move_extent_finish2 ( c , bkey_i_to_s_c ( & new - > k_i ) ) ;
2022-06-13 19:07:19 -04:00
}
err :
2022-07-17 23:06:38 -04:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
2022-06-13 19:07:19 -04:00
ret = 0 ;
if ( ret )
break ;
next :
2022-11-24 03:12:22 -05:00
while ( bkey_ge ( iter . pos , bch2_keylist_front ( keys ) - > k . p ) ) {
2022-06-13 19:07:19 -04:00
bch2_keylist_pop_front ( keys ) ;
if ( bch2_keylist_empty ( keys ) )
goto out ;
}
continue ;
2023-03-04 03:21:34 -05:00
nowork :
2023-11-27 21:52:33 +13:00
if ( m - > stats ) {
2022-06-13 19:07:19 -04:00
BUG_ON ( k . k - > p . offset < = iter . pos . offset ) ;
2023-10-23 15:36:45 -04:00
atomic64_inc ( & m - > stats - > keys_raced ) ;
2022-06-13 19:07:19 -04:00
atomic64_add ( k . k - > p . offset - iter . pos . offset ,
2023-10-23 15:36:45 -04:00
& m - > stats - > sectors_raced ) ;
2022-06-13 19:07:19 -04:00
}
2022-08-27 12:48:36 -04:00
2023-11-27 22:37:27 -05:00
count_event ( c , move_extent_fail ) ;
2022-08-27 12:48:36 -04:00
2022-06-13 19:07:19 -04:00
bch2_btree_iter_advance ( & iter ) ;
goto next ;
}
out :
2022-11-14 01:31:10 -05:00
bch2_trans_iter_exit ( trans , & iter ) ;
2022-06-13 19:07:19 -04:00
bch2_bkey_buf_exit ( & _insert , c ) ;
bch2_bkey_buf_exit ( & _new , c ) ;
2022-07-17 23:06:38 -04:00
BUG_ON ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) ) ;
2022-06-13 19:07:19 -04:00
return ret ;
}
2022-11-14 01:31:10 -05:00
int bch2_data_update_index_update ( struct bch_write_op * op )
{
2023-09-12 17:16:02 -04:00
return bch2_trans_run ( op - > c , __bch2_data_update_index_update ( trans , op ) ) ;
2022-11-14 01:31:10 -05:00
}
2022-06-13 19:17:45 -04:00
void bch2_data_update_read_done ( struct data_update * m ,
struct bch_extent_crc_unpacked crc )
2022-06-13 19:07:19 -04:00
{
/* write bio must own pages: */
BUG_ON ( ! m - > op . wbio . bio . bi_vcnt ) ;
2022-06-13 19:17:45 -04:00
m - > op . crc = crc ;
m - > op . wbio . bio . bi_iter . bi_size = crc . compressed_size < < 9 ;
2022-06-13 19:07:19 -04:00
2022-06-13 19:17:45 -04:00
closure_call ( & m - > op . cl , bch2_write , NULL , NULL ) ;
}
void bch2_data_update_exit ( struct data_update * update )
{
struct bch_fs * c = update - > op . c ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
struct bkey_ptrs_c ptrs =
bch2_bkey_ptrs_c ( bkey_i_to_s_c ( update - > k . k ) ) ;
2023-02-11 12:38:28 -05:00
bkey_for_each_ptr ( ptrs , ptr ) {
2023-02-24 19:07:21 -05:00
if ( c - > opts . nocow_enabled )
bch2_bucket_nocow_unlock ( & c - > nocow_locks ,
PTR_BUCKET_POS ( c , ptr ) , 0 ) ;
2023-02-11 12:38:28 -05:00
percpu_ref_put ( & bch_dev_bkey_exists ( c , ptr - > dev ) - > ref ) ;
}
2022-06-13 19:17:45 -04:00
bch2_bkey_buf_exit ( & update - > k , c ) ;
bch2_disk_reservation_put ( c , & update - > op . res ) ;
bch2_bio_free_pages_pool ( c , & update - > op . wbio . bio ) ;
2022-06-13 19:07:19 -04:00
}
2023-11-24 21:51:45 -05:00
static void bch2_update_unwritten_extent ( struct btree_trans * trans ,
2022-11-14 01:31:10 -05:00
struct data_update * update )
{
struct bch_fs * c = update - > op . c ;
struct bio * bio = & update - > op . wbio . bio ;
struct bkey_i_extent * e ;
struct write_point * wp ;
struct closure cl ;
struct btree_iter iter ;
struct bkey_s_c k ;
int ret ;
closure_init_stack ( & cl ) ;
bch2_keylist_init ( & update - > op . insert_keys , update - > op . inline_keys ) ;
while ( bio_sectors ( bio ) ) {
unsigned sectors = bio_sectors ( bio ) ;
bch2_trans_iter_init ( trans , & iter , update - > btree_id , update - > op . pos ,
BTREE_ITER_SLOTS ) ;
ret = lockrestart_do ( trans , ( {
k = bch2_btree_iter_peek_slot ( & iter ) ;
bkey_err ( k ) ;
} ) ) ;
bch2_trans_iter_exit ( trans , & iter ) ;
if ( ret | | ! bch2_extents_match ( k , bkey_i_to_s_c ( update - > k . k ) ) )
break ;
e = bkey_extent_init ( update - > op . insert_keys . top ) ;
e - > k . p = update - > op . pos ;
ret = bch2_alloc_sectors_start_trans ( trans ,
update - > op . target ,
false ,
update - > op . write_point ,
& update - > op . devs_have ,
update - > op . nr_replicas ,
update - > op . nr_replicas ,
2023-06-24 19:30:10 -04:00
update - > op . watermark ,
2022-11-14 01:31:10 -05:00
0 , & cl , & wp ) ;
if ( bch2_err_matches ( ret , BCH_ERR_operation_blocked ) ) {
bch2_trans_unlock ( trans ) ;
closure_sync ( & cl ) ;
continue ;
}
2023-12-19 18:08:19 -05:00
bch_err_fn_ratelimited ( c , ret ) ;
2022-11-14 01:31:10 -05:00
if ( ret )
return ;
sectors = min ( sectors , wp - > sectors_free ) ;
bch2_key_resize ( & e - > k , sectors ) ;
bch2_open_bucket_get ( c , wp , & update - > op . open_buckets ) ;
bch2_alloc_sectors_append_ptrs ( c , wp , & e - > k_i , sectors , false ) ;
bch2_alloc_sectors_done ( c , wp ) ;
bio_advance ( bio , sectors < < 9 ) ;
update - > op . pos . offset + = sectors ;
extent_for_each_ptr ( extent_i_to_s ( e ) , ptr )
ptr - > unwritten = true ;
bch2_keylist_push ( & update - > op . insert_keys ) ;
ret = __bch2_data_update_index_update ( trans , & update - > op ) ;
bch2_open_buckets_put ( c , & update - > op . open_buckets ) ;
if ( ret )
break ;
}
2023-08-12 16:51:45 -04:00
if ( closure_nr_remaining ( & cl ) ! = 1 ) {
2022-11-14 01:31:10 -05:00
bch2_trans_unlock ( trans ) ;
closure_sync ( & cl ) ;
}
}
2023-11-24 21:51:45 -05:00
int bch2_extent_drop_ptrs ( struct btree_trans * trans ,
struct btree_iter * iter ,
struct bkey_s_c k ,
struct data_update_opts data_opts )
{
struct bch_fs * c = trans - > c ;
struct bkey_i * n ;
int ret ;
n = bch2_bkey_make_mut_noupdate ( trans , k ) ;
ret = PTR_ERR_OR_ZERO ( n ) ;
if ( ret )
return ret ;
while ( data_opts . kill_ptrs ) {
unsigned i = 0 , drop = __fls ( data_opts . kill_ptrs ) ;
struct bch_extent_ptr * ptr ;
bch2_bkey_drop_ptrs ( bkey_i_to_s ( n ) , ptr , i + + = = drop ) ;
data_opts . kill_ptrs ^ = 1U < < drop ;
}
/*
* If the new extent no longer has any pointers , bch2_extent_normalize ( )
* will do the appropriate thing with it ( turning it into a
* KEY_TYPE_error key , or just a discard if it was a cached extent )
*/
bch2_extent_normalize ( c , bkey_i_to_s ( n ) ) ;
/*
* Since we ' re not inserting through an extent iterator
* ( BTREE_ITER_ALL_SNAPSHOTS iterators aren ' t extent iterators ) ,
* we aren ' t using the extent overwrite path to delete , we ' re
* just using the normal key deletion path :
*/
2023-12-02 02:43:58 -05:00
if ( bkey_deleted ( & n - > k ) & & ! ( iter - > flags & BTREE_ITER_IS_EXTENTS ) )
2023-11-24 21:51:45 -05:00
n - > k . size = 0 ;
return bch2_trans_relock ( trans ) ? :
bch2_trans_update ( trans , iter , n , BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE ) ? :
2023-11-11 16:31:50 -05:00
bch2_trans_commit ( trans , NULL , NULL , BCH_TRANS_COMMIT_no_enospc ) ;
2023-11-24 21:51:45 -05:00
}
2023-01-02 17:53:02 -05:00
int bch2_data_update_init ( struct btree_trans * trans ,
2023-11-24 21:51:45 -05:00
struct btree_iter * iter ,
2023-01-02 17:53:02 -05:00
struct moving_context * ctxt ,
struct data_update * m ,
2022-06-13 19:07:19 -04:00
struct write_point_specifier wp ,
struct bch_io_opts io_opts ,
2022-06-13 19:17:45 -04:00
struct data_update_opts data_opts ,
2022-06-13 19:07:19 -04:00
enum btree_id btree_id ,
struct bkey_s_c k )
{
2023-01-02 17:53:02 -05:00
struct bch_fs * c = trans - > c ;
2022-06-13 19:07:19 -04:00
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
struct extent_ptr_decoded p ;
2022-06-13 19:17:45 -04:00
unsigned i , reserve_sectors = k . k - > size * data_opts . extra_replicas ;
2023-02-27 23:16:37 -05:00
unsigned ptrs_locked = 0 ;
2023-11-24 21:51:45 -05:00
int ret = 0 ;
2022-06-13 19:07:19 -04:00
2024-03-22 16:29:23 -04:00
/*
* fs is corrupt we have a key for a snapshot node that doesn ' t exist ,
* and we have to check for this because we go rw before repairing the
* snapshots table - just skip it , we can move it later .
*/
if ( unlikely ( k . k - > p . snapshot & & ! bch2_snapshot_equiv ( c , k . k - > p . snapshot ) ) )
return - BCH_ERR_data_update_done ;
2022-06-13 19:17:45 -04:00
bch2_bkey_buf_init ( & m - > k ) ;
bch2_bkey_buf_reassemble ( & m - > k , c , k ) ;
2022-06-13 19:07:19 -04:00
m - > btree_id = btree_id ;
m - > data_opts = data_opts ;
2023-10-23 15:36:45 -04:00
m - > ctxt = ctxt ;
2023-10-23 16:21:54 -04:00
m - > stats = ctxt ? ctxt - > stats : NULL ;
2022-06-13 19:07:19 -04:00
bch2_write_op_init ( & m - > op , c , io_opts ) ;
2022-06-13 19:17:45 -04:00
m - > op . pos = bkey_start_pos ( k . k ) ;
m - > op . version = k . k - > version ;
2022-10-19 18:31:33 -04:00
m - > op . target = data_opts . target ;
2022-06-13 19:07:19 -04:00
m - > op . write_point = wp ;
2023-02-27 23:16:37 -05:00
m - > op . nr_replicas = 0 ;
2022-06-13 19:17:45 -04:00
m - > op . flags | = BCH_WRITE_PAGES_STABLE |
2022-06-13 19:07:19 -04:00
BCH_WRITE_PAGES_OWNED |
BCH_WRITE_DATA_ENCODED |
2022-06-13 19:17:45 -04:00
BCH_WRITE_MOVE |
m - > data_opts . write_flags ;
2024-01-16 16:20:21 -05:00
m - > op . compression_opt = background_compression ( io_opts ) ;
2023-06-27 17:32:48 -04:00
m - > op . watermark = m - > data_opts . btree_insert_flags & BCH_WATERMARK_MASK ;
2022-06-13 19:07:19 -04:00
2023-02-11 12:38:28 -05:00
bkey_for_each_ptr ( ptrs , ptr )
percpu_ref_get ( & bch_dev_bkey_exists ( c , ptr - > dev ) - > ref ) ;
2023-11-24 21:51:45 -05:00
unsigned durability_have = 0 , durability_removing = 0 ;
2022-06-13 19:17:45 -04:00
i = 0 ;
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry ) {
2023-01-06 21:11:07 +13:00
bool locked ;
2023-02-27 23:16:37 -05:00
if ( ( ( 1U < < i ) & m - > data_opts . rewrite_ptrs ) ) {
BUG_ON ( p . ptr . cached ) ;
2022-06-13 19:07:19 -04:00
2023-02-27 23:16:37 -05:00
if ( crc_is_compressed ( p . crc ) )
reserve_sectors + = k . k - > size ;
2022-06-13 19:07:19 -04:00
2023-06-13 15:12:04 -04:00
m - > op . nr_replicas + = bch2_extent_ptr_desired_durability ( c , & p ) ;
2023-11-24 21:51:45 -05:00
durability_removing + = bch2_extent_ptr_desired_durability ( c , & p ) ;
} else if ( ! p . ptr . cached & &
! ( ( 1U < < i ) & m - > data_opts . kill_ptrs ) ) {
2023-02-27 23:16:37 -05:00
bch2_dev_list_add_dev ( & m - > op . devs_have , p . ptr . dev ) ;
2023-11-24 21:51:45 -05:00
durability_have + = bch2_extent_ptr_durability ( c , & p ) ;
2023-02-27 23:16:37 -05:00
}
2022-06-13 19:07:19 -04:00
2022-06-13 19:17:45 -04:00
/*
* op - > csum_type is normally initialized from the fs / file ' s
* current options - but if an extent is encrypted , we require
* that it stays encrypted :
*/
if ( bch2_csum_type_is_encryption ( p . crc . csum_type ) ) {
m - > op . nonce = p . crc . nonce + p . crc . offset ;
m - > op . csum_type = p . crc . csum_type ;
2022-06-13 19:07:19 -04:00
}
2022-06-13 19:17:45 -04:00
if ( p . crc . compression_type = = BCH_COMPRESSION_TYPE_incompressible )
m - > op . incompressible = true ;
2022-06-13 19:07:19 -04:00
2023-02-24 19:07:21 -05:00
if ( c - > opts . nocow_enabled ) {
if ( ctxt ) {
2023-10-20 13:32:42 -04:00
move_ctxt_wait_event ( ctxt ,
2023-02-24 19:07:21 -05:00
( locked = bch2_bucket_nocow_trylock ( & c - > nocow_locks ,
PTR_BUCKET_POS ( c , & p . ptr ) , 0 ) ) | |
2024-04-02 01:03:58 -04:00
list_empty ( & ctxt - > ios ) ) ;
2023-02-24 19:07:21 -05:00
if ( ! locked )
bch2_bucket_nocow_lock ( & c - > nocow_locks ,
PTR_BUCKET_POS ( c , & p . ptr ) , 0 ) ;
} else {
if ( ! bch2_bucket_nocow_trylock ( & c - > nocow_locks ,
PTR_BUCKET_POS ( c , & p . ptr ) , 0 ) ) {
ret = - BCH_ERR_nocow_lock_blocked ;
goto err ;
}
2023-01-06 21:11:07 +13:00
}
2023-02-24 19:07:21 -05:00
ptrs_locked | = ( 1U < < i ) ;
2023-01-02 17:53:02 -05:00
}
2023-02-24 19:07:21 -05:00
2023-01-06 21:11:07 +13:00
i + + ;
2022-06-13 19:07:19 -04:00
}
2022-06-13 19:17:45 -04:00
2024-04-05 02:43:08 -04:00
unsigned durability_required = max ( 0 , ( int ) ( io_opts . data_replicas - durability_have ) ) ;
2023-11-24 21:51:45 -05:00
/*
* If current extent durability is less than io_opts . data_replicas ,
* we ' re not trying to rereplicate the extent up to data_replicas here -
* unless extra_replicas was specified
*
* Increasing replication is an explicit operation triggered by
* rereplicate , currently , so that users don ' t get an unexpected - ENOSPC
*/
2023-12-23 23:29:05 -05:00
if ( ! ( m - > data_opts . write_flags & BCH_WRITE_CACHED ) & &
2024-04-05 02:43:08 -04:00
! durability_required ) {
2023-11-24 21:51:45 -05:00
m - > data_opts . kill_ptrs | = m - > data_opts . rewrite_ptrs ;
m - > data_opts . rewrite_ptrs = 0 ;
/* if iter == NULL, it's just a promote */
if ( iter )
2023-12-02 02:43:58 -05:00
ret = bch2_extent_drop_ptrs ( trans , iter , k , m - > data_opts ) ;
2023-11-24 21:51:45 -05:00
goto done ;
}
2024-04-05 02:43:08 -04:00
m - > op . nr_replicas = min ( durability_removing , durability_required ) +
2023-11-24 21:51:45 -05:00
m - > data_opts . extra_replicas ;
2024-04-05 02:43:08 -04:00
/*
* If device ( s ) were set to durability = 0 after data was written to them
* we can end up with a duribilty = 0 extent , and the normal algorithm
* that tries not to increase durability doesn ' t work :
*/
if ( ! ( durability_have + durability_removing ) )
m - > op . nr_replicas = max ( ( unsigned ) m - > op . nr_replicas , 1 ) ;
m - > op . nr_replicas_required = m - > op . nr_replicas ;
2023-11-24 21:51:45 -05:00
2022-06-13 19:17:45 -04:00
if ( reserve_sectors ) {
ret = bch2_disk_reservation_add ( c , & m - > op . res , reserve_sectors ,
m - > data_opts . extra_replicas
? 0
: BCH_DISK_RESERVATION_NOFAIL ) ;
if ( ret )
2022-12-09 12:37:56 +13:00
goto err ;
2022-06-13 19:07:19 -04:00
}
2023-11-24 21:51:45 -05:00
if ( bkey_extent_is_unwritten ( k ) ) {
bch2_update_unwritten_extent ( trans , m ) ;
goto done ;
}
2022-11-14 01:31:10 -05:00
2022-06-13 19:07:19 -04:00
return 0 ;
2022-12-09 12:37:56 +13:00
err :
2023-01-06 21:11:07 +13:00
i = 0 ;
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry ) {
if ( ( 1U < < i ) & ptrs_locked )
bch2_bucket_nocow_unlock ( & c - > nocow_locks ,
2023-02-11 12:38:28 -05:00
PTR_BUCKET_POS ( c , & p . ptr ) , 0 ) ;
percpu_ref_put ( & bch_dev_bkey_exists ( c , p . ptr . dev ) - > ref ) ;
2023-01-06 21:11:07 +13:00
i + + ;
}
2022-12-09 12:37:56 +13:00
bch2_bkey_buf_exit ( & m - > k , c ) ;
bch2_bio_free_pages_pool ( c , & m - > op . wbio . bio ) ;
return ret ;
2023-11-24 21:51:45 -05:00
done :
bch2_data_update_exit ( m ) ;
return ret ? : - BCH_ERR_data_update_done ;
2022-06-13 19:07:19 -04:00
}
2022-10-09 03:32:17 -04:00
void bch2_data_update_opts_normalize ( struct bkey_s_c k , struct data_update_opts * opts )
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
unsigned i = 0 ;
bkey_for_each_ptr ( ptrs , ptr ) {
if ( ( opts - > rewrite_ptrs & ( 1U < < i ) ) & & ptr - > cached ) {
opts - > kill_ptrs | = 1U < < i ;
opts - > rewrite_ptrs ^ = 1U < < i ;
}
i + + ;
}
}