2017-03-16 22:18:50 -08:00
// SPDX-License-Identifier: GPL-2.0
# include "bcachefs.h"
# include "bkey_methods.h"
2018-11-27 18:30:56 -05:00
# include "bkey_sort.h"
2017-03-16 22:18:50 -08:00
# include "btree_cache.h"
# include "btree_io.h"
# include "btree_iter.h"
# include "btree_locking.h"
# include "btree_update.h"
# include "btree_update_interior.h"
# include "buckets.h"
# include "checksum.h"
# include "debug.h"
# include "error.h"
# include "extents.h"
2023-09-10 18:05:17 -04:00
# include "io_write.h"
2017-03-16 22:18:50 -08:00
# include "journal_reclaim.h"
# include "journal_seq_blacklist.h"
2023-08-03 20:37:32 -04:00
# include "recovery.h"
2017-03-16 22:18:50 -08:00
# include "super-io.h"
# include "trace.h"
2020-03-27 17:38:51 -04:00
# include <linux/sched/mm.h>
2021-07-10 23:03:15 -04:00
void bch2_btree_node_io_unlock ( struct btree * b )
{
EBUG_ON ( ! btree_node_write_in_flight ( b ) ) ;
2021-07-10 13:44:42 -04:00
clear_btree_node_write_in_flight_inner ( b ) ;
2021-07-10 23:03:15 -04:00
clear_btree_node_write_in_flight ( b ) ;
wake_up_bit ( & b - > flags , BTREE_NODE_write_in_flight ) ;
}
void bch2_btree_node_io_lock ( struct btree * b )
{
2023-03-03 00:03:01 -05:00
bch2_assert_btree_nodes_not_locked ( ) ;
2021-07-10 23:03:15 -04:00
wait_on_bit_lock_io ( & b - > flags , BTREE_NODE_write_in_flight ,
TASK_UNINTERRUPTIBLE ) ;
}
void __bch2_btree_node_wait_on_read ( struct btree * b )
{
wait_on_bit_io ( & b - > flags , BTREE_NODE_read_in_flight ,
TASK_UNINTERRUPTIBLE ) ;
}
void __bch2_btree_node_wait_on_write ( struct btree * b )
{
wait_on_bit_io ( & b - > flags , BTREE_NODE_write_in_flight ,
TASK_UNINTERRUPTIBLE ) ;
}
void bch2_btree_node_wait_on_read ( struct btree * b )
{
2023-03-03 00:03:01 -05:00
bch2_assert_btree_nodes_not_locked ( ) ;
2021-07-10 23:03:15 -04:00
wait_on_bit_io ( & b - > flags , BTREE_NODE_read_in_flight ,
TASK_UNINTERRUPTIBLE ) ;
}
void bch2_btree_node_wait_on_write ( struct btree * b )
{
2023-03-03 00:03:01 -05:00
bch2_assert_btree_nodes_not_locked ( ) ;
2021-07-10 23:03:15 -04:00
wait_on_bit_io ( & b - > flags , BTREE_NODE_write_in_flight ,
TASK_UNINTERRUPTIBLE ) ;
}
2017-03-16 22:18:50 -08:00
static void verify_no_dups ( struct btree * b ,
struct bkey_packed * start ,
2021-02-20 00:00:23 -05:00
struct bkey_packed * end )
2017-03-16 22:18:50 -08:00
{
# ifdef CONFIG_BCACHEFS_DEBUG
2019-11-09 23:50:52 -05:00
struct bkey_packed * k , * p ;
if ( start = = end )
return ;
2017-03-16 22:18:50 -08:00
2023-03-04 23:05:55 -05:00
for ( p = start , k = bkey_p_next ( start ) ;
2019-11-09 23:50:52 -05:00
k ! = end ;
2023-03-04 23:05:55 -05:00
p = k , k = bkey_p_next ( k ) ) {
2019-11-09 23:50:52 -05:00
struct bkey l = bkey_unpack_key ( b , p ) ;
struct bkey r = bkey_unpack_key ( b , k ) ;
2017-03-16 22:18:50 -08:00
2022-11-24 03:12:22 -05:00
BUG_ON ( bpos_ge ( l . p , bkey_start_pos ( & r ) ) ) ;
2017-03-16 22:18:50 -08:00
}
# endif
}
2019-11-09 23:50:52 -05:00
static void set_needs_whiteout ( struct bset * i , int v )
2017-03-16 22:18:50 -08:00
{
struct bkey_packed * k ;
2023-03-04 23:05:55 -05:00
for ( k = i - > start ; k ! = vstruct_last ( i ) ; k = bkey_p_next ( k ) )
2019-11-09 23:50:52 -05:00
k - > needs_whiteout = v ;
2017-03-16 22:18:50 -08:00
}
2020-07-25 15:07:37 -04:00
static void btree_bounce_free ( struct bch_fs * c , size_t size ,
2017-03-16 22:18:50 -08:00
bool used_mempool , void * p )
{
if ( used_mempool )
mempool_free ( p , & c - > btree_bounce_pool ) ;
else
2020-07-25 15:07:37 -04:00
vpfree ( p , size ) ;
2017-03-16 22:18:50 -08:00
}
2020-07-25 15:07:37 -04:00
static void * btree_bounce_alloc ( struct bch_fs * c , size_t size ,
2017-03-16 22:18:50 -08:00
bool * used_mempool )
{
2020-03-27 17:38:51 -04:00
unsigned flags = memalloc_nofs_save ( ) ;
2017-03-16 22:18:50 -08:00
void * p ;
2024-01-16 13:29:59 -05:00
BUG_ON ( size > c - > opts . btree_node_size ) ;
2017-03-16 22:18:50 -08:00
* used_mempool = false ;
2020-07-25 15:07:37 -04:00
p = vpmalloc ( size , __GFP_NOWARN | GFP_NOWAIT ) ;
2020-03-27 17:38:51 -04:00
if ( ! p ) {
* used_mempool = true ;
2023-05-28 18:02:38 -04:00
p = mempool_alloc ( & c - > btree_bounce_pool , GFP_NOFS ) ;
2020-03-27 17:38:51 -04:00
}
memalloc_nofs_restore ( flags ) ;
return p ;
2017-03-16 22:18:50 -08:00
}
2019-11-29 14:08:51 -05:00
static void sort_bkey_ptrs ( const struct btree * bt ,
struct bkey_packed * * ptrs , unsigned nr )
{
unsigned n = nr , a = nr / 2 , b , c , d ;
if ( ! a )
return ;
/* Heap sort: see lib/sort.c: */
while ( 1 ) {
if ( a )
a - - ;
else if ( - - n )
swap ( ptrs [ 0 ] , ptrs [ n ] ) ;
else
break ;
for ( b = a ; c = 2 * b + 1 , ( d = c + 1 ) < n ; )
2020-11-07 12:31:20 -05:00
b = bch2_bkey_cmp_packed ( bt ,
2019-11-29 14:08:51 -05:00
ptrs [ c ] ,
ptrs [ d ] ) > = 0 ? c : d ;
if ( d = = n )
b = c ;
while ( b ! = a & &
2020-11-07 12:31:20 -05:00
bch2_bkey_cmp_packed ( bt ,
2019-11-29 14:08:51 -05:00
ptrs [ a ] ,
ptrs [ b ] ) > = 0 )
b = ( b - 1 ) / 2 ;
c = b ;
while ( b ! = a ) {
b = ( b - 1 ) / 2 ;
swap ( ptrs [ b ] , ptrs [ c ] ) ;
}
}
}
static void bch2_sort_whiteouts ( struct bch_fs * c , struct btree * b )
{
2019-12-14 15:55:29 -05:00
struct bkey_packed * new_whiteouts , * * ptrs , * * ptrs_end , * k ;
bool used_mempool = false ;
2020-07-25 15:07:37 -04:00
size_t bytes = b - > whiteout_u64s * sizeof ( u64 ) ;
2019-11-29 14:08:51 -05:00
if ( ! b - > whiteout_u64s )
return ;
2020-07-25 15:07:37 -04:00
new_whiteouts = btree_bounce_alloc ( c , bytes , & used_mempool ) ;
2019-11-29 14:08:51 -05:00
2020-07-25 15:07:37 -04:00
ptrs = ptrs_end = ( ( void * ) new_whiteouts + bytes ) ;
2019-11-29 14:08:51 -05:00
2024-01-16 13:29:59 -05:00
for ( k = unwritten_whiteouts_start ( b ) ;
k ! = unwritten_whiteouts_end ( b ) ;
2023-03-04 23:05:55 -05:00
k = bkey_p_next ( k ) )
2019-12-14 15:55:29 -05:00
* - - ptrs = k ;
2019-11-29 14:08:51 -05:00
2019-12-14 15:55:29 -05:00
sort_bkey_ptrs ( b , ptrs , ptrs_end - ptrs ) ;
2019-11-29 14:08:51 -05:00
k = new_whiteouts ;
2019-12-14 15:55:29 -05:00
while ( ptrs ! = ptrs_end ) {
2023-11-02 19:33:48 -04:00
bkey_p_copy ( k , * ptrs ) ;
2023-03-04 23:05:55 -05:00
k = bkey_p_next ( k ) ;
2019-12-14 15:55:29 -05:00
ptrs + + ;
2019-11-29 14:08:51 -05:00
}
verify_no_dups ( b , new_whiteouts ,
2021-02-20 00:00:23 -05:00
( void * ) ( ( u64 * ) new_whiteouts + b - > whiteout_u64s ) ) ;
2019-11-29 14:08:51 -05:00
2024-01-16 13:29:59 -05:00
memcpy_u64s ( unwritten_whiteouts_start ( b ) ,
2019-11-29 14:08:51 -05:00
new_whiteouts , b - > whiteout_u64s ) ;
2020-07-25 15:07:37 -04:00
btree_bounce_free ( c , bytes , used_mempool , new_whiteouts ) ;
2019-11-29 14:08:51 -05:00
}
2019-12-13 13:08:37 -05:00
static bool should_compact_bset ( struct btree * b , struct bset_tree * t ,
bool compacting , enum compact_mode mode )
2017-03-16 22:18:50 -08:00
{
2019-12-13 13:08:37 -05:00
if ( ! bset_dead_u64s ( b , t ) )
return false ;
2017-03-16 22:18:50 -08:00
2019-12-13 13:08:37 -05:00
switch ( mode ) {
case COMPACT_LAZY :
return should_compact_bset_lazy ( b , t ) | |
( compacting & & ! bset_written ( b , bset ( b , t ) ) ) ;
case COMPACT_ALL :
return true ;
default :
BUG ( ) ;
2017-03-16 22:18:50 -08:00
}
}
2019-12-13 13:08:37 -05:00
static bool bch2_drop_whiteouts ( struct btree * b , enum compact_mode mode )
2017-03-16 22:18:50 -08:00
{
struct bset_tree * t ;
bool ret = false ;
for_each_bset ( b , t ) {
struct bset * i = bset ( b , t ) ;
struct bkey_packed * k , * n , * out , * start , * end ;
2019-12-13 13:08:37 -05:00
struct btree_node_entry * src = NULL , * dst = NULL ;
if ( t ! = b - > set & & ! bset_written ( b , i ) ) {
src = container_of ( i , struct btree_node_entry , keys ) ;
dst = max ( write_block ( b ) ,
( void * ) btree_bkey_last ( b , t - 1 ) ) ;
}
if ( src ! = dst )
ret = true ;
2017-03-16 22:18:50 -08:00
2019-12-13 13:08:37 -05:00
if ( ! should_compact_bset ( b , t , ret , mode ) ) {
if ( src ! = dst ) {
memmove ( dst , src , sizeof ( * src ) +
le16_to_cpu ( src - > keys . u64s ) *
sizeof ( u64 ) ) ;
i = & dst - > keys ;
set_btree_bset ( b , t , i ) ;
}
2017-03-16 22:18:50 -08:00
continue ;
2019-12-13 13:08:37 -05:00
}
2017-03-16 22:18:50 -08:00
start = btree_bkey_first ( b , t ) ;
end = btree_bkey_last ( b , t ) ;
2019-12-13 13:08:37 -05:00
if ( src ! = dst ) {
memmove ( dst , src , sizeof ( * src ) ) ;
i = & dst - > keys ;
2017-03-16 22:18:50 -08:00
set_btree_bset ( b , t , i ) ;
}
out = i - > start ;
for ( k = start ; k ! = end ; k = n ) {
2023-03-04 23:05:55 -05:00
n = bkey_p_next ( k ) ;
2017-03-16 22:18:50 -08:00
2021-02-19 23:41:40 -05:00
if ( ! bkey_deleted ( k ) ) {
2023-11-02 19:33:48 -04:00
bkey_p_copy ( out , k ) ;
2023-03-04 23:05:55 -05:00
out = bkey_p_next ( out ) ;
2019-12-13 13:08:37 -05:00
} else {
BUG_ON ( k - > needs_whiteout ) ;
2017-03-16 22:18:50 -08:00
}
}
i - > u64s = cpu_to_le16 ( ( u64 * ) out - i - > _data ) ;
2019-12-13 13:08:37 -05:00
set_btree_bset_end ( b , t ) ;
2017-03-16 22:18:50 -08:00
bch2_bset_set_no_aux_tree ( b , t ) ;
ret = true ;
}
bch2_verify_btree_nr_keys ( b ) ;
2019-12-13 13:08:37 -05:00
bch2_btree_build_aux_trees ( b ) ;
2017-03-16 22:18:50 -08:00
return ret ;
}
2019-12-13 13:08:37 -05:00
bool bch2_compact_whiteouts ( struct bch_fs * c , struct btree * b ,
enum compact_mode mode )
{
2021-02-20 00:00:23 -05:00
return bch2_drop_whiteouts ( b , mode ) ;
2019-12-13 13:08:37 -05:00
}
2017-03-16 22:18:50 -08:00
static void btree_node_sort ( struct bch_fs * c , struct btree * b ,
unsigned start_idx ,
unsigned end_idx ,
bool filter_whiteouts )
{
struct btree_node * out ;
2023-09-09 20:10:11 -04:00
struct sort_iter_stack sort_iter ;
2017-03-16 22:18:50 -08:00
struct bset_tree * t ;
struct bset * start_bset = bset ( b , & b - > set [ start_idx ] ) ;
bool used_mempool = false ;
u64 start_time , seq = 0 ;
2020-07-25 15:07:37 -04:00
unsigned i , u64s = 0 , bytes , shift = end_idx - start_idx - 1 ;
2017-03-16 22:18:50 -08:00
bool sorting_entire_node = start_idx = = 0 & &
end_idx = = b - > nsets ;
2023-09-09 20:10:11 -04:00
sort_iter_stack_init ( & sort_iter , b ) ;
2017-03-16 22:18:50 -08:00
for ( t = b - > set + start_idx ;
t < b - > set + end_idx ;
t + + ) {
u64s + = le16_to_cpu ( bset ( b , t ) - > u64s ) ;
2023-09-09 20:10:11 -04:00
sort_iter_add ( & sort_iter . iter ,
2017-03-16 22:18:50 -08:00
btree_bkey_first ( b , t ) ,
btree_bkey_last ( b , t ) ) ;
}
2020-07-25 15:07:37 -04:00
bytes = sorting_entire_node
2024-01-16 13:29:59 -05:00
? btree_buf_bytes ( b )
2020-07-25 15:07:37 -04:00
: __vstruct_bytes ( struct btree_node , u64s ) ;
2017-03-16 22:18:50 -08:00
2020-07-25 15:07:37 -04:00
out = btree_bounce_alloc ( c , bytes , & used_mempool ) ;
2017-03-16 22:18:50 -08:00
start_time = local_clock ( ) ;
2023-09-09 20:10:11 -04:00
u64s = bch2_sort_keys ( out - > keys . start , & sort_iter . iter , filter_whiteouts ) ;
2017-03-16 22:18:50 -08:00
out - > keys . u64s = cpu_to_le16 ( u64s ) ;
2020-07-25 15:07:37 -04:00
BUG_ON ( vstruct_end ( & out - > keys ) > ( void * ) out + bytes ) ;
2017-03-16 22:18:50 -08:00
if ( sorting_entire_node )
2019-03-21 16:28:57 -04:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_node_sort ] ,
2017-03-16 22:18:50 -08:00
start_time ) ;
/* Make sure we preserve bset journal_seq: */
for ( t = b - > set + start_idx ; t < b - > set + end_idx ; t + + )
seq = max ( seq , le64_to_cpu ( bset ( b , t ) - > journal_seq ) ) ;
start_bset - > journal_seq = cpu_to_le64 ( seq ) ;
if ( sorting_entire_node ) {
2023-09-12 18:41:22 -04:00
u64s = le16_to_cpu ( out - > keys . u64s ) ;
2017-03-16 22:18:50 -08:00
2024-01-16 13:29:59 -05:00
BUG_ON ( bytes ! = btree_buf_bytes ( b ) ) ;
2017-03-16 22:18:50 -08:00
/*
* Our temporary buffer is the same size as the btree node ' s
* buffer , we can just swap buffers instead of doing a big
* memcpy ( )
*/
* out = * b - > data ;
out - > keys . u64s = cpu_to_le16 ( u64s ) ;
swap ( out , b - > data ) ;
set_btree_bset ( b , b - > set , & b - > data - > keys ) ;
} else {
start_bset - > u64s = out - > keys . u64s ;
memcpy_u64s ( start_bset - > start ,
out - > keys . start ,
le16_to_cpu ( out - > keys . u64s ) ) ;
}
for ( i = start_idx + 1 ; i < end_idx ; i + + )
b - > nr . bset_u64s [ start_idx ] + =
b - > nr . bset_u64s [ i ] ;
b - > nsets - = shift ;
for ( i = start_idx + 1 ; i < b - > nsets ; i + + ) {
b - > nr . bset_u64s [ i ] = b - > nr . bset_u64s [ i + shift ] ;
b - > set [ i ] = b - > set [ i + shift ] ;
}
for ( i = b - > nsets ; i < MAX_BSETS ; i + + )
b - > nr . bset_u64s [ i ] = 0 ;
set_btree_bset_end ( b , & b - > set [ start_idx ] ) ;
bch2_bset_set_no_aux_tree ( b , & b - > set [ start_idx ] ) ;
2020-07-25 15:07:37 -04:00
btree_bounce_free ( c , bytes , used_mempool , out ) ;
2017-03-16 22:18:50 -08:00
bch2_verify_btree_nr_keys ( b ) ;
}
void bch2_btree_sort_into ( struct bch_fs * c ,
struct btree * dst ,
struct btree * src )
{
struct btree_nr_keys nr ;
struct btree_node_iter src_iter ;
u64 start_time = local_clock ( ) ;
BUG_ON ( dst - > nsets ! = 1 ) ;
bch2_bset_set_no_aux_tree ( dst , dst - > set ) ;
2016-07-21 19:05:06 -08:00
bch2_btree_node_iter_init_from_start ( & src_iter , src ) ;
2017-03-16 22:18:50 -08:00
2021-12-19 18:59:22 -05:00
nr = bch2_sort_repack ( btree_bset_first ( dst ) ,
src , & src_iter ,
& dst - > format ,
true ) ;
2017-03-16 22:18:50 -08:00
2019-03-21 16:28:57 -04:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_node_sort ] ,
start_time ) ;
2017-03-16 22:18:50 -08:00
set_btree_bset_end ( dst , dst - > set ) ;
dst - > nr . live_u64s + = nr . live_u64s ;
dst - > nr . bset_u64s [ 0 ] + = nr . bset_u64s [ 0 ] ;
dst - > nr . packed_keys + = nr . packed_keys ;
dst - > nr . unpacked_keys + = nr . unpacked_keys ;
bch2_verify_btree_nr_keys ( dst ) ;
}
/*
* We ' re about to add another bset to the btree node , so if there ' s currently
* too many bsets - sort some of them together :
*/
2021-04-06 15:33:19 -04:00
static bool btree_node_compact ( struct bch_fs * c , struct btree * b )
2017-03-16 22:18:50 -08:00
{
unsigned unwritten_idx ;
bool ret = false ;
for ( unwritten_idx = 0 ;
unwritten_idx < b - > nsets ;
unwritten_idx + + )
2018-08-05 22:23:44 -04:00
if ( ! bset_written ( b , bset ( b , & b - > set [ unwritten_idx ] ) ) )
2017-03-16 22:18:50 -08:00
break ;
if ( b - > nsets - unwritten_idx > 1 ) {
2021-04-06 15:33:19 -04:00
btree_node_sort ( c , b , unwritten_idx ,
2017-03-16 22:18:50 -08:00
b - > nsets , false ) ;
ret = true ;
}
if ( unwritten_idx > 1 ) {
2021-04-06 15:33:19 -04:00
btree_node_sort ( c , b , 0 , unwritten_idx , false ) ;
2017-03-16 22:18:50 -08:00
ret = true ;
}
return ret ;
}
void bch2_btree_build_aux_trees ( struct btree * b )
{
struct bset_tree * t ;
for_each_bset ( b , t )
bch2_bset_build_aux_tree ( b , t ,
2018-08-05 22:23:44 -04:00
! bset_written ( b , bset ( b , t ) ) & &
2017-03-16 22:18:50 -08:00
t = = bset_tree_last ( b ) ) ;
}
2022-10-28 18:56:31 -04:00
/*
* If we have MAX_BSETS ( 3 ) bsets , should we sort them all down to just one ?
*
* The first bset is going to be of similar order to the size of the node , the
* last bset is bounded by btree_write_set_buffer ( ) , which is set to keep the
* memmove on insert from being too expensive : the middle bset should , ideally ,
* be the geometric mean of the first and the last .
*
* Returns true if the middle bset is greater than that geometric mean :
*/
static inline bool should_compact_all ( struct bch_fs * c , struct btree * b )
{
unsigned mid_u64s_bits =
( ilog2 ( btree_max_u64s ( c ) ) + BTREE_WRITE_SET_U64s_BITS ) / 2 ;
return bset_u64s ( & b - > set [ 1 ] ) > 1U < < mid_u64s_bits ;
}
2017-03-16 22:18:50 -08:00
/*
* @ bch_btree_init_next - initialize a new ( unwritten ) bset that can then be
* inserted into
*
* Safe to call if there already is an unwritten bset - will only add a new bset
* if @ b doesn ' t already have one .
*
* Returns true if we sorted ( i . e . invalidated iterators
*/
2021-08-30 14:36:03 -04:00
void bch2_btree_init_next ( struct btree_trans * trans , struct btree * b )
2017-03-16 22:18:50 -08:00
{
2021-07-10 23:22:06 -04:00
struct bch_fs * c = trans - > c ;
2017-03-16 22:18:50 -08:00
struct btree_node_entry * bne ;
2021-04-06 15:33:19 -04:00
bool reinit_iter = false ;
2017-03-16 22:18:50 -08:00
2023-05-20 23:57:48 -04:00
EBUG_ON ( ! six_lock_counts ( & b - > c . lock ) . n [ SIX_LOCK_write ] ) ;
2021-04-06 15:33:19 -04:00
BUG_ON ( bset_written ( b , bset ( b , & b - > set [ 1 ] ) ) ) ;
2022-10-28 18:56:31 -04:00
BUG_ON ( btree_node_just_written ( b ) ) ;
2021-04-06 15:33:19 -04:00
2021-07-10 23:03:15 -04:00
if ( b - > nsets = = MAX_BSETS & &
2022-10-28 18:56:31 -04:00
! btree_node_write_in_flight ( b ) & &
should_compact_all ( c , b ) ) {
bch2_btree_node_write ( c , b , SIX_LOCK_write ,
BTREE_WRITE_init_next_bset ) ;
reinit_iter = true ;
2021-04-06 15:33:19 -04:00
}
if ( b - > nsets = = MAX_BSETS & &
btree_node_compact ( c , b ) )
reinit_iter = true ;
2017-03-16 22:18:50 -08:00
2021-04-06 15:33:19 -04:00
BUG_ON ( b - > nsets > = MAX_BSETS ) ;
2017-03-16 22:18:50 -08:00
bne = want_new_bset ( c , b ) ;
if ( bne )
2024-01-16 13:29:59 -05:00
bch2_bset_init_next ( b , bne ) ;
2017-03-16 22:18:50 -08:00
bch2_btree_build_aux_trees ( b ) ;
2021-08-30 14:36:03 -04:00
if ( reinit_iter )
bch2_trans_node_reinit_iter ( trans , b ) ;
2017-03-16 22:18:50 -08:00
}
2018-11-09 01:24:07 -05:00
static void btree_err_msg ( struct printbuf * out , struct bch_fs * c ,
2021-02-02 17:08:54 -05:00
struct bch_dev * ca ,
2018-11-09 01:24:07 -05:00
struct btree * b , struct bset * i ,
unsigned offset , int write )
2017-03-16 22:18:50 -08:00
{
2023-01-03 17:14:07 -05:00
prt_printf ( out , bch2_log_msg ( c , " %s " ) ,
write = = READ
? " error validating btree node "
: " corrupt btree node before write " ) ;
2021-02-02 17:08:54 -05:00
if ( ca )
2023-02-03 21:01:40 -05:00
prt_printf ( out , " on %s " , ca - > name ) ;
prt_printf ( out , " at btree " ) ;
2023-10-19 22:49:08 -04:00
bch2_btree_pos_to_text ( out , c , b ) ;
2020-06-03 18:27:07 -04:00
2024-01-05 14:17:57 -05:00
prt_printf ( out , " \n node offset %u/%u " ,
b - > written , btree_ptr_sectors_written ( & b - > key ) ) ;
2017-03-16 22:18:50 -08:00
if ( i )
2023-02-03 21:01:40 -05:00
prt_printf ( out , " bset u64s %u " , le16_to_cpu ( i - > u64s ) ) ;
2022-09-25 18:22:54 -04:00
prt_str ( out , " : " ) ;
2017-03-16 22:18:50 -08:00
}
2023-10-24 20:44:36 -04:00
__printf ( 9 , 10 )
2023-08-03 19:36:28 -04:00
static int __btree_err ( int ret ,
2023-01-03 17:14:07 -05:00
struct bch_fs * c ,
struct bch_dev * ca ,
struct btree * b ,
struct bset * i ,
int write ,
bool have_retry ,
2023-10-24 20:44:36 -04:00
enum bch_sb_error_id err_type ,
2023-01-03 17:14:07 -05:00
const char * fmt , . . . )
{
struct printbuf out = PRINTBUF ;
va_list args ;
btree_err_msg ( & out , c , ca , b , i , b - > written , write ) ;
va_start ( args , fmt ) ;
prt_vprintf ( & out , fmt , args ) ;
va_end ( args ) ;
if ( write = = WRITE ) {
2023-02-01 15:45:45 -05:00
bch2_print_string_as_lines ( KERN_ERR , out . buf ) ;
2023-01-03 17:14:07 -05:00
ret = c - > opts . errors = = BCH_ON_ERROR_continue
? 0
: - BCH_ERR_fsck_errors_not_fixed ;
goto out ;
}
2023-08-03 19:36:28 -04:00
if ( ! have_retry & & ret = = - BCH_ERR_btree_node_read_err_want_retry )
ret = - BCH_ERR_btree_node_read_err_fixable ;
if ( ! have_retry & & ret = = - BCH_ERR_btree_node_read_err_must_retry )
ret = - BCH_ERR_btree_node_read_err_bad_node ;
2023-02-01 15:45:45 -05:00
2023-10-24 20:44:36 -04:00
if ( ret ! = - BCH_ERR_btree_node_read_err_fixable )
bch2_sb_error_count ( c , err_type ) ;
2023-08-03 19:36:28 -04:00
switch ( ret ) {
case - BCH_ERR_btree_node_read_err_fixable :
2023-10-24 20:44:36 -04:00
ret = bch2_fsck_err ( c , FSCK_CAN_FIX , err_type , " %s " , out . buf ) ;
if ( ret ! = - BCH_ERR_fsck_fix & &
ret ! = - BCH_ERR_fsck_ignore )
goto fsck_err ;
2023-02-01 15:45:45 -05:00
ret = - BCH_ERR_fsck_fix ;
2023-01-03 17:14:07 -05:00
break ;
2023-08-03 19:36:28 -04:00
case - BCH_ERR_btree_node_read_err_want_retry :
case - BCH_ERR_btree_node_read_err_must_retry :
2023-02-01 15:45:45 -05:00
bch2_print_string_as_lines ( KERN_ERR , out . buf ) ;
2023-01-03 17:14:07 -05:00
break ;
2023-08-03 19:36:28 -04:00
case - BCH_ERR_btree_node_read_err_bad_node :
2023-02-01 15:45:45 -05:00
bch2_print_string_as_lines ( KERN_ERR , out . buf ) ;
bch2_topology_error ( c ) ;
2023-07-16 23:19:49 -04:00
ret = bch2_run_explicit_recovery_pass ( c , BCH_RECOVERY_PASS_check_topology ) ? : - EIO ;
2023-02-01 15:45:45 -05:00
break ;
2023-08-03 19:36:28 -04:00
case - BCH_ERR_btree_node_read_err_incompatible :
2023-02-01 15:45:45 -05:00
bch2_print_string_as_lines ( KERN_ERR , out . buf ) ;
2023-01-03 17:14:07 -05:00
ret = - BCH_ERR_fsck_errors_not_fixed ;
break ;
2023-02-01 15:45:45 -05:00
default :
BUG ( ) ;
2023-01-03 17:14:07 -05:00
}
out :
fsck_err :
printbuf_exit ( & out ) ;
return ret ;
}
2023-10-24 20:44:36 -04:00
# define btree_err(type, c, ca, b, i, _err_type, msg, ...) \
2017-03-16 22:18:50 -08:00
( { \
2023-10-24 20:44:36 -04:00
int _ret = __btree_err ( type , c , ca , b , i , write , have_retry , \
BCH_FSCK_ERR_ # # _err_type , \
msg , # # __VA_ARGS__ ) ; \
2022-09-25 18:22:54 -04:00
\
2023-08-03 20:32:46 -04:00
if ( _ret ! = - BCH_ERR_fsck_fix ) { \
ret = _ret ; \
2023-01-03 17:14:07 -05:00
goto fsck_err ; \
2023-08-03 20:32:46 -04:00
} \
\
2023-01-03 17:32:16 -05:00
* saw_error = true ; \
2017-03-16 22:18:50 -08:00
} )
# define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
2021-04-24 16:32:35 -04:00
/*
* When btree topology repair changes the start or end of a node , that might
* mean we have to drop keys that are no longer inside the node :
*/
__cold
void bch2_btree_node_drop_keys_outside_node ( struct btree * b )
{
struct bset_tree * t ;
for_each_bset ( b , t ) {
struct bset * i = bset ( b , t ) ;
struct bkey_packed * k ;
2023-03-04 23:05:55 -05:00
for ( k = i - > start ; k ! = vstruct_last ( i ) ; k = bkey_p_next ( k ) )
2021-04-24 16:32:35 -04:00
if ( bkey_cmp_left_packed ( b , k , & b - > data - > min_key ) > = 0 )
break ;
if ( k ! = i - > start ) {
unsigned shift = ( u64 * ) k - ( u64 * ) i - > start ;
memmove_u64s_down ( i - > start , k ,
( u64 * ) vstruct_end ( i ) - ( u64 * ) k ) ;
i - > u64s = cpu_to_le16 ( le16_to_cpu ( i - > u64s ) - shift ) ;
set_btree_bset_end ( b , t ) ;
}
2023-03-04 23:05:55 -05:00
for ( k = i - > start ; k ! = vstruct_last ( i ) ; k = bkey_p_next ( k ) )
2021-04-24 16:32:35 -04:00
if ( bkey_cmp_left_packed ( b , k , & b - > data - > max_key ) > 0 )
break ;
if ( k ! = vstruct_last ( i ) ) {
i - > u64s = cpu_to_le16 ( ( u64 * ) k - ( u64 * ) i - > start ) ;
set_btree_bset_end ( b , t ) ;
}
}
2022-08-17 14:20:48 -04:00
/*
* Always rebuild search trees : eytzinger search tree nodes directly
* depend on the values of min / max key :
*/
bch2_bset_set_no_aux_tree ( b , b - > set ) ;
2021-04-24 16:32:35 -04:00
bch2_btree_build_aux_trees ( b ) ;
2023-09-12 18:41:22 -04:00
struct bkey_s_c k ;
struct bkey unpacked ;
struct btree_node_iter iter ;
2021-04-24 16:32:35 -04:00
for_each_btree_node_key_unpack ( b , k , & iter , & unpacked ) {
2022-11-24 03:12:22 -05:00
BUG_ON ( bpos_lt ( k . k - > p , b - > data - > min_key ) ) ;
BUG_ON ( bpos_gt ( k . k - > p , b - > data - > max_key ) ) ;
2021-04-24 16:32:35 -04:00
}
}
2021-02-02 17:08:54 -05:00
static int validate_bset ( struct bch_fs * c , struct bch_dev * ca ,
struct btree * b , struct bset * i ,
2021-07-16 12:57:27 -04:00
unsigned offset , unsigned sectors ,
2023-01-03 17:32:16 -05:00
int write , bool have_retry , bool * saw_error )
2017-03-16 22:18:50 -08:00
{
2020-01-07 13:29:32 -05:00
unsigned version = le16_to_cpu ( i - > version ) ;
2022-02-25 13:18:19 -05:00
struct printbuf buf1 = PRINTBUF ;
struct printbuf buf2 = PRINTBUF ;
2017-03-16 22:18:50 -08:00
int ret = 0 ;
2023-06-28 20:27:07 -04:00
btree_err_on ( ! bch2_version_compatible ( version ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_incompatible ,
c , ca , b , i ,
btree_node_unsupported_version ,
2023-06-27 22:09:35 -04:00
" unsupported bset version %u.%u " ,
BCH_VERSION_MAJOR ( version ) ,
BCH_VERSION_MINOR ( version ) ) ;
2020-01-07 13:29:32 -05:00
2021-03-21 16:03:23 -04:00
if ( btree_err_on ( version < c - > sb . version_min ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , i ,
btree_node_bset_older_than_sb_min ,
2021-03-21 16:03:23 -04:00
" bset version %u older than superblock version_min %u " ,
version , c - > sb . version_min ) ) {
mutex_lock ( & c - > sb_lock ) ;
c - > disk_sb . sb - > version_min = cpu_to_le16 ( version ) ;
bch2_write_super ( c ) ;
mutex_unlock ( & c - > sb_lock ) ;
}
2023-06-27 22:09:35 -04:00
if ( btree_err_on ( BCH_VERSION_MAJOR ( version ) >
BCH_VERSION_MAJOR ( c - > sb . version ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , i ,
btree_node_bset_newer_than_sb ,
2021-03-21 16:03:23 -04:00
" bset version %u newer than superblock version %u " ,
version , c - > sb . version ) ) {
mutex_lock ( & c - > sb_lock ) ;
c - > disk_sb . sb - > version = cpu_to_le16 ( version ) ;
bch2_write_super ( c ) ;
mutex_unlock ( & c - > sb_lock ) ;
}
2021-03-29 00:19:05 -04:00
btree_err_on ( BSET_SEPARATE_WHITEOUTS ( i ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_incompatible ,
c , ca , b , i ,
btree_node_unsupported_version ,
2021-03-29 00:19:05 -04:00
" BSET_SEPARATE_WHITEOUTS no longer supported " ) ;
2021-12-14 14:24:41 -05:00
if ( btree_err_on ( offset + sectors > btree_sectors ( c ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , ca , b , i ,
bset_past_end_of_btree_node ,
2020-01-07 13:29:32 -05:00
" bset past end of btree node " ) ) {
i - > u64s = 0 ;
2022-02-25 13:18:19 -05:00
ret = 0 ;
goto out ;
2020-01-07 13:29:32 -05:00
}
2021-07-16 12:57:27 -04:00
btree_err_on ( offset & & ! i - > u64s ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , ca , b , i ,
bset_empty ,
2020-01-07 13:29:32 -05:00
" empty bset " ) ;
2023-10-24 20:44:36 -04:00
btree_err_on ( BSET_OFFSET ( i ) & & BSET_OFFSET ( i ) ! = offset ,
- BCH_ERR_btree_node_read_err_want_retry ,
c , ca , b , i ,
bset_wrong_sector_offset ,
2021-07-16 12:57:27 -04:00
" bset at wrong sector offset " ) ;
if ( ! offset ) {
2020-01-07 13:29:32 -05:00
struct btree_node * bn =
container_of ( i , struct btree_node , keys ) ;
2017-03-16 22:18:50 -08:00
/* These indicate that we read the wrong btree node: */
2020-05-12 18:34:16 -04:00
if ( b - > key . k . type = = KEY_TYPE_btree_ptr_v2 ) {
struct bch_btree_ptr_v2 * bp =
& bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v ;
/* XXX endianness */
btree_err_on ( bp - > seq ! = bn - > keys . seq ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , NULL ,
bset_bad_seq ,
2020-05-12 18:34:16 -04:00
" incorrect sequence number (wrong btree node) " ) ;
}
2020-01-07 13:29:32 -05:00
btree_err_on ( BTREE_NODE_ID ( bn ) ! = b - > c . btree_id ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , i ,
btree_node_bad_btree ,
2017-03-16 22:18:50 -08:00
" incorrect btree id " ) ;
2020-01-07 13:29:32 -05:00
btree_err_on ( BTREE_NODE_LEVEL ( bn ) ! = b - > c . level ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , i ,
btree_node_bad_level ,
2017-03-16 22:18:50 -08:00
" incorrect level " ) ;
2020-01-07 13:29:32 -05:00
if ( ! write )
compat_btree_node ( b - > c . level , b - > c . btree_id , version ,
BSET_BIG_ENDIAN ( i ) , write , bn ) ;
2020-02-07 13:38:02 -05:00
if ( b - > key . k . type = = KEY_TYPE_btree_ptr_v2 ) {
struct bch_btree_ptr_v2 * bp =
& bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v ;
2021-01-26 20:13:54 -05:00
if ( BTREE_PTR_RANGE_UPDATED ( bp ) ) {
b - > data - > min_key = bp - > min_key ;
b - > data - > max_key = b - > key . k . p ;
}
2022-11-24 03:12:22 -05:00
btree_err_on ( ! bpos_eq ( b - > data - > min_key , bp - > min_key ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , NULL ,
btree_node_bad_min_key ,
2021-03-04 15:20:22 -05:00
" incorrect min_key: got %s should be %s " ,
2022-02-25 13:18:19 -05:00
( printbuf_reset ( & buf1 ) ,
bch2_bpos_to_text ( & buf1 , bn - > min_key ) , buf1 . buf ) ,
( printbuf_reset ( & buf2 ) ,
bch2_bpos_to_text ( & buf2 , bp - > min_key ) , buf2 . buf ) ) ;
2020-02-07 13:38:02 -05:00
}
2022-11-24 03:12:22 -05:00
btree_err_on ( ! bpos_eq ( bn - > max_key , b - > key . k . p ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , i ,
btree_node_bad_max_key ,
2021-03-04 15:20:22 -05:00
" incorrect max key %s " ,
2022-02-25 13:18:19 -05:00
( printbuf_reset ( & buf1 ) ,
bch2_bpos_to_text ( & buf1 , bn - > max_key ) , buf1 . buf ) ) ;
2017-03-16 22:18:50 -08:00
2020-01-07 13:29:32 -05:00
if ( write )
compat_btree_node ( b - > c . level , b - > c . btree_id , version ,
BSET_BIG_ENDIAN ( i ) , write , bn ) ;
2023-08-06 10:04:37 -04:00
btree_err_on ( bch2_bkey_format_invalid ( c , & bn - > format , write , & buf1 ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_bad_node ,
c , ca , b , i ,
btree_node_bad_format ,
2023-08-03 14:42:37 -04:00
" invalid bkey format: %s \n %s " , buf1 . buf ,
( printbuf_reset ( & buf2 ) ,
bch2_bkey_format_to_text ( & buf2 , & bn - > format ) , buf2 . buf ) ) ;
printbuf_reset ( & buf1 ) ;
2017-03-16 22:18:50 -08:00
2020-01-07 13:29:32 -05:00
compat_bformat ( b - > c . level , b - > c . btree_id , version ,
BSET_BIG_ENDIAN ( i ) , write ,
& bn - > format ) ;
2017-03-16 22:18:50 -08:00
}
2022-02-25 13:18:19 -05:00
out :
2020-01-07 13:29:32 -05:00
fsck_err :
2022-02-25 13:18:19 -05:00
printbuf_exit ( & buf2 ) ;
printbuf_exit ( & buf1 ) ;
2020-01-07 13:29:32 -05:00
return ret ;
}
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
static int bset_key_invalid ( struct bch_fs * c , struct btree * b ,
struct bkey_s_c k ,
2022-04-03 21:50:25 -04:00
bool updated_range , int rw ,
2022-04-03 17:50:01 -04:00
struct printbuf * err )
{
2022-04-10 00:48:36 -04:00
return __bch2_bkey_invalid ( c , k , btree_node_type ( b ) , READ , err ) ? :
2023-10-24 20:44:36 -04:00
( ! updated_range ? bch2_bkey_in_btree_node ( c , b , k , err ) : 0 ) ? :
2022-04-10 00:48:36 -04:00
( rw = = WRITE ? bch2_bkey_val_invalid ( c , k , READ , err ) : 0 ) ;
2022-04-03 17:50:01 -04:00
}
2024-01-05 14:17:57 -05:00
static bool __bkey_valid ( struct bch_fs * c , struct btree * b ,
struct bset * i , struct bkey_packed * k )
{
if ( bkey_p_next ( k ) > vstruct_last ( i ) )
return false ;
if ( k - > format > KEY_FORMAT_CURRENT )
return false ;
struct printbuf buf = PRINTBUF ;
struct bkey tmp ;
struct bkey_s u = __bkey_disassemble ( b , k , & tmp ) ;
bool ret = __bch2_bkey_invalid ( c , u . s_c , btree_node_type ( b ) , READ , & buf ) ;
printbuf_exit ( & buf ) ;
return ret ;
}
2020-01-07 13:29:32 -05:00
static int validate_bset_keys ( struct bch_fs * c , struct btree * b ,
2023-01-03 17:32:16 -05:00
struct bset * i , int write ,
bool have_retry , bool * saw_error )
2020-01-07 13:29:32 -05:00
{
unsigned version = le16_to_cpu ( i - > version ) ;
struct bkey_packed * k , * prev = NULL ;
2022-04-03 17:50:01 -04:00
struct printbuf buf = PRINTBUF ;
2021-04-24 16:32:35 -04:00
bool updated_range = b - > key . k . type = = KEY_TYPE_btree_ptr_v2 & &
BTREE_PTR_RANGE_UPDATED ( & bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v ) ;
2020-01-07 13:29:32 -05:00
int ret = 0 ;
2017-03-16 22:18:50 -08:00
for ( k = i - > start ;
k ! = vstruct_last ( i ) ; ) {
2020-02-06 20:15:15 -05:00
struct bkey_s u ;
2017-03-16 22:18:50 -08:00
struct bkey tmp ;
2024-01-05 14:17:57 -05:00
unsigned next_good_key ;
2017-03-16 22:18:50 -08:00
2023-03-04 23:05:55 -05:00
if ( btree_err_on ( bkey_p_next ( k ) > vstruct_last ( i ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , i ,
btree_node_bkey_past_bset_end ,
2017-03-16 22:18:50 -08:00
" key extends past end of bset " ) ) {
i - > u64s = cpu_to_le16 ( ( u64 * ) k - i - > _data ) ;
break ;
}
if ( btree_err_on ( k - > format > KEY_FORMAT_CURRENT ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , i ,
btree_node_bkey_bad_format ,
2024-01-05 14:17:57 -05:00
" invalid bkey format %u " , k - > format ) )
goto drop_this_key ;
2017-03-16 22:18:50 -08:00
2020-01-07 13:29:32 -05:00
/* XXX: validate k->u64s */
if ( ! write )
bch2_bkey_compat ( b - > c . level , b - > c . btree_id , version ,
BSET_BIG_ENDIAN ( i ) , write ,
& b - > format , k ) ;
2017-03-16 22:18:50 -08:00
2020-02-06 20:15:15 -05:00
u = __bkey_disassemble ( b , k , & tmp ) ;
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
printbuf_reset ( & buf ) ;
if ( bset_key_invalid ( c , b , u . s_c , updated_range , write , & buf ) ) {
printbuf_reset ( & buf ) ;
bset_key_invalid ( c , b , u . s_c , updated_range , write , & buf ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " \n " ) ;
2022-04-18 23:43:08 -04:00
bch2_bkey_val_to_text ( & buf , c , u . s_c ) ;
2022-04-03 17:50:01 -04:00
2023-10-24 20:44:36 -04:00
btree_err ( - BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , i ,
btree_node_bad_bkey ,
" invalid bkey: %s " , buf . buf ) ;
2024-01-05 14:17:57 -05:00
goto drop_this_key ;
2017-03-16 22:18:50 -08:00
}
2020-01-07 13:29:32 -05:00
if ( write )
bch2_bkey_compat ( b - > c . level , b - > c . btree_id , version ,
BSET_BIG_ENDIAN ( i ) , write ,
& b - > format , k ) ;
2018-11-01 15:10:01 -04:00
2021-03-29 00:19:05 -04:00
if ( prev & & bkey_iter_cmp ( b , prev , k ) > 0 ) {
2019-12-30 14:37:25 -05:00
struct bkey up = bkey_unpack_key ( b , prev ) ;
2020-02-26 20:39:06 -05:00
2022-04-03 17:50:01 -04:00
printbuf_reset ( & buf ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " keys out of order: " ) ;
2022-04-03 17:50:01 -04:00
bch2_bkey_to_text ( & buf , & up ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " > " ) ;
2022-04-03 17:50:01 -04:00
bch2_bkey_to_text ( & buf , u . k ) ;
2020-02-26 20:39:06 -05:00
2023-10-24 20:44:36 -04:00
if ( btree_err ( - BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , i ,
btree_node_bkey_out_of_order ,
2024-01-05 14:17:57 -05:00
" %s " , buf . buf ) )
goto drop_this_key ;
2017-03-16 22:18:50 -08:00
}
2019-12-30 14:37:25 -05:00
prev = k ;
2023-03-04 23:05:55 -05:00
k = bkey_p_next ( k ) ;
2024-01-05 14:17:57 -05:00
continue ;
drop_this_key :
next_good_key = k - > u64s ;
if ( ! next_good_key | |
( BSET_BIG_ENDIAN ( i ) = = CPU_BIG_ENDIAN & &
version > = bcachefs_metadata_version_snapshot ) ) {
/*
* only do scanning if bch2_bkey_compat ( ) has nothing to
* do
*/
if ( ! __bkey_valid ( c , b , i , ( void * ) ( ( u64 * ) k + next_good_key ) ) ) {
for ( next_good_key = 1 ;
next_good_key < ( u64 * ) vstruct_last ( i ) - ( u64 * ) k ;
next_good_key + + )
if ( __bkey_valid ( c , b , i , ( void * ) ( ( u64 * ) k + next_good_key ) ) )
goto got_good_key ;
}
/*
* didn ' t find a good key , have to truncate the rest of
* the bset
*/
next_good_key = ( u64 * ) vstruct_last ( i ) - ( u64 * ) k ;
}
got_good_key :
le16_add_cpu ( & i - > u64s , - next_good_key ) ;
memmove_u64s_down ( k , bkey_p_next ( k ) , ( u64 * ) vstruct_end ( i ) - ( u64 * ) k ) ;
2017-03-16 22:18:50 -08:00
}
fsck_err :
2022-04-03 17:50:01 -04:00
printbuf_exit ( & buf ) ;
2017-03-16 22:18:50 -08:00
return ret ;
}
2021-02-02 17:08:54 -05:00
int bch2_btree_node_read_done ( struct bch_fs * c , struct bch_dev * ca ,
2023-01-03 17:32:16 -05:00
struct btree * b , bool have_retry , bool * saw_error )
2017-03-16 22:18:50 -08:00
{
struct btree_node_entry * bne ;
2019-12-14 16:20:33 -05:00
struct sort_iter * iter ;
2017-03-16 22:18:50 -08:00
struct btree_node * sorted ;
struct bkey_packed * k ;
struct bset * i ;
2019-04-04 21:53:12 -04:00
bool used_mempool , blacklisted ;
2021-04-24 16:32:35 -04:00
bool updated_range = b - > key . k . type = = KEY_TYPE_btree_ptr_v2 & &
BTREE_PTR_RANGE_UPDATED ( & bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v ) ;
2017-03-16 22:18:50 -08:00
unsigned u64s ;
2021-07-10 13:44:42 -04:00
unsigned ptr_written = btree_ptr_sectors_written ( & b - > key ) ;
2022-04-03 17:50:01 -04:00
struct printbuf buf = PRINTBUF ;
2023-01-03 17:14:07 -05:00
int ret = 0 , retry_read = 0 , write = READ ;
2024-01-03 16:42:33 -05:00
u64 start_time = local_clock ( ) ;
2017-03-16 22:18:50 -08:00
2021-03-14 19:01:14 -04:00
b - > version_ondisk = U16_MAX ;
2022-06-14 01:37:16 -04:00
/* We might get called multiple times on read retry: */
b - > written = 0 ;
2021-03-14 19:01:14 -04:00
2023-05-28 18:02:38 -04:00
iter = mempool_alloc ( & c - > fill_iter , GFP_NOFS ) ;
2023-09-09 20:10:11 -04:00
sort_iter_init ( iter , b , ( btree_blocks ( c ) + 1 ) * 2 ) ;
2017-03-16 22:18:50 -08:00
if ( bch2_meta_read_fault ( " btree " ) )
2023-10-24 20:44:36 -04:00
btree_err ( - BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , NULL ,
btree_node_fault_injected ,
2017-03-16 22:18:50 -08:00
" dynamic fault " ) ;
btree_err_on ( le64_to_cpu ( b - > data - > magic ) ! = bset_magic ( c ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , NULL ,
btree_node_bad_magic ,
2022-02-23 10:32:43 -05:00
" bad magic: want %llx, got %llx " ,
bset_magic ( c ) , le64_to_cpu ( b - > data - > magic ) ) ;
2017-03-16 22:18:50 -08:00
2020-02-07 13:38:02 -05:00
if ( b - > key . k . type = = KEY_TYPE_btree_ptr_v2 ) {
struct bch_btree_ptr_v2 * bp =
& bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v ;
2023-12-05 15:22:25 -05:00
bch2_bpos_to_text ( & buf , b - > data - > min_key ) ;
prt_str ( & buf , " - " ) ;
bch2_bpos_to_text ( & buf , b - > data - > max_key ) ;
2020-02-07 13:38:02 -05:00
btree_err_on ( b - > data - > keys . seq ! = bp - > seq ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , NULL ,
btree_node_bad_seq ,
2023-12-05 15:22:25 -05:00
" got wrong btree node (want %llx got %llx) \n "
" got btree %s level %llu pos %s " ,
bp - > seq , b - > data - > keys . seq ,
bch2_btree_id_str ( BTREE_NODE_ID ( b - > data ) ) ,
BTREE_NODE_LEVEL ( b - > data ) ,
buf . buf ) ;
2023-10-24 20:44:36 -04:00
} else {
btree_err_on ( ! b - > data - > keys . seq ,
- BCH_ERR_btree_node_read_err_must_retry ,
c , ca , b , NULL ,
btree_node_bad_seq ,
" bad btree header: seq 0 " ) ;
2020-02-07 13:38:02 -05:00
}
2021-12-14 14:24:41 -05:00
while ( b - > written < ( ptr_written ? : btree_sectors ( c ) ) ) {
2022-08-15 14:20:22 -04:00
unsigned sectors ;
2017-03-16 22:18:50 -08:00
struct nonce nonce ;
bool first = ! b - > written ;
2023-10-25 16:29:37 -04:00
bool csum_bad ;
2017-03-16 22:18:50 -08:00
if ( ! b - > written ) {
i = & b - > data - > keys ;
btree_err_on ( ! bch2_checksum_type_valid ( c , BSET_CSUM_TYPE ( i ) ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_want_retry ,
c , ca , b , i ,
bset_unknown_csum ,
" unknown checksum type %llu " , BSET_CSUM_TYPE ( i ) ) ;
2017-03-16 22:18:50 -08:00
nonce = btree_nonce ( i , b - > written < < 9 ) ;
2024-01-05 11:59:03 -05:00
struct bch_csum csum = csum_vstruct ( c , BSET_CSUM_TYPE ( i ) , nonce , b - > data ) ;
csum_bad = bch2_crc_cmp ( b - > data - > csum , csum ) ;
2023-10-25 16:29:37 -04:00
if ( csum_bad )
bch2_io_error ( ca , BCH_MEMBER_ERROR_checksum ) ;
btree_err_on ( csum_bad ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_want_retry ,
c , ca , b , i ,
bset_bad_csum ,
2024-01-05 11:59:03 -05:00
" %s " ,
( printbuf_reset ( & buf ) ,
bch2_csum_err_msg ( & buf , BSET_CSUM_TYPE ( i ) , b - > data - > csum , csum ) ,
buf . buf ) ) ;
2017-03-16 22:18:50 -08:00
2022-02-19 00:42:12 -05:00
ret = bset_encrypt ( c , i , b - > written < < 9 ) ;
if ( bch2_fs_fatal_err_on ( ret , c ,
" error decrypting btree node: %i " , ret ) )
goto fsck_err ;
2017-03-16 22:18:50 -08:00
2021-12-11 17:13:09 -05:00
btree_err_on ( btree_node_type_is_extents ( btree_node_type ( b ) ) & &
2021-02-20 00:00:23 -05:00
! BTREE_NODE_NEW_EXTENT_OVERWRITE ( b - > data ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_incompatible ,
c , NULL , b , NULL ,
btree_node_unsupported_version ,
2021-02-20 00:00:23 -05:00
" btree node does not have NEW_EXTENT_OVERWRITE set " ) ;
2019-11-26 17:26:04 -05:00
2017-03-16 22:18:50 -08:00
sectors = vstruct_sectors ( b - > data , c - > block_bits ) ;
} else {
bne = write_block ( b ) ;
i = & bne - > keys ;
if ( i - > seq ! = b - > data - > keys . seq )
break ;
btree_err_on ( ! bch2_checksum_type_valid ( c , BSET_CSUM_TYPE ( i ) ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_want_retry ,
c , ca , b , i ,
bset_unknown_csum ,
" unknown checksum type %llu " , BSET_CSUM_TYPE ( i ) ) ;
2017-03-16 22:18:50 -08:00
nonce = btree_nonce ( i , b - > written < < 9 ) ;
2024-01-05 11:59:03 -05:00
struct bch_csum csum = csum_vstruct ( c , BSET_CSUM_TYPE ( i ) , nonce , bne ) ;
csum_bad = bch2_crc_cmp ( bne - > csum , csum ) ;
2023-10-25 16:29:37 -04:00
if ( csum_bad )
bch2_io_error ( ca , BCH_MEMBER_ERROR_checksum ) ;
2017-03-16 22:18:50 -08:00
2023-10-25 16:29:37 -04:00
btree_err_on ( csum_bad ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_want_retry ,
c , ca , b , i ,
bset_bad_csum ,
2024-01-05 11:59:03 -05:00
" %s " ,
( printbuf_reset ( & buf ) ,
bch2_csum_err_msg ( & buf , BSET_CSUM_TYPE ( i ) , bne - > csum , csum ) ,
buf . buf ) ) ;
2017-03-16 22:18:50 -08:00
2022-02-19 00:42:12 -05:00
ret = bset_encrypt ( c , i , b - > written < < 9 ) ;
if ( bch2_fs_fatal_err_on ( ret , c ,
" error decrypting btree node: %i \n " , ret ) )
goto fsck_err ;
2017-03-16 22:18:50 -08:00
sectors = vstruct_sectors ( bne , c - > block_bits ) ;
}
2021-03-14 19:01:14 -04:00
b - > version_ondisk = min ( b - > version_ondisk ,
le16_to_cpu ( i - > version ) ) ;
2021-07-16 12:57:27 -04:00
ret = validate_bset ( c , ca , b , i , b - > written , sectors ,
2023-01-03 17:32:16 -05:00
READ , have_retry , saw_error ) ;
2017-03-16 22:18:50 -08:00
if ( ret )
goto fsck_err ;
2020-01-07 13:29:32 -05:00
if ( ! b - > written )
btree_node_set_format ( b , b - > data - > format ) ;
2023-01-03 17:32:16 -05:00
ret = validate_bset_keys ( c , b , i , READ , have_retry , saw_error ) ;
2020-01-07 13:29:32 -05:00
if ( ret )
goto fsck_err ;
SET_BSET_BIG_ENDIAN ( i , CPU_BIG_ENDIAN ) ;
2019-04-04 21:53:12 -04:00
blacklisted = bch2_journal_seq_is_blacklisted ( c ,
le64_to_cpu ( i - > journal_seq ) ,
true ) ;
2017-03-16 22:18:50 -08:00
2019-04-04 21:53:12 -04:00
btree_err_on ( blacklisted & & first ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , ca , b , i ,
bset_blacklisted_journal_seq ,
2022-01-04 19:05:08 -05:00
" first btree node bset has blacklisted journal seq (%llu) " ,
le64_to_cpu ( i - > journal_seq ) ) ;
2021-07-10 13:44:42 -04:00
btree_err_on ( blacklisted & & ptr_written ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , ca , b , i ,
first_bset_blacklisted_journal_seq ,
2022-01-04 19:05:08 -05:00
" found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u " ,
le64_to_cpu ( i - > journal_seq ) ,
b - > written , b - > written + sectors , ptr_written ) ;
b - > written + = sectors ;
2019-04-04 21:53:12 -04:00
if ( blacklisted & & ! first )
continue ;
2017-03-16 22:18:50 -08:00
2019-12-14 16:20:33 -05:00
sort_iter_add ( iter ,
2022-08-15 14:20:22 -04:00
vstruct_idx ( i , 0 ) ,
2019-12-14 16:20:33 -05:00
vstruct_last ( i ) ) ;
2017-03-16 22:18:50 -08:00
}
2021-07-10 13:44:42 -04:00
if ( ptr_written ) {
btree_err_on ( b - > written < ptr_written ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_want_retry ,
c , ca , b , NULL ,
btree_node_data_missing ,
2021-07-10 13:44:42 -04:00
" btree node data missing: expected %u sectors, found %u " ,
ptr_written , b - > written ) ;
} else {
for ( bne = write_block ( b ) ;
2024-01-16 13:29:59 -05:00
bset_byte_offset ( b , bne ) < btree_buf_bytes ( b ) ;
2021-07-10 13:44:42 -04:00
bne = ( void * ) bne + block_bytes ( c ) )
btree_err_on ( bne - > keys . seq = = b - > data - > keys . seq & &
! bch2_journal_seq_is_blacklisted ( c ,
le64_to_cpu ( bne - > keys . journal_seq ) ,
true ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_want_retry ,
c , ca , b , NULL ,
btree_node_bset_after_end ,
2021-07-10 13:44:42 -04:00
" found bset signature after last bset " ) ;
}
2021-05-22 21:43:20 -04:00
2024-01-16 13:29:59 -05:00
sorted = btree_bounce_alloc ( c , btree_buf_bytes ( b ) , & used_mempool ) ;
2017-03-16 22:18:50 -08:00
sorted - > keys . u64s = 0 ;
set_btree_bset ( b , b - > set , & b - > data - > keys ) ;
2021-02-20 00:00:23 -05:00
b - > nr = bch2_key_sort_fix_overlapping ( c , & sorted - > keys , iter ) ;
2017-03-16 22:18:50 -08:00
u64s = le16_to_cpu ( sorted - > keys . u64s ) ;
* sorted = * b - > data ;
sorted - > keys . u64s = cpu_to_le16 ( u64s ) ;
swap ( sorted , b - > data ) ;
set_btree_bset ( b , b - > set , & b - > data - > keys ) ;
b - > nsets = 1 ;
BUG_ON ( b - > nr . live_u64s ! = u64s ) ;
2024-01-16 13:29:59 -05:00
btree_bounce_free ( c , btree_buf_bytes ( b ) , used_mempool , sorted ) ;
2017-03-16 22:18:50 -08:00
2021-04-24 16:32:35 -04:00
if ( updated_range )
bch2_btree_node_drop_keys_outside_node ( b ) ;
2017-03-16 22:18:50 -08:00
i = & b - > data - > keys ;
for ( k = i - > start ; k ! = vstruct_last ( i ) ; ) {
struct bkey tmp ;
2020-02-07 13:38:02 -05:00
struct bkey_s u = __bkey_disassemble ( b , k , & tmp ) ;
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
printbuf_reset ( & buf ) ;
2022-04-03 21:50:25 -04:00
if ( bch2_bkey_val_invalid ( c , u . s_c , READ , & buf ) | |
2020-11-02 18:20:44 -05:00
( bch2_inject_invalid_keys & &
2017-03-16 22:18:50 -08:00
! bversion_cmp ( u . k - > version , MAX_VERSION ) ) ) {
2022-04-03 17:50:01 -04:00
printbuf_reset ( & buf ) ;
2017-03-16 22:18:50 -08:00
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " invalid bkey: " ) ;
2022-04-03 21:50:25 -04:00
bch2_bkey_val_invalid ( c , u . s_c , READ , & buf ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " \n " ) ;
2022-04-18 23:43:08 -04:00
bch2_bkey_val_to_text ( & buf , c , u . s_c ) ;
2022-04-03 17:50:01 -04:00
2023-10-24 20:44:36 -04:00
btree_err ( - BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , i ,
btree_node_bad_bkey ,
" %s " , buf . buf ) ;
2017-03-16 22:18:50 -08:00
btree_keys_account_key_drop ( & b - > nr , 0 , k ) ;
i - > u64s = cpu_to_le16 ( le16_to_cpu ( i - > u64s ) - k - > u64s ) ;
2023-03-04 23:05:55 -05:00
memmove_u64s_down ( k , bkey_p_next ( k ) ,
2017-03-16 22:18:50 -08:00
( u64 * ) vstruct_end ( i ) - ( u64 * ) k ) ;
set_btree_bset_end ( b , b - > set ) ;
continue ;
}
2020-02-07 13:38:02 -05:00
if ( u . k - > type = = KEY_TYPE_btree_ptr_v2 ) {
struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2 ( u ) ;
bp . v - > mem_ptr = 0 ;
}
2023-03-04 23:05:55 -05:00
k = bkey_p_next ( k ) ;
2017-03-16 22:18:50 -08:00
}
bch2_bset_build_aux_tree ( b , b - > set , false ) ;
2019-11-09 23:50:52 -05:00
set_needs_whiteout ( btree_bset_first ( b ) , true ) ;
2017-03-16 22:18:50 -08:00
btree_node_reset_sib_u64s ( b ) ;
2020-07-03 16:32:00 -04:00
bkey_for_each_ptr ( bch2_bkey_ptrs ( bkey_i_to_s ( & b - > key ) ) , ptr ) {
2023-09-12 18:41:22 -04:00
struct bch_dev * ca2 = bch_dev_bkey_exists ( c , ptr - > dev ) ;
2020-07-03 16:32:00 -04:00
2023-09-12 18:41:22 -04:00
if ( ca2 - > mi . state ! = BCH_MEMBER_STATE_rw )
2020-07-03 16:32:00 -04:00
set_btree_node_need_rewrite ( b ) ;
}
2021-07-10 13:44:42 -04:00
if ( ! ptr_written )
set_btree_node_need_rewrite ( b ) ;
2017-03-16 22:18:50 -08:00
out :
mempool_free ( iter , & c - > fill_iter ) ;
2022-04-03 17:50:01 -04:00
printbuf_exit ( & buf ) ;
2024-01-03 16:42:33 -05:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_node_read_done ] , start_time ) ;
2017-03-16 22:18:50 -08:00
return retry_read ;
fsck_err :
2023-08-03 19:36:28 -04:00
if ( ret = = - BCH_ERR_btree_node_read_err_want_retry | |
ret = = - BCH_ERR_btree_node_read_err_must_retry )
2017-03-16 22:18:50 -08:00
retry_read = 1 ;
2023-02-01 15:45:45 -05:00
else
2017-03-16 22:18:50 -08:00
set_btree_node_read_error ( b ) ;
goto out ;
}
static void btree_node_read_work ( struct work_struct * work )
{
struct btree_read_bio * rb =
container_of ( work , struct btree_read_bio , work ) ;
struct bch_fs * c = rb - > c ;
2021-05-21 23:57:37 -04:00
struct btree * b = rb - > b ;
2017-03-16 22:18:50 -08:00
struct bch_dev * ca = bch_dev_bkey_exists ( c , rb - > pick . ptr . dev ) ;
struct bio * bio = & rb - > bio ;
2018-11-01 15:28:45 -04:00
struct bch_io_failures failed = { . nr = 0 } ;
2022-02-25 13:18:19 -05:00
struct printbuf buf = PRINTBUF ;
2021-04-24 02:47:41 -04:00
bool saw_error = false ;
2022-05-26 15:19:20 -04:00
bool retry = false ;
2017-03-16 22:18:50 -08:00
bool can_retry ;
goto start ;
while ( 1 ) {
2022-05-26 15:19:20 -04:00
retry = true ;
2017-03-16 22:18:50 -08:00
bch_info ( c , " retrying read " ) ;
ca = bch_dev_bkey_exists ( c , rb - > pick . ptr . dev ) ;
rb - > have_ioref = bch2_dev_get_ioref ( ca , READ ) ;
bio_reset ( bio , NULL , REQ_OP_READ | REQ_SYNC | REQ_META ) ;
bio - > bi_iter . bi_sector = rb - > pick . ptr . offset ;
2024-01-16 13:29:59 -05:00
bio - > bi_iter . bi_size = btree_buf_bytes ( b ) ;
2017-03-16 22:18:50 -08:00
if ( rb - > have_ioref ) {
bio_set_dev ( bio , ca - > disk_sb . bdev ) ;
submit_bio_wait ( bio ) ;
} else {
bio - > bi_status = BLK_STS_REMOVED ;
}
start :
2022-02-25 13:18:19 -05:00
printbuf_reset ( & buf ) ;
2023-10-19 22:49:08 -04:00
bch2_btree_pos_to_text ( & buf , c , b ) ;
2023-10-25 16:29:37 -04:00
bch2_dev_io_err_on ( bio - > bi_status , ca , BCH_MEMBER_ERROR_read ,
" btree read error %s for %s " ,
2022-02-25 13:18:19 -05:00
bch2_blk_status_to_str ( bio - > bi_status ) , buf . buf ) ;
2017-03-16 22:18:50 -08:00
if ( rb - > have_ioref )
percpu_ref_put ( & ca - > io_ref ) ;
rb - > have_ioref = false ;
2018-11-01 15:28:45 -04:00
bch2_mark_io_failure ( & failed , & rb - > pick ) ;
2018-11-01 15:10:01 -04:00
can_retry = bch2_bkey_pick_read_device ( c ,
bkey_i_to_s_c ( & b - > key ) ,
& failed , & rb - > pick ) > 0 ;
2017-03-16 22:18:50 -08:00
if ( ! bio - > bi_status & &
2023-01-03 17:32:16 -05:00
! bch2_btree_node_read_done ( c , ca , b , can_retry , & saw_error ) ) {
2022-05-26 15:19:20 -04:00
if ( retry )
bch_info ( c , " retry success " ) ;
2017-03-16 22:18:50 -08:00
break ;
2022-05-26 15:19:20 -04:00
}
2017-03-16 22:18:50 -08:00
2021-04-24 02:47:41 -04:00
saw_error = true ;
2017-03-16 22:18:50 -08:00
if ( ! can_retry ) {
set_btree_node_read_error ( b ) ;
break ;
}
}
2019-03-21 16:28:57 -04:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_node_read ] ,
rb - > start_time ) ;
2017-03-16 22:18:50 -08:00
bio_put ( & rb - > bio ) ;
2021-04-24 02:47:41 -04:00
2023-02-10 15:47:46 -05:00
if ( saw_error & & ! btree_node_read_error ( b ) ) {
2023-09-12 18:41:22 -04:00
printbuf_reset ( & buf ) ;
2023-02-10 15:47:46 -05:00
bch2_bpos_to_text ( & buf , b - > key . k . p ) ;
bch_info ( c , " %s: rewriting btree node at btree=%s level=%u %s due to error " ,
2023-10-19 22:49:08 -04:00
__func__ , bch2_btree_id_str ( b - > c . btree_id ) , b - > c . level , buf . buf ) ;
2023-02-10 15:47:46 -05:00
2021-04-24 02:47:41 -04:00
bch2_btree_node_rewrite_async ( c , b ) ;
2023-02-10 15:47:46 -05:00
}
2021-04-24 02:47:41 -04:00
2023-09-12 18:41:22 -04:00
printbuf_exit ( & buf ) ;
2017-03-16 22:18:50 -08:00
clear_btree_node_read_in_flight ( b ) ;
wake_up_bit ( & b - > flags , BTREE_NODE_read_in_flight ) ;
}
static void btree_node_read_endio ( struct bio * bio )
{
struct btree_read_bio * rb =
container_of ( bio , struct btree_read_bio , bio ) ;
struct bch_fs * c = rb - > c ;
if ( rb - > have_ioref ) {
struct bch_dev * ca = bch_dev_bkey_exists ( c , rb - > pick . ptr . dev ) ;
2022-10-22 15:59:53 -04:00
2017-03-16 22:18:50 -08:00
bch2_latency_acct ( ca , rb - > start_time , READ ) ;
}
2021-05-22 17:37:25 -04:00
queue_work ( c - > io_complete_wq , & rb - > work ) ;
2017-03-16 22:18:50 -08:00
}
2021-05-21 23:57:37 -04:00
struct btree_node_read_all {
struct closure cl ;
struct bch_fs * c ;
struct btree * b ;
unsigned nr ;
void * buf [ BCH_REPLICAS_MAX ] ;
struct bio * bio [ BCH_REPLICAS_MAX ] ;
2023-07-06 22:47:42 -04:00
blk_status_t err [ BCH_REPLICAS_MAX ] ;
2021-05-21 23:57:37 -04:00
} ;
static unsigned btree_node_sectors_written ( struct bch_fs * c , void * data )
{
struct btree_node * bn = data ;
struct btree_node_entry * bne ;
unsigned offset = 0 ;
if ( le64_to_cpu ( bn - > magic ) ! = bset_magic ( c ) )
return 0 ;
2021-12-14 14:24:41 -05:00
while ( offset < btree_sectors ( c ) ) {
2021-05-21 23:57:37 -04:00
if ( ! offset ) {
offset + = vstruct_sectors ( bn , c - > block_bits ) ;
} else {
bne = data + ( offset < < 9 ) ;
if ( bne - > keys . seq ! = bn - > keys . seq )
break ;
offset + = vstruct_sectors ( bne , c - > block_bits ) ;
}
}
return offset ;
}
static bool btree_node_has_extra_bsets ( struct bch_fs * c , unsigned offset , void * data )
{
struct btree_node * bn = data ;
struct btree_node_entry * bne ;
if ( ! offset )
return false ;
2021-12-14 14:24:41 -05:00
while ( offset < btree_sectors ( c ) ) {
2021-05-21 23:57:37 -04:00
bne = data + ( offset < < 9 ) ;
if ( bne - > keys . seq = = bn - > keys . seq )
return true ;
offset + + ;
}
return false ;
return offset ;
}
2023-11-17 19:13:27 -05:00
static CLOSURE_CALLBACK ( btree_node_read_all_replicas_done )
2021-05-21 23:57:37 -04:00
{
2023-11-17 19:13:27 -05:00
closure_type ( ra , struct btree_node_read_all , cl ) ;
2021-05-21 23:57:37 -04:00
struct bch_fs * c = ra - > c ;
struct btree * b = ra - > b ;
2022-02-25 13:18:19 -05:00
struct printbuf buf = PRINTBUF ;
2021-05-21 23:57:37 -04:00
bool dump_bset_maps = false ;
bool have_retry = false ;
2021-06-22 21:51:17 -04:00
int ret = 0 , best = - 1 , write = READ ;
2021-09-09 19:05:34 -04:00
unsigned i , written = 0 , written2 = 0 ;
2021-05-21 23:57:37 -04:00
__le64 seq = b - > key . k . type = = KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v . seq : 0 ;
2023-01-03 17:32:16 -05:00
bool _saw_error = false , * saw_error = & _saw_error ;
2021-05-21 23:57:37 -04:00
for ( i = 0 ; i < ra - > nr ; i + + ) {
2021-06-22 21:51:17 -04:00
struct btree_node * bn = ra - > buf [ i ] ;
2021-05-21 23:57:37 -04:00
if ( ra - > err [ i ] )
continue ;
2021-06-22 21:51:17 -04:00
if ( le64_to_cpu ( bn - > magic ) ! = bset_magic ( c ) | |
( seq & & seq ! = bn - > keys . seq ) )
continue ;
2021-05-21 23:57:37 -04:00
2021-06-22 21:51:17 -04:00
if ( best < 0 ) {
best = i ;
written = btree_node_sectors_written ( c , bn ) ;
continue ;
2021-05-21 23:57:37 -04:00
}
written2 = btree_node_sectors_written ( c , ra - > buf [ i ] ) ;
2023-10-24 20:44:36 -04:00
if ( btree_err_on ( written2 ! = written , - BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , NULL ,
btree_node_replicas_sectors_written_mismatch ,
2021-05-21 23:57:37 -04:00
" btree node sectors written mismatch: %u != %u " ,
written , written2 ) | |
btree_err_on ( btree_node_has_extra_bsets ( c , written2 , ra - > buf [ i ] ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , NULL ,
btree_node_bset_after_end ,
2021-05-21 23:57:37 -04:00
" found bset signature after last bset " ) | |
2021-06-22 21:51:17 -04:00
btree_err_on ( memcmp ( ra - > buf [ best ] , ra - > buf [ i ] , written < < 9 ) ,
2023-10-24 20:44:36 -04:00
- BCH_ERR_btree_node_read_err_fixable ,
c , NULL , b , NULL ,
btree_node_replicas_data_mismatch ,
2021-05-21 23:57:37 -04:00
" btree node replicas content mismatch " ) )
dump_bset_maps = true ;
if ( written2 > written ) {
written = written2 ;
2021-06-22 21:51:17 -04:00
best = i ;
2021-05-21 23:57:37 -04:00
}
}
fsck_err :
if ( dump_bset_maps ) {
for ( i = 0 ; i < ra - > nr ; i + + ) {
struct btree_node * bn = ra - > buf [ i ] ;
struct btree_node_entry * bne = NULL ;
unsigned offset = 0 , sectors ;
bool gap = false ;
if ( ra - > err [ i ] )
continue ;
2022-02-25 13:18:19 -05:00
printbuf_reset ( & buf ) ;
2021-12-14 14:24:41 -05:00
while ( offset < btree_sectors ( c ) ) {
2021-05-21 23:57:37 -04:00
if ( ! offset ) {
sectors = vstruct_sectors ( bn , c - > block_bits ) ;
} else {
bne = ra - > buf [ i ] + ( offset < < 9 ) ;
if ( bne - > keys . seq ! = bn - > keys . seq )
break ;
sectors = vstruct_sectors ( bne , c - > block_bits ) ;
}
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " %u-%u " , offset , offset + sectors ) ;
2021-05-21 23:57:37 -04:00
if ( bne & & bch2_journal_seq_is_blacklisted ( c ,
le64_to_cpu ( bne - > keys . journal_seq ) , false ) )
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " * " ) ;
2021-05-21 23:57:37 -04:00
offset + = sectors ;
}
2021-12-14 14:24:41 -05:00
while ( offset < btree_sectors ( c ) ) {
2021-05-21 23:57:37 -04:00
bne = ra - > buf [ i ] + ( offset < < 9 ) ;
if ( bne - > keys . seq = = bn - > keys . seq ) {
if ( ! gap )
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " GAP " ) ;
2021-05-21 23:57:37 -04:00
gap = true ;
sectors = vstruct_sectors ( bne , c - > block_bits ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " %u-%u " , offset , offset + sectors ) ;
2021-05-21 23:57:37 -04:00
if ( bch2_journal_seq_is_blacklisted ( c ,
le64_to_cpu ( bne - > keys . journal_seq ) , false ) )
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " * " ) ;
2021-05-21 23:57:37 -04:00
}
offset + + ;
}
2022-02-25 13:18:19 -05:00
bch_err ( c , " replica %u:%s " , i , buf . buf ) ;
2021-05-21 23:57:37 -04:00
}
}
2021-06-22 21:51:17 -04:00
if ( best > = 0 ) {
2024-01-16 13:29:59 -05:00
memcpy ( b - > data , ra - > buf [ best ] , btree_buf_bytes ( b ) ) ;
2023-01-03 17:32:16 -05:00
ret = bch2_btree_node_read_done ( c , NULL , b , false , saw_error ) ;
2021-06-22 21:51:17 -04:00
} else {
ret = - 1 ;
}
if ( ret )
2021-05-21 23:57:37 -04:00
set_btree_node_read_error ( b ) ;
2023-01-03 17:32:16 -05:00
else if ( * saw_error )
bch2_btree_node_rewrite_async ( c , b ) ;
2021-05-21 23:57:37 -04:00
for ( i = 0 ; i < ra - > nr ; i + + ) {
mempool_free ( ra - > buf [ i ] , & c - > btree_bounce_pool ) ;
bio_put ( ra - > bio [ i ] ) ;
}
closure_debug_destroy ( & ra - > cl ) ;
kfree ( ra ) ;
2022-02-25 13:18:19 -05:00
printbuf_exit ( & buf ) ;
2021-05-21 23:57:37 -04:00
clear_btree_node_read_in_flight ( b ) ;
wake_up_bit ( & b - > flags , BTREE_NODE_read_in_flight ) ;
}
static void btree_node_read_all_replicas_endio ( struct bio * bio )
{
struct btree_read_bio * rb =
container_of ( bio , struct btree_read_bio , bio ) ;
struct bch_fs * c = rb - > c ;
struct btree_node_read_all * ra = rb - > ra ;
if ( rb - > have_ioref ) {
struct bch_dev * ca = bch_dev_bkey_exists ( c , rb - > pick . ptr . dev ) ;
2022-10-22 15:59:53 -04:00
2021-05-21 23:57:37 -04:00
bch2_latency_acct ( ca , rb - > start_time , READ ) ;
}
ra - > err [ rb - > idx ] = bio - > bi_status ;
closure_put ( & ra - > cl ) ;
}
/*
* XXX This allocates multiple times from the same mempools , and can deadlock
* under sufficient memory pressure ( but is only a debug path )
*/
static int btree_node_read_all_replicas ( struct bch_fs * c , struct btree * b , bool sync )
{
struct bkey_s_c k = bkey_i_to_s_c ( & b - > key ) ;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
struct extent_ptr_decoded pick ;
struct btree_node_read_all * ra ;
unsigned i ;
ra = kzalloc ( sizeof ( * ra ) , GFP_NOFS ) ;
if ( ! ra )
2023-03-14 15:35:57 -04:00
return - BCH_ERR_ENOMEM_btree_node_read_all_replicas ;
2021-05-21 23:57:37 -04:00
closure_init ( & ra - > cl , NULL ) ;
ra - > c = c ;
ra - > b = b ;
ra - > nr = bch2_bkey_nr_ptrs ( k ) ;
for ( i = 0 ; i < ra - > nr ; i + + ) {
ra - > buf [ i ] = mempool_alloc ( & c - > btree_bounce_pool , GFP_NOFS ) ;
ra - > bio [ i ] = bio_alloc_bioset ( NULL ,
2024-01-16 13:29:59 -05:00
buf_pages ( ra - > buf [ i ] , btree_buf_bytes ( b ) ) ,
2021-05-21 23:57:37 -04:00
REQ_OP_READ | REQ_SYNC | REQ_META ,
GFP_NOFS ,
& c - > btree_bio ) ;
}
i = 0 ;
bkey_for_each_ptr_decode ( k . k , ptrs , pick , entry ) {
struct bch_dev * ca = bch_dev_bkey_exists ( c , pick . ptr . dev ) ;
struct btree_read_bio * rb =
container_of ( ra - > bio [ i ] , struct btree_read_bio , bio ) ;
rb - > c = c ;
rb - > b = b ;
rb - > ra = ra ;
rb - > start_time = local_clock ( ) ;
rb - > have_ioref = bch2_dev_get_ioref ( ca , READ ) ;
rb - > idx = i ;
rb - > pick = pick ;
rb - > bio . bi_iter . bi_sector = pick . ptr . offset ;
rb - > bio . bi_end_io = btree_node_read_all_replicas_endio ;
2024-01-16 13:29:59 -05:00
bch2_bio_map ( & rb - > bio , ra - > buf [ i ] , btree_buf_bytes ( b ) ) ;
2021-05-21 23:57:37 -04:00
if ( rb - > have_ioref ) {
this_cpu_add ( ca - > io_done - > sectors [ READ ] [ BCH_DATA_btree ] ,
bio_sectors ( & rb - > bio ) ) ;
bio_set_dev ( & rb - > bio , ca - > disk_sb . bdev ) ;
closure_get ( & ra - > cl ) ;
submit_bio ( & rb - > bio ) ;
} else {
ra - > err [ i ] = BLK_STS_REMOVED ;
}
i + + ;
}
if ( sync ) {
closure_sync ( & ra - > cl ) ;
2023-11-17 19:13:27 -05:00
btree_node_read_all_replicas_done ( & ra - > cl . work ) ;
2021-05-21 23:57:37 -04:00
} else {
2021-05-22 17:37:25 -04:00
continue_at ( & ra - > cl , btree_node_read_all_replicas_done ,
c - > io_complete_wq ) ;
2021-05-21 23:57:37 -04:00
}
return 0 ;
}
2023-12-02 03:36:27 -05:00
void bch2_btree_node_read ( struct btree_trans * trans , struct btree * b ,
2017-03-16 22:18:50 -08:00
bool sync )
{
2023-12-02 03:36:27 -05:00
struct bch_fs * c = trans - > c ;
2018-10-02 11:03:39 -04:00
struct extent_ptr_decoded pick ;
2017-03-16 22:18:50 -08:00
struct btree_read_bio * rb ;
struct bch_dev * ca ;
struct bio * bio ;
int ret ;
2023-12-02 03:36:27 -05:00
trace_and_count ( c , btree_node_read , trans , b ) ;
2017-03-16 22:18:50 -08:00
2021-05-21 23:57:37 -04:00
if ( bch2_verify_all_btree_replicas & &
! btree_node_read_all_replicas ( c , b , sync ) )
2022-06-06 22:09:11 -04:00
return ;
2021-05-21 23:57:37 -04:00
2018-11-01 15:10:01 -04:00
ret = bch2_bkey_pick_read_device ( c , bkey_i_to_s_c ( & b - > key ) ,
NULL , & pick ) ;
2022-06-06 22:09:11 -04:00
if ( ret < = 0 ) {
struct printbuf buf = PRINTBUF ;
2023-02-03 21:01:40 -05:00
prt_str ( & buf , " btree node read error: no device to read from \n at " ) ;
2023-10-19 22:49:08 -04:00
bch2_btree_pos_to_text ( & buf , c , b ) ;
2022-06-06 22:09:11 -04:00
bch_err ( c , " %s " , buf . buf ) ;
2023-07-16 23:19:49 -04:00
if ( c - > recovery_passes_explicit & BIT_ULL ( BCH_RECOVERY_PASS_check_topology ) & &
c - > curr_recovery_pass > BCH_RECOVERY_PASS_check_topology )
2022-06-06 22:09:11 -04:00
bch2_fatal_error ( c ) ;
2017-03-16 22:18:50 -08:00
set_btree_node_read_error ( b ) ;
2022-06-06 22:09:11 -04:00
clear_btree_node_read_in_flight ( b ) ;
wake_up_bit ( & b - > flags , BTREE_NODE_read_in_flight ) ;
printbuf_exit ( & buf ) ;
return ;
2017-03-16 22:18:50 -08:00
}
ca = bch_dev_bkey_exists ( c , pick . ptr . dev ) ;
bio = bio_alloc_bioset ( NULL ,
2024-01-16 13:29:59 -05:00
buf_pages ( b - > data , btree_buf_bytes ( b ) ) ,
2017-03-16 22:18:50 -08:00
REQ_OP_READ | REQ_SYNC | REQ_META ,
2023-05-28 18:02:38 -04:00
GFP_NOFS ,
2017-03-16 22:18:50 -08:00
& c - > btree_bio ) ;
rb = container_of ( bio , struct btree_read_bio , bio ) ;
rb - > c = c ;
2021-05-21 23:57:37 -04:00
rb - > b = b ;
rb - > ra = NULL ;
2017-03-16 22:18:50 -08:00
rb - > start_time = local_clock ( ) ;
rb - > have_ioref = bch2_dev_get_ioref ( ca , READ ) ;
rb - > pick = pick ;
INIT_WORK ( & rb - > work , btree_node_read_work ) ;
bio - > bi_iter . bi_sector = pick . ptr . offset ;
bio - > bi_end_io = btree_node_read_endio ;
2024-01-16 13:29:59 -05:00
bch2_bio_map ( bio , b - > data , btree_buf_bytes ( b ) ) ;
2017-03-16 22:18:50 -08:00
if ( rb - > have_ioref ) {
2020-07-09 18:28:11 -04:00
this_cpu_add ( ca - > io_done - > sectors [ READ ] [ BCH_DATA_btree ] ,
2017-03-16 22:18:50 -08:00
bio_sectors ( bio ) ) ;
bio_set_dev ( bio , ca - > disk_sb . bdev ) ;
if ( sync ) {
submit_bio_wait ( bio ) ;
2024-01-03 17:32:07 -05:00
bch2_latency_acct ( ca , rb - > start_time , READ ) ;
2017-03-16 22:18:50 -08:00
btree_node_read_work ( & rb - > work ) ;
} else {
submit_bio ( bio ) ;
}
} else {
bio - > bi_status = BLK_STS_REMOVED ;
if ( sync )
btree_node_read_work ( & rb - > work ) ;
else
2021-05-22 17:37:25 -04:00
queue_work ( c - > io_complete_wq , & rb - > work ) ;
2017-03-16 22:18:50 -08:00
}
}
2023-03-02 02:12:18 -05:00
static int __bch2_btree_root_read ( struct btree_trans * trans , enum btree_id id ,
const struct bkey_i * k , unsigned level )
2017-03-16 22:18:50 -08:00
{
2023-03-02 02:12:18 -05:00
struct bch_fs * c = trans - > c ;
2017-03-16 22:18:50 -08:00
struct closure cl ;
struct btree * b ;
int ret ;
closure_init_stack ( & cl ) ;
do {
2023-12-02 03:36:27 -05:00
ret = bch2_btree_cache_cannibalize_lock ( trans , & cl ) ;
2017-03-16 22:18:50 -08:00
closure_sync ( & cl ) ;
} while ( ret ) ;
2023-03-02 02:12:18 -05:00
b = bch2_btree_node_mem_alloc ( trans , level ! = 0 ) ;
2023-12-02 03:36:27 -05:00
bch2_btree_cache_cannibalize_unlock ( trans ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( IS_ERR ( b ) ) ;
bkey_copy ( & b - > key , k ) ;
BUG_ON ( bch2_btree_node_hash_insert ( & c - > btree_cache , b , level , id ) ) ;
2021-07-10 23:03:15 -04:00
set_btree_node_read_in_flight ( b ) ;
2023-12-02 03:36:27 -05:00
bch2_btree_node_read ( trans , b , true ) ;
2017-03-16 22:18:50 -08:00
if ( btree_node_read_error ( b ) ) {
bch2_btree_node_hash_remove ( & c - > btree_cache , b ) ;
mutex_lock ( & c - > btree_cache . lock ) ;
list_move ( & b - > list , & c - > btree_cache . freeable ) ;
mutex_unlock ( & c - > btree_cache . lock ) ;
ret = - EIO ;
goto err ;
}
bch2_btree_set_root_for_read ( c , b ) ;
err :
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
return ret ;
}
2023-03-02 02:12:18 -05:00
int bch2_btree_root_read ( struct bch_fs * c , enum btree_id id ,
const struct bkey_i * k , unsigned level )
{
2023-09-12 17:16:02 -04:00
return bch2_trans_run ( c , __bch2_btree_root_read ( trans , id , k , level ) ) ;
2023-03-02 02:12:18 -05:00
}
2023-11-30 23:32:20 -05:00
static void bch2_btree_complete_write ( struct bch_fs * c , struct btree * b ,
struct btree_write * w )
2017-03-16 22:18:50 -08:00
{
unsigned long old , new , v = READ_ONCE ( b - > will_make_reachable ) ;
do {
old = new = v ;
if ( ! ( old & 1 ) )
break ;
new & = ~ 1UL ;
} while ( ( v = cmpxchg ( & b - > will_make_reachable , old , new ) ) ! = old ) ;
if ( old & 1 )
closure_put ( & ( ( struct btree_update * ) new ) - > cl ) ;
bch2_journal_pin_drop ( & c - > journal , & w - > journal ) ;
}
2022-02-26 21:35:16 -05:00
static void __btree_node_write_done ( struct bch_fs * c , struct btree * b )
2017-03-16 22:18:50 -08:00
{
struct btree_write * w = btree_prev_write ( b ) ;
2021-07-11 16:41:14 -04:00
unsigned long old , new , v ;
2022-11-17 16:03:15 -05:00
unsigned type = 0 ;
2017-03-16 22:18:50 -08:00
bch2_btree_complete_write ( c , b , w ) ;
2021-07-11 16:41:14 -04:00
v = READ_ONCE ( b - > flags ) ;
do {
old = new = v ;
if ( ( old & ( 1U < < BTREE_NODE_dirty ) ) & &
( old & ( 1U < < BTREE_NODE_need_write ) ) & &
! ( old & ( 1U < < BTREE_NODE_never_write ) ) & &
2022-02-27 09:56:33 -05:00
! ( old & ( 1U < < BTREE_NODE_write_blocked ) ) & &
! ( old & ( 1U < < BTREE_NODE_will_make_reachable ) ) ) {
2021-07-11 16:41:14 -04:00
new & = ~ ( 1U < < BTREE_NODE_dirty ) ;
new & = ~ ( 1U < < BTREE_NODE_need_write ) ;
new | = ( 1U < < BTREE_NODE_write_in_flight ) ;
2021-07-10 13:44:42 -04:00
new | = ( 1U < < BTREE_NODE_write_in_flight_inner ) ;
2021-07-11 16:41:14 -04:00
new | = ( 1U < < BTREE_NODE_just_written ) ;
new ^ = ( 1U < < BTREE_NODE_write_idx ) ;
2022-11-17 16:03:15 -05:00
type = new & BTREE_WRITE_TYPE_MASK ;
new & = ~ BTREE_WRITE_TYPE_MASK ;
2021-07-11 16:41:14 -04:00
} else {
new & = ~ ( 1U < < BTREE_NODE_write_in_flight ) ;
2021-07-10 13:44:42 -04:00
new & = ~ ( 1U < < BTREE_NODE_write_in_flight_inner ) ;
2021-07-11 16:41:14 -04:00
}
} while ( ( v = cmpxchg ( & b - > flags , old , new ) ) ! = old ) ;
if ( new & ( 1U < < BTREE_NODE_write_in_flight ) )
2022-11-17 16:03:15 -05:00
__bch2_btree_node_write ( c , b , BTREE_WRITE_ALREADY_STARTED | type ) ;
2022-03-10 17:35:06 -05:00
else
wake_up_bit ( & b - > flags , BTREE_NODE_write_in_flight ) ;
2022-02-26 21:35:16 -05:00
}
2021-07-11 16:41:14 -04:00
2022-02-26 21:35:16 -05:00
static void btree_node_write_done ( struct bch_fs * c , struct btree * b )
{
2023-09-12 17:16:02 -04:00
struct btree_trans * trans = bch2_trans_get ( c ) ;
2022-08-21 14:29:43 -04:00
2023-09-12 17:16:02 -04:00
btree_node_lock_nopath_nofail ( trans , & b - > c , SIX_LOCK_read ) ;
2022-02-26 21:35:16 -05:00
__btree_node_write_done ( c , b ) ;
2021-07-11 16:41:14 -04:00
six_unlock_read ( & b - > c . lock ) ;
2022-08-21 14:29:43 -04:00
2023-09-12 17:16:02 -04:00
bch2_trans_put ( trans ) ;
2017-03-16 22:18:50 -08:00
}
2021-07-10 13:44:42 -04:00
static void btree_node_write_work ( struct work_struct * work )
2017-03-16 22:18:50 -08:00
{
2021-07-10 13:44:42 -04:00
struct btree_write_bio * wbio =
container_of ( work , struct btree_write_bio , work ) ;
struct bch_fs * c = wbio - > wbio . c ;
2017-03-16 22:18:50 -08:00
struct btree * b = wbio - > wbio . bio . bi_private ;
struct bch_extent_ptr * ptr ;
2023-04-21 03:33:45 -04:00
int ret = 0 ;
2017-03-16 22:18:50 -08:00
2021-07-10 13:44:42 -04:00
btree_bounce_free ( c ,
wbio - > data_bytes ,
wbio - > wbio . used_mempool ,
wbio - > data ) ;
2017-03-16 22:18:50 -08:00
2021-07-10 13:44:42 -04:00
bch2_bkey_drop_ptrs ( bkey_i_to_s ( & wbio - > key ) , ptr ,
2018-09-30 18:28:23 -04:00
bch2_dev_list_has_dev ( wbio - > wbio . failed , ptr - > dev ) ) ;
2017-03-16 22:18:50 -08:00
2023-12-23 21:09:34 -05:00
if ( ! bch2_bkey_nr_ptrs ( bkey_i_to_s_c ( & wbio - > key ) ) ) {
ret = - BCH_ERR_btree_write_all_failed ;
2017-03-16 22:18:50 -08:00
goto err ;
2023-12-23 21:09:34 -05:00
}
2017-03-16 22:18:50 -08:00
2021-07-10 13:44:42 -04:00
if ( wbio - > wbio . first_btree_write ) {
if ( wbio - > wbio . failed . nr ) {
}
} else {
ret = bch2_trans_do ( c , NULL , NULL , 0 ,
2023-09-12 17:16:02 -04:00
bch2_btree_node_update_key_get_iter ( trans , b , & wbio - > key ,
2023-06-27 17:32:48 -04:00
BCH_WATERMARK_reclaim |
2023-11-11 16:31:50 -05:00
BCH_TRANS_COMMIT_journal_reclaim |
BCH_TRANS_COMMIT_no_enospc |
BCH_TRANS_COMMIT_no_check_rw ,
2023-06-27 17:32:48 -04:00
! wbio - > wbio . failed . nr ) ) ;
2021-07-10 13:44:42 -04:00
if ( ret )
goto err ;
}
2017-03-16 22:18:50 -08:00
out :
bio_put ( & wbio - > wbio . bio ) ;
btree_node_write_done ( c , b ) ;
return ;
err :
set_btree_node_noevict ( b ) ;
2023-04-21 03:33:45 -04:00
if ( ! bch2_err_matches ( ret , EROFS ) )
2023-06-27 19:10:24 -04:00
bch2_fs_fatal_error ( c , " fatal error writing btree node: %s " , bch2_err_str ( ret ) ) ;
2017-03-16 22:18:50 -08:00
goto out ;
}
static void btree_node_write_endio ( struct bio * bio )
{
struct bch_write_bio * wbio = to_wbio ( bio ) ;
struct bch_write_bio * parent = wbio - > split ? wbio - > parent : NULL ;
struct bch_write_bio * orig = parent ? : wbio ;
2021-07-10 13:44:42 -04:00
struct btree_write_bio * wb = container_of ( orig , struct btree_write_bio , wbio ) ;
2017-03-16 22:18:50 -08:00
struct bch_fs * c = wbio - > c ;
2021-07-10 13:44:42 -04:00
struct btree * b = wbio - > bio . bi_private ;
2017-03-16 22:18:50 -08:00
struct bch_dev * ca = bch_dev_bkey_exists ( c , wbio - > dev ) ;
unsigned long flags ;
if ( wbio - > have_ioref )
bch2_latency_acct ( ca , wbio - > submit_time , WRITE ) ;
2023-10-25 16:29:37 -04:00
if ( bch2_dev_io_err_on ( bio - > bi_status , ca , BCH_MEMBER_ERROR_write ,
" btree write error: %s " ,
2020-07-21 13:34:22 -04:00
bch2_blk_status_to_str ( bio - > bi_status ) ) | |
2017-03-16 22:18:50 -08:00
bch2_meta_write_fault ( " btree " ) ) {
spin_lock_irqsave ( & c - > btree_write_error_lock , flags ) ;
bch2_dev_list_add_dev ( & orig - > failed , wbio - > dev ) ;
spin_unlock_irqrestore ( & c - > btree_write_error_lock , flags ) ;
}
if ( wbio - > have_ioref )
percpu_ref_put ( & ca - > io_ref ) ;
if ( parent ) {
bio_put ( bio ) ;
bio_endio ( & parent - > bio ) ;
2021-07-10 13:44:42 -04:00
return ;
2017-03-16 22:18:50 -08:00
}
2021-07-10 13:44:42 -04:00
clear_btree_node_write_in_flight_inner ( b ) ;
wake_up_bit ( & b - > flags , BTREE_NODE_write_in_flight_inner ) ;
INIT_WORK ( & wb - > work , btree_node_write_work ) ;
queue_work ( c - > btree_io_complete_wq , & wb - > work ) ;
2017-03-16 22:18:50 -08:00
}
static int validate_bset_for_write ( struct bch_fs * c , struct btree * b ,
struct bset * i , unsigned sectors )
{
2022-04-03 17:50:01 -04:00
struct printbuf buf = PRINTBUF ;
2023-01-03 17:32:16 -05:00
bool saw_error ;
2017-03-16 22:18:50 -08:00
int ret ;
2022-04-03 21:50:25 -04:00
ret = bch2_bkey_invalid ( c , bkey_i_to_s_c ( & b - > key ) ,
BKEY_TYPE_btree , WRITE , & buf ) ;
2022-04-03 17:50:01 -04:00
if ( ret )
bch2_fs_inconsistent ( c , " invalid btree node key before write: %s " , buf . buf ) ;
printbuf_exit ( & buf ) ;
if ( ret )
return ret ;
2017-03-16 22:18:50 -08:00
2023-01-03 17:32:16 -05:00
ret = validate_bset_keys ( c , b , i , WRITE , false , & saw_error ) ? :
validate_bset ( c , NULL , b , i , b - > written , sectors , WRITE , false , & saw_error ) ;
2020-11-16 14:16:42 -05:00
if ( ret ) {
2017-03-16 22:18:50 -08:00
bch2_inconsistent_error ( c ) ;
2020-11-16 14:16:42 -05:00
dump_stack ( ) ;
}
2017-03-16 22:18:50 -08:00
return ret ;
}
2021-04-06 15:28:34 -04:00
static void btree_write_submit ( struct work_struct * work )
{
struct btree_write_bio * wbio = container_of ( work , struct btree_write_bio , work ) ;
2023-03-04 22:36:02 -05:00
BKEY_PADDED_ONSTACK ( k , BKEY_BTREE_PTR_VAL_U64s_MAX ) tmp ;
2021-07-10 13:44:42 -04:00
bkey_copy ( & tmp . k , & wbio - > key ) ;
bkey_for_each_ptr ( bch2_bkey_ptrs ( bkey_i_to_s ( & tmp . k ) ) , ptr )
ptr - > offset + = wbio - > sector_offset ;
2021-04-06 15:28:34 -04:00
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
bch2_submit_wbio_replicas ( & wbio - > wbio , wbio - > wbio . c , BCH_DATA_btree ,
& tmp . k , false ) ;
2021-04-06 15:28:34 -04:00
}
2022-02-26 21:46:41 -05:00
void __bch2_btree_node_write ( struct bch_fs * c , struct btree * b , unsigned flags )
2017-03-16 22:18:50 -08:00
{
struct btree_write_bio * wbio ;
struct bset_tree * t ;
struct bset * i ;
struct btree_node * bn = NULL ;
struct btree_node_entry * bne = NULL ;
2023-09-09 20:10:11 -04:00
struct sort_iter_stack sort_iter ;
2017-03-16 22:18:50 -08:00
struct nonce nonce ;
2020-07-25 15:07:37 -04:00
unsigned bytes_to_write , sectors_to_write , bytes , u64s ;
2017-03-16 22:18:50 -08:00
u64 seq = 0 ;
bool used_mempool ;
unsigned long old , new ;
2018-11-01 15:10:01 -04:00
bool validate_before_checksum = false ;
2022-10-28 17:08:41 -04:00
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK ;
2017-03-16 22:18:50 -08:00
void * data ;
2022-02-19 00:42:12 -05:00
int ret ;
2017-03-16 22:18:50 -08:00
2022-02-26 21:46:41 -05:00
if ( flags & BTREE_WRITE_ALREADY_STARTED )
2021-07-11 16:41:14 -04:00
goto do_write ;
2021-07-10 23:03:15 -04:00
2017-03-16 22:18:50 -08:00
/*
* We may only have a read lock on the btree node - the dirty bit is our
* " lock " against racing with other threads that may be trying to start
* a write , we do a write iff we clear the dirty bit . Since setting the
* dirty bit requires a write lock , we can ' t race with other threads
* redirtying it :
*/
do {
old = new = READ_ONCE ( b - > flags ) ;
if ( ! ( old & ( 1 < < BTREE_NODE_dirty ) ) )
return ;
2022-02-26 21:46:41 -05:00
if ( ( flags & BTREE_WRITE_ONLY_IF_NEED ) & &
! ( old & ( 1 < < BTREE_NODE_need_write ) ) )
return ;
2022-02-27 09:56:33 -05:00
if ( old &
( ( 1 < < BTREE_NODE_never_write ) |
( 1 < < BTREE_NODE_write_blocked ) ) )
2017-03-16 22:18:50 -08:00
return ;
2022-02-27 09:56:33 -05:00
if ( b - > written & &
( old & ( 1 < < BTREE_NODE_will_make_reachable ) ) )
2020-12-03 16:20:18 -05:00
return ;
2022-02-26 21:46:41 -05:00
if ( old & ( 1 < < BTREE_NODE_write_in_flight ) )
return ;
2017-03-16 22:18:50 -08:00
2022-11-17 16:03:15 -05:00
if ( flags & BTREE_WRITE_ONLY_IF_NEED )
type = new & BTREE_WRITE_TYPE_MASK ;
new & = ~ BTREE_WRITE_TYPE_MASK ;
2017-03-16 22:18:50 -08:00
new & = ~ ( 1 < < BTREE_NODE_dirty ) ;
new & = ~ ( 1 < < BTREE_NODE_need_write ) ;
new | = ( 1 < < BTREE_NODE_write_in_flight ) ;
2021-07-10 13:44:42 -04:00
new | = ( 1 < < BTREE_NODE_write_in_flight_inner ) ;
2017-03-16 22:18:50 -08:00
new | = ( 1 < < BTREE_NODE_just_written ) ;
new ^ = ( 1 < < BTREE_NODE_write_idx ) ;
} while ( cmpxchg_acquire ( & b - > flags , old , new ) ! = old ) ;
2021-07-11 16:41:14 -04:00
if ( new & ( 1U < < BTREE_NODE_need_write ) )
return ;
do_write :
2022-10-28 17:08:41 -04:00
BUG_ON ( ( type = = BTREE_WRITE_initial ) ! = ( b - > written = = 0 ) ) ;
2020-11-09 13:01:52 -05:00
atomic_dec ( & c - > btree_cache . dirty ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( btree_node_fake ( b ) ) ;
BUG_ON ( ( b - > will_make_reachable ! = 0 ) ! = ! b - > written ) ;
2021-12-14 14:24:41 -05:00
BUG_ON ( b - > written > = btree_sectors ( c ) ) ;
BUG_ON ( b - > written & ( block_sectors ( c ) - 1 ) ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( bset_written ( b , btree_bset_last ( b ) ) ) ;
BUG_ON ( le64_to_cpu ( b - > data - > magic ) ! = bset_magic ( c ) ) ;
BUG_ON ( memcmp ( & b - > data - > format , & b - > format , sizeof ( b - > format ) ) ) ;
2019-11-29 14:08:51 -05:00
bch2_sort_whiteouts ( c , b ) ;
2017-03-16 22:18:50 -08:00
2023-09-09 20:10:11 -04:00
sort_iter_stack_init ( & sort_iter , b ) ;
2017-03-16 22:18:50 -08:00
bytes = ! b - > written
? sizeof ( struct btree_node )
: sizeof ( struct btree_node_entry ) ;
bytes + = b - > whiteout_u64s * sizeof ( u64 ) ;
for_each_bset ( b , t ) {
i = bset ( b , t ) ;
if ( bset_written ( b , i ) )
continue ;
bytes + = le16_to_cpu ( i - > u64s ) * sizeof ( u64 ) ;
2023-09-09 20:10:11 -04:00
sort_iter_add ( & sort_iter . iter ,
2017-03-16 22:18:50 -08:00
btree_bkey_first ( b , t ) ,
btree_bkey_last ( b , t ) ) ;
seq = max ( seq , le64_to_cpu ( i - > journal_seq ) ) ;
}
2020-12-03 16:20:18 -05:00
BUG_ON ( b - > written & & ! seq ) ;
2020-11-11 12:42:54 -05:00
/* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
bytes + = 8 ;
2021-05-07 22:29:02 -04:00
/* buffer must be a multiple of the block size */
bytes = round_up ( bytes , block_bytes ( c ) ) ;
2020-07-25 15:07:37 -04:00
data = btree_bounce_alloc ( c , bytes , & used_mempool ) ;
2017-03-16 22:18:50 -08:00
if ( ! b - > written ) {
bn = data ;
* bn = * b - > data ;
i = & bn - > keys ;
} else {
bne = data ;
bne - > keys = b - > data - > keys ;
i = & bne - > keys ;
}
i - > journal_seq = cpu_to_le64 ( seq ) ;
i - > u64s = 0 ;
2023-09-09 20:10:11 -04:00
sort_iter_add ( & sort_iter . iter ,
2024-01-16 13:29:59 -05:00
unwritten_whiteouts_start ( b ) ,
unwritten_whiteouts_end ( b ) ) ;
2021-02-20 00:00:23 -05:00
SET_BSET_SEPARATE_WHITEOUTS ( i , false ) ;
2017-03-16 22:18:50 -08:00
b - > whiteout_u64s = 0 ;
2023-09-09 20:10:11 -04:00
u64s = bch2_sort_keys ( i - > start , & sort_iter . iter , false ) ;
2017-03-16 22:18:50 -08:00
le16_add_cpu ( & i - > u64s , u64s ) ;
2022-10-01 00:34:02 -04:00
BUG_ON ( ! b - > written & & i - > u64s ! = b - > data - > keys . u64s ) ;
2019-11-09 23:50:52 -05:00
set_needs_whiteout ( i , false ) ;
2017-03-16 22:18:50 -08:00
/* do we have data to write? */
if ( b - > written & & ! i - > u64s )
goto nowrite ;
bytes_to_write = vstruct_end ( i ) - data ;
sectors_to_write = round_up ( bytes_to_write , block_bytes ( c ) ) > > 9 ;
2022-10-01 00:34:02 -04:00
if ( ! b - > written & &
b - > key . k . type = = KEY_TYPE_btree_ptr_v2 )
BUG_ON ( btree_ptr_sectors_written ( & b - > key ) ! = sectors_to_write ) ;
2017-03-16 22:18:50 -08:00
memset ( data + bytes_to_write , 0 ,
( sectors_to_write < < 9 ) - bytes_to_write ) ;
2021-12-14 14:24:41 -05:00
BUG_ON ( b - > written + sectors_to_write > btree_sectors ( c ) ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( BSET_BIG_ENDIAN ( i ) ! = CPU_BIG_ENDIAN ) ;
BUG_ON ( i - > seq ! = b - > data - > keys . seq ) ;
2023-06-28 20:27:07 -04:00
i - > version = cpu_to_le16 ( c - > sb . version ) ;
2021-07-16 12:57:27 -04:00
SET_BSET_OFFSET ( i , b - > written ) ;
2017-03-16 22:18:50 -08:00
SET_BSET_CSUM_TYPE ( i , bch2_meta_checksum_type ( c ) ) ;
2018-11-01 15:10:01 -04:00
if ( bch2_csum_type_is_encryption ( BSET_CSUM_TYPE ( i ) ) )
validate_before_checksum = true ;
/* validate_bset will be modifying: */
bcachefs: Start using bpos.snapshot field
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:
* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
and xattrs) now always have their snapshot field set to U32_MAX
The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.
We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).
This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2021-03-24 18:02:16 -04:00
if ( le16_to_cpu ( i - > version ) < bcachefs_metadata_version_current )
2018-11-01 15:10:01 -04:00
validate_before_checksum = true ;
2017-03-16 22:18:50 -08:00
/* if we're going to be encrypting, check metadata validity first: */
2018-11-01 15:10:01 -04:00
if ( validate_before_checksum & &
2017-03-16 22:18:50 -08:00
validate_bset_for_write ( c , b , i , sectors_to_write ) )
goto err ;
2022-02-19 00:42:12 -05:00
ret = bset_encrypt ( c , i , b - > written < < 9 ) ;
if ( bch2_fs_fatal_err_on ( ret , c ,
" error encrypting btree node: %i \n " , ret ) )
goto err ;
2017-03-16 22:18:50 -08:00
nonce = btree_nonce ( i , b - > written < < 9 ) ;
if ( bn )
bn - > csum = csum_vstruct ( c , BSET_CSUM_TYPE ( i ) , nonce , bn ) ;
else
bne - > csum = csum_vstruct ( c , BSET_CSUM_TYPE ( i ) , nonce , bne ) ;
/* if we're not encrypting, check metadata after checksumming: */
2018-11-01 15:10:01 -04:00
if ( ! validate_before_checksum & &
2017-03-16 22:18:50 -08:00
validate_bset_for_write ( c , b , i , sectors_to_write ) )
goto err ;
/*
* We handle btree write errors by immediately halting the journal -
* after we ' ve done that , we can ' t issue any subsequent btree writes
* because they might have pointers to new nodes that failed to write .
*
* Furthermore , there ' s no point in doing any more btree writes because
* with the journal stopped , we ' re never going to update the journal to
* reflect that those writes were done and the data flushed from the
* journal :
*
2020-05-02 16:21:35 -04:00
* Also on journal error , the pending write may have updates that were
* never journalled ( interior nodes , see btree_update_nodes_written ( ) ) -
* it ' s critical that we don ' t do the write in that case otherwise we
* will have updates visible that weren ' t in the journal :
*
2017-03-16 22:18:50 -08:00
* Make sure to update b - > written so bch2_btree_init_next ( ) doesn ' t
* break :
*/
if ( bch2_journal_error ( & c - > journal ) | |
c - > opts . nochanges )
goto err ;
2022-08-27 12:48:36 -04:00
trace_and_count ( c , btree_node_write , b , bytes_to_write , sectors_to_write ) ;
2017-03-16 22:18:50 -08:00
2018-11-03 20:04:54 -04:00
wbio = container_of ( bio_alloc_bioset ( NULL ,
buf_pages ( data , sectors_to_write < < 9 ) ,
2019-10-21 19:38:08 -04:00
REQ_OP_WRITE | REQ_META ,
2023-05-28 18:02:38 -04:00
GFP_NOFS ,
2017-03-16 22:18:50 -08:00
& c - > btree_bio ) ,
struct btree_write_bio , wbio . bio ) ;
wbio_init ( & wbio - > wbio . bio ) ;
wbio - > data = data ;
2021-07-10 13:44:42 -04:00
wbio - > data_bytes = bytes ;
wbio - > sector_offset = b - > written ;
2021-04-06 15:28:34 -04:00
wbio - > wbio . c = c ;
2017-03-16 22:18:50 -08:00
wbio - > wbio . used_mempool = used_mempool ;
2021-07-10 13:44:42 -04:00
wbio - > wbio . first_btree_write = ! b - > written ;
2017-03-16 22:18:50 -08:00
wbio - > wbio . bio . bi_end_io = btree_node_write_endio ;
wbio - > wbio . bio . bi_private = b ;
2019-07-03 19:27:42 -04:00
bch2_bio_map ( & wbio - > wbio . bio , data , sectors_to_write < < 9 ) ;
2017-03-16 22:18:50 -08:00
2021-04-06 15:28:34 -04:00
bkey_copy ( & wbio - > key , & b - > key ) ;
2017-03-16 22:18:50 -08:00
b - > written + = sectors_to_write ;
2021-07-10 13:44:42 -04:00
if ( wbio - > key . k . type = = KEY_TYPE_btree_ptr_v2 )
bkey_i_to_btree_ptr_v2 ( & wbio - > key ) - > v . sectors_written =
cpu_to_le16 ( b - > written ) ;
2022-10-28 17:08:41 -04:00
atomic64_inc ( & c - > btree_write_stats [ type ] . nr ) ;
atomic64_add ( bytes_to_write , & c - > btree_write_stats [ type ] . bytes ) ;
2021-03-31 21:07:37 -04:00
2021-04-06 15:28:34 -04:00
INIT_WORK ( & wbio - > work , btree_write_submit ) ;
2021-05-22 17:37:25 -04:00
queue_work ( c - > io_complete_wq , & wbio - > work ) ;
2017-03-16 22:18:50 -08:00
return ;
err :
set_btree_node_noevict ( b ) ;
b - > written + = sectors_to_write ;
nowrite :
2020-07-25 15:07:37 -04:00
btree_bounce_free ( c , bytes , used_mempool , data ) ;
2022-02-26 21:35:16 -05:00
__btree_node_write_done ( c , b ) ;
2017-03-16 22:18:50 -08:00
}
/*
* Work that must be done with write lock held :
*/
bool bch2_btree_post_write_cleanup ( struct bch_fs * c , struct btree * b )
{
bool invalidated_iter = false ;
struct btree_node_entry * bne ;
struct bset_tree * t ;
if ( ! btree_node_just_written ( b ) )
return false ;
BUG_ON ( b - > whiteout_u64s ) ;
clear_btree_node_just_written ( b ) ;
/*
2018-08-05 22:23:44 -04:00
* Note : immediately after write , bset_written ( ) doesn ' t work - the
* amount of data we had to write after compaction might have been
* smaller than the offset of the last bset .
2017-03-16 22:18:50 -08:00
*
* However , we know that all bsets have been written here , as long as
* we ' re still holding the write lock :
*/
/*
* XXX : decide if we really want to unconditionally sort down to a
* single bset :
*/
if ( b - > nsets > 1 ) {
2021-04-06 15:33:19 -04:00
btree_node_sort ( c , b , 0 , b - > nsets , true ) ;
2017-03-16 22:18:50 -08:00
invalidated_iter = true ;
} else {
2019-12-13 13:08:37 -05:00
invalidated_iter = bch2_drop_whiteouts ( b , COMPACT_ALL ) ;
2017-03-16 22:18:50 -08:00
}
for_each_bset ( b , t )
2019-11-09 23:50:52 -05:00
set_needs_whiteout ( bset ( b , t ) , true ) ;
2017-03-16 22:18:50 -08:00
bch2_btree_verify ( c , b ) ;
/*
* If later we don ' t unconditionally sort down to a single bset , we have
* to ensure this is still true :
*/
BUG_ON ( ( void * ) btree_bkey_last ( b , bset_tree_last ( b ) ) > write_block ( b ) ) ;
bne = want_new_bset ( c , b ) ;
if ( bne )
2024-01-16 13:29:59 -05:00
bch2_bset_init_next ( b , bne ) ;
2017-03-16 22:18:50 -08:00
bch2_btree_build_aux_trees ( b ) ;
return invalidated_iter ;
}
/*
* Use this one if the node is intent locked :
*/
void bch2_btree_node_write ( struct bch_fs * c , struct btree * b ,
2022-02-26 21:46:41 -05:00
enum six_lock_type lock_type_held ,
unsigned flags )
2017-03-16 22:18:50 -08:00
{
if ( lock_type_held = = SIX_LOCK_intent | |
2021-04-06 15:33:19 -04:00
( lock_type_held = = SIX_LOCK_read & &
six_lock_tryupgrade ( & b - > c . lock ) ) ) {
2022-02-26 21:46:41 -05:00
__bch2_btree_node_write ( c , b , flags ) ;
2017-03-16 22:18:50 -08:00
/* don't cycle lock unnecessarily: */
if ( btree_node_just_written ( b ) & &
2020-06-06 12:28:01 -04:00
six_trylock_write ( & b - > c . lock ) ) {
2017-03-16 22:18:50 -08:00
bch2_btree_post_write_cleanup ( c , b ) ;
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
}
if ( lock_type_held = = SIX_LOCK_read )
2020-06-06 12:28:01 -04:00
six_lock_downgrade ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
} else {
2022-02-26 21:46:41 -05:00
__bch2_btree_node_write ( c , b , flags ) ;
2021-04-06 15:33:19 -04:00
if ( lock_type_held = = SIX_LOCK_write & &
btree_node_just_written ( b ) )
bch2_btree_post_write_cleanup ( c , b ) ;
2017-03-16 22:18:50 -08:00
}
}
2022-04-17 17:30:49 -04:00
static bool __bch2_btree_flush_all ( struct bch_fs * c , unsigned flag )
2017-03-16 22:18:50 -08:00
{
struct bucket_table * tbl ;
struct rhash_head * pos ;
struct btree * b ;
unsigned i ;
2022-04-17 17:30:49 -04:00
bool ret = false ;
2017-03-16 22:18:50 -08:00
restart :
rcu_read_lock ( ) ;
for_each_cached_btree ( b , c , tbl , i , pos )
if ( test_bit ( flag , & b - > flags ) ) {
rcu_read_unlock ( ) ;
wait_on_bit_io ( & b - > flags , flag , TASK_UNINTERRUPTIBLE ) ;
2022-04-17 17:30:49 -04:00
ret = true ;
2017-03-16 22:18:50 -08:00
goto restart ;
}
rcu_read_unlock ( ) ;
2022-04-17 17:30:49 -04:00
return ret ;
2017-03-16 22:18:50 -08:00
}
2022-04-17 17:30:49 -04:00
bool bch2_btree_flush_all_reads ( struct bch_fs * c )
2017-03-16 22:18:50 -08:00
{
2022-04-17 17:30:49 -04:00
return __bch2_btree_flush_all ( c , BTREE_NODE_read_in_flight ) ;
2017-03-16 22:18:50 -08:00
}
2022-04-17 17:30:49 -04:00
bool bch2_btree_flush_all_writes ( struct bch_fs * c )
2017-03-16 22:18:50 -08:00
{
2022-04-17 17:30:49 -04:00
return __bch2_btree_flush_all ( c , BTREE_NODE_write_in_flight ) ;
2017-03-16 22:18:50 -08:00
}
2022-10-28 17:08:41 -04:00
2023-07-06 22:47:42 -04:00
static const char * const bch2_btree_write_types [ ] = {
2022-10-28 17:08:41 -04:00
# define x(t, n) [n] = #t,
BCH_BTREE_WRITE_TYPES ( )
NULL
} ;
void bch2_btree_write_stats_to_text ( struct printbuf * out , struct bch_fs * c )
{
printbuf_tabstop_push ( out , 20 ) ;
printbuf_tabstop_push ( out , 10 ) ;
prt_tab ( out ) ;
prt_str ( out , " nr " ) ;
prt_tab ( out ) ;
prt_str ( out , " size " ) ;
prt_newline ( out ) ;
for ( unsigned i = 0 ; i < BTREE_WRITE_TYPE_NR ; i + + ) {
u64 nr = atomic64_read ( & c - > btree_write_stats [ i ] . nr ) ;
u64 bytes = atomic64_read ( & c - > btree_write_stats [ i ] . bytes ) ;
prt_printf ( out , " %s: " , bch2_btree_write_types [ i ] ) ;
prt_tab ( out ) ;
prt_u64 ( out , nr ) ;
prt_tab ( out ) ;
prt_human_readable_u64 ( out , nr ? div64_u64 ( bytes , nr ) : 0 ) ;
prt_newline ( out ) ;
}
}