2023-08-05 23:08:44 +03:00
// SPDX-License-Identifier: GPL-2.0
# include "bcachefs.h"
2024-01-22 22:25:00 +03:00
# include "bkey_buf.h"
2023-08-05 23:08:44 +03:00
# include "bset.h"
2024-01-22 22:25:00 +03:00
# include "btree_cache.h"
2023-08-05 23:08:44 +03:00
# include "btree_journal_iter.h"
# include "journal_io.h"
# include <linux/sort.h>
/*
* For managing keys we read from the journal : until journal replay works normal
* btree lookups need to be able to find and return keys from the journal where
* they overwrite what ' s in the btree , so we have a special iterator and
* operations for the regular btree iter code to use :
*/
static int __journal_key_cmp ( enum btree_id l_btree_id ,
unsigned l_level ,
struct bpos l_pos ,
const struct journal_key * r )
{
return ( cmp_int ( l_btree_id , r - > btree_id ) ? :
cmp_int ( l_level , r - > level ) ? :
bpos_cmp ( l_pos , r - > k - > k . p ) ) ;
}
static int journal_key_cmp ( const struct journal_key * l , const struct journal_key * r )
{
return __journal_key_cmp ( l - > btree_id , l - > level , l - > k - > k . p , r ) ;
}
static inline size_t idx_to_pos ( struct journal_keys * keys , size_t idx )
{
size_t gap_size = keys - > size - keys - > nr ;
if ( idx > = keys - > gap )
idx + = gap_size ;
return idx ;
}
static inline struct journal_key * idx_to_key ( struct journal_keys * keys , size_t idx )
{
2024-02-24 08:15:56 +03:00
return keys - > data + idx_to_pos ( keys , idx ) ;
2023-08-05 23:08:44 +03:00
}
static size_t __bch2_journal_key_search ( struct journal_keys * keys ,
enum btree_id id , unsigned level ,
struct bpos pos )
{
size_t l = 0 , r = keys - > nr , m ;
while ( l < r ) {
m = l + ( ( r - l ) > > 1 ) ;
if ( __journal_key_cmp ( id , level , pos , idx_to_key ( keys , m ) ) > 0 )
l = m + 1 ;
else
r = m ;
}
BUG_ON ( l < keys - > nr & &
__journal_key_cmp ( id , level , pos , idx_to_key ( keys , l ) ) > 0 ) ;
BUG_ON ( l & &
__journal_key_cmp ( id , level , pos , idx_to_key ( keys , l - 1 ) ) < = 0 ) ;
return l ;
}
static size_t bch2_journal_key_search ( struct journal_keys * keys ,
enum btree_id id , unsigned level ,
struct bpos pos )
{
return idx_to_pos ( keys , __bch2_journal_key_search ( keys , id , level , pos ) ) ;
}
2023-11-18 02:38:09 +03:00
/* Returns first non-overwritten key >= search key: */
2023-08-05 23:08:44 +03:00
struct bkey_i * bch2_journal_keys_peek_upto ( struct bch_fs * c , enum btree_id btree_id ,
unsigned level , struct bpos pos ,
struct bpos end_pos , size_t * idx )
{
struct journal_keys * keys = & c - > journal_keys ;
unsigned iters = 0 ;
struct journal_key * k ;
2023-11-18 07:13:49 +03:00
BUG_ON ( * idx > keys - > nr ) ;
2023-08-05 23:08:44 +03:00
search :
if ( ! * idx )
* idx = __bch2_journal_key_search ( keys , btree_id , level , pos ) ;
2023-11-18 02:38:09 +03:00
while ( * idx & &
__journal_key_cmp ( btree_id , level , end_pos , idx_to_key ( keys , * idx - 1 ) ) < = 0 ) {
- - ( * idx ) ;
iters + + ;
if ( iters = = 10 ) {
* idx = 0 ;
goto search ;
}
}
2023-08-05 23:08:44 +03:00
while ( ( k = * idx < keys - > nr ? idx_to_key ( keys , * idx ) : NULL ) ) {
if ( __journal_key_cmp ( btree_id , level , end_pos , k ) < 0 )
return NULL ;
2023-11-18 02:38:09 +03:00
if ( k - > overwritten ) {
( * idx ) + + ;
continue ;
}
if ( __journal_key_cmp ( btree_id , level , pos , k ) < = 0 )
2023-08-05 23:08:44 +03:00
return k - > k ;
( * idx ) + + ;
iters + + ;
if ( iters = = 10 ) {
* idx = 0 ;
goto search ;
}
}
return NULL ;
}
struct bkey_i * bch2_journal_keys_peek_slot ( struct bch_fs * c , enum btree_id btree_id ,
unsigned level , struct bpos pos )
{
size_t idx = 0 ;
return bch2_journal_keys_peek_upto ( c , btree_id , level , pos , pos , & idx ) ;
}
static void journal_iters_fix ( struct bch_fs * c )
{
struct journal_keys * keys = & c - > journal_keys ;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys - > gap + ( keys - > size - keys - > nr ) ;
struct btree_and_journal_iter * iter ;
/*
* If an iterator points one after the key we just inserted , decrement
* the iterator so it points at the key we just inserted - if the
* decrement was unnecessary , bch2_btree_and_journal_iter_peek ( ) will
* handle that :
*/
list_for_each_entry ( iter , & c - > journal_iters , journal . list )
if ( iter - > journal . idx = = gap_end )
iter - > journal . idx = keys - > gap - 1 ;
}
static void journal_iters_move_gap ( struct bch_fs * c , size_t old_gap , size_t new_gap )
{
struct journal_keys * keys = & c - > journal_keys ;
struct journal_iter * iter ;
size_t gap_size = keys - > size - keys - > nr ;
list_for_each_entry ( iter , & c - > journal_iters , list ) {
if ( iter - > idx > old_gap )
iter - > idx - = gap_size ;
if ( iter - > idx > = new_gap )
iter - > idx + = gap_size ;
}
}
int bch2_journal_key_insert_take ( struct bch_fs * c , enum btree_id id ,
unsigned level , struct bkey_i * k )
{
struct journal_key n = {
. btree_id = id ,
. level = level ,
. k = k ,
. allocated = true ,
/*
* Ensure these keys are done last by journal replay , to unblock
* journal reclaim :
*/
. journal_seq = U32_MAX ,
} ;
struct journal_keys * keys = & c - > journal_keys ;
size_t idx = bch2_journal_key_search ( keys , id , level , k - > k . p ) ;
2023-11-27 01:05:02 +03:00
BUG_ON ( test_bit ( BCH_FS_rw , & c - > flags ) ) ;
2023-08-05 23:08:44 +03:00
if ( idx < keys - > size & &
2024-02-24 08:15:56 +03:00
journal_key_cmp ( & n , & keys - > data [ idx ] ) = = 0 ) {
if ( keys - > data [ idx ] . allocated )
kfree ( keys - > data [ idx ] . k ) ;
keys - > data [ idx ] = n ;
2023-08-05 23:08:44 +03:00
return 0 ;
}
if ( idx > keys - > gap )
idx - = keys - > size - keys - > nr ;
if ( keys - > nr = = keys - > size ) {
struct journal_keys new_keys = {
. nr = keys - > nr ,
. size = max_t ( size_t , keys - > size , 8 ) * 2 ,
} ;
2024-02-24 08:15:56 +03:00
new_keys . data = kvmalloc_array ( new_keys . size , sizeof ( new_keys . data [ 0 ] ) , GFP_KERNEL ) ;
if ( ! new_keys . data ) {
2023-08-05 23:08:44 +03:00
bch_err ( c , " %s: error allocating new key array (size %zu) " ,
__func__ , new_keys . size ) ;
return - BCH_ERR_ENOMEM_journal_key_insert ;
}
/* Since @keys was full, there was no gap: */
2024-02-24 08:15:56 +03:00
memcpy ( new_keys . data , keys - > data , sizeof ( keys - > data [ 0 ] ) * keys - > nr ) ;
kvfree ( keys - > data ) ;
keys - > data = new_keys . data ;
2023-11-18 07:13:49 +03:00
keys - > nr = new_keys . nr ;
keys - > size = new_keys . size ;
2023-08-05 23:08:44 +03:00
/* And now the gap is at the end: */
2023-11-18 07:13:49 +03:00
keys - > gap = keys - > nr ;
2023-08-05 23:08:44 +03:00
}
journal_iters_move_gap ( c , keys - > gap , idx ) ;
2024-02-24 06:43:24 +03:00
move_gap ( keys , idx ) ;
2023-08-05 23:08:44 +03:00
keys - > nr + + ;
2024-02-24 08:15:56 +03:00
keys - > data [ keys - > gap + + ] = n ;
2023-08-05 23:08:44 +03:00
journal_iters_fix ( c ) ;
return 0 ;
}
/*
* Can only be used from the recovery thread while we ' re still RO - can ' t be
* used once we ' ve got RW , as journal_keys is at that point used by multiple
* threads :
*/
int bch2_journal_key_insert ( struct bch_fs * c , enum btree_id id ,
unsigned level , struct bkey_i * k )
{
struct bkey_i * n ;
int ret ;
n = kmalloc ( bkey_bytes ( & k - > k ) , GFP_KERNEL ) ;
if ( ! n )
return - BCH_ERR_ENOMEM_journal_key_insert ;
bkey_copy ( n , k ) ;
ret = bch2_journal_key_insert_take ( c , id , level , n ) ;
if ( ret )
kfree ( n ) ;
return ret ;
}
int bch2_journal_key_delete ( struct bch_fs * c , enum btree_id id ,
unsigned level , struct bpos pos )
{
struct bkey_i whiteout ;
bkey_init ( & whiteout . k ) ;
whiteout . k . p = pos ;
return bch2_journal_key_insert ( c , id , level , & whiteout ) ;
}
void bch2_journal_key_overwritten ( struct bch_fs * c , enum btree_id btree ,
unsigned level , struct bpos pos )
{
struct journal_keys * keys = & c - > journal_keys ;
size_t idx = bch2_journal_key_search ( keys , btree , level , pos ) ;
if ( idx < keys - > size & &
2024-02-24 08:15:56 +03:00
keys - > data [ idx ] . btree_id = = btree & &
keys - > data [ idx ] . level = = level & &
bpos_eq ( keys - > data [ idx ] . k - > k . p , pos ) )
keys - > data [ idx ] . overwritten = true ;
2023-08-05 23:08:44 +03:00
}
static void bch2_journal_iter_advance ( struct journal_iter * iter )
{
if ( iter - > idx < iter - > keys - > size ) {
iter - > idx + + ;
if ( iter - > idx = = iter - > keys - > gap )
iter - > idx + = iter - > keys - > size - iter - > keys - > nr ;
}
}
static struct bkey_s_c bch2_journal_iter_peek ( struct journal_iter * iter )
{
2024-02-24 08:15:56 +03:00
struct journal_key * k = iter - > keys - > data + iter - > idx ;
2023-08-05 23:08:44 +03:00
2024-02-24 08:15:56 +03:00
while ( k < iter - > keys - > data + iter - > keys - > size & &
2023-08-05 23:08:44 +03:00
k - > btree_id = = iter - > btree_id & &
k - > level = = iter - > level ) {
if ( ! k - > overwritten )
return bkey_i_to_s_c ( k - > k ) ;
bch2_journal_iter_advance ( iter ) ;
2024-02-24 08:15:56 +03:00
k = iter - > keys - > data + iter - > idx ;
2023-08-05 23:08:44 +03:00
}
return bkey_s_c_null ;
}
static void bch2_journal_iter_exit ( struct journal_iter * iter )
{
list_del ( & iter - > list ) ;
}
static void bch2_journal_iter_init ( struct bch_fs * c ,
struct journal_iter * iter ,
enum btree_id id , unsigned level ,
struct bpos pos )
{
iter - > btree_id = id ;
iter - > level = level ;
iter - > keys = & c - > journal_keys ;
iter - > idx = bch2_journal_key_search ( & c - > journal_keys , id , level , pos ) ;
}
static struct bkey_s_c bch2_journal_iter_peek_btree ( struct btree_and_journal_iter * iter )
{
return bch2_btree_node_iter_peek_unpack ( & iter - > node_iter ,
iter - > b , & iter - > unpacked ) ;
}
static void bch2_journal_iter_advance_btree ( struct btree_and_journal_iter * iter )
{
bch2_btree_node_iter_advance ( & iter - > node_iter , iter - > b ) ;
}
void bch2_btree_and_journal_iter_advance ( struct btree_and_journal_iter * iter )
{
if ( bpos_eq ( iter - > pos , SPOS_MAX ) )
iter - > at_end = true ;
else
iter - > pos = bpos_successor ( iter - > pos ) ;
}
2024-01-22 22:25:00 +03:00
static void btree_and_journal_iter_prefetch ( struct btree_and_journal_iter * _iter )
{
struct btree_and_journal_iter iter = * _iter ;
struct bch_fs * c = iter . trans - > c ;
unsigned level = iter . journal . level ;
struct bkey_buf tmp ;
unsigned nr = test_bit ( BCH_FS_started , & c - > flags )
? ( level > 1 ? 0 : 2 )
: ( level > 1 ? 1 : 16 ) ;
iter . prefetch = false ;
bch2_bkey_buf_init ( & tmp ) ;
while ( nr - - ) {
bch2_btree_and_journal_iter_advance ( & iter ) ;
struct bkey_s_c k = bch2_btree_and_journal_iter_peek ( & iter ) ;
if ( ! k . k )
break ;
bch2_bkey_buf_reassemble ( & tmp , c , k ) ;
bch2_btree_node_prefetch ( iter . trans , NULL , tmp . k , iter . journal . btree_id , level - 1 ) ;
}
bch2_bkey_buf_exit ( & tmp , c ) ;
}
2023-08-05 23:08:44 +03:00
struct bkey_s_c bch2_btree_and_journal_iter_peek ( struct btree_and_journal_iter * iter )
{
struct bkey_s_c btree_k , journal_k , ret ;
2024-01-22 22:25:00 +03:00
if ( iter - > prefetch & & iter - > journal . level )
btree_and_journal_iter_prefetch ( iter ) ;
2023-08-05 23:08:44 +03:00
again :
if ( iter - > at_end )
return bkey_s_c_null ;
while ( ( btree_k = bch2_journal_iter_peek_btree ( iter ) ) . k & &
bpos_lt ( btree_k . k - > p , iter - > pos ) )
bch2_journal_iter_advance_btree ( iter ) ;
while ( ( journal_k = bch2_journal_iter_peek ( & iter - > journal ) ) . k & &
bpos_lt ( journal_k . k - > p , iter - > pos ) )
bch2_journal_iter_advance ( & iter - > journal ) ;
ret = journal_k . k & &
( ! btree_k . k | | bpos_le ( journal_k . k - > p , btree_k . k - > p ) )
? journal_k
: btree_k ;
if ( ret . k & & iter - > b & & bpos_gt ( ret . k - > p , iter - > b - > data - > max_key ) )
ret = bkey_s_c_null ;
if ( ret . k ) {
iter - > pos = ret . k - > p ;
if ( bkey_deleted ( ret . k ) ) {
bch2_btree_and_journal_iter_advance ( iter ) ;
goto again ;
}
} else {
iter - > pos = SPOS_MAX ;
iter - > at_end = true ;
}
return ret ;
}
void bch2_btree_and_journal_iter_exit ( struct btree_and_journal_iter * iter )
{
bch2_journal_iter_exit ( & iter - > journal ) ;
}
2024-01-22 22:37:42 +03:00
void __bch2_btree_and_journal_iter_init_node_iter ( struct btree_trans * trans ,
struct btree_and_journal_iter * iter ,
2023-08-05 23:08:44 +03:00
struct btree * b ,
struct btree_node_iter node_iter ,
struct bpos pos )
{
memset ( iter , 0 , sizeof ( * iter ) ) ;
2024-01-22 22:37:42 +03:00
iter - > trans = trans ;
2023-08-05 23:08:44 +03:00
iter - > b = b ;
iter - > node_iter = node_iter ;
2024-01-22 22:37:42 +03:00
bch2_journal_iter_init ( trans - > c , & iter - > journal , b - > c . btree_id , b - > c . level , pos ) ;
2023-08-05 23:08:44 +03:00
INIT_LIST_HEAD ( & iter - > journal . list ) ;
iter - > pos = b - > data - > min_key ;
iter - > at_end = false ;
}
/*
* this version is used by btree_gc before filesystem has gone RW and
* multithreaded , so uses the journal_iters list :
*/
2024-01-22 22:37:42 +03:00
void bch2_btree_and_journal_iter_init_node_iter ( struct btree_trans * trans ,
struct btree_and_journal_iter * iter ,
2023-08-05 23:08:44 +03:00
struct btree * b )
{
struct btree_node_iter node_iter ;
bch2_btree_node_iter_init_from_start ( & node_iter , b ) ;
2024-01-22 22:37:42 +03:00
__bch2_btree_and_journal_iter_init_node_iter ( trans , iter , b , node_iter , b - > data - > min_key ) ;
list_add ( & iter - > journal . list , & trans - > c - > journal_iters ) ;
2023-08-05 23:08:44 +03:00
}
/* sort and dedup all keys in the journal: */
void bch2_journal_entries_free ( struct bch_fs * c )
{
struct journal_replay * * i ;
struct genradix_iter iter ;
genradix_for_each ( & c - > journal_entries , iter , i )
2024-02-01 14:35:46 +03:00
kvfree ( * i ) ;
2023-08-05 23:08:44 +03:00
genradix_free ( & c - > journal_entries ) ;
}
/*
* When keys compare equal , oldest compares first :
*/
static int journal_sort_key_cmp ( const void * _l , const void * _r )
{
const struct journal_key * l = _l ;
const struct journal_key * r = _r ;
return journal_key_cmp ( l , r ) ? :
cmp_int ( l - > journal_seq , r - > journal_seq ) ? :
cmp_int ( l - > journal_offset , r - > journal_offset ) ;
}
2023-11-18 07:13:49 +03:00
void bch2_journal_keys_put ( struct bch_fs * c )
2023-08-05 23:08:44 +03:00
{
2023-11-18 07:13:49 +03:00
struct journal_keys * keys = & c - > journal_keys ;
2023-08-05 23:08:44 +03:00
2023-11-18 07:13:49 +03:00
BUG_ON ( atomic_read ( & keys - > ref ) < = 0 ) ;
if ( ! atomic_dec_and_test ( & keys - > ref ) )
return ;
2024-02-24 06:43:24 +03:00
move_gap ( keys , keys - > nr ) ;
2023-08-05 23:08:44 +03:00
2024-02-24 08:19:09 +03:00
darray_for_each ( * keys , i )
2023-08-05 23:08:44 +03:00
if ( i - > allocated )
kfree ( i - > k ) ;
2024-02-24 08:15:56 +03:00
kvfree ( keys - > data ) ;
keys - > data = NULL ;
2023-08-05 23:08:44 +03:00
keys - > nr = keys - > gap = keys - > size = 0 ;
2023-11-18 07:13:49 +03:00
bch2_journal_entries_free ( c ) ;
2023-08-05 23:08:44 +03:00
}
static void __journal_keys_sort ( struct journal_keys * keys )
{
2024-02-24 08:15:56 +03:00
sort ( keys - > data , keys - > nr , sizeof ( keys - > data [ 0 ] ) , journal_sort_key_cmp , NULL ) ;
2023-08-05 23:08:44 +03:00
2024-02-24 08:19:09 +03:00
struct journal_key * dst = keys - > data ;
darray_for_each ( * keys , src ) {
if ( src + 1 < & darray_top ( * keys ) & &
! journal_key_cmp ( src , src + 1 ) )
continue ;
2023-08-05 23:08:44 +03:00
2024-02-24 08:19:09 +03:00
* dst + + = * src ;
2023-08-05 23:08:44 +03:00
}
2024-02-24 08:15:56 +03:00
keys - > nr = dst - keys - > data ;
2023-08-05 23:08:44 +03:00
}
int bch2_journal_keys_sort ( struct bch_fs * c )
{
struct genradix_iter iter ;
struct journal_replay * i , * * _i ;
struct journal_keys * keys = & c - > journal_keys ;
2024-02-24 08:19:09 +03:00
size_t nr_read = 0 ;
2023-08-05 23:08:44 +03:00
genradix_for_each ( & c - > journal_entries , iter , _i ) {
i = * _i ;
2024-02-26 02:48:21 +03:00
if ( journal_replay_ignore ( i ) )
2023-08-05 23:08:44 +03:00
continue ;
cond_resched ( ) ;
for_each_jset_key ( k , entry , & i - > j ) {
2024-02-24 08:19:09 +03:00
struct journal_key n = ( struct journal_key ) {
2023-08-05 23:08:44 +03:00
. btree_id = entry - > btree_id ,
. level = entry - > level ,
. k = k ,
. journal_seq = le64_to_cpu ( i - > j . seq ) ,
. journal_offset = k - > _data - i - > j . _data ,
} ;
2024-02-24 08:19:09 +03:00
if ( darray_push ( keys , n ) ) {
__journal_keys_sort ( keys ) ;
if ( keys - > nr * 8 > keys - > size * 7 ) {
bch_err ( c , " Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu " ,
keys - > nr , keys - > size , nr_read , le64_to_cpu ( i - > j . seq ) ) ;
return - BCH_ERR_ENOMEM_journal_keys_sort ;
}
BUG_ON ( darray_push ( keys , n ) ) ;
}
2023-08-05 23:08:44 +03:00
nr_read + + ;
}
}
__journal_keys_sort ( keys ) ;
keys - > gap = keys - > nr ;
2024-02-24 08:19:09 +03:00
bch_verbose ( c , " Journal keys: %zu read, %zu after sorting and compacting " , nr_read , keys - > nr ) ;
2023-08-05 23:08:44 +03:00
return 0 ;
}