2017-03-16 22:18:50 -08:00
// SPDX-License-Identifier: GPL-2.0
# include "bcachefs.h"
2020-12-17 15:08:58 -05:00
# include "bkey_buf.h"
2017-03-16 22:18:50 -08:00
# include "btree_cache.h"
# include "btree_io.h"
# include "btree_iter.h"
# include "btree_locking.h"
# include "debug.h"
2021-01-26 20:59:00 -05:00
# include "error.h"
2017-03-16 22:18:50 -08:00
# include "trace.h"
# include <linux/prefetch.h>
2019-06-11 21:03:23 -04:00
# include <linux/sched/mm.h>
2017-03-16 22:18:50 -08:00
void bch2_recalc_btree_reserve ( struct bch_fs * c )
{
unsigned i , reserve = 16 ;
if ( ! c - > btree_roots [ 0 ] . b )
reserve + = 8 ;
for ( i = 0 ; i < BTREE_ID_NR ; i + + )
if ( c - > btree_roots [ i ] . b )
reserve + = min_t ( unsigned , 1 ,
2020-06-06 12:28:01 -04:00
c - > btree_roots [ i ] . b - > c . level ) * 8 ;
2017-03-16 22:18:50 -08:00
c - > btree_cache . reserve = reserve ;
}
static inline unsigned btree_cache_can_free ( struct btree_cache * bc )
{
return max_t ( int , 0 , bc - > used - bc - > reserve ) ;
}
static void __btree_node_data_free ( struct bch_fs * c , struct btree * b )
{
EBUG_ON ( btree_node_write_in_flight ( b ) ) ;
kvpfree ( b - > data , btree_bytes ( c ) ) ;
b - > data = NULL ;
2020-07-25 15:07:37 -04:00
kvfree ( b - > aux_data ) ;
b - > aux_data = NULL ;
2017-03-16 22:18:50 -08:00
}
static void btree_node_data_free ( struct bch_fs * c , struct btree * b )
{
struct btree_cache * bc = & c - > btree_cache ;
__btree_node_data_free ( c , b ) ;
bc - > used - - ;
list_move ( & b - > list , & bc - > freed ) ;
}
static int bch2_btree_cache_cmp_fn ( struct rhashtable_compare_arg * arg ,
const void * obj )
{
const struct btree * b = obj ;
const u64 * v = arg - > key ;
2020-02-18 17:15:32 -05:00
return b - > hash_val = = * v ? 0 : 1 ;
2017-03-16 22:18:50 -08:00
}
static const struct rhashtable_params bch_btree_cache_params = {
. head_offset = offsetof ( struct btree , hash ) ,
2020-02-18 17:15:32 -05:00
. key_offset = offsetof ( struct btree , hash_val ) ,
. key_len = sizeof ( u64 ) ,
2017-03-16 22:18:50 -08:00
. obj_cmpfn = bch2_btree_cache_cmp_fn ,
} ;
2020-07-25 15:07:37 -04:00
static int btree_node_data_alloc ( struct bch_fs * c , struct btree * b , gfp_t gfp )
2017-03-16 22:18:50 -08:00
{
2020-06-09 17:49:24 -04:00
BUG_ON ( b - > data | | b - > aux_data ) ;
2017-03-16 22:18:50 -08:00
b - > data = kvpmalloc ( btree_bytes ( c ) , gfp ) ;
if ( ! b - > data )
2020-06-09 17:49:24 -04:00
return - ENOMEM ;
2017-03-16 22:18:50 -08:00
2020-07-25 15:07:37 -04:00
b - > aux_data = kvmalloc ( btree_aux_data_bytes ( b ) , gfp ) ;
if ( ! b - > aux_data ) {
2020-06-09 17:49:24 -04:00
kvpfree ( b - > data , btree_bytes ( c ) ) ;
b - > data = NULL ;
return - ENOMEM ;
}
2017-03-16 22:18:50 -08:00
2020-06-09 17:49:24 -04:00
return 0 ;
}
2020-07-25 15:07:37 -04:00
static struct btree * __btree_node_mem_alloc ( struct bch_fs * c )
2020-06-09 17:49:24 -04:00
{
2020-07-25 15:07:37 -04:00
struct btree * b = kzalloc ( sizeof ( struct btree ) , GFP_KERNEL ) ;
2017-03-16 22:18:50 -08:00
if ( ! b )
return NULL ;
2018-11-01 15:10:01 -04:00
bkey_btree_ptr_init ( & b - > key ) ;
2020-06-06 12:28:01 -04:00
six_lock_init ( & b - > c . lock ) ;
lockdep_set_novalidate_class ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
INIT_LIST_HEAD ( & b - > list ) ;
INIT_LIST_HEAD ( & b - > write_blocked ) ;
2020-07-25 15:07:37 -04:00
b - > byte_order = ilog2 ( btree_bytes ( c ) ) ;
return b ;
}
2021-04-20 20:21:12 -04:00
struct btree * __bch2_btree_node_mem_alloc ( struct bch_fs * c )
2020-07-25 15:07:37 -04:00
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b = __btree_node_mem_alloc ( c ) ;
if ( ! b )
return NULL ;
2017-03-16 22:18:50 -08:00
2020-07-25 15:07:37 -04:00
if ( btree_node_data_alloc ( c , b , GFP_KERNEL ) ) {
kfree ( b ) ;
return NULL ;
}
bc - > used + + ;
list_add ( & b - > list , & bc - > freeable ) ;
return b ;
2017-03-16 22:18:50 -08:00
}
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove ( struct btree_cache * bc , struct btree * b )
{
rhashtable_remove_fast ( & bc - > table , & b - > hash , bch_btree_cache_params ) ;
/* Cause future lookups for this node to fail: */
2020-02-18 17:15:32 -05:00
b - > hash_val = 0 ;
2020-07-25 15:37:14 -04:00
six_lock_wakeup_all ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
}
int __bch2_btree_node_hash_insert ( struct btree_cache * bc , struct btree * b )
{
2020-02-18 17:15:32 -05:00
BUG_ON ( b - > hash_val ) ;
b - > hash_val = btree_ptr_hash_val ( & b - > key ) ;
2017-03-16 22:18:50 -08:00
return rhashtable_lookup_insert_fast ( & bc - > table , & b - > hash ,
bch_btree_cache_params ) ;
}
int bch2_btree_node_hash_insert ( struct btree_cache * bc , struct btree * b ,
unsigned level , enum btree_id id )
{
int ret ;
2020-06-06 12:28:01 -04:00
b - > c . level = level ;
b - > c . btree_id = id ;
2017-03-16 22:18:50 -08:00
2021-03-23 23:52:27 -04:00
if ( level )
six_lock_pcpu_alloc ( & b - > c . lock ) ;
else
six_lock_pcpu_free_rcu ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
ret = __bch2_btree_node_hash_insert ( bc , b ) ;
if ( ! ret )
list_add ( & b - > list , & bc - > live ) ;
mutex_unlock ( & bc - > lock ) ;
return ret ;
}
__flatten
static inline struct btree * btree_cache_find ( struct btree_cache * bc ,
const struct bkey_i * k )
{
2020-02-18 17:15:32 -05:00
u64 v = btree_ptr_hash_val ( k ) ;
return rhashtable_lookup_fast ( & bc - > table , & v , bch_btree_cache_params ) ;
2017-03-16 22:18:50 -08:00
}
/*
* this version is for btree nodes that have already been freed ( we ' re not
* reaping a real btree node )
*/
static int __btree_node_reclaim ( struct bch_fs * c , struct btree * b , bool flush )
{
struct btree_cache * bc = & c - > btree_cache ;
int ret = 0 ;
lockdep_assert_held ( & bc - > lock ) ;
2020-06-06 12:28:01 -04:00
if ( ! six_trylock_intent ( & b - > c . lock ) )
2017-03-16 22:18:50 -08:00
return - ENOMEM ;
2020-06-06 12:28:01 -04:00
if ( ! six_trylock_write ( & b - > c . lock ) )
2017-03-16 22:18:50 -08:00
goto out_unlock_intent ;
if ( btree_node_noevict ( b ) )
goto out_unlock ;
if ( ! btree_node_may_write ( b ) )
goto out_unlock ;
2019-01-13 16:02:22 -05:00
if ( btree_node_dirty ( b ) & &
test_bit ( BCH_FS_HOLD_BTREE_WRITES , & c - > flags ) )
goto out_unlock ;
2017-03-16 22:18:50 -08:00
if ( btree_node_dirty ( b ) | |
btree_node_write_in_flight ( b ) | |
btree_node_read_in_flight ( b ) ) {
if ( ! flush )
goto out_unlock ;
wait_on_bit_io ( & b - > flags , BTREE_NODE_read_in_flight ,
TASK_UNINTERRUPTIBLE ) ;
/*
* Using the underscore version because we don ' t want to compact
* bsets after the write , since this node is about to be evicted
* - unless btree verify mode is enabled , since it runs out of
* the post write cleanup :
*/
2020-11-02 18:20:44 -05:00
if ( bch2_verify_btree_ondisk )
2017-03-16 22:18:50 -08:00
bch2_btree_node_write ( c , b , SIX_LOCK_intent ) ;
else
2021-04-06 15:33:19 -04:00
__bch2_btree_node_write ( c , b ) ;
2017-03-16 22:18:50 -08:00
/* wait for any in flight btree write */
btree_node_wait_on_io ( b ) ;
}
out :
2020-02-18 17:15:32 -05:00
if ( b - > hash_val & & ! ret )
2017-03-16 22:18:50 -08:00
trace_btree_node_reap ( c , b ) ;
return ret ;
out_unlock :
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
out_unlock_intent :
2020-06-06 12:28:01 -04:00
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
ret = - ENOMEM ;
goto out ;
}
static int btree_node_reclaim ( struct bch_fs * c , struct btree * b )
{
return __btree_node_reclaim ( c , b , false ) ;
}
static int btree_node_write_and_reclaim ( struct bch_fs * c , struct btree * b )
{
return __btree_node_reclaim ( c , b , true ) ;
}
static unsigned long bch2_btree_cache_scan ( struct shrinker * shrink ,
struct shrink_control * sc )
{
struct bch_fs * c = container_of ( shrink , struct bch_fs ,
btree_cache . shrink ) ;
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b , * t ;
unsigned long nr = sc - > nr_to_scan ;
unsigned long can_free ;
unsigned long touched = 0 ;
unsigned long freed = 0 ;
2020-10-15 21:48:58 -04:00
unsigned i , flags ;
2017-03-16 22:18:50 -08:00
2020-11-02 18:20:44 -05:00
if ( bch2_btree_shrinker_disabled )
2017-03-16 22:18:50 -08:00
return SHRINK_STOP ;
/* Return -1 if we can't do anything right now */
2020-06-05 09:01:23 -04:00
if ( sc - > gfp_mask & __GFP_FS )
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
else if ( ! mutex_trylock ( & bc - > lock ) )
return - 1 ;
2020-10-15 21:48:58 -04:00
flags = memalloc_nofs_save ( ) ;
2017-03-16 22:18:50 -08:00
/*
* It ' s _really_ critical that we don ' t free too many btree nodes - we
* have to always leave ourselves a reserve . The reserve is how we
* guarantee that allocating memory for a new btree node can always
* succeed , so that inserting keys into the btree can always succeed and
* IO can always make forward progress :
*/
nr / = btree_pages ( c ) ;
can_free = btree_cache_can_free ( bc ) ;
nr = min_t ( unsigned long , nr , can_free ) ;
i = 0 ;
list_for_each_entry_safe ( b , t , & bc - > freeable , list ) {
2021-12-27 22:11:54 -05:00
/*
* Leave a few nodes on the freeable list , so that a btree split
* won ' t have to hit the system allocator :
*/
if ( + + i < = 3 )
continue ;
2017-03-16 22:18:50 -08:00
touched + + ;
if ( freed > = nr )
break ;
2021-12-27 22:11:54 -05:00
if ( ! btree_node_reclaim ( c , b ) ) {
2017-03-16 22:18:50 -08:00
btree_node_data_free ( c , b ) ;
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
freed + + ;
}
}
restart :
list_for_each_entry_safe ( b , t , & bc - > live , list ) {
touched + + ;
if ( freed > = nr ) {
/* Save position */
if ( & t - > list ! = & bc - > live )
list_move_tail ( & bc - > live , & t - > list ) ;
break ;
}
if ( ! btree_node_accessed ( b ) & &
! btree_node_reclaim ( c , b ) ) {
/* can't call bch2_btree_node_hash_remove under lock */
freed + + ;
if ( & t - > list ! = & bc - > live )
list_move_tail ( & bc - > live , & t - > list ) ;
btree_node_data_free ( c , b ) ;
mutex_unlock ( & bc - > lock ) ;
bch2_btree_node_hash_remove ( bc , b ) ;
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
if ( freed > = nr )
goto out ;
2020-06-15 20:18:02 -04:00
if ( sc - > gfp_mask & __GFP_FS )
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
else if ( ! mutex_trylock ( & bc - > lock ) )
goto out ;
goto restart ;
} else
clear_btree_node_accessed ( b ) ;
}
mutex_unlock ( & bc - > lock ) ;
out :
2020-11-11 18:59:41 -05:00
memalloc_nofs_restore ( flags ) ;
2017-03-16 22:18:50 -08:00
return ( unsigned long ) freed * btree_pages ( c ) ;
}
static unsigned long bch2_btree_cache_count ( struct shrinker * shrink ,
struct shrink_control * sc )
{
struct bch_fs * c = container_of ( shrink , struct bch_fs ,
btree_cache . shrink ) ;
struct btree_cache * bc = & c - > btree_cache ;
2020-11-02 18:20:44 -05:00
if ( bch2_btree_shrinker_disabled )
2017-03-16 22:18:50 -08:00
return 0 ;
return btree_cache_can_free ( bc ) * btree_pages ( c ) ;
}
void bch2_fs_btree_cache_exit ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
2020-10-11 16:33:49 -04:00
unsigned i , flags ;
2017-03-16 22:18:50 -08:00
if ( bc - > shrink . list . next )
unregister_shrinker ( & bc - > shrink ) ;
2020-10-11 16:33:49 -04:00
/* vfree() can allocate memory: */
flags = memalloc_nofs_save ( ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
if ( c - > verify_data )
list_move ( & c - > verify_data - > list , & bc - > live ) ;
kvpfree ( c - > verify_ondisk , btree_bytes ( c ) ) ;
for ( i = 0 ; i < BTREE_ID_NR ; i + + )
if ( c - > btree_roots [ i ] . b )
list_add ( & c - > btree_roots [ i ] . b - > list , & bc - > live ) ;
list_splice ( & bc - > freeable , & bc - > live ) ;
while ( ! list_empty ( & bc - > live ) ) {
b = list_first_entry ( & bc - > live , struct btree , list ) ;
BUG_ON ( btree_node_read_in_flight ( b ) | |
btree_node_write_in_flight ( b ) ) ;
if ( btree_node_dirty ( b ) )
bch2_btree_complete_write ( c , b , btree_current_write ( b ) ) ;
2020-11-09 13:01:52 -05:00
clear_btree_node_dirty ( c , b ) ;
2017-03-16 22:18:50 -08:00
btree_node_data_free ( c , b ) ;
}
2020-11-09 13:01:52 -05:00
BUG_ON ( atomic_read ( & c - > btree_cache . dirty ) ) ;
2017-03-16 22:18:50 -08:00
while ( ! list_empty ( & bc - > freed ) ) {
b = list_first_entry ( & bc - > freed , struct btree , list ) ;
list_del ( & b - > list ) ;
2021-03-23 23:52:27 -04:00
six_lock_pcpu_free ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
kfree ( b ) ;
}
mutex_unlock ( & bc - > lock ) ;
2020-10-11 16:33:49 -04:00
memalloc_nofs_restore ( flags ) ;
2017-03-16 22:18:50 -08:00
if ( bc - > table_init_done )
rhashtable_destroy ( & bc - > table ) ;
}
int bch2_fs_btree_cache_init ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
unsigned i ;
int ret = 0 ;
pr_verbose_init ( c - > opts , " " ) ;
ret = rhashtable_init ( & bc - > table , & bch_btree_cache_params ) ;
if ( ret )
goto out ;
bc - > table_init_done = true ;
bch2_recalc_btree_reserve ( c ) ;
for ( i = 0 ; i < bc - > reserve ; i + + )
2021-04-20 20:21:12 -04:00
if ( ! __bch2_btree_node_mem_alloc ( c ) ) {
2017-03-16 22:18:50 -08:00
ret = - ENOMEM ;
goto out ;
}
list_splice_init ( & bc - > live , & bc - > freeable ) ;
mutex_init ( & c - > verify_lock ) ;
bc - > shrink . count_objects = bch2_btree_cache_count ;
bc - > shrink . scan_objects = bch2_btree_cache_scan ;
bc - > shrink . seeks = 4 ;
bc - > shrink . batch = btree_pages ( c ) * 2 ;
2020-11-15 16:31:58 -05:00
ret = register_shrinker ( & bc - > shrink , " %s/btree_cache " , c - > name ) ;
2017-03-16 22:18:50 -08:00
out :
pr_verbose_init ( c - > opts , " ret %i " , ret ) ;
return ret ;
}
void bch2_fs_btree_cache_init_early ( struct btree_cache * bc )
{
mutex_init ( & bc - > lock ) ;
INIT_LIST_HEAD ( & bc - > live ) ;
INIT_LIST_HEAD ( & bc - > freeable ) ;
INIT_LIST_HEAD ( & bc - > freed ) ;
}
/*
* We can only have one thread cannibalizing other cached btree nodes at a time ,
* or we ' ll deadlock . We use an open coded mutex to ensure that , which a
* cannibalize_bucket ( ) will take . This means every time we unlock the root of
* the btree , we need to release this lock if we have it held .
*/
void bch2_btree_cache_cannibalize_unlock ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
if ( bc - > alloc_lock = = current ) {
trace_btree_node_cannibalize_unlock ( c ) ;
bc - > alloc_lock = NULL ;
closure_wake_up ( & bc - > alloc_wait ) ;
}
}
int bch2_btree_cache_cannibalize_lock ( struct bch_fs * c , struct closure * cl )
{
struct btree_cache * bc = & c - > btree_cache ;
struct task_struct * old ;
old = cmpxchg ( & bc - > alloc_lock , NULL , current ) ;
if ( old = = NULL | | old = = current )
goto success ;
if ( ! cl ) {
trace_btree_node_cannibalize_lock_fail ( c ) ;
return - ENOMEM ;
}
closure_wait ( & bc - > alloc_wait , cl ) ;
/* Try again, after adding ourselves to waitlist */
old = cmpxchg ( & bc - > alloc_lock , NULL , current ) ;
if ( old = = NULL | | old = = current ) {
/* We raced */
closure_wake_up ( & bc - > alloc_wait ) ;
goto success ;
}
trace_btree_node_cannibalize_lock_fail ( c ) ;
return - EAGAIN ;
success :
trace_btree_node_cannibalize_lock ( c ) ;
return 0 ;
}
static struct btree * btree_node_cannibalize ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
list_for_each_entry_reverse ( b , & bc - > live , list )
if ( ! btree_node_reclaim ( c , b ) )
return b ;
while ( 1 ) {
list_for_each_entry_reverse ( b , & bc - > live , list )
if ( ! btree_node_write_and_reclaim ( c , b ) )
return b ;
/*
* Rare case : all nodes were intent - locked .
* Just busy - wait .
*/
WARN_ONCE ( 1 , " btree cache cannibalize failed \n " ) ;
cond_resched ( ) ;
}
}
struct btree * bch2_btree_node_mem_alloc ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
u64 start_time = local_clock ( ) ;
2019-06-11 21:03:23 -04:00
unsigned flags ;
2017-03-16 22:18:50 -08:00
2019-06-11 21:03:23 -04:00
flags = memalloc_nofs_save ( ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
/*
* btree_free ( ) doesn ' t free memory ; it sticks the node on the end of
* the list . Check if there ' s any freed nodes there :
*/
list_for_each_entry ( b , & bc - > freeable , list )
if ( ! btree_node_reclaim ( c , b ) )
2020-06-09 17:49:24 -04:00
goto got_node ;
2017-03-16 22:18:50 -08:00
/*
* We never free struct btree itself , just the memory that holds the on
* disk node . Check the freed list before allocating a new one :
*/
list_for_each_entry ( b , & bc - > freed , list )
2020-06-09 17:49:24 -04:00
if ( ! btree_node_reclaim ( c , b ) )
goto got_node ;
2017-03-16 22:18:50 -08:00
2020-06-09 17:49:24 -04:00
b = NULL ;
got_node :
if ( b )
list_del_init ( & b - > list ) ;
mutex_unlock ( & bc - > lock ) ;
if ( ! b ) {
2020-07-25 15:07:37 -04:00
b = __btree_node_mem_alloc ( c ) ;
2020-06-09 17:49:24 -04:00
if ( ! b )
2017-03-16 22:18:50 -08:00
goto err ;
2020-06-09 17:49:24 -04:00
BUG_ON ( ! six_trylock_intent ( & b - > c . lock ) ) ;
BUG_ON ( ! six_trylock_write ( & b - > c . lock ) ) ;
}
if ( ! b - > data ) {
2020-07-25 15:07:37 -04:00
if ( btree_node_data_alloc ( c , b , __GFP_NOWARN | GFP_KERNEL ) )
2020-06-09 17:49:24 -04:00
goto err ;
mutex_lock ( & bc - > lock ) ;
bc - > used + + ;
mutex_unlock ( & bc - > lock ) ;
}
2017-03-16 22:18:50 -08:00
BUG_ON ( btree_node_hashed ( b ) ) ;
BUG_ON ( btree_node_write_in_flight ( b ) ) ;
out :
b - > flags = 0 ;
b - > written = 0 ;
b - > nsets = 0 ;
b - > sib_u64s [ 0 ] = 0 ;
b - > sib_u64s [ 1 ] = 0 ;
b - > whiteout_u64s = 0 ;
2020-11-02 18:20:44 -05:00
bch2_btree_keys_init ( b ) ;
2017-03-16 22:18:50 -08:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_node_mem_alloc ] ,
start_time ) ;
2020-05-27 14:10:27 -04:00
memalloc_nofs_restore ( flags ) ;
2017-03-16 22:18:50 -08:00
return b ;
err :
2020-06-09 17:49:24 -04:00
mutex_lock ( & bc - > lock ) ;
if ( b ) {
list_add ( & b - > list , & bc - > freed ) ;
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
}
2017-03-16 22:18:50 -08:00
/* Try to cannibalize another cached btree node: */
if ( bc - > alloc_lock = = current ) {
b = btree_node_cannibalize ( c ) ;
list_del_init ( & b - > list ) ;
mutex_unlock ( & bc - > lock ) ;
bch2_btree_node_hash_remove ( bc , b ) ;
trace_btree_node_cannibalize ( c ) ;
goto out ;
}
mutex_unlock ( & bc - > lock ) ;
2020-05-27 14:10:27 -04:00
memalloc_nofs_restore ( flags ) ;
2017-03-16 22:18:50 -08:00
return ERR_PTR ( - ENOMEM ) ;
}
/* Slowpath, don't want it inlined into btree_iter_traverse() */
static noinline struct btree * bch2_btree_node_fill ( struct bch_fs * c ,
struct btree_iter * iter ,
const struct bkey_i * k ,
2020-03-15 23:29:43 -04:00
enum btree_id btree_id ,
2017-03-16 22:18:50 -08:00
unsigned level ,
enum six_lock_type lock_type ,
bool sync )
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
2020-02-24 15:25:00 -05:00
BUG_ON ( level + 1 > = BTREE_MAX_DEPTH ) ;
2017-03-16 22:18:50 -08:00
/*
* Parent node must be locked , else we could read in a btree node that ' s
* been freed :
*/
2020-03-15 23:29:43 -04:00
if ( iter & & ! bch2_btree_node_relock ( iter , level + 1 ) )
2020-02-24 15:25:00 -05:00
return ERR_PTR ( - EINTR ) ;
2017-03-16 22:18:50 -08:00
b = bch2_btree_node_mem_alloc ( c ) ;
if ( IS_ERR ( b ) )
return b ;
bkey_copy ( & b - > key , k ) ;
2020-03-15 23:29:43 -04:00
if ( bch2_btree_node_hash_insert ( bc , b , level , btree_id ) ) {
2017-03-16 22:18:50 -08:00
/* raced with another fill: */
/* mark as unhashed... */
2020-02-18 17:15:32 -05:00
b - > hash_val = 0 ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
list_add ( & b - > list , & bc - > freeable ) ;
mutex_unlock ( & bc - > lock ) ;
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
return NULL ;
}
/*
2020-02-24 15:25:00 -05:00
* Unlock before doing IO :
2017-03-16 22:18:50 -08:00
*
2020-02-24 15:25:00 -05:00
* XXX : ideally should be dropping all btree node locks here
2017-03-16 22:18:50 -08:00
*/
2020-03-15 23:29:43 -04:00
if ( iter & & btree_node_read_locked ( iter , level + 1 ) )
2017-03-16 22:18:50 -08:00
btree_node_unlock ( iter , level + 1 ) ;
bch2_btree_node_read ( c , b , sync ) ;
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
if ( ! sync ) {
2020-06-06 12:28:01 -04:00
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
return NULL ;
}
if ( lock_type = = SIX_LOCK_read )
2020-06-06 12:28:01 -04:00
six_lock_downgrade ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
return b ;
}
2020-06-12 22:29:48 -04:00
static int lock_node_check_fn ( struct six_lock * lock , void * p )
{
struct btree * b = container_of ( lock , struct btree , c . lock ) ;
const struct bkey_i * k = p ;
return b - > hash_val = = btree_ptr_hash_val ( k ) ? 0 : - 1 ;
}
2017-03-16 22:18:50 -08:00
/**
* bch_btree_node_get - find a btree node in the cache and lock it , reading it
* in from disk if necessary .
*
* If IO is necessary and running under generic_make_request , returns - EAGAIN .
*
* The btree node will have either a read or a write lock held , depending on
* the @ write parameter .
*/
struct btree * bch2_btree_node_get ( struct bch_fs * c , struct btree_iter * iter ,
const struct bkey_i * k , unsigned level ,
2020-10-28 14:17:46 -04:00
enum six_lock_type lock_type ,
unsigned long trace_ip )
2017-03-16 22:18:50 -08:00
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
struct bset_tree * t ;
EBUG_ON ( level > = BTREE_MAX_DEPTH ) ;
2020-02-24 15:25:00 -05:00
b = btree_node_mem_ptr ( k ) ;
if ( b )
goto lock_node ;
2017-03-16 22:18:50 -08:00
retry :
b = btree_cache_find ( bc , k ) ;
if ( unlikely ( ! b ) ) {
/*
* We must have the parent locked to call bch2_btree_node_fill ( ) ,
* else we could read in a btree node from disk that ' s been
* freed :
*/
2020-03-15 23:29:43 -04:00
b = bch2_btree_node_fill ( c , iter , k , iter - > btree_id ,
level , lock_type , true ) ;
2017-03-16 22:18:50 -08:00
/* We raced and found the btree node in the cache */
if ( ! b )
goto retry ;
if ( IS_ERR ( b ) )
return b ;
} else {
2020-02-24 15:25:00 -05:00
lock_node :
2017-03-16 22:18:50 -08:00
/*
* There ' s a potential deadlock with splits and insertions into
* interior nodes we have to avoid :
*
* The other thread might be holding an intent lock on the node
* we want , and they want to update its parent node so they ' re
* going to upgrade their intent lock on the parent node to a
* write lock .
*
* But if we ' re holding a read lock on the parent , and we ' re
* trying to get the intent lock they ' re holding , we deadlock .
*
* So to avoid this we drop the read locks on parent nodes when
* we ' re starting to take intent locks - and handle the race .
*
* The race is that they might be about to free the node we
* want , and dropping our read lock on the parent node lets them
* update the parent marking the node we want as freed , and then
* free it :
*
* To guard against this , btree nodes are evicted from the cache
2020-02-18 17:15:32 -05:00
* when they ' re freed - and b - > hash_val is zeroed out , which we
2017-03-16 22:18:50 -08:00
* check for after we lock the node .
*
* Then , bch2_btree_node_relock ( ) on the parent will fail - because
* the parent was modified , when the pointer to the node we want
* was removed - and we ' ll bail out :
*/
if ( btree_node_read_locked ( iter , level + 1 ) )
btree_node_unlock ( iter , level + 1 ) ;
2020-06-12 22:29:48 -04:00
if ( ! btree_node_lock ( b , k - > k . p , level , iter , lock_type ,
2020-10-28 14:17:46 -04:00
lock_node_check_fn , ( void * ) k , trace_ip ) ) {
2020-06-12 22:29:48 -04:00
if ( b - > hash_val ! = btree_ptr_hash_val ( k ) )
goto retry ;
2017-03-16 22:18:50 -08:00
return ERR_PTR ( - EINTR ) ;
2020-06-12 22:29:48 -04:00
}
2017-03-16 22:18:50 -08:00
2020-02-18 17:15:32 -05:00
if ( unlikely ( b - > hash_val ! = btree_ptr_hash_val ( k ) | |
2020-06-06 12:28:01 -04:00
b - > c . level ! = level | |
2017-03-16 22:18:50 -08:00
race_fault ( ) ) ) {
2020-06-06 12:28:01 -04:00
six_unlock_type ( & b - > c . lock , lock_type ) ;
2017-03-16 22:18:50 -08:00
if ( bch2_btree_node_relock ( iter , level + 1 ) )
goto retry ;
2019-05-15 10:54:43 -04:00
trace_trans_restart_btree_node_reused ( iter - > trans - > ip ) ;
2017-03-16 22:18:50 -08:00
return ERR_PTR ( - EINTR ) ;
}
}
2020-02-24 15:25:00 -05:00
/* XXX: waiting on IO with btree locks held: */
2017-03-16 22:18:50 -08:00
wait_on_bit_io ( & b - > flags , BTREE_NODE_read_in_flight ,
TASK_UNINTERRUPTIBLE ) ;
prefetch ( b - > aux_data ) ;
for_each_bset ( b , t ) {
void * p = ( u64 * ) b - > aux_data + t - > aux_data_offset ;
prefetch ( p + L1_CACHE_BYTES * 0 ) ;
prefetch ( p + L1_CACHE_BYTES * 1 ) ;
prefetch ( p + L1_CACHE_BYTES * 2 ) ;
}
/* avoid atomic set bit if it's not needed: */
2020-02-26 17:25:13 -05:00
if ( ! btree_node_accessed ( b ) )
2017-03-16 22:18:50 -08:00
set_btree_node_accessed ( b ) ;
if ( unlikely ( btree_node_read_error ( b ) ) ) {
2020-06-06 12:28:01 -04:00
six_unlock_type ( & b - > c . lock , lock_type ) ;
2017-03-16 22:18:50 -08:00
return ERR_PTR ( - EIO ) ;
}
2021-01-26 20:59:00 -05:00
EBUG_ON ( b - > c . btree_id ! = iter - > btree_id ) ;
EBUG_ON ( BTREE_NODE_LEVEL ( b - > data ) ! = level ) ;
2021-03-04 16:20:16 -05:00
EBUG_ON ( bpos_cmp ( b - > data - > max_key , k - > k . p ) ) ;
2021-01-26 20:59:00 -05:00
EBUG_ON ( b - > key . k . type = = KEY_TYPE_btree_ptr_v2 & &
2021-03-04 16:20:16 -05:00
bpos_cmp ( b - > data - > min_key ,
2021-01-26 20:59:00 -05:00
bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v . min_key ) ) ;
2017-03-16 22:18:50 -08:00
return b ;
}
2020-03-15 23:29:43 -04:00
struct btree * bch2_btree_node_get_noiter ( struct bch_fs * c ,
const struct bkey_i * k ,
enum btree_id btree_id ,
2021-01-26 20:59:00 -05:00
unsigned level ,
bool nofill )
2020-03-15 23:29:43 -04:00
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
struct bset_tree * t ;
2020-06-12 22:29:48 -04:00
int ret ;
2020-03-15 23:29:43 -04:00
EBUG_ON ( level > = BTREE_MAX_DEPTH ) ;
b = btree_node_mem_ptr ( k ) ;
if ( b )
goto lock_node ;
retry :
b = btree_cache_find ( bc , k ) ;
if ( unlikely ( ! b ) ) {
2021-01-26 20:59:00 -05:00
if ( nofill )
2021-02-23 21:41:25 -05:00
goto out ;
2021-01-26 20:59:00 -05:00
2020-03-15 23:29:43 -04:00
b = bch2_btree_node_fill ( c , NULL , k , btree_id ,
level , SIX_LOCK_read , true ) ;
/* We raced and found the btree node in the cache */
if ( ! b )
goto retry ;
2021-02-23 21:41:25 -05:00
if ( IS_ERR ( b ) & &
! bch2_btree_cache_cannibalize_lock ( c , NULL ) )
goto retry ;
2020-03-15 23:29:43 -04:00
if ( IS_ERR ( b ) )
2021-02-23 21:41:25 -05:00
goto out ;
2020-03-15 23:29:43 -04:00
} else {
lock_node :
2020-06-12 22:29:48 -04:00
ret = six_lock_read ( & b - > c . lock , lock_node_check_fn , ( void * ) k ) ;
if ( ret )
goto retry ;
2020-03-15 23:29:43 -04:00
if ( unlikely ( b - > hash_val ! = btree_ptr_hash_val ( k ) | |
b - > c . btree_id ! = btree_id | |
b - > c . level ! = level ) ) {
six_unlock_read ( & b - > c . lock ) ;
goto retry ;
}
}
/* XXX: waiting on IO with btree locks held: */
wait_on_bit_io ( & b - > flags , BTREE_NODE_read_in_flight ,
TASK_UNINTERRUPTIBLE ) ;
prefetch ( b - > aux_data ) ;
for_each_bset ( b , t ) {
void * p = ( u64 * ) b - > aux_data + t - > aux_data_offset ;
prefetch ( p + L1_CACHE_BYTES * 0 ) ;
prefetch ( p + L1_CACHE_BYTES * 1 ) ;
prefetch ( p + L1_CACHE_BYTES * 2 ) ;
}
/* avoid atomic set bit if it's not needed: */
if ( ! btree_node_accessed ( b ) )
set_btree_node_accessed ( b ) ;
if ( unlikely ( btree_node_read_error ( b ) ) ) {
six_unlock_read ( & b - > c . lock ) ;
2021-02-23 21:41:25 -05:00
b = ERR_PTR ( - EIO ) ;
goto out ;
2020-03-15 23:29:43 -04:00
}
2021-01-26 20:59:00 -05:00
EBUG_ON ( b - > c . btree_id ! = btree_id ) ;
EBUG_ON ( BTREE_NODE_LEVEL ( b - > data ) ! = level ) ;
2021-03-04 16:20:16 -05:00
EBUG_ON ( bpos_cmp ( b - > data - > max_key , k - > k . p ) ) ;
2021-01-26 20:59:00 -05:00
EBUG_ON ( b - > key . k . type = = KEY_TYPE_btree_ptr_v2 & &
2021-03-04 16:20:16 -05:00
bpos_cmp ( b - > data - > min_key ,
2021-01-26 20:59:00 -05:00
bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v . min_key ) ) ;
2021-02-23 21:41:25 -05:00
out :
bch2_btree_cache_cannibalize_unlock ( c ) ;
2020-03-15 23:29:43 -04:00
return b ;
}
2017-03-16 22:18:50 -08:00
void bch2_btree_node_prefetch ( struct bch_fs * c , struct btree_iter * iter ,
2021-01-11 16:11:02 -05:00
const struct bkey_i * k ,
enum btree_id btree_id , unsigned level )
2017-03-16 22:18:50 -08:00
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
2021-01-11 16:11:02 -05:00
BUG_ON ( iter & & ! btree_node_locked ( iter , level + 1 ) ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( level > = BTREE_MAX_DEPTH ) ;
b = btree_cache_find ( bc , k ) ;
if ( b )
return ;
2021-01-11 16:11:02 -05:00
bch2_btree_node_fill ( c , iter , k , btree_id , level , SIX_LOCK_read , false ) ;
2017-03-16 22:18:50 -08:00
}
2018-11-09 01:24:07 -05:00
void bch2_btree_node_to_text ( struct printbuf * out , struct bch_fs * c ,
struct btree * b )
2017-03-16 22:18:50 -08:00
{
const struct bkey_format * f = & b - > format ;
struct bset_stats stats ;
memset ( & stats , 0 , sizeof ( stats ) ) ;
bch2_btree_keys_stats ( b , & stats ) ;
2021-03-04 15:20:22 -05:00
pr_buf ( out , " l %u " , b - > c . level ) ;
bch2_bpos_to_text ( out , b - > data - > min_key ) ;
pr_buf ( out , " - " ) ;
bch2_bpos_to_text ( out , b - > data - > max_key ) ;
pr_buf ( out , " : \n "
" ptrs: " ) ;
2018-11-01 15:10:01 -04:00
bch2_val_to_text ( out , c , bkey_i_to_s_c ( & b - > key ) ) ;
2021-03-04 15:20:22 -05:00
2018-11-09 01:24:07 -05:00
pr_buf ( out , " \n "
" format: u64s %u fields %u %u %u %u %u \n "
" unpack fn len: %u \n "
" bytes used %zu/%zu (%zu%% full) \n "
2021-03-29 01:13:31 -04:00
" sib u64s: %u, %u (merge threshold %u) \n "
2018-11-09 01:24:07 -05:00
" nr packed keys %u \n "
" nr unpacked keys %u \n "
" floats %zu \n "
2019-10-23 14:56:20 -04:00
" failed unpacked %zu \n " ,
2018-11-09 01:24:07 -05:00
f - > key_u64s ,
f - > bits_per_field [ 0 ] ,
f - > bits_per_field [ 1 ] ,
f - > bits_per_field [ 2 ] ,
f - > bits_per_field [ 3 ] ,
f - > bits_per_field [ 4 ] ,
b - > unpack_fn_len ,
b - > nr . live_u64s * sizeof ( u64 ) ,
btree_bytes ( c ) - sizeof ( struct btree_node ) ,
b - > nr . live_u64s * 100 / btree_max_u64s ( c ) ,
b - > sib_u64s [ 0 ] ,
b - > sib_u64s [ 1 ] ,
2021-03-29 01:13:31 -04:00
c - > btree_foreground_merge_threshold ,
2018-11-09 01:24:07 -05:00
b - > nr . packed_keys ,
b - > nr . unpacked_keys ,
stats . floats ,
2019-10-23 14:56:20 -04:00
stats . failed ) ;
2017-03-16 22:18:50 -08:00
}
2020-11-19 20:13:30 -05:00
void bch2_btree_cache_to_text ( struct printbuf * out , struct bch_fs * c )
{
2021-01-11 13:37:35 -05:00
pr_buf ( out , " nr nodes: \t \t %u \n " , c - > btree_cache . used ) ;
pr_buf ( out , " nr dirty: \t \t %u \n " , atomic_read ( & c - > btree_cache . dirty ) ) ;
pr_buf ( out , " cannibalize lock: \t %p \n " , c - > btree_cache . alloc_lock ) ;
2020-11-19 20:13:30 -05:00
}