2017-03-16 22:18:50 -08:00
// SPDX-License-Identifier: GPL-2.0
# include "bcachefs.h"
2020-12-17 15:08:58 -05:00
# include "bkey_buf.h"
2017-03-16 22:18:50 -08:00
# include "btree_cache.h"
# include "btree_io.h"
# include "btree_iter.h"
# include "btree_locking.h"
# include "debug.h"
2022-07-17 23:06:38 -04:00
# include "errcode.h"
2021-01-26 20:59:00 -05:00
# include "error.h"
2017-03-16 22:18:50 -08:00
# include "trace.h"
# include <linux/prefetch.h>
2019-06-11 21:03:23 -04:00
# include <linux/sched/mm.h>
2017-03-16 22:18:50 -08:00
2022-02-26 11:10:20 -05:00
const char * const bch2_btree_node_flags [ ] = {
# define x(f) #f,
BTREE_FLAGS ( )
# undef x
NULL
} ;
2017-03-16 22:18:50 -08:00
void bch2_recalc_btree_reserve ( struct bch_fs * c )
{
unsigned i , reserve = 16 ;
2023-06-28 22:09:13 -04:00
if ( ! c - > btree_roots_known [ 0 ] . b )
2017-03-16 22:18:50 -08:00
reserve + = 8 ;
2023-06-28 22:09:13 -04:00
for ( i = 0 ; i < btree_id_nr_alive ( c ) ; i + + ) {
struct btree_root * r = bch2_btree_id_root ( c , i ) ;
if ( r - > b )
reserve + = min_t ( unsigned , 1 , r - > b - > c . level ) * 8 ;
}
2017-03-16 22:18:50 -08:00
c - > btree_cache . reserve = reserve ;
}
static inline unsigned btree_cache_can_free ( struct btree_cache * bc )
{
return max_t ( int , 0 , bc - > used - bc - > reserve ) ;
}
2022-03-04 19:16:04 -05:00
static void btree_node_to_freedlist ( struct btree_cache * bc , struct btree * b )
{
if ( b - > c . lock . readers )
list_move ( & b - > list , & bc - > freed_pcpu ) ;
else
list_move ( & b - > list , & bc - > freed_nonpcpu ) ;
}
2021-04-24 00:38:16 -04:00
static void btree_node_data_free ( struct bch_fs * c , struct btree * b )
2017-03-16 22:18:50 -08:00
{
2021-04-24 00:38:16 -04:00
struct btree_cache * bc = & c - > btree_cache ;
2017-03-16 22:18:50 -08:00
EBUG_ON ( btree_node_write_in_flight ( b ) ) ;
bcachefs: Clear btree_node_just_written() when node reused or evicted
This fixes the following bug:
Journal reclaim attempts to flush a node, but races with the node being
evicted from the btree node cache; when we lock the node, the data
buffers have already been freed.
We don't evict a node that's dirty, so calling btree_node_write() is
fine - it's a noop - except that the btree_node_just_written bit causes
bch2_btree_post_write_cleanup() to run (resorting the node), which then
causes a null ptr deref.
00078 Unable to handle kernel NULL pointer dereference at virtual address 000000000000009e
00078 Mem abort info:
00078 ESR = 0x0000000096000005
00078 EC = 0x25: DABT (current EL), IL = 32 bits
00078 SET = 0, FnV = 0
00078 EA = 0, S1PTW = 0
00078 FSC = 0x05: level 1 translation fault
00078 Data abort info:
00078 ISV = 0, ISS = 0x00000005
00078 CM = 0, WnR = 0
00078 user pgtable: 4k pages, 39-bit VAs, pgdp=000000007ed64000
00078 [000000000000009e] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
00078 Internal error: Oops: 0000000096000005 [#1] SMP
00078 Modules linked in:
00078 CPU: 75 PID: 1170 Comm: stress-ng-utime Not tainted 6.3.0-ktest-g5ef5b466e77e #2078
00078 Hardware name: linux,dummy-virt (DT)
00078 pstate: 80001005 (Nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
00078 pc : btree_node_sort+0xc4/0x568
00078 lr : bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 sp : ffffff803e30b350
00078 x29: ffffff803e30b350 x28: 0000000000000001 x27: ffffff80076e52a8
00078 x26: 0000000000000002 x25: 0000000000000000 x24: ffffffc00912e000
00078 x23: ffffff80076e52a8 x22: 0000000000000000 x21: ffffff80076e52bc
00078 x20: ffffff80076e5200 x19: 0000000000000000 x18: 0000000000000000
00078 x17: fffffffff8000000 x16: 0000000008000000 x15: 0000000008000000
00078 x14: 0000000000000002 x13: 0000000000000000 x12: 00000000000000a0
00078 x11: ffffff803e30b400 x10: ffffff803e30b408 x9 : 0000000000000001
00078 x8 : 0000000000000000 x7 : ffffff803e480000 x6 : 00000000000000a0
00078 x5 : 0000000000000088 x4 : 0000000000000000 x3 : 0000000000000010
00078 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffff80076e52a8
00078 Call trace:
00078 btree_node_sort+0xc4/0x568
00078 bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 bch2_btree_node_write+0x108/0x148
00078 __btree_node_flush+0x104/0x160
00078 bch2_btree_node_flush0+0x1c/0x30
00078 journal_flush_pins.constprop.0+0x184/0x2d0
00078 __bch2_journal_reclaim+0x4d4/0x508
00078 bch2_journal_reclaim+0x1c/0x30
00078 __bch2_journal_preres_get+0x244/0x268
00078 bch2_trans_journal_preres_get_cold+0xa4/0x180
00078 __bch2_trans_commit+0x61c/0x1bb0
00078 bch2_setattr_nonsize+0x254/0x318
00078 bch2_setattr+0x5c/0x78
00078 notify_change+0x2bc/0x408
00078 vfs_utimes+0x11c/0x218
00078 do_utimes+0x84/0x140
00078 __arm64_sys_utimensat+0x68/0xa8
00078 invoke_syscall.constprop.0+0x54/0xf0
00078 do_el0_svc+0x48/0xd8
00078 el0_svc+0x14/0x48
00078 el0t_64_sync_handler+0xb0/0xb8
00078 el0t_64_sync+0x14c/0x150
00078 Code: 8b050265 910020c6 8b060266 910060ac (79402cad)
00078 ---[ end trace 0000000000000000 ]---
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-22 00:49:06 -04:00
clear_btree_node_just_written ( b ) ;
2017-03-16 22:18:50 -08:00
kvpfree ( b - > data , btree_bytes ( c ) ) ;
b - > data = NULL ;
2021-04-24 00:38:16 -04:00
# ifdef __KERNEL__
2020-07-25 15:07:37 -04:00
kvfree ( b - > aux_data ) ;
2021-04-24 00:38:16 -04:00
# else
munmap ( b - > aux_data , btree_aux_data_bytes ( b ) ) ;
# endif
2020-07-25 15:07:37 -04:00
b - > aux_data = NULL ;
2017-03-16 22:18:50 -08:00
bc - > used - - ;
2022-03-04 19:16:04 -05:00
btree_node_to_freedlist ( bc , b ) ;
2017-03-16 22:18:50 -08:00
}
static int bch2_btree_cache_cmp_fn ( struct rhashtable_compare_arg * arg ,
const void * obj )
{
const struct btree * b = obj ;
const u64 * v = arg - > key ;
2020-02-18 17:15:32 -05:00
return b - > hash_val = = * v ? 0 : 1 ;
2017-03-16 22:18:50 -08:00
}
static const struct rhashtable_params bch_btree_cache_params = {
. head_offset = offsetof ( struct btree , hash ) ,
2020-02-18 17:15:32 -05:00
. key_offset = offsetof ( struct btree , hash_val ) ,
. key_len = sizeof ( u64 ) ,
2017-03-16 22:18:50 -08:00
. obj_cmpfn = bch2_btree_cache_cmp_fn ,
} ;
2020-07-25 15:07:37 -04:00
static int btree_node_data_alloc ( struct bch_fs * c , struct btree * b , gfp_t gfp )
2017-03-16 22:18:50 -08:00
{
2020-06-09 17:49:24 -04:00
BUG_ON ( b - > data | | b - > aux_data ) ;
2017-03-16 22:18:50 -08:00
b - > data = kvpmalloc ( btree_bytes ( c ) , gfp ) ;
if ( ! b - > data )
2023-03-14 15:35:57 -04:00
return - BCH_ERR_ENOMEM_btree_node_mem_alloc ;
2021-04-24 00:38:16 -04:00
# ifdef __KERNEL__
2020-07-25 15:07:37 -04:00
b - > aux_data = kvmalloc ( btree_aux_data_bytes ( b ) , gfp ) ;
2021-04-24 00:38:16 -04:00
# else
b - > aux_data = mmap ( NULL , btree_aux_data_bytes ( b ) ,
PROT_READ | PROT_WRITE | PROT_EXEC ,
MAP_PRIVATE | MAP_ANONYMOUS , 0 , 0 ) ;
if ( b - > aux_data = = MAP_FAILED )
b - > aux_data = NULL ;
# endif
2020-07-25 15:07:37 -04:00
if ( ! b - > aux_data ) {
2020-06-09 17:49:24 -04:00
kvpfree ( b - > data , btree_bytes ( c ) ) ;
b - > data = NULL ;
2023-03-14 15:35:57 -04:00
return - BCH_ERR_ENOMEM_btree_node_mem_alloc ;
2020-06-09 17:49:24 -04:00
}
2017-03-16 22:18:50 -08:00
2020-06-09 17:49:24 -04:00
return 0 ;
}
2022-09-25 14:49:14 -04:00
static struct btree * __btree_node_mem_alloc ( struct bch_fs * c , gfp_t gfp )
2020-06-09 17:49:24 -04:00
{
2022-10-22 15:59:53 -04:00
struct btree * b ;
b = kzalloc ( sizeof ( struct btree ) , gfp ) ;
2017-03-16 22:18:50 -08:00
if ( ! b )
return NULL ;
2018-11-01 15:10:01 -04:00
bkey_btree_ptr_init ( & b - > key ) ;
2017-03-16 22:18:50 -08:00
INIT_LIST_HEAD ( & b - > list ) ;
INIT_LIST_HEAD ( & b - > write_blocked ) ;
2020-07-25 15:07:37 -04:00
b - > byte_order = ilog2 ( btree_bytes ( c ) ) ;
return b ;
}
2021-04-20 20:21:12 -04:00
struct btree * __bch2_btree_node_mem_alloc ( struct bch_fs * c )
2020-07-25 15:07:37 -04:00
{
struct btree_cache * bc = & c - > btree_cache ;
2022-10-22 15:59:53 -04:00
struct btree * b ;
b = __btree_node_mem_alloc ( c , GFP_KERNEL ) ;
2020-07-25 15:07:37 -04:00
if ( ! b )
return NULL ;
2017-03-16 22:18:50 -08:00
2020-07-25 15:07:37 -04:00
if ( btree_node_data_alloc ( c , b , GFP_KERNEL ) ) {
kfree ( b ) ;
return NULL ;
}
2023-05-20 20:57:55 -04:00
bch2_btree_lock_init ( & b - > c , 0 ) ;
2020-07-25 15:07:37 -04:00
bc - > used + + ;
list_add ( & b - > list , & bc - > freeable ) ;
return b ;
2017-03-16 22:18:50 -08:00
}
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove ( struct btree_cache * bc , struct btree * b )
{
2021-09-01 00:50:18 -04:00
int ret = rhashtable_remove_fast ( & bc - > table , & b - > hash , bch_btree_cache_params ) ;
2022-10-22 15:59:53 -04:00
2021-09-01 00:50:18 -04:00
BUG_ON ( ret ) ;
2017-03-16 22:18:50 -08:00
/* Cause future lookups for this node to fail: */
2020-02-18 17:15:32 -05:00
b - > hash_val = 0 ;
2017-03-16 22:18:50 -08:00
}
int __bch2_btree_node_hash_insert ( struct btree_cache * bc , struct btree * b )
{
2020-02-18 17:15:32 -05:00
BUG_ON ( b - > hash_val ) ;
b - > hash_val = btree_ptr_hash_val ( & b - > key ) ;
2017-03-16 22:18:50 -08:00
return rhashtable_lookup_insert_fast ( & bc - > table , & b - > hash ,
bch_btree_cache_params ) ;
}
int bch2_btree_node_hash_insert ( struct btree_cache * bc , struct btree * b ,
unsigned level , enum btree_id id )
{
int ret ;
2020-06-06 12:28:01 -04:00
b - > c . level = level ;
b - > c . btree_id = id ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
ret = __bch2_btree_node_hash_insert ( bc , b ) ;
if ( ! ret )
2022-10-06 15:53:36 +13:00
list_add_tail ( & b - > list , & bc - > live ) ;
2017-03-16 22:18:50 -08:00
mutex_unlock ( & bc - > lock ) ;
return ret ;
}
__flatten
static inline struct btree * btree_cache_find ( struct btree_cache * bc ,
const struct bkey_i * k )
{
2020-02-18 17:15:32 -05:00
u64 v = btree_ptr_hash_val ( k ) ;
return rhashtable_lookup_fast ( & bc - > table , & v , bch_btree_cache_params ) ;
2017-03-16 22:18:50 -08:00
}
/*
* this version is for btree nodes that have already been freed ( we ' re not
* reaping a real btree node )
*/
static int __btree_node_reclaim ( struct bch_fs * c , struct btree * b , bool flush )
{
struct btree_cache * bc = & c - > btree_cache ;
int ret = 0 ;
lockdep_assert_held ( & bc - > lock ) ;
2021-07-10 23:03:15 -04:00
wait_on_io :
if ( b - > flags & ( ( 1U < < BTREE_NODE_dirty ) |
( 1U < < BTREE_NODE_read_in_flight ) |
( 1U < < BTREE_NODE_write_in_flight ) ) ) {
if ( ! flush )
2023-03-14 15:35:57 -04:00
return - BCH_ERR_ENOMEM_btree_node_reclaim ;
2021-07-10 23:03:15 -04:00
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read ( b ) ;
bch2_btree_node_wait_on_write ( b ) ;
}
2017-03-16 22:18:50 -08:00
2020-06-06 12:28:01 -04:00
if ( ! six_trylock_intent ( & b - > c . lock ) )
2023-03-14 15:35:57 -04:00
return - BCH_ERR_ENOMEM_btree_node_reclaim ;
2017-03-16 22:18:50 -08:00
2020-06-06 12:28:01 -04:00
if ( ! six_trylock_write ( & b - > c . lock ) )
2017-03-16 22:18:50 -08:00
goto out_unlock_intent ;
2021-07-10 23:03:15 -04:00
/* recheck under lock */
if ( b - > flags & ( ( 1U < < BTREE_NODE_read_in_flight ) |
( 1U < < BTREE_NODE_write_in_flight ) ) ) {
if ( ! flush )
goto out_unlock ;
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
goto wait_on_io ;
}
2022-02-27 09:56:33 -05:00
if ( btree_node_noevict ( b ) | |
btree_node_write_blocked ( b ) | |
btree_node_will_make_reachable ( b ) )
2017-03-16 22:18:50 -08:00
goto out_unlock ;
2021-07-10 23:03:15 -04:00
if ( btree_node_dirty ( b ) ) {
2022-02-26 20:25:15 -05:00
if ( ! flush )
2017-03-16 22:18:50 -08:00
goto out_unlock ;
/*
* Using the underscore version because we don ' t want to compact
* bsets after the write , since this node is about to be evicted
* - unless btree verify mode is enabled , since it runs out of
* the post write cleanup :
*/
2020-11-02 18:20:44 -05:00
if ( bch2_verify_btree_ondisk )
2022-10-28 17:08:41 -04:00
bch2_btree_node_write ( c , b , SIX_LOCK_intent ,
BTREE_WRITE_cache_reclaim ) ;
2017-03-16 22:18:50 -08:00
else
2022-10-28 17:08:41 -04:00
__bch2_btree_node_write ( c , b ,
BTREE_WRITE_cache_reclaim ) ;
2017-03-16 22:18:50 -08:00
2021-07-10 23:03:15 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
goto wait_on_io ;
2017-03-16 22:18:50 -08:00
}
out :
2020-02-18 17:15:32 -05:00
if ( b - > hash_val & & ! ret )
2022-08-27 12:48:36 -04:00
trace_and_count ( c , btree_cache_reap , c , b ) ;
2017-03-16 22:18:50 -08:00
return ret ;
out_unlock :
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
out_unlock_intent :
2020-06-06 12:28:01 -04:00
six_unlock_intent ( & b - > c . lock ) ;
2023-03-14 15:35:57 -04:00
ret = - BCH_ERR_ENOMEM_btree_node_reclaim ;
2017-03-16 22:18:50 -08:00
goto out ;
}
static int btree_node_reclaim ( struct bch_fs * c , struct btree * b )
{
return __btree_node_reclaim ( c , b , false ) ;
}
static int btree_node_write_and_reclaim ( struct bch_fs * c , struct btree * b )
{
return __btree_node_reclaim ( c , b , true ) ;
}
static unsigned long bch2_btree_cache_scan ( struct shrinker * shrink ,
struct shrink_control * sc )
{
struct bch_fs * c = container_of ( shrink , struct bch_fs ,
btree_cache . shrink ) ;
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b , * t ;
unsigned long nr = sc - > nr_to_scan ;
2022-04-03 20:36:32 -04:00
unsigned long can_free = 0 ;
2017-03-16 22:18:50 -08:00
unsigned long freed = 0 ;
2022-09-25 14:49:14 -04:00
unsigned long touched = 0 ;
2020-10-15 21:48:58 -04:00
unsigned i , flags ;
2021-12-27 20:45:07 -05:00
unsigned long ret = SHRINK_STOP ;
2022-09-25 14:49:14 -04:00
bool trigger_writes = atomic_read ( & bc - > dirty ) + nr > =
bc - > used * 3 / 4 ;
2017-03-16 22:18:50 -08:00
2020-11-02 18:20:44 -05:00
if ( bch2_btree_shrinker_disabled )
2017-03-16 22:18:50 -08:00
return SHRINK_STOP ;
2022-09-25 14:49:14 -04:00
mutex_lock ( & bc - > lock ) ;
2020-10-15 21:48:58 -04:00
flags = memalloc_nofs_save ( ) ;
2017-03-16 22:18:50 -08:00
/*
* It ' s _really_ critical that we don ' t free too many btree nodes - we
* have to always leave ourselves a reserve . The reserve is how we
* guarantee that allocating memory for a new btree node can always
* succeed , so that inserting keys into the btree can always succeed and
* IO can always make forward progress :
*/
can_free = btree_cache_can_free ( bc ) ;
nr = min_t ( unsigned long , nr , can_free ) ;
i = 0 ;
list_for_each_entry_safe ( b , t , & bc - > freeable , list ) {
2021-12-27 22:11:54 -05:00
/*
* Leave a few nodes on the freeable list , so that a btree split
* won ' t have to hit the system allocator :
*/
if ( + + i < = 3 )
continue ;
2017-03-16 22:18:50 -08:00
touched + + ;
2021-11-11 15:50:22 -05:00
if ( touched > = nr )
2022-09-25 14:49:14 -04:00
goto out ;
2017-03-16 22:18:50 -08:00
2021-12-27 22:11:54 -05:00
if ( ! btree_node_reclaim ( c , b ) ) {
2017-03-16 22:18:50 -08:00
btree_node_data_free ( c , b ) ;
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
freed + + ;
}
}
restart :
list_for_each_entry_safe ( b , t , & bc - > live , list ) {
2022-09-25 14:49:14 -04:00
touched + + ;
2022-03-03 11:04:01 -05:00
if ( btree_node_accessed ( b ) ) {
clear_btree_node_accessed ( b ) ;
2022-09-25 14:49:14 -04:00
} else if ( ! btree_node_reclaim ( c , b ) ) {
2017-03-16 22:18:50 -08:00
freed + + ;
btree_node_data_free ( c , b ) ;
bch2_btree_node_hash_remove ( bc , b ) ;
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
2022-09-25 14:49:14 -04:00
if ( freed = = nr )
goto out_rotate ;
} else if ( trigger_writes & &
btree_node_dirty ( b ) & &
! btree_node_will_make_reachable ( b ) & &
! btree_node_write_blocked ( b ) & &
six_trylock_read ( & b - > c . lock ) ) {
list_move ( & bc - > live , & b - > list ) ;
mutex_unlock ( & bc - > lock ) ;
2022-10-28 17:08:41 -04:00
__bch2_btree_node_write ( c , b , BTREE_WRITE_cache_reclaim ) ;
2022-09-25 14:49:14 -04:00
six_unlock_read ( & b - > c . lock ) ;
if ( touched > = nr )
goto out_nounlock ;
mutex_lock ( & bc - > lock ) ;
2017-03-16 22:18:50 -08:00
goto restart ;
2022-03-03 11:04:01 -05:00
}
2022-09-25 14:49:14 -04:00
if ( touched > = nr )
2022-03-03 11:04:01 -05:00
break ;
2017-03-16 22:18:50 -08:00
}
2022-09-25 14:49:14 -04:00
out_rotate :
if ( & t - > list ! = & bc - > live )
list_move_tail ( & bc - > live , & t - > list ) ;
2017-03-16 22:18:50 -08:00
out :
2022-09-25 14:49:14 -04:00
mutex_unlock ( & bc - > lock ) ;
out_nounlock :
2022-04-03 20:36:32 -04:00
ret = freed ;
2020-11-11 18:59:41 -05:00
memalloc_nofs_restore ( flags ) ;
2022-08-27 12:48:36 -04:00
trace_and_count ( c , btree_cache_scan , sc - > nr_to_scan , can_free , ret ) ;
2021-12-27 20:45:07 -05:00
return ret ;
2017-03-16 22:18:50 -08:00
}
static unsigned long bch2_btree_cache_count ( struct shrinker * shrink ,
struct shrink_control * sc )
{
struct bch_fs * c = container_of ( shrink , struct bch_fs ,
btree_cache . shrink ) ;
struct btree_cache * bc = & c - > btree_cache ;
2020-11-02 18:20:44 -05:00
if ( bch2_btree_shrinker_disabled )
2017-03-16 22:18:50 -08:00
return 0 ;
2022-04-03 20:36:32 -04:00
return btree_cache_can_free ( bc ) ;
2017-03-16 22:18:50 -08:00
}
void bch2_fs_btree_cache_exit ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
2020-10-11 16:33:49 -04:00
unsigned i , flags ;
2017-03-16 22:18:50 -08:00
2023-06-26 23:31:49 -04:00
unregister_shrinker ( & bc - > shrink ) ;
2017-03-16 22:18:50 -08:00
2020-10-11 16:33:49 -04:00
/* vfree() can allocate memory: */
flags = memalloc_nofs_save ( ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
if ( c - > verify_data )
list_move ( & c - > verify_data - > list , & bc - > live ) ;
kvpfree ( c - > verify_ondisk , btree_bytes ( c ) ) ;
2023-06-28 22:09:13 -04:00
for ( i = 0 ; i < btree_id_nr_alive ( c ) ; i + + ) {
struct btree_root * r = bch2_btree_id_root ( c , i ) ;
if ( r - > b )
list_add ( & r - > b - > list , & bc - > live ) ;
}
2017-03-16 22:18:50 -08:00
list_splice ( & bc - > freeable , & bc - > live ) ;
while ( ! list_empty ( & bc - > live ) ) {
b = list_first_entry ( & bc - > live , struct btree , list ) ;
BUG_ON ( btree_node_read_in_flight ( b ) | |
btree_node_write_in_flight ( b ) ) ;
if ( btree_node_dirty ( b ) )
bch2_btree_complete_write ( c , b , btree_current_write ( b ) ) ;
2022-02-26 11:10:20 -05:00
clear_btree_node_dirty_acct ( c , b ) ;
2017-03-16 22:18:50 -08:00
btree_node_data_free ( c , b ) ;
}
2020-11-09 13:01:52 -05:00
BUG_ON ( atomic_read ( & c - > btree_cache . dirty ) ) ;
2022-03-04 19:16:04 -05:00
list_splice ( & bc - > freed_pcpu , & bc - > freed_nonpcpu ) ;
while ( ! list_empty ( & bc - > freed_nonpcpu ) ) {
b = list_first_entry ( & bc - > freed_nonpcpu , struct btree , list ) ;
2017-03-16 22:18:50 -08:00
list_del ( & b - > list ) ;
2023-05-20 20:57:55 -04:00
six_lock_exit ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
kfree ( b ) ;
}
mutex_unlock ( & bc - > lock ) ;
2020-10-11 16:33:49 -04:00
memalloc_nofs_restore ( flags ) ;
2017-03-16 22:18:50 -08:00
if ( bc - > table_init_done )
rhashtable_destroy ( & bc - > table ) ;
}
int bch2_fs_btree_cache_init ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
unsigned i ;
int ret = 0 ;
ret = rhashtable_init ( & bc - > table , & bch_btree_cache_params ) ;
if ( ret )
2023-07-07 04:38:29 -04:00
goto err ;
2017-03-16 22:18:50 -08:00
bc - > table_init_done = true ;
bch2_recalc_btree_reserve ( c ) ;
for ( i = 0 ; i < bc - > reserve ; i + + )
2023-07-07 04:38:29 -04:00
if ( ! __bch2_btree_node_mem_alloc ( c ) )
goto err ;
2017-03-16 22:18:50 -08:00
list_splice_init ( & bc - > live , & bc - > freeable ) ;
mutex_init ( & c - > verify_lock ) ;
bc - > shrink . count_objects = bch2_btree_cache_count ;
bc - > shrink . scan_objects = bch2_btree_cache_scan ;
bc - > shrink . seeks = 4 ;
2020-11-15 16:31:58 -05:00
ret = register_shrinker ( & bc - > shrink , " %s/btree_cache " , c - > name ) ;
2023-07-07 04:38:29 -04:00
if ( ret )
goto err ;
return 0 ;
err :
return - BCH_ERR_ENOMEM_fs_btree_cache_init ;
2017-03-16 22:18:50 -08:00
}
void bch2_fs_btree_cache_init_early ( struct btree_cache * bc )
{
mutex_init ( & bc - > lock ) ;
INIT_LIST_HEAD ( & bc - > live ) ;
INIT_LIST_HEAD ( & bc - > freeable ) ;
2022-03-04 19:16:04 -05:00
INIT_LIST_HEAD ( & bc - > freed_pcpu ) ;
INIT_LIST_HEAD ( & bc - > freed_nonpcpu ) ;
2017-03-16 22:18:50 -08:00
}
/*
* We can only have one thread cannibalizing other cached btree nodes at a time ,
* or we ' ll deadlock . We use an open coded mutex to ensure that , which a
* cannibalize_bucket ( ) will take . This means every time we unlock the root of
* the btree , we need to release this lock if we have it held .
*/
void bch2_btree_cache_cannibalize_unlock ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
if ( bc - > alloc_lock = = current ) {
2022-08-27 12:48:36 -04:00
trace_and_count ( c , btree_cache_cannibalize_unlock , c ) ;
2017-03-16 22:18:50 -08:00
bc - > alloc_lock = NULL ;
closure_wake_up ( & bc - > alloc_wait ) ;
}
}
int bch2_btree_cache_cannibalize_lock ( struct bch_fs * c , struct closure * cl )
{
struct btree_cache * bc = & c - > btree_cache ;
struct task_struct * old ;
old = cmpxchg ( & bc - > alloc_lock , NULL , current ) ;
if ( old = = NULL | | old = = current )
goto success ;
if ( ! cl ) {
2022-08-27 12:48:36 -04:00
trace_and_count ( c , btree_cache_cannibalize_lock_fail , c ) ;
2023-03-14 15:35:57 -04:00
return - BCH_ERR_ENOMEM_btree_cache_cannibalize_lock ;
2017-03-16 22:18:50 -08:00
}
closure_wait ( & bc - > alloc_wait , cl ) ;
/* Try again, after adding ourselves to waitlist */
old = cmpxchg ( & bc - > alloc_lock , NULL , current ) ;
if ( old = = NULL | | old = = current ) {
/* We raced */
closure_wake_up ( & bc - > alloc_wait ) ;
goto success ;
}
2022-08-27 12:48:36 -04:00
trace_and_count ( c , btree_cache_cannibalize_lock_fail , c ) ;
2022-12-13 15:17:40 -05:00
return - BCH_ERR_btree_cache_cannibalize_lock_blocked ;
2017-03-16 22:18:50 -08:00
success :
2022-08-27 12:48:36 -04:00
trace_and_count ( c , btree_cache_cannibalize_lock , c ) ;
2017-03-16 22:18:50 -08:00
return 0 ;
}
static struct btree * btree_node_cannibalize ( struct bch_fs * c )
{
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
list_for_each_entry_reverse ( b , & bc - > live , list )
if ( ! btree_node_reclaim ( c , b ) )
return b ;
while ( 1 ) {
list_for_each_entry_reverse ( b , & bc - > live , list )
if ( ! btree_node_write_and_reclaim ( c , b ) )
return b ;
/*
* Rare case : all nodes were intent - locked .
* Just busy - wait .
*/
WARN_ONCE ( 1 , " btree cache cannibalize failed \n " ) ;
cond_resched ( ) ;
}
}
2023-03-02 02:12:18 -05:00
struct btree * bch2_btree_node_mem_alloc ( struct btree_trans * trans , bool pcpu_read_locks )
2017-03-16 22:18:50 -08:00
{
2023-03-02 02:12:18 -05:00
struct bch_fs * c = trans - > c ;
2017-03-16 22:18:50 -08:00
struct btree_cache * bc = & c - > btree_cache ;
2022-03-04 19:16:04 -05:00
struct list_head * freed = pcpu_read_locks
? & bc - > freed_pcpu
: & bc - > freed_nonpcpu ;
2022-03-04 19:50:28 -05:00
struct btree * b , * b2 ;
2017-03-16 22:18:50 -08:00
u64 start_time = local_clock ( ) ;
2019-06-11 21:03:23 -04:00
unsigned flags ;
2017-03-16 22:18:50 -08:00
2019-06-11 21:03:23 -04:00
flags = memalloc_nofs_save ( ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
/*
* We never free struct btree itself , just the memory that holds the on
* disk node . Check the freed list before allocating a new one :
*/
2022-03-04 19:16:04 -05:00
list_for_each_entry ( b , freed , list )
2022-03-04 19:50:28 -05:00
if ( ! btree_node_reclaim ( c , b ) ) {
list_del_init ( & b - > list ) ;
2020-06-09 17:49:24 -04:00
goto got_node ;
2022-03-04 19:50:28 -05:00
}
2017-03-16 22:18:50 -08:00
2023-05-28 02:35:34 -04:00
b = __btree_node_mem_alloc ( c , GFP_NOWAIT | __GFP_NOWARN ) ;
2022-09-25 14:49:14 -04:00
if ( ! b ) {
mutex_unlock ( & bc - > lock ) ;
2023-05-28 02:35:34 -04:00
bch2_trans_unlock ( trans ) ;
2022-09-25 14:49:14 -04:00
b = __btree_node_mem_alloc ( c , GFP_KERNEL ) ;
if ( ! b )
goto err ;
mutex_lock ( & bc - > lock ) ;
}
2022-03-04 19:50:28 -05:00
2023-05-20 20:57:55 -04:00
bch2_btree_lock_init ( & b - > c , pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0 ) ;
2022-03-04 19:16:04 -05:00
2022-03-04 19:50:28 -05:00
BUG_ON ( ! six_trylock_intent ( & b - > c . lock ) ) ;
BUG_ON ( ! six_trylock_write ( & b - > c . lock ) ) ;
2020-06-09 17:49:24 -04:00
got_node :
2022-03-04 19:50:28 -05:00
/*
* btree_free ( ) doesn ' t free memory ; it sticks the node on the end of
* the list . Check if there ' s any freed nodes there :
*/
list_for_each_entry ( b2 , & bc - > freeable , list )
if ( ! btree_node_reclaim ( c , b2 ) ) {
swap ( b - > data , b2 - > data ) ;
swap ( b - > aux_data , b2 - > aux_data ) ;
2022-03-04 19:16:04 -05:00
btree_node_to_freedlist ( bc , b2 ) ;
2022-03-04 19:50:28 -05:00
six_unlock_write ( & b2 - > c . lock ) ;
six_unlock_intent ( & b2 - > c . lock ) ;
goto got_mem ;
}
2017-03-16 22:18:50 -08:00
2022-03-04 19:50:28 -05:00
mutex_unlock ( & bc - > lock ) ;
2020-06-09 17:49:24 -04:00
2023-05-28 02:35:34 -04:00
if ( btree_node_data_alloc ( c , b , GFP_NOWAIT | __GFP_NOWARN ) ) {
bch2_trans_unlock ( trans ) ;
if ( btree_node_data_alloc ( c , b , GFP_KERNEL | __GFP_NOWARN ) )
goto err ;
}
2020-06-09 17:49:24 -04:00
2022-03-04 19:50:28 -05:00
mutex_lock ( & bc - > lock ) ;
bc - > used + + ;
got_mem :
mutex_unlock ( & bc - > lock ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( btree_node_hashed ( b ) ) ;
2021-07-10 23:03:15 -04:00
BUG_ON ( btree_node_dirty ( b ) ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( btree_node_write_in_flight ( b ) ) ;
out :
b - > flags = 0 ;
b - > written = 0 ;
b - > nsets = 0 ;
b - > sib_u64s [ 0 ] = 0 ;
b - > sib_u64s [ 1 ] = 0 ;
b - > whiteout_u64s = 0 ;
2020-11-02 18:20:44 -05:00
bch2_btree_keys_init ( b ) ;
2021-04-29 16:55:26 -04:00
set_btree_node_accessed ( b ) ;
2017-03-16 22:18:50 -08:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_node_mem_alloc ] ,
start_time ) ;
2020-05-27 14:10:27 -04:00
memalloc_nofs_restore ( flags ) ;
2017-03-16 22:18:50 -08:00
return b ;
err :
2020-06-09 17:49:24 -04:00
mutex_lock ( & bc - > lock ) ;
2022-09-25 14:49:14 -04:00
2017-03-16 22:18:50 -08:00
/* Try to cannibalize another cached btree node: */
if ( bc - > alloc_lock = = current ) {
2022-03-04 19:50:28 -05:00
b2 = btree_node_cannibalize ( c ) ;
bcachefs: Clear btree_node_just_written() when node reused or evicted
This fixes the following bug:
Journal reclaim attempts to flush a node, but races with the node being
evicted from the btree node cache; when we lock the node, the data
buffers have already been freed.
We don't evict a node that's dirty, so calling btree_node_write() is
fine - it's a noop - except that the btree_node_just_written bit causes
bch2_btree_post_write_cleanup() to run (resorting the node), which then
causes a null ptr deref.
00078 Unable to handle kernel NULL pointer dereference at virtual address 000000000000009e
00078 Mem abort info:
00078 ESR = 0x0000000096000005
00078 EC = 0x25: DABT (current EL), IL = 32 bits
00078 SET = 0, FnV = 0
00078 EA = 0, S1PTW = 0
00078 FSC = 0x05: level 1 translation fault
00078 Data abort info:
00078 ISV = 0, ISS = 0x00000005
00078 CM = 0, WnR = 0
00078 user pgtable: 4k pages, 39-bit VAs, pgdp=000000007ed64000
00078 [000000000000009e] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
00078 Internal error: Oops: 0000000096000005 [#1] SMP
00078 Modules linked in:
00078 CPU: 75 PID: 1170 Comm: stress-ng-utime Not tainted 6.3.0-ktest-g5ef5b466e77e #2078
00078 Hardware name: linux,dummy-virt (DT)
00078 pstate: 80001005 (Nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
00078 pc : btree_node_sort+0xc4/0x568
00078 lr : bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 sp : ffffff803e30b350
00078 x29: ffffff803e30b350 x28: 0000000000000001 x27: ffffff80076e52a8
00078 x26: 0000000000000002 x25: 0000000000000000 x24: ffffffc00912e000
00078 x23: ffffff80076e52a8 x22: 0000000000000000 x21: ffffff80076e52bc
00078 x20: ffffff80076e5200 x19: 0000000000000000 x18: 0000000000000000
00078 x17: fffffffff8000000 x16: 0000000008000000 x15: 0000000008000000
00078 x14: 0000000000000002 x13: 0000000000000000 x12: 00000000000000a0
00078 x11: ffffff803e30b400 x10: ffffff803e30b408 x9 : 0000000000000001
00078 x8 : 0000000000000000 x7 : ffffff803e480000 x6 : 00000000000000a0
00078 x5 : 0000000000000088 x4 : 0000000000000000 x3 : 0000000000000010
00078 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffff80076e52a8
00078 Call trace:
00078 btree_node_sort+0xc4/0x568
00078 bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 bch2_btree_node_write+0x108/0x148
00078 __btree_node_flush+0x104/0x160
00078 bch2_btree_node_flush0+0x1c/0x30
00078 journal_flush_pins.constprop.0+0x184/0x2d0
00078 __bch2_journal_reclaim+0x4d4/0x508
00078 bch2_journal_reclaim+0x1c/0x30
00078 __bch2_journal_preres_get+0x244/0x268
00078 bch2_trans_journal_preres_get_cold+0xa4/0x180
00078 __bch2_trans_commit+0x61c/0x1bb0
00078 bch2_setattr_nonsize+0x254/0x318
00078 bch2_setattr+0x5c/0x78
00078 notify_change+0x2bc/0x408
00078 vfs_utimes+0x11c/0x218
00078 do_utimes+0x84/0x140
00078 __arm64_sys_utimensat+0x68/0xa8
00078 invoke_syscall.constprop.0+0x54/0xf0
00078 do_el0_svc+0x48/0xd8
00078 el0_svc+0x14/0x48
00078 el0t_64_sync_handler+0xb0/0xb8
00078 el0t_64_sync+0x14c/0x150
00078 Code: 8b050265 910020c6 8b060266 910060ac (79402cad)
00078 ---[ end trace 0000000000000000 ]---
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-22 00:49:06 -04:00
clear_btree_node_just_written ( b2 ) ;
2022-03-04 19:50:28 -05:00
bch2_btree_node_hash_remove ( bc , b2 ) ;
if ( b ) {
swap ( b - > data , b2 - > data ) ;
swap ( b - > aux_data , b2 - > aux_data ) ;
2022-03-04 19:16:04 -05:00
btree_node_to_freedlist ( bc , b2 ) ;
2022-03-04 19:50:28 -05:00
six_unlock_write ( & b2 - > c . lock ) ;
six_unlock_intent ( & b2 - > c . lock ) ;
} else {
b = b2 ;
list_del_init ( & b - > list ) ;
}
2017-03-16 22:18:50 -08:00
2022-03-04 19:50:28 -05:00
mutex_unlock ( & bc - > lock ) ;
2017-03-16 22:18:50 -08:00
2022-08-27 12:48:36 -04:00
trace_and_count ( c , btree_cache_cannibalize , c ) ;
2017-03-16 22:18:50 -08:00
goto out ;
}
mutex_unlock ( & bc - > lock ) ;
2020-05-27 14:10:27 -04:00
memalloc_nofs_restore ( flags ) ;
2023-03-14 15:35:57 -04:00
return ERR_PTR ( - BCH_ERR_ENOMEM_btree_node_mem_alloc ) ;
2017-03-16 22:18:50 -08:00
}
/* Slowpath, don't want it inlined into btree_iter_traverse() */
2023-03-02 02:12:18 -05:00
static noinline struct btree * bch2_btree_node_fill ( struct btree_trans * trans ,
2021-08-30 15:18:31 -04:00
struct btree_path * path ,
2017-03-16 22:18:50 -08:00
const struct bkey_i * k ,
2020-03-15 23:29:43 -04:00
enum btree_id btree_id ,
2017-03-16 22:18:50 -08:00
unsigned level ,
enum six_lock_type lock_type ,
bool sync )
{
2023-03-02 02:12:18 -05:00
struct bch_fs * c = trans - > c ;
2017-03-16 22:18:50 -08:00
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
2021-07-10 23:03:15 -04:00
u32 seq ;
2017-03-16 22:18:50 -08:00
2020-02-24 15:25:00 -05:00
BUG_ON ( level + 1 > = BTREE_MAX_DEPTH ) ;
2017-03-16 22:18:50 -08:00
/*
* Parent node must be locked , else we could read in a btree node that ' s
* been freed :
*/
2023-03-02 02:12:18 -05:00
if ( path & & ! bch2_btree_node_relock ( trans , path , level + 1 ) ) {
2022-08-27 12:48:36 -04:00
trace_and_count ( c , trans_restart_relock_parent_for_fill , trans , _THIS_IP_ , path ) ;
2022-07-17 23:06:38 -04:00
return ERR_PTR ( btree_trans_restart ( trans , BCH_ERR_transaction_restart_fill_relock ) ) ;
2021-07-25 17:19:52 -04:00
}
2017-03-16 22:18:50 -08:00
2023-03-02 02:12:18 -05:00
b = bch2_btree_node_mem_alloc ( trans , level ! = 0 ) ;
2022-02-18 00:47:45 -05:00
2023-03-14 15:35:57 -04:00
if ( bch2_err_matches ( PTR_ERR_OR_ZERO ( b ) , ENOMEM ) ) {
2022-02-18 00:47:45 -05:00
trans - > memory_allocation_failure = true ;
2022-08-27 12:48:36 -04:00
trace_and_count ( c , trans_restart_memory_allocation_failure , trans , _THIS_IP_ , path ) ;
2022-07-17 23:06:38 -04:00
return ERR_PTR ( btree_trans_restart ( trans , BCH_ERR_transaction_restart_fill_mem_alloc_fail ) ) ;
2022-02-18 00:47:45 -05:00
}
2017-03-16 22:18:50 -08:00
if ( IS_ERR ( b ) )
return b ;
2022-11-25 16:04:42 -05:00
/*
* Btree nodes read in from disk should not have the accessed bit set
* initially , so that linear scans don ' t thrash the cache :
*/
clear_btree_node_accessed ( b ) ;
2017-03-16 22:18:50 -08:00
bkey_copy ( & b - > key , k ) ;
2020-03-15 23:29:43 -04:00
if ( bch2_btree_node_hash_insert ( bc , b , level , btree_id ) ) {
2017-03-16 22:18:50 -08:00
/* raced with another fill: */
/* mark as unhashed... */
2020-02-18 17:15:32 -05:00
b - > hash_val = 0 ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bc - > lock ) ;
list_add ( & b - > list , & bc - > freeable ) ;
mutex_unlock ( & bc - > lock ) ;
2020-06-06 12:28:01 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-16 22:18:50 -08:00
return NULL ;
}
2021-07-10 23:03:15 -04:00
set_btree_node_read_in_flight ( b ) ;
six_unlock_write ( & b - > c . lock ) ;
2023-05-20 23:57:48 -04:00
seq = six_lock_seq ( & b - > c . lock ) ;
2021-07-10 23:03:15 -04:00
six_unlock_intent ( & b - > c . lock ) ;
2021-04-08 22:26:53 -04:00
/* Unlock before doing IO: */
2021-08-30 14:22:43 -04:00
if ( trans & & sync )
2023-06-18 13:25:09 -04:00
bch2_trans_unlock_noassert ( trans ) ;
2017-03-16 22:18:50 -08:00
bch2_btree_node_read ( c , b , sync ) ;
2021-07-10 23:03:15 -04:00
if ( ! sync )
2017-03-16 22:18:50 -08:00
return NULL ;
2023-03-02 02:12:18 -05:00
if ( path ) {
2022-07-17 23:06:38 -04:00
int ret = bch2_trans_relock ( trans ) ? :
bch2_btree_path_relock_intent ( trans , path ) ;
if ( ret ) {
BUG_ON ( ! trans - > restarted ) ;
return ERR_PTR ( ret ) ;
}
2021-07-25 17:19:52 -04:00
}
2021-04-08 22:26:53 -04:00
2021-07-25 17:19:52 -04:00
if ( ! six_relock_type ( & b - > c . lock , lock_type , seq ) ) {
2023-03-02 02:12:18 -05:00
if ( path )
2022-08-27 12:48:36 -04:00
trace_and_count ( c , trans_restart_relock_after_fill , trans , _THIS_IP_ , path ) ;
2022-07-17 23:06:38 -04:00
return ERR_PTR ( btree_trans_restart ( trans , BCH_ERR_transaction_restart_relock_after_fill ) ) ;
2021-07-25 17:19:52 -04:00
}
2017-03-16 22:18:50 -08:00
return b ;
}
2021-04-23 16:05:49 -04:00
static noinline void btree_bad_header ( struct bch_fs * c , struct btree * b )
{
2022-04-07 17:28:09 -04:00
struct printbuf buf = PRINTBUF ;
2021-04-23 16:05:49 -04:00
2023-07-07 02:42:28 -04:00
if ( c - > curr_recovery_pass < = BCH_RECOVERY_PASS_check_allocations )
2021-04-23 16:05:49 -04:00
return ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf ,
2022-04-07 17:28:09 -04:00
" btree node header doesn't match ptr \n "
" btree %s level %u \n "
" ptr: " ,
bch2_btree_ids [ b - > c . btree_id ] , b - > c . level ) ;
bch2_bkey_val_to_text ( & buf , c , bkey_i_to_s_c ( & b - > key ) ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " \n header: btree %s level %llu \n "
2022-04-07 17:28:09 -04:00
" min " ,
bch2_btree_ids [ BTREE_NODE_ID ( b - > data ) ] ,
BTREE_NODE_LEVEL ( b - > data ) ) ;
bch2_bpos_to_text ( & buf , b - > data - > min_key ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " \n max " ) ;
2022-04-07 17:28:09 -04:00
bch2_bpos_to_text ( & buf , b - > data - > max_key ) ;
bch2_fs_inconsistent ( c , " %s " , buf . buf ) ;
printbuf_exit ( & buf ) ;
2021-04-23 16:05:49 -04:00
}
static inline void btree_check_header ( struct bch_fs * c , struct btree * b )
{
if ( b - > c . btree_id ! = BTREE_NODE_ID ( b - > data ) | |
b - > c . level ! = BTREE_NODE_LEVEL ( b - > data ) | |
2022-11-24 03:12:22 -05:00
! bpos_eq ( b - > data - > max_key , b - > key . k . p ) | |
2021-04-23 16:05:49 -04:00
( b - > key . k . type = = KEY_TYPE_btree_ptr_v2 & &
2022-11-24 03:12:22 -05:00
! bpos_eq ( b - > data - > min_key ,
2021-04-23 16:05:49 -04:00
bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v . min_key ) ) )
btree_bad_header ( c , b ) ;
}
2022-11-22 22:05:45 -05:00
static struct btree * __bch2_btree_node_get ( struct btree_trans * trans , struct btree_path * path ,
const struct bkey_i * k , unsigned level ,
enum six_lock_type lock_type ,
unsigned long trace_ip )
2017-03-16 22:18:50 -08:00
{
2021-07-24 17:12:51 -04:00
struct bch_fs * c = trans - > c ;
2017-03-16 22:18:50 -08:00
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
struct bset_tree * t ;
2023-05-28 02:35:34 -04:00
bool need_relock = false ;
2022-07-17 23:06:38 -04:00
int ret ;
2017-03-16 22:18:50 -08:00
EBUG_ON ( level > = BTREE_MAX_DEPTH ) ;
retry :
b = btree_cache_find ( bc , k ) ;
if ( unlikely ( ! b ) ) {
/*
* We must have the parent locked to call bch2_btree_node_fill ( ) ,
* else we could read in a btree node from disk that ' s been
* freed :
*/
2023-03-02 02:12:18 -05:00
b = bch2_btree_node_fill ( trans , path , k , path - > btree_id ,
2020-03-15 23:29:43 -04:00
level , lock_type , true ) ;
2023-05-28 02:35:34 -04:00
need_relock = true ;
2017-03-16 22:18:50 -08:00
/* We raced and found the btree node in the cache */
if ( ! b )
goto retry ;
if ( IS_ERR ( b ) )
return b ;
} else {
2021-08-30 15:18:31 -04:00
if ( btree_node_read_locked ( path , level + 1 ) )
2022-07-14 18:58:23 +12:00
btree_node_unlock ( trans , path , level + 1 ) ;
2017-03-16 22:18:50 -08:00
2022-08-22 15:29:53 -04:00
ret = btree_node_lock ( trans , path , & b - > c , level , lock_type , trace_ip ) ;
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
return ERR_PTR ( ret ) ;
BUG_ON ( ret ) ;
2017-03-16 22:18:50 -08:00
2020-02-18 17:15:32 -05:00
if ( unlikely ( b - > hash_val ! = btree_ptr_hash_val ( k ) | |
2020-06-06 12:28:01 -04:00
b - > c . level ! = level | |
2017-03-16 22:18:50 -08:00
race_fault ( ) ) ) {
2020-06-06 12:28:01 -04:00
six_unlock_type ( & b - > c . lock , lock_type ) ;
2021-08-30 15:18:31 -04:00
if ( bch2_btree_node_relock ( trans , path , level + 1 ) )
2017-03-16 22:18:50 -08:00
goto retry ;
2022-08-27 12:48:36 -04:00
trace_and_count ( c , trans_restart_btree_node_reused , trans , trace_ip , path ) ;
2022-07-17 23:06:38 -04:00
return ERR_PTR ( btree_trans_restart ( trans , BCH_ERR_transaction_restart_lock_node_reused ) ) ;
2017-03-16 22:18:50 -08:00
}
2022-11-25 16:04:42 -05:00
/* avoid atomic set bit if it's not needed: */
if ( ! btree_node_accessed ( b ) )
set_btree_node_accessed ( b ) ;
2017-03-16 22:18:50 -08:00
}
2021-04-08 22:26:53 -04:00
if ( unlikely ( btree_node_read_in_flight ( b ) ) ) {
2023-05-20 23:57:48 -04:00
u32 seq = six_lock_seq ( & b - > c . lock ) ;
2021-07-10 23:03:15 -04:00
2021-04-08 22:26:53 -04:00
six_unlock_type ( & b - > c . lock , lock_type ) ;
2021-07-24 17:12:51 -04:00
bch2_trans_unlock ( trans ) ;
2023-05-28 02:35:34 -04:00
need_relock = true ;
2021-04-08 22:26:53 -04:00
2021-07-10 23:03:15 -04:00
bch2_btree_node_wait_on_read ( b ) ;
2021-04-08 22:26:53 -04:00
/*
2021-08-30 15:18:31 -04:00
* should_be_locked is not set on this path yet , so we need to
* relock it specifically :
2021-04-08 22:26:53 -04:00
*/
2021-07-10 23:03:15 -04:00
if ( ! six_relock_type ( & b - > c . lock , lock_type , seq ) )
goto retry ;
2021-04-08 22:26:53 -04:00
}
2017-03-16 22:18:50 -08:00
2023-05-28 02:35:34 -04:00
if ( unlikely ( need_relock ) ) {
int ret = bch2_trans_relock ( trans ) ? :
bch2_btree_path_relock_intent ( trans , path ) ;
if ( ret ) {
six_unlock_type ( & b - > c . lock , lock_type ) ;
return ERR_PTR ( ret ) ;
}
}
2017-03-16 22:18:50 -08:00
prefetch ( b - > aux_data ) ;
for_each_bset ( b , t ) {
void * p = ( u64 * ) b - > aux_data + t - > aux_data_offset ;
prefetch ( p + L1_CACHE_BYTES * 0 ) ;
prefetch ( p + L1_CACHE_BYTES * 1 ) ;
prefetch ( p + L1_CACHE_BYTES * 2 ) ;
}
if ( unlikely ( btree_node_read_error ( b ) ) ) {
2020-06-06 12:28:01 -04:00
six_unlock_type ( & b - > c . lock , lock_type ) ;
2017-03-16 22:18:50 -08:00
return ERR_PTR ( - EIO ) ;
}
2021-08-30 15:18:31 -04:00
EBUG_ON ( b - > c . btree_id ! = path - > btree_id ) ;
2022-11-22 22:05:45 -05:00
EBUG_ON ( BTREE_NODE_LEVEL ( b - > data ) ! = level ) ;
btree_check_header ( c , b ) ;
return b ;
}
/**
* bch_btree_node_get - find a btree node in the cache and lock it , reading it
* in from disk if necessary .
*
* The btree node will have either a read or a write lock held , depending on
* the @ write parameter .
*/
struct btree * bch2_btree_node_get ( struct btree_trans * trans , struct btree_path * path ,
const struct bkey_i * k , unsigned level ,
enum six_lock_type lock_type ,
unsigned long trace_ip )
{
struct bch_fs * c = trans - > c ;
struct btree * b ;
struct bset_tree * t ;
int ret ;
EBUG_ON ( level > = BTREE_MAX_DEPTH ) ;
b = btree_node_mem_ptr ( k ) ;
/*
* Check b - > hash_val _before_ calling btree_node_lock ( ) - this might not
* be the node we want anymore , and trying to lock the wrong node could
* cause an unneccessary transaction restart :
*/
if ( unlikely ( ! c - > opts . btree_node_mem_ptr_optimization | |
! b | |
b - > hash_val ! = btree_ptr_hash_val ( k ) ) )
return __bch2_btree_node_get ( trans , path , k , level , lock_type , trace_ip ) ;
if ( btree_node_read_locked ( path , level + 1 ) )
btree_node_unlock ( trans , path , level + 1 ) ;
ret = btree_node_lock ( trans , path , & b - > c , level , lock_type , trace_ip ) ;
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
return ERR_PTR ( ret ) ;
BUG_ON ( ret ) ;
if ( unlikely ( b - > hash_val ! = btree_ptr_hash_val ( k ) | |
b - > c . level ! = level | |
race_fault ( ) ) ) {
six_unlock_type ( & b - > c . lock , lock_type ) ;
if ( bch2_btree_node_relock ( trans , path , level + 1 ) )
return __bch2_btree_node_get ( trans , path , k , level , lock_type , trace_ip ) ;
trace_and_count ( c , trans_restart_btree_node_reused , trans , trace_ip , path ) ;
return ERR_PTR ( btree_trans_restart ( trans , BCH_ERR_transaction_restart_lock_node_reused ) ) ;
}
if ( unlikely ( btree_node_read_in_flight ( b ) ) ) {
2023-05-20 23:57:48 -04:00
u32 seq = six_lock_seq ( & b - > c . lock ) ;
2022-11-22 22:05:45 -05:00
six_unlock_type ( & b - > c . lock , lock_type ) ;
bch2_trans_unlock ( trans ) ;
bch2_btree_node_wait_on_read ( b ) ;
/*
* should_be_locked is not set on this path yet , so we need to
* relock it specifically :
*/
if ( trans ) {
int ret = bch2_trans_relock ( trans ) ? :
bch2_btree_path_relock_intent ( trans , path ) ;
if ( ret ) {
BUG_ON ( ! trans - > restarted ) ;
return ERR_PTR ( ret ) ;
}
}
if ( ! six_relock_type ( & b - > c . lock , lock_type , seq ) )
return __bch2_btree_node_get ( trans , path , k , level , lock_type , trace_ip ) ;
}
prefetch ( b - > aux_data ) ;
for_each_bset ( b , t ) {
void * p = ( u64 * ) b - > aux_data + t - > aux_data_offset ;
prefetch ( p + L1_CACHE_BYTES * 0 ) ;
prefetch ( p + L1_CACHE_BYTES * 1 ) ;
prefetch ( p + L1_CACHE_BYTES * 2 ) ;
}
/* avoid atomic set bit if it's not needed: */
if ( ! btree_node_accessed ( b ) )
set_btree_node_accessed ( b ) ;
if ( unlikely ( btree_node_read_error ( b ) ) ) {
six_unlock_type ( & b - > c . lock , lock_type ) ;
return ERR_PTR ( - EIO ) ;
}
EBUG_ON ( b - > c . btree_id ! = path - > btree_id ) ;
2021-01-26 20:59:00 -05:00
EBUG_ON ( BTREE_NODE_LEVEL ( b - > data ) ! = level ) ;
2021-04-23 16:05:49 -04:00
btree_check_header ( c , b ) ;
2017-03-16 22:18:50 -08:00
return b ;
}
2022-08-21 14:29:43 -04:00
struct btree * bch2_btree_node_get_noiter ( struct btree_trans * trans ,
2020-03-15 23:29:43 -04:00
const struct bkey_i * k ,
enum btree_id btree_id ,
2021-01-26 20:59:00 -05:00
unsigned level ,
bool nofill )
2020-03-15 23:29:43 -04:00
{
2022-08-21 14:29:43 -04:00
struct bch_fs * c = trans - > c ;
2020-03-15 23:29:43 -04:00
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
struct bset_tree * t ;
2020-06-12 22:29:48 -04:00
int ret ;
2020-03-15 23:29:43 -04:00
EBUG_ON ( level > = BTREE_MAX_DEPTH ) ;
2021-07-26 15:52:41 -04:00
if ( c - > opts . btree_node_mem_ptr_optimization ) {
b = btree_node_mem_ptr ( k ) ;
if ( b )
goto lock_node ;
}
2020-03-15 23:29:43 -04:00
retry :
b = btree_cache_find ( bc , k ) ;
if ( unlikely ( ! b ) ) {
2021-01-26 20:59:00 -05:00
if ( nofill )
2021-02-23 21:41:25 -05:00
goto out ;
2021-01-26 20:59:00 -05:00
2023-03-02 02:12:18 -05:00
b = bch2_btree_node_fill ( trans , NULL , k , btree_id ,
2020-03-15 23:29:43 -04:00
level , SIX_LOCK_read , true ) ;
/* We raced and found the btree node in the cache */
if ( ! b )
goto retry ;
2021-02-23 21:41:25 -05:00
if ( IS_ERR ( b ) & &
! bch2_btree_cache_cannibalize_lock ( c , NULL ) )
goto retry ;
2020-03-15 23:29:43 -04:00
if ( IS_ERR ( b ) )
2021-02-23 21:41:25 -05:00
goto out ;
2020-03-15 23:29:43 -04:00
} else {
lock_node :
2023-02-04 19:39:59 -05:00
ret = btree_node_lock_nopath ( trans , & b - > c , SIX_LOCK_read , _THIS_IP_ ) ;
2022-08-22 15:29:53 -04:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
return ERR_PTR ( ret ) ;
BUG_ON ( ret ) ;
2020-03-15 23:29:43 -04:00
if ( unlikely ( b - > hash_val ! = btree_ptr_hash_val ( k ) | |
b - > c . btree_id ! = btree_id | |
b - > c . level ! = level ) ) {
six_unlock_read ( & b - > c . lock ) ;
goto retry ;
}
}
/* XXX: waiting on IO with btree locks held: */
2021-07-10 23:03:15 -04:00
__bch2_btree_node_wait_on_read ( b ) ;
2020-03-15 23:29:43 -04:00
prefetch ( b - > aux_data ) ;
for_each_bset ( b , t ) {
void * p = ( u64 * ) b - > aux_data + t - > aux_data_offset ;
prefetch ( p + L1_CACHE_BYTES * 0 ) ;
prefetch ( p + L1_CACHE_BYTES * 1 ) ;
prefetch ( p + L1_CACHE_BYTES * 2 ) ;
}
/* avoid atomic set bit if it's not needed: */
if ( ! btree_node_accessed ( b ) )
set_btree_node_accessed ( b ) ;
if ( unlikely ( btree_node_read_error ( b ) ) ) {
six_unlock_read ( & b - > c . lock ) ;
2021-02-23 21:41:25 -05:00
b = ERR_PTR ( - EIO ) ;
goto out ;
2020-03-15 23:29:43 -04:00
}
2021-01-26 20:59:00 -05:00
EBUG_ON ( b - > c . btree_id ! = btree_id ) ;
EBUG_ON ( BTREE_NODE_LEVEL ( b - > data ) ! = level ) ;
2021-04-23 16:05:49 -04:00
btree_check_header ( c , b ) ;
2021-02-23 21:41:25 -05:00
out :
bch2_btree_cache_cannibalize_unlock ( c ) ;
2020-03-15 23:29:43 -04:00
return b ;
}
2023-03-02 02:12:18 -05:00
int bch2_btree_node_prefetch ( struct btree_trans * trans ,
2021-08-30 15:18:31 -04:00
struct btree_path * path ,
2021-07-24 19:50:40 -04:00
const struct bkey_i * k ,
enum btree_id btree_id , unsigned level )
2017-03-16 22:18:50 -08:00
{
2023-03-02 02:12:18 -05:00
struct bch_fs * c = trans - > c ;
2017-03-16 22:18:50 -08:00
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
2021-08-30 15:18:31 -04:00
BUG_ON ( trans & & ! btree_node_locked ( path , level + 1 ) ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( level > = BTREE_MAX_DEPTH ) ;
b = btree_cache_find ( bc , k ) ;
if ( b )
2021-07-24 19:50:40 -04:00
return 0 ;
2017-03-16 22:18:50 -08:00
2023-03-02 02:12:18 -05:00
b = bch2_btree_node_fill ( trans , path , k , btree_id ,
2021-08-30 14:22:43 -04:00
level , SIX_LOCK_read , false ) ;
2021-07-24 19:50:40 -04:00
return PTR_ERR_OR_ZERO ( b ) ;
2017-03-16 22:18:50 -08:00
}
2022-08-21 14:29:43 -04:00
void bch2_btree_node_evict ( struct btree_trans * trans , const struct bkey_i * k )
2021-04-25 16:24:03 -04:00
{
2022-08-21 14:29:43 -04:00
struct bch_fs * c = trans - > c ;
2021-04-25 16:24:03 -04:00
struct btree_cache * bc = & c - > btree_cache ;
struct btree * b ;
b = btree_cache_find ( bc , k ) ;
if ( ! b )
return ;
2021-07-10 23:03:15 -04:00
wait_on_io :
/* not allowed to wait on io with btree locks held: */
/* XXX we're called from btree_gc which will be holding other btree
* nodes locked
2022-10-19 18:31:33 -04:00
*/
2021-07-10 23:03:15 -04:00
__bch2_btree_node_wait_on_read ( b ) ;
__bch2_btree_node_wait_on_write ( b ) ;
2021-04-25 16:24:03 -04:00
2022-08-21 14:29:43 -04:00
btree_node_lock_nopath_nofail ( trans , & b - > c , SIX_LOCK_intent ) ;
btree_node_lock_nopath_nofail ( trans , & b - > c , SIX_LOCK_write ) ;
2021-04-25 16:24:03 -04:00
2021-07-10 23:03:15 -04:00
if ( btree_node_dirty ( b ) ) {
2022-10-28 17:08:41 -04:00
__bch2_btree_node_write ( c , b , BTREE_WRITE_cache_reclaim ) ;
2021-07-10 23:03:15 -04:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
goto wait_on_io ;
}
2021-04-25 16:24:03 -04:00
BUG_ON ( btree_node_dirty ( b ) ) ;
mutex_lock ( & bc - > lock ) ;
btree_node_data_free ( c , b ) ;
bch2_btree_node_hash_remove ( bc , b ) ;
mutex_unlock ( & bc - > lock ) ;
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
}
2018-11-09 01:24:07 -05:00
void bch2_btree_node_to_text ( struct printbuf * out , struct bch_fs * c ,
2023-03-06 02:34:59 -05:00
const struct btree * b )
2017-03-16 22:18:50 -08:00
{
struct bset_stats stats ;
memset ( & stats , 0 , sizeof ( stats ) ) ;
bch2_btree_keys_stats ( b , & stats ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( out , " l %u " , b - > c . level ) ;
2021-03-04 15:20:22 -05:00
bch2_bpos_to_text ( out , b - > data - > min_key ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( out , " - " ) ;
2021-03-04 15:20:22 -05:00
bch2_bpos_to_text ( out , b - > data - > max_key ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( out , " : \n "
2021-03-04 15:20:22 -05:00
" ptrs: " ) ;
2018-11-01 15:10:01 -04:00
bch2_val_to_text ( out , c , bkey_i_to_s_c ( & b - > key ) ) ;
2023-08-03 14:42:37 -04:00
prt_newline ( out ) ;
2021-03-04 15:20:22 -05:00
2023-08-03 14:42:37 -04:00
prt_printf ( out ,
" format: " ) ;
bch2_bkey_format_to_text ( out , & b - > format ) ;
prt_printf ( out ,
2018-11-09 01:24:07 -05:00
" unpack fn len: %u \n "
" bytes used %zu/%zu (%zu%% full) \n "
2021-03-29 01:13:31 -04:00
" sib u64s: %u, %u (merge threshold %u) \n "
2018-11-09 01:24:07 -05:00
" nr packed keys %u \n "
" nr unpacked keys %u \n "
" floats %zu \n "
2019-10-23 14:56:20 -04:00
" failed unpacked %zu \n " ,
2018-11-09 01:24:07 -05:00
b - > unpack_fn_len ,
b - > nr . live_u64s * sizeof ( u64 ) ,
btree_bytes ( c ) - sizeof ( struct btree_node ) ,
b - > nr . live_u64s * 100 / btree_max_u64s ( c ) ,
b - > sib_u64s [ 0 ] ,
b - > sib_u64s [ 1 ] ,
2021-03-29 01:13:31 -04:00
c - > btree_foreground_merge_threshold ,
2018-11-09 01:24:07 -05:00
b - > nr . packed_keys ,
b - > nr . unpacked_keys ,
stats . floats ,
2019-10-23 14:56:20 -04:00
stats . failed ) ;
2017-03-16 22:18:50 -08:00
}
2020-11-19 20:13:30 -05:00
2023-03-06 02:34:59 -05:00
void bch2_btree_cache_to_text ( struct printbuf * out , const struct bch_fs * c )
2020-11-19 20:13:30 -05:00
{
2023-02-03 21:01:40 -05:00
prt_printf ( out , " nr nodes: \t \t %u \n " , c - > btree_cache . used ) ;
prt_printf ( out , " nr dirty: \t \t %u \n " , atomic_read ( & c - > btree_cache . dirty ) ) ;
prt_printf ( out , " cannibalize lock: \t %p \n " , c - > btree_cache . alloc_lock ) ;
2020-11-19 20:13:30 -05:00
}