2017-03-17 09:18:50 +03:00
// SPDX-License-Identifier: GPL-2.0
# include "bcachefs.h"
2018-10-06 07:46:55 +03:00
# include "alloc_foreground.h"
2017-03-17 09:18:50 +03:00
# include "bkey_methods.h"
# include "btree_cache.h"
# include "btree_gc.h"
# include "btree_update.h"
# include "btree_update_interior.h"
# include "btree_io.h"
# include "btree_iter.h"
# include "btree_locking.h"
# include "buckets.h"
2020-11-16 22:16:42 +03:00
# include "error.h"
2017-03-17 09:18:50 +03:00
# include "extents.h"
# include "journal.h"
# include "journal_reclaim.h"
# include "keylist.h"
2021-12-26 04:07:00 +03:00
# include "recovery.h"
2017-03-17 09:18:50 +03:00
# include "replicas.h"
# include "super-io.h"
# include "trace.h"
# include <linux/random.h>
2021-07-11 06:22:06 +03:00
static void bch2_btree_insert_node ( struct btree_update * , struct btree_trans * ,
2021-08-30 22:18:31 +03:00
struct btree_path * , struct btree * ,
2021-07-11 06:22:06 +03:00
struct keylist * , unsigned ) ;
2021-08-25 04:30:06 +03:00
static void bch2_btree_update_add_new_node ( struct btree_update * , struct btree * ) ;
2021-07-11 06:22:06 +03:00
2017-03-17 09:18:50 +03:00
/* Debug code: */
2020-03-31 23:23:43 +03:00
/*
* Verify that child nodes correctly span parent node ' s range :
*/
2020-07-21 18:51:17 +03:00
static void btree_node_interior_verify ( struct bch_fs * c , struct btree * b )
2017-03-17 09:18:50 +03:00
{
2020-03-31 23:23:43 +03:00
# ifdef CONFIG_BCACHEFS_DEBUG
struct bpos next_node = b - > data - > min_key ;
2017-03-17 09:18:50 +03:00
struct btree_node_iter iter ;
2020-03-31 23:23:43 +03:00
struct bkey_s_c k ;
struct bkey_s_c_btree_ptr_v2 bp ;
struct bkey unpacked ;
2021-03-04 23:20:22 +03:00
char buf1 [ 100 ] , buf2 [ 100 ] ;
2017-03-17 09:18:50 +03:00
2020-06-06 19:28:01 +03:00
BUG_ON ( ! b - > c . level ) ;
2017-03-17 09:18:50 +03:00
2021-12-28 07:10:06 +03:00
if ( ! test_bit ( JOURNAL_REPLAY_DONE , & c - > journal . flags ) )
2020-07-21 18:51:17 +03:00
return ;
2020-03-31 23:23:43 +03:00
bch2_btree_node_iter_init_from_start ( & iter , b ) ;
while ( 1 ) {
k = bch2_btree_node_iter_peek_unpack ( & iter , b , & unpacked ) ;
2020-04-04 23:47:59 +03:00
if ( k . k - > type ! = KEY_TYPE_btree_ptr_v2 )
break ;
2020-03-31 23:23:43 +03:00
bp = bkey_s_c_to_btree_ptr_v2 ( k ) ;
2021-03-05 00:20:16 +03:00
if ( bpos_cmp ( next_node , bp . v - > min_key ) ) {
2020-11-19 19:53:38 +03:00
bch2_dump_btree_node ( c , b ) ;
2021-03-04 23:20:22 +03:00
panic ( " expected next min_key %s got %s \n " ,
( bch2_bpos_to_text ( & PBUF ( buf1 ) , next_node ) , buf1 ) ,
( bch2_bpos_to_text ( & PBUF ( buf2 ) , bp . v - > min_key ) , buf2 ) ) ;
2020-11-19 19:53:38 +03:00
}
2020-03-31 23:23:43 +03:00
bch2_btree_node_iter_advance ( & iter , b ) ;
if ( bch2_btree_node_iter_end ( & iter ) ) {
2021-03-05 00:20:16 +03:00
if ( bpos_cmp ( k . k - > p , b - > key . k . p ) ) {
2020-11-19 19:53:38 +03:00
bch2_dump_btree_node ( c , b ) ;
2021-03-04 23:20:22 +03:00
panic ( " expected end %s got %s \n " ,
( bch2_bpos_to_text ( & PBUF ( buf1 ) , b - > key . k . p ) , buf1 ) ,
( bch2_bpos_to_text ( & PBUF ( buf2 ) , k . k - > p ) , buf2 ) ) ;
2020-11-19 19:53:38 +03:00
}
2020-03-31 23:23:43 +03:00
break ;
}
bcachefs: Start using bpos.snapshot field
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:
* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
and xattrs) now always have their snapshot field set to U32_MAX
The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.
We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).
This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2021-03-25 01:02:16 +03:00
next_node = bpos_successor ( k . k - > p ) ;
2020-03-31 23:23:43 +03:00
}
2017-03-17 09:18:50 +03:00
# endif
}
/* Calculate ideal packed bkey format for new btree nodes: */
void __bch2_btree_calc_format ( struct bkey_format_state * s , struct btree * b )
{
struct bkey_packed * k ;
struct bset_tree * t ;
struct bkey uk ;
for_each_bset ( b , t )
2019-11-10 07:50:52 +03:00
bset_tree_for_each_key ( b , t , k )
2021-02-20 07:41:40 +03:00
if ( ! bkey_deleted ( k ) ) {
2017-03-17 09:18:50 +03:00
uk = bkey_unpack_key ( b , k ) ;
bch2_bkey_format_add_key ( s , & uk ) ;
}
}
static struct bkey_format bch2_btree_calc_format ( struct btree * b )
{
struct bkey_format_state s ;
bch2_bkey_format_init ( & s ) ;
2021-03-27 03:29:04 +03:00
bch2_bkey_format_add_pos ( & s , b - > data - > min_key ) ;
bch2_bkey_format_add_pos ( & s , b - > data - > max_key ) ;
2017-03-17 09:18:50 +03:00
__bch2_btree_calc_format ( & s , b ) ;
return bch2_bkey_format_done ( & s ) ;
}
static size_t btree_node_u64s_with_format ( struct btree * b ,
struct bkey_format * new_f )
{
struct bkey_format * old_f = & b - > format ;
/* stupid integer promotion rules */
ssize_t delta =
( ( ( int ) new_f - > key_u64s - old_f - > key_u64s ) *
( int ) b - > nr . packed_keys ) +
( ( ( int ) new_f - > key_u64s - BKEY_U64s ) *
( int ) b - > nr . unpacked_keys ) ;
BUG_ON ( delta + b - > nr . live_u64s < 0 ) ;
return b - > nr . live_u64s + delta ;
}
/**
* btree_node_format_fits - check if we could rewrite node with a new format
*
* This assumes all keys can pack with the new format - - it just checks if
* the re - packed keys would fit inside the node itself .
*/
bool bch2_btree_node_format_fits ( struct bch_fs * c , struct btree * b ,
struct bkey_format * new_f )
{
size_t u64s = btree_node_u64s_with_format ( b , new_f ) ;
return __vstruct_bytes ( struct btree_node , u64s ) < btree_bytes ( c ) ;
}
/* Btree node freeing/allocation: */
static void __btree_node_free ( struct bch_fs * c , struct btree * b )
{
trace_btree_node_free ( c , b ) ;
BUG_ON ( btree_node_dirty ( b ) ) ;
BUG_ON ( btree_node_need_write ( b ) ) ;
BUG_ON ( b = = btree_node_root ( c , b ) ) ;
BUG_ON ( b - > ob . nr ) ;
BUG_ON ( ! list_empty ( & b - > write_blocked ) ) ;
BUG_ON ( b - > will_make_reachable ) ;
clear_btree_node_noevict ( b ) ;
mutex_lock ( & c - > btree_cache . lock ) ;
list_move ( & b - > list , & c - > btree_cache . freeable ) ;
mutex_unlock ( & c - > btree_cache . lock ) ;
}
2021-08-25 04:30:06 +03:00
static void bch2_btree_node_free_inmem ( struct btree_trans * trans ,
struct btree * b )
2017-03-17 09:18:50 +03:00
{
2021-08-25 04:30:06 +03:00
struct bch_fs * c = trans - > c ;
2021-08-30 22:18:31 +03:00
struct btree_path * path ;
2018-11-23 13:19:25 +03:00
2021-08-30 22:18:31 +03:00
trans_for_each_path ( trans , path )
2021-09-27 08:56:31 +03:00
BUG_ON ( path - > l [ b - > c . level ] . b = = b & &
path - > l [ b - > c . level ] . lock_seq = = b - > c . lock . state . seq ) ;
2018-11-23 13:19:25 +03:00
2020-06-06 19:28:01 +03:00
six_lock_write ( & b - > c . lock , NULL , NULL ) ;
2021-09-01 07:50:18 +03:00
bch2_btree_node_hash_remove ( & c - > btree_cache , b ) ;
2017-03-17 09:18:50 +03:00
__btree_node_free ( c , b ) ;
2021-09-01 07:50:18 +03:00
2020-06-06 19:28:01 +03:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
}
static struct btree * __bch2_btree_node_alloc ( struct bch_fs * c ,
struct disk_reservation * res ,
struct closure * cl ,
unsigned flags )
{
struct write_point * wp ;
struct btree * b ;
2020-12-17 23:08:58 +03:00
__BKEY_PADDED ( k , BKEY_BTREE_PTR_VAL_U64s_MAX ) tmp ;
2018-10-06 11:12:42 +03:00
struct open_buckets ob = { . nr = 0 } ;
2017-03-17 09:18:50 +03:00
struct bch_devs_list devs_have = ( struct bch_devs_list ) { 0 } ;
unsigned nr_reserve ;
enum alloc_reserve alloc_reserve ;
2020-12-22 01:17:18 +03:00
if ( flags & BTREE_INSERT_USE_RESERVE ) {
2017-03-17 09:18:50 +03:00
nr_reserve = 0 ;
2021-01-08 01:18:14 +03:00
alloc_reserve = RESERVE_BTREE_MOVINGGC ;
2017-03-17 09:18:50 +03:00
} else {
nr_reserve = BTREE_NODE_RESERVE ;
2021-01-08 01:18:14 +03:00
alloc_reserve = RESERVE_BTREE ;
2017-03-17 09:18:50 +03:00
}
mutex_lock ( & c - > btree_reserve_cache_lock ) ;
if ( c - > btree_reserve_cache_nr > nr_reserve ) {
struct btree_alloc * a =
& c - > btree_reserve_cache [ - - c - > btree_reserve_cache_nr ] ;
ob = a - > ob ;
bkey_copy ( & tmp . k , & a - > k ) ;
mutex_unlock ( & c - > btree_reserve_cache_lock ) ;
goto mem_alloc ;
}
mutex_unlock ( & c - > btree_reserve_cache_lock ) ;
retry :
2021-01-29 23:37:28 +03:00
wp = bch2_alloc_sectors_start ( c ,
c - > opts . metadata_target ? :
c - > opts . foreground_target ,
0 ,
2017-03-17 09:18:50 +03:00
writepoint_ptr ( & c - > btree_write_point ) ,
& devs_have ,
res - > nr_replicas ,
c - > opts . metadata_replicas_required ,
alloc_reserve , 0 , cl ) ;
if ( IS_ERR ( wp ) )
return ERR_CAST ( wp ) ;
2021-12-14 22:24:41 +03:00
if ( wp - > sectors_free < btree_sectors ( c ) ) {
2017-03-17 09:18:50 +03:00
struct open_bucket * ob ;
unsigned i ;
2018-10-06 11:12:42 +03:00
open_bucket_for_each ( c , & wp - > ptrs , ob , i )
2021-12-14 22:24:41 +03:00
if ( ob - > sectors_free < btree_sectors ( c ) )
2017-03-17 09:18:50 +03:00
ob - > sectors_free = 0 ;
bch2_alloc_sectors_done ( c , wp ) ;
goto retry ;
}
2021-07-10 20:44:42 +03:00
bkey_btree_ptr_v2_init ( & tmp . k ) ;
2021-12-26 05:14:49 +03:00
bch2_alloc_sectors_append_ptrs ( c , wp , & tmp . k , btree_sectors ( c ) , false ) ;
2017-03-17 09:18:50 +03:00
2018-10-06 11:12:42 +03:00
bch2_open_bucket_get ( c , wp , & ob ) ;
2017-03-17 09:18:50 +03:00
bch2_alloc_sectors_done ( c , wp ) ;
mem_alloc :
b = bch2_btree_node_mem_alloc ( c ) ;
2022-02-07 09:19:39 +03:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
/* we hold cannibalize_lock: */
BUG_ON ( IS_ERR ( b ) ) ;
BUG_ON ( b - > ob . nr ) ;
bkey_copy ( & b - > key , & tmp . k ) ;
b - > ob = ob ;
return b ;
}
static struct btree * bch2_btree_node_alloc ( struct btree_update * as , unsigned level )
{
struct bch_fs * c = as - > c ;
struct btree * b ;
2020-02-07 21:38:02 +03:00
int ret ;
2017-03-17 09:18:50 +03:00
BUG_ON ( level > = BTREE_MAX_DEPTH ) ;
2020-05-25 21:57:06 +03:00
BUG_ON ( ! as - > nr_prealloc_nodes ) ;
2017-03-17 09:18:50 +03:00
2020-05-25 21:57:06 +03:00
b = as - > prealloc_nodes [ - - as - > nr_prealloc_nodes ] ;
2017-03-17 09:18:50 +03:00
2022-02-07 09:19:39 +03:00
six_lock_intent ( & b - > c . lock , NULL , NULL ) ;
six_lock_write ( & b - > c . lock , NULL , NULL ) ;
2017-03-17 09:18:50 +03:00
set_btree_node_accessed ( b ) ;
2020-11-09 21:01:52 +03:00
set_btree_node_dirty ( c , b ) ;
2019-01-14 00:02:22 +03:00
set_btree_node_need_write ( b ) ;
2017-03-17 09:18:50 +03:00
bch2_bset_init_first ( b , & b - > data - > keys ) ;
2020-02-27 01:11:00 +03:00
b - > c . level = level ;
b - > c . btree_id = as - > btree_id ;
2021-03-15 02:01:14 +03:00
b - > version_ondisk = c - > sb . version ;
2020-02-27 01:11:00 +03:00
2017-03-17 09:18:50 +03:00
memset ( & b - > nr , 0 , sizeof ( b - > nr ) ) ;
b - > data - > magic = cpu_to_le64 ( bset_magic ( c ) ) ;
2021-05-21 03:47:27 +03:00
memset ( & b - > data - > _ptr , 0 , sizeof ( b - > data - > _ptr ) ) ;
2017-03-17 09:18:50 +03:00
b - > data - > flags = 0 ;
SET_BTREE_NODE_ID ( b - > data , as - > btree_id ) ;
SET_BTREE_NODE_LEVEL ( b - > data , level ) ;
2020-02-07 21:38:02 +03:00
if ( b - > key . k . type = = KEY_TYPE_btree_ptr_v2 ) {
struct bkey_i_btree_ptr_v2 * bp = bkey_i_to_btree_ptr_v2 ( & b - > key ) ;
bp - > v . mem_ptr = 0 ;
bp - > v . seq = b - > data - > keys . seq ;
bp - > v . sectors_written = 0 ;
}
2017-03-17 09:18:50 +03:00
2021-02-20 08:00:23 +03:00
SET_BTREE_NODE_NEW_EXTENT_OVERWRITE ( b - > data , true ) ;
2019-11-27 01:26:04 +03:00
2017-03-17 09:18:50 +03:00
bch2_btree_build_aux_trees ( b ) ;
2020-02-07 21:38:02 +03:00
ret = bch2_btree_node_hash_insert ( & c - > btree_cache , b , level , as - > btree_id ) ;
BUG_ON ( ret ) ;
2017-03-17 09:18:50 +03:00
trace_btree_node_alloc ( c , b ) ;
return b ;
}
2020-02-07 21:38:02 +03:00
static void btree_set_min ( struct btree * b , struct bpos pos )
{
if ( b - > key . k . type = = KEY_TYPE_btree_ptr_v2 )
bkey_i_to_btree_ptr_v2 ( & b - > key ) - > v . min_key = pos ;
b - > data - > min_key = pos ;
}
static void btree_set_max ( struct btree * b , struct bpos pos )
{
b - > key . k . p = pos ;
b - > data - > max_key = pos ;
}
2017-03-17 09:18:50 +03:00
struct btree * __bch2_btree_node_alloc_replacement ( struct btree_update * as ,
struct btree * b ,
struct bkey_format format )
{
struct btree * n ;
2020-06-06 19:28:01 +03:00
n = bch2_btree_node_alloc ( as , b - > c . level ) ;
2017-03-17 09:18:50 +03:00
SET_BTREE_NODE_SEQ ( n - > data , BTREE_NODE_SEQ ( b - > data ) + 1 ) ;
2020-02-07 21:38:02 +03:00
btree_set_min ( n , b - > data - > min_key ) ;
btree_set_max ( n , b - > data - > max_key ) ;
n - > data - > format = format ;
2017-03-17 09:18:50 +03:00
btree_node_set_format ( n , format ) ;
bch2_btree_sort_into ( as - > c , n , b ) ;
btree_node_reset_sib_u64s ( n ) ;
n - > key . k . p = b - > key . k . p ;
return n ;
}
static struct btree * bch2_btree_node_alloc_replacement ( struct btree_update * as ,
struct btree * b )
{
struct bkey_format new_f = bch2_btree_calc_format ( b ) ;
/*
* The keys might expand with the new format - if they wouldn ' t fit in
* the btree node anymore , use the old format for now :
*/
if ( ! bch2_btree_node_format_fits ( as - > c , b , & new_f ) )
new_f = b - > format ;
return __bch2_btree_node_alloc_replacement ( as , b , new_f ) ;
}
static struct btree * __btree_root_alloc ( struct btree_update * as , unsigned level )
{
struct btree * b = bch2_btree_node_alloc ( as , level ) ;
2020-02-07 21:38:02 +03:00
btree_set_min ( b , POS_MIN ) ;
2021-07-06 05:02:07 +03:00
btree_set_max ( b , SPOS_MAX ) ;
2017-03-17 09:18:50 +03:00
b - > data - > format = bch2_btree_calc_format ( b ) ;
btree_node_set_format ( b , b - > data - > format ) ;
bch2_btree_build_aux_trees ( b ) ;
2020-05-25 21:57:06 +03:00
bch2_btree_update_add_new_node ( as , b ) ;
2020-06-06 19:28:01 +03:00
six_unlock_write ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
return b ;
}
2020-05-25 21:57:06 +03:00
static void bch2_btree_reserve_put ( struct btree_update * as )
2017-03-17 09:18:50 +03:00
{
2020-05-25 21:57:06 +03:00
struct bch_fs * c = as - > c ;
2017-03-17 09:18:50 +03:00
mutex_lock ( & c - > btree_reserve_cache_lock ) ;
2020-05-25 21:57:06 +03:00
while ( as - > nr_prealloc_nodes ) {
struct btree * b = as - > prealloc_nodes [ - - as - > nr_prealloc_nodes ] ;
2017-03-17 09:18:50 +03:00
2022-02-07 09:19:39 +03:00
six_lock_intent ( & b - > c . lock , NULL , NULL ) ;
six_lock_write ( & b - > c . lock , NULL , NULL ) ;
2017-03-17 09:18:50 +03:00
if ( c - > btree_reserve_cache_nr <
ARRAY_SIZE ( c - > btree_reserve_cache ) ) {
struct btree_alloc * a =
& c - > btree_reserve_cache [ c - > btree_reserve_cache_nr + + ] ;
a - > ob = b - > ob ;
b - > ob . nr = 0 ;
bkey_copy ( & a - > k , & b - > key ) ;
} else {
2018-10-06 11:12:42 +03:00
bch2_open_buckets_put ( c , & b - > ob ) ;
2017-03-17 09:18:50 +03:00
}
__btree_node_free ( c , b ) ;
2020-06-06 19:28:01 +03:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
}
mutex_unlock ( & c - > btree_reserve_cache_lock ) ;
}
2020-05-25 21:57:06 +03:00
static int bch2_btree_reserve_get ( struct btree_update * as , unsigned nr_nodes ,
unsigned flags , struct closure * cl )
2017-03-17 09:18:50 +03:00
{
2020-05-25 21:57:06 +03:00
struct bch_fs * c = as - > c ;
2017-03-17 09:18:50 +03:00
struct btree * b ;
2020-05-25 21:57:06 +03:00
int ret ;
2017-03-17 09:18:50 +03:00
BUG_ON ( nr_nodes > BTREE_RESERVE_MAX ) ;
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve :
*/
ret = bch2_btree_cache_cannibalize_lock ( c , cl ) ;
2020-05-25 21:57:06 +03:00
if ( ret )
return ret ;
2017-03-17 09:18:50 +03:00
2020-05-25 21:57:06 +03:00
while ( as - > nr_prealloc_nodes < nr_nodes ) {
b = __bch2_btree_node_alloc ( c , & as - > disk_res ,
2017-03-17 09:18:50 +03:00
flags & BTREE_INSERT_NOWAIT
? NULL : cl , flags ) ;
if ( IS_ERR ( b ) ) {
ret = PTR_ERR ( b ) ;
goto err_free ;
}
2020-05-25 21:57:06 +03:00
as - > prealloc_nodes [ as - > nr_prealloc_nodes + + ] = b ;
2017-03-17 09:18:50 +03:00
}
bch2_btree_cache_cannibalize_unlock ( c ) ;
2020-05-25 21:57:06 +03:00
return 0 ;
2017-03-17 09:18:50 +03:00
err_free :
bch2_btree_cache_cannibalize_unlock ( c ) ;
trace_btree_reserve_get_fail ( c , nr_nodes , cl ) ;
2020-05-25 21:57:06 +03:00
return ret ;
2017-03-17 09:18:50 +03:00
}
/* Asynchronous interior node update machinery */
2020-05-25 21:57:06 +03:00
static void bch2_btree_update_free ( struct btree_update * as )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = as - > c ;
2021-03-31 22:21:37 +03:00
if ( as - > took_gc_lock )
up_read ( & c - > gc_lock ) ;
as - > took_gc_lock = false ;
2020-02-09 03:06:31 +03:00
bch2_journal_preres_put ( & c - > journal , & as - > journal_preres ) ;
bch2_journal_pin_drop ( & c - > journal , & as - > journal ) ;
2018-07-17 19:19:14 +03:00
bch2_journal_pin_flush ( & c - > journal , & as - > journal ) ;
2020-05-25 21:57:06 +03:00
bch2_disk_reservation_put ( c , & as - > disk_res ) ;
bch2_btree_reserve_put ( as ) ;
2018-07-17 19:19:14 +03:00
2021-12-10 23:41:38 +03:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_interior_update_total ] ,
as - > start_time ) ;
2020-05-25 21:57:06 +03:00
mutex_lock ( & c - > btree_interior_update_lock ) ;
2020-05-02 23:21:35 +03:00
list_del ( & as - > unwritten_list ) ;
2017-03-17 09:18:50 +03:00
list_del ( & as - > list ) ;
closure_debug_destroy ( & as - > cl ) ;
mempool_free ( as , & c - > btree_interior_update_pool ) ;
2021-12-10 23:41:38 +03:00
/*
* Have to do the wakeup with btree_interior_update_lock still held ,
* since being on btree_interior_update_list is our ref on @ c :
*/
2017-03-17 09:18:50 +03:00
closure_wake_up ( & c - > btree_interior_update_wait ) ;
2021-12-10 23:41:38 +03:00
mutex_unlock ( & c - > btree_interior_update_lock ) ;
2017-03-17 09:18:50 +03:00
}
2020-05-25 21:57:06 +03:00
static void btree_update_will_delete_key ( struct btree_update * as ,
struct bkey_i * k )
2017-03-17 09:18:50 +03:00
{
2020-05-25 21:57:06 +03:00
BUG_ON ( bch2_keylist_u64s ( & as - > old_keys ) + k - > k . u64s >
ARRAY_SIZE ( as - > _old_keys ) ) ;
bch2_keylist_add ( & as - > old_keys , k ) ;
}
2017-03-17 09:18:50 +03:00
2020-05-25 21:57:06 +03:00
static void btree_update_will_add_key ( struct btree_update * as ,
struct bkey_i * k )
{
BUG_ON ( bch2_keylist_u64s ( & as - > new_keys ) + k - > k . u64s >
ARRAY_SIZE ( as - > _new_keys ) ) ;
bch2_keylist_add ( & as - > new_keys , k ) ;
2020-03-29 02:17:23 +03:00
}
2020-05-25 21:57:06 +03:00
/*
* The transactional part of an interior btree node update , where we journal the
* update we did to the interior node and update alloc info :
*/
static int btree_update_nodes_written_trans ( struct btree_trans * trans ,
struct btree_update * as )
2020-05-02 02:56:31 +03:00
{
2020-05-25 21:57:06 +03:00
struct bkey_i * k ;
int ret ;
trans - > extra_journal_entries = ( void * ) & as - > journal_entries [ 0 ] ;
trans - > extra_journal_entry_u64s = as - > journal_u64s ;
trans - > journal_pin = & as - > journal ;
2020-05-02 02:56:31 +03:00
2020-05-25 21:57:06 +03:00
for_each_keylist_key ( & as - > new_keys , k ) {
2020-12-10 21:13:56 +03:00
ret = bch2_trans_mark_key ( trans ,
bkey_s_c_null ,
bkey_i_to_s_c ( k ) ,
2021-06-11 04:44:27 +03:00
BTREE_TRIGGER_INSERT ) ;
2020-05-25 21:57:06 +03:00
if ( ret )
return ret ;
2020-05-02 02:56:31 +03:00
}
2020-05-25 21:57:06 +03:00
for_each_keylist_key ( & as - > old_keys , k ) {
2020-12-10 21:13:56 +03:00
ret = bch2_trans_mark_key ( trans ,
bkey_i_to_s_c ( k ) ,
bkey_s_c_null ,
2021-06-11 04:44:27 +03:00
BTREE_TRIGGER_OVERWRITE ) ;
2020-05-25 21:57:06 +03:00
if ( ret )
return ret ;
}
return 0 ;
2020-05-02 02:56:31 +03:00
}
2020-05-25 21:57:06 +03:00
static void btree_update_nodes_written ( struct btree_update * as )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = as - > c ;
2020-05-25 21:57:06 +03:00
struct btree * b = as - > b ;
2020-11-16 04:52:55 +03:00
struct btree_trans trans ;
2020-05-25 21:57:06 +03:00
u64 journal_seq = 0 ;
unsigned i ;
2020-02-09 03:06:31 +03:00
int ret ;
2017-03-17 09:18:50 +03:00
2020-12-03 02:30:06 +03:00
/*
* If we ' re already in an error state , it might be because a btree node
* was never written , and we might be trying to free that same btree
* node here , but it won ' t have been marked as allocated and we ' ll see
* spurious disk usage inconsistencies in the transactional part below
* if we don ' t skip it :
*/
ret = bch2_journal_error ( & c - > journal ) ;
if ( ret )
goto err ;
2020-11-30 10:08:14 +03:00
BUG_ON ( ! journal_pin_active ( & as - > journal ) ) ;
2023-09-11 06:33:08 +03:00
/*
* Wait for any in flight writes to finish before we free the old nodes
* on disk :
*/
for ( i = 0 ; i < as - > nr_old_nodes ; i + + ) {
struct btree * old = as - > old_nodes [ i ] ;
__le64 seq ;
six_lock_read ( & old - > c . lock , NULL , NULL ) ;
seq = old - > data ? old - > data - > keys . seq : 0 ;
six_unlock_read ( & old - > c . lock ) ;
if ( seq = = as - > old_nodes_seq [ i ] )
2021-07-10 20:44:42 +03:00
wait_on_bit_io ( & old - > flags , BTREE_NODE_write_in_flight_inner ,
TASK_UNINTERRUPTIBLE ) ;
2023-09-11 06:33:08 +03:00
}
2017-03-17 09:18:50 +03:00
/*
* We did an update to a parent node where the pointers we added pointed
* to child nodes that weren ' t written yet : now , the child nodes have
* been written so we can write out the update to the interior node .
*/
2019-03-08 03:46:10 +03:00
/*
* We can ' t call into journal reclaim here : we ' d block on the journal
* reclaim lock , but we may need to release the open buckets we have
* pinned in order for other btree updates to make forward progress , and
* journal reclaim does btree updates when flushing bkey_cached entries ,
* which may require allocations as well .
*/
2020-11-16 04:52:55 +03:00
bch2_trans_init ( & trans , c , 0 , 512 ) ;
ret = __bch2_trans_do ( & trans , & as - > disk_res , & journal_seq ,
BTREE_INSERT_NOFAIL |
BTREE_INSERT_NOCHECK_RW |
BTREE_INSERT_JOURNAL_RECLAIM |
BTREE_INSERT_JOURNAL_RESERVED ,
btree_update_nodes_written_trans ( & trans , as ) ) ;
bch2_trans_exit ( & trans ) ;
2020-05-25 21:57:06 +03:00
2020-12-03 02:30:06 +03:00
bch2_fs_fatal_err_on ( ret & & ! bch2_journal_error ( & c - > journal ) , c ,
" error %i in btree_update_nodes_written() " , ret ) ;
err :
2020-05-25 21:57:06 +03:00
if ( b ) {
2020-05-02 23:21:35 +03:00
/*
2020-05-25 21:57:06 +03:00
* @ b is the node we did the final insert into :
*
2020-05-02 23:21:35 +03:00
* On failure to get a journal reservation , we still have to
* unblock the write and allow most of the write path to happen
* so that shutdown works , but the i - > journal_seq mechanism
* won ' t work to prevent the btree write from being visible ( we
* didn ' t get a journal sequence number ) - instead
* __bch2_btree_node_write ( ) doesn ' t do the actual write if
* we ' re in journal error state :
*/
2017-03-17 09:18:50 +03:00
2022-02-16 06:28:37 +03:00
six_lock_intent ( & b - > c . lock , NULL , NULL ) ;
six_lock_write ( & b - > c . lock , NULL , NULL ) ;
2020-05-25 21:57:06 +03:00
mutex_lock ( & c - > btree_interior_update_lock ) ;
2017-03-17 09:18:50 +03:00
list_del ( & as - > write_blocked_list ) ;
2019-01-14 00:02:22 +03:00
2020-12-04 00:20:18 +03:00
/*
* Node might have been freed , recheck under
* btree_interior_update_lock :
*/
if ( as - > b = = b ) {
2020-05-02 23:21:35 +03:00
struct bset * i = btree_bset_last ( b ) ;
2020-05-25 21:57:06 +03:00
BUG_ON ( ! b - > c . level ) ;
BUG_ON ( ! btree_node_dirty ( b ) ) ;
2020-12-04 00:20:18 +03:00
if ( ! ret ) {
i - > journal_seq = cpu_to_le64 (
max ( journal_seq ,
le64_to_cpu ( i - > journal_seq ) ) ) ;
bch2_btree_add_journal_pin ( c , b , journal_seq ) ;
} else {
/*
* If we didn ' t get a journal sequence number we
* can ' t write this btree node , because recovery
* won ' t know to ignore this write :
*/
set_btree_node_never_write ( b ) ;
}
2020-05-02 23:21:35 +03:00
}
2020-05-25 21:57:06 +03:00
mutex_unlock ( & c - > btree_interior_update_lock ) ;
2020-05-02 23:21:35 +03:00
six_unlock_write ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
2020-05-25 21:57:06 +03:00
btree_node_write_if_need ( c , b , SIX_LOCK_intent ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
}
2020-02-09 03:06:31 +03:00
bch2_journal_pin_drop ( & c - > journal , & as - > journal ) ;
bch2_journal_preres_put ( & c - > journal , & as - > journal_preres ) ;
2020-03-30 19:33:30 +03:00
2020-05-25 21:57:06 +03:00
mutex_lock ( & c - > btree_interior_update_lock ) ;
for ( i = 0 ; i < as - > nr_new_nodes ; i + + ) {
b = as - > new_nodes [ i ] ;
2020-04-08 00:27:12 +03:00
2020-05-02 23:21:35 +03:00
BUG_ON ( b - > will_make_reachable ! = ( unsigned long ) as ) ;
b - > will_make_reachable = 0 ;
2020-05-25 21:57:06 +03:00
}
mutex_unlock ( & c - > btree_interior_update_lock ) ;
for ( i = 0 ; i < as - > nr_new_nodes ; i + + ) {
b = as - > new_nodes [ i ] ;
2020-04-08 00:27:12 +03:00
2022-02-16 06:28:37 +03:00
six_lock_read ( & b - > c . lock , NULL , NULL ) ;
2020-05-25 21:57:06 +03:00
btree_node_write_if_need ( c , b , SIX_LOCK_read ) ;
six_unlock_read ( & b - > c . lock ) ;
2020-04-08 00:27:12 +03:00
}
2020-04-04 22:45:06 +03:00
2020-05-25 21:57:06 +03:00
for ( i = 0 ; i < as - > nr_open_buckets ; i + + )
bch2_open_bucket_put ( c , c - > open_buckets + as - > open_buckets [ i ] ) ;
2020-05-02 23:21:35 +03:00
2020-05-25 21:57:06 +03:00
bch2_btree_update_free ( as ) ;
}
2020-04-08 00:27:12 +03:00
2020-05-25 21:57:06 +03:00
static void btree_interior_update_work ( struct work_struct * work )
{
struct bch_fs * c =
container_of ( work , struct bch_fs , btree_interior_update_work ) ;
struct btree_update * as ;
2020-04-08 00:27:12 +03:00
2020-05-25 21:57:06 +03:00
while ( 1 ) {
mutex_lock ( & c - > btree_interior_update_lock ) ;
as = list_first_entry_or_null ( & c - > btree_interior_updates_unwritten ,
struct btree_update , unwritten_list ) ;
if ( as & & ! as - > nodes_written )
as = NULL ;
mutex_unlock ( & c - > btree_interior_update_lock ) ;
2020-05-02 23:21:35 +03:00
2020-05-25 21:57:06 +03:00
if ( ! as )
break ;
btree_update_nodes_written ( as ) ;
2020-04-08 00:27:12 +03:00
}
2020-05-25 21:57:06 +03:00
}
static void btree_update_set_nodes_written ( struct closure * cl )
{
struct btree_update * as = container_of ( cl , struct btree_update , cl ) ;
struct bch_fs * c = as - > c ;
2020-04-08 00:27:12 +03:00
mutex_lock ( & c - > btree_interior_update_lock ) ;
2020-05-25 21:57:06 +03:00
as - > nodes_written = true ;
mutex_unlock ( & c - > btree_interior_update_lock ) ;
queue_work ( c - > btree_interior_update_worker , & c - > btree_interior_update_work ) ;
2017-03-17 09:18:50 +03:00
}
/*
* We ' re updating @ b with pointers to nodes that haven ' t finished writing yet :
* block @ b from being written until @ as completes
*/
static void btree_update_updated_node ( struct btree_update * as , struct btree * b )
{
struct bch_fs * c = as - > c ;
mutex_lock ( & c - > btree_interior_update_lock ) ;
2020-02-09 00:39:37 +03:00
list_add_tail ( & as - > unwritten_list , & c - > btree_interior_updates_unwritten ) ;
2017-03-17 09:18:50 +03:00
BUG_ON ( as - > mode ! = BTREE_INTERIOR_NO_UPDATE ) ;
BUG_ON ( ! btree_node_dirty ( b ) ) ;
2020-02-09 03:06:31 +03:00
as - > mode = BTREE_INTERIOR_UPDATING_NODE ;
as - > b = b ;
2017-03-17 09:18:50 +03:00
list_add ( & as - > write_blocked_list , & b - > write_blocked ) ;
mutex_unlock ( & c - > btree_interior_update_lock ) ;
}
static void btree_update_reparent ( struct btree_update * as ,
struct btree_update * child )
{
struct bch_fs * c = as - > c ;
2020-02-09 03:06:31 +03:00
lockdep_assert_held ( & c - > btree_interior_update_lock ) ;
2017-03-17 09:18:50 +03:00
child - > b = NULL ;
child - > mode = BTREE_INTERIOR_UPDATING_AS ;
2020-02-09 03:06:31 +03:00
bch2_journal_pin_copy ( & c - > journal , & as - > journal , & child - > journal , NULL ) ;
2017-03-17 09:18:50 +03:00
}
2020-02-09 03:06:31 +03:00
static void btree_update_updated_root ( struct btree_update * as , struct btree * b )
2017-03-17 09:18:50 +03:00
{
2020-05-25 21:57:06 +03:00
struct bkey_i * insert = & b - > key ;
2017-03-17 09:18:50 +03:00
struct bch_fs * c = as - > c ;
BUG_ON ( as - > mode ! = BTREE_INTERIOR_NO_UPDATE ) ;
2020-05-25 21:57:06 +03:00
BUG_ON ( as - > journal_u64s + jset_u64s ( insert - > k . u64s ) >
ARRAY_SIZE ( as - > journal_entries ) ) ;
as - > journal_u64s + =
journal_entry_set ( ( void * ) & as - > journal_entries [ as - > journal_u64s ] ,
BCH_JSET_ENTRY_btree_root ,
b - > c . btree_id , b - > c . level ,
insert , insert - > k . u64s ) ;
2017-03-17 09:18:50 +03:00
2020-02-09 03:06:31 +03:00
mutex_lock ( & c - > btree_interior_update_lock ) ;
list_add_tail ( & as - > unwritten_list , & c - > btree_interior_updates_unwritten ) ;
2017-03-17 09:18:50 +03:00
2020-02-09 03:06:31 +03:00
as - > mode = BTREE_INTERIOR_UPDATING_ROOT ;
2017-03-17 09:18:50 +03:00
mutex_unlock ( & c - > btree_interior_update_lock ) ;
}
2020-05-25 21:57:06 +03:00
/*
* bch2_btree_update_add_new_node :
*
* This causes @ as to wait on @ b to be written , before it gets to
* bch2_btree_update_nodes_written
*
* Additionally , it sets b - > will_make_reachable to prevent any additional writes
* to @ b from happening besides the first until @ b is reachable on disk
*
* And it adds @ b to the list of @ as ' s new nodes , so that we can update sector
* counts in bch2_btree_update_nodes_written :
*/
2021-08-25 04:30:06 +03:00
static void bch2_btree_update_add_new_node ( struct btree_update * as , struct btree * b )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = as - > c ;
2020-05-25 21:57:06 +03:00
closure_get ( & as - > cl ) ;
2017-03-17 09:18:50 +03:00
mutex_lock ( & c - > btree_interior_update_lock ) ;
BUG_ON ( as - > nr_new_nodes > = ARRAY_SIZE ( as - > new_nodes ) ) ;
BUG_ON ( b - > will_make_reachable ) ;
as - > new_nodes [ as - > nr_new_nodes + + ] = b ;
b - > will_make_reachable = 1UL | ( unsigned long ) as ;
mutex_unlock ( & c - > btree_interior_update_lock ) ;
2020-05-25 21:57:06 +03:00
btree_update_will_add_key ( as , & b - > key ) ;
2017-03-17 09:18:50 +03:00
}
2020-05-25 21:57:06 +03:00
/*
* returns true if @ b was a new node
*/
2017-03-17 09:18:50 +03:00
static void btree_update_drop_new_node ( struct bch_fs * c , struct btree * b )
{
struct btree_update * as ;
unsigned long v ;
unsigned i ;
mutex_lock ( & c - > btree_interior_update_lock ) ;
2020-05-25 21:57:06 +03:00
/*
* When b - > will_make_reachable ! = 0 , it owns a ref on as - > cl that ' s
* dropped when it gets written by bch2_btree_complete_write - the
* xchg ( ) is for synchronization with bch2_btree_complete_write :
*/
2017-03-17 09:18:50 +03:00
v = xchg ( & b - > will_make_reachable , 0 ) ;
as = ( struct btree_update * ) ( v & ~ 1UL ) ;
if ( ! as ) {
mutex_unlock ( & c - > btree_interior_update_lock ) ;
return ;
}
for ( i = 0 ; i < as - > nr_new_nodes ; i + + )
if ( as - > new_nodes [ i ] = = b )
goto found ;
BUG ( ) ;
found :
array_remove_item ( as - > new_nodes , as - > nr_new_nodes , i ) ;
mutex_unlock ( & c - > btree_interior_update_lock ) ;
if ( v & 1 )
closure_put ( & as - > cl ) ;
}
2021-08-25 04:30:06 +03:00
static void bch2_btree_update_get_open_buckets ( struct btree_update * as , struct btree * b )
2017-03-17 09:18:50 +03:00
{
2020-05-25 21:57:06 +03:00
while ( b - > ob . nr )
as - > open_buckets [ as - > nr_open_buckets + + ] =
b - > ob . v [ - - b - > ob . nr ] ;
2017-03-17 09:18:50 +03:00
}
/*
* @ b is being split / rewritten : it may have pointers to not - yet - written btree
* nodes and thus outstanding btree_updates - redirect @ b ' s
* btree_updates to point to this btree_update :
*/
2021-08-25 04:30:06 +03:00
static void bch2_btree_interior_update_will_free_node ( struct btree_update * as ,
2017-03-17 09:18:50 +03:00
struct btree * b )
{
struct bch_fs * c = as - > c ;
struct btree_update * p , * n ;
struct btree_write * w ;
set_btree_node_dying ( b ) ;
if ( btree_node_fake ( b ) )
return ;
mutex_lock ( & c - > btree_interior_update_lock ) ;
/*
* Does this node have any btree_update operations preventing
* it from being written ?
*
* If so , redirect them to point to this btree_update : we can
* write out our new nodes , but we won ' t make them visible until those
* operations complete
*/
list_for_each_entry_safe ( p , n , & b - > write_blocked , write_blocked_list ) {
2020-06-09 23:25:07 +03:00
list_del_init ( & p - > write_blocked_list ) ;
2017-03-17 09:18:50 +03:00
btree_update_reparent ( as , p ) ;
2019-01-14 00:02:22 +03:00
/*
* for flush_held_btree_writes ( ) waiting on updates to flush or
* nodes to be writeable :
*/
closure_wake_up ( & c - > btree_interior_update_wait ) ;
2017-03-17 09:18:50 +03:00
}
2020-11-09 21:01:52 +03:00
clear_btree_node_dirty ( c , b ) ;
2017-03-17 09:18:50 +03:00
clear_btree_node_need_write ( b ) ;
/*
* Does this node have unwritten data that has a pin on the journal ?
*
* If so , transfer that pin to the btree_update operation -
* note that if we ' re freeing multiple nodes , we only need to keep the
* oldest pin of any of the nodes we ' re freeing . We ' ll release the pin
* when the new nodes are persistent and reachable on disk :
*/
2020-02-09 03:06:31 +03:00
w = btree_current_write ( b ) ;
bch2_journal_pin_copy ( & c - > journal , & as - > journal , & w - > journal , NULL ) ;
2017-03-17 09:18:50 +03:00
bch2_journal_pin_drop ( & c - > journal , & w - > journal ) ;
w = btree_prev_write ( b ) ;
2020-02-09 03:06:31 +03:00
bch2_journal_pin_copy ( & c - > journal , & as - > journal , & w - > journal , NULL ) ;
2017-03-17 09:18:50 +03:00
bch2_journal_pin_drop ( & c - > journal , & w - > journal ) ;
mutex_unlock ( & c - > btree_interior_update_lock ) ;
2020-05-25 21:57:06 +03:00
/*
* Is this a node that isn ' t reachable on disk yet ?
*
* Nodes that aren ' t reachable yet have writes blocked until they ' re
* reachable - now that we ' ve cancelled any pending writes and moved
* things waiting on that write to wait on this update , we can drop this
* node from the list of nodes that the other update is making
* reachable , prior to freeing it :
*/
btree_update_drop_new_node ( c , b ) ;
btree_update_will_delete_key ( as , & b - > key ) ;
2021-04-20 00:17:34 +03:00
2023-09-11 06:33:08 +03:00
as - > old_nodes [ as - > nr_old_nodes ] = b ;
as - > old_nodes_seq [ as - > nr_old_nodes ] = b - > data - > keys . seq ;
as - > nr_old_nodes + + ;
2017-03-17 09:18:50 +03:00
}
2021-08-25 04:30:06 +03:00
static void bch2_btree_update_done ( struct btree_update * as )
2017-03-17 09:18:50 +03:00
{
2021-12-10 23:41:38 +03:00
struct bch_fs * c = as - > c ;
u64 start_time = as - > start_time ;
2017-03-17 09:18:50 +03:00
BUG_ON ( as - > mode = = BTREE_INTERIOR_NO_UPDATE ) ;
2021-03-31 22:21:37 +03:00
if ( as - > took_gc_lock )
up_read ( & as - > c - > gc_lock ) ;
as - > took_gc_lock = false ;
2020-05-25 21:57:06 +03:00
bch2_btree_reserve_put ( as ) ;
2017-03-17 09:18:50 +03:00
2021-05-23 00:37:25 +03:00
continue_at ( & as - > cl , btree_update_set_nodes_written ,
as - > c - > btree_interior_update_worker ) ;
2021-12-10 23:41:38 +03:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_interior_update_foreground ] ,
start_time ) ;
2017-03-17 09:18:50 +03:00
}
2021-08-25 04:30:06 +03:00
static struct btree_update *
2021-08-30 22:18:31 +03:00
bch2_btree_update_start ( struct btree_trans * trans , struct btree_path * path ,
2021-08-25 04:30:06 +03:00
unsigned level , unsigned nr_nodes , unsigned flags )
2017-03-17 09:18:50 +03:00
{
2020-04-06 04:49:17 +03:00
struct bch_fs * c = trans - > c ;
2017-03-17 09:18:50 +03:00
struct btree_update * as ;
2021-03-31 22:21:37 +03:00
struct closure cl ;
2021-12-10 23:41:38 +03:00
u64 start_time = local_clock ( ) ;
2020-06-10 03:54:36 +03:00
int disk_res_flags = ( flags & BTREE_INSERT_NOFAIL )
2020-05-25 21:57:06 +03:00
? BCH_DISK_RESERVATION_NOFAIL : 0 ;
2021-04-03 23:24:13 +03:00
int journal_flags = 0 ;
2020-06-10 03:54:36 +03:00
int ret = 0 ;
2020-05-25 21:57:06 +03:00
2021-08-30 22:18:31 +03:00
BUG_ON ( ! path - > should_be_locked ) ;
2021-06-15 01:16:10 +03:00
2021-04-03 23:24:13 +03:00
if ( flags & BTREE_INSERT_JOURNAL_RESERVED )
journal_flags | = JOURNAL_RES_GET_RESERVED ;
2021-03-31 22:21:37 +03:00
closure_init_stack ( & cl ) ;
retry :
2020-05-25 21:57:06 +03:00
2021-03-31 22:21:37 +03:00
/*
* XXX : figure out how far we might need to split ,
* instead of locking / reserving all the way to the root :
*/
2021-08-30 22:18:31 +03:00
if ( ! bch2_btree_path_upgrade ( trans , path , U8_MAX ) ) {
2022-01-04 08:33:52 +03:00
trace_trans_restart_iter_upgrade ( trans - > fn , _RET_IP_ ,
2021-08-30 22:18:31 +03:00
path - > btree_id , & path - > pos ) ;
ret = btree_trans_restart ( trans ) ;
return ERR_PTR ( ret ) ;
2021-03-31 22:21:37 +03:00
}
if ( flags & BTREE_INSERT_GC_LOCK_HELD )
lockdep_assert_held ( & c - > gc_lock ) ;
else if ( ! down_read_trylock ( & c - > gc_lock ) ) {
bch2_trans_unlock ( trans ) ;
down_read ( & c - > gc_lock ) ;
if ( ! bch2_trans_relock ( trans ) ) {
up_read ( & c - > gc_lock ) ;
return ERR_PTR ( - EINTR ) ;
}
}
2020-05-25 21:57:06 +03:00
as = mempool_alloc ( & c - > btree_interior_update_pool , GFP_NOIO ) ;
memset ( as , 0 , sizeof ( * as ) ) ;
closure_init ( & as - > cl , NULL ) ;
as - > c = c ;
2021-12-10 23:41:38 +03:00
as - > start_time = start_time ;
2020-05-25 21:57:06 +03:00
as - > mode = BTREE_INTERIOR_NO_UPDATE ;
2021-03-31 22:21:37 +03:00
as - > took_gc_lock = ! ( flags & BTREE_INSERT_GC_LOCK_HELD ) ;
2021-08-30 22:18:31 +03:00
as - > btree_id = path - > btree_id ;
2020-05-25 21:57:06 +03:00
INIT_LIST_HEAD ( & as - > list ) ;
INIT_LIST_HEAD ( & as - > unwritten_list ) ;
INIT_LIST_HEAD ( & as - > write_blocked_list ) ;
bch2_keylist_init ( & as - > old_keys , as - > _old_keys ) ;
bch2_keylist_init ( & as - > new_keys , as - > _new_keys ) ;
bch2_keylist_init ( & as - > parent_keys , as - > inline_keys ) ;
2017-03-17 09:18:50 +03:00
2021-07-13 23:12:00 +03:00
mutex_lock ( & c - > btree_interior_update_lock ) ;
list_add_tail ( & as - > list , & c - > btree_interior_update_list ) ;
mutex_unlock ( & c - > btree_interior_update_lock ) ;
/*
* We don ' t want to allocate if we ' re in an error state , that can cause
* deadlock on emergency shutdown due to open buckets getting stuck in
* the btree_reserve_cache after allocator shutdown has cleared it out .
* This check needs to come after adding us to the btree_interior_update
* list but before calling bch2_btree_reserve_get , to synchronize with
* __bch2_fs_read_only ( ) .
*/
ret = bch2_journal_error ( & c - > journal ) ;
if ( ret )
goto err ;
2020-06-10 03:54:36 +03:00
ret = bch2_journal_preres_get ( & c - > journal , & as - > journal_preres ,
BTREE_UPDATE_JOURNAL_RES ,
journal_flags | JOURNAL_RES_GET_NONBLOCK ) ;
2020-04-06 04:49:17 +03:00
if ( ret = = - EAGAIN ) {
bch2_trans_unlock ( trans ) ;
2021-04-01 04:44:55 +03:00
if ( flags & BTREE_INSERT_JOURNAL_RECLAIM ) {
bch2_btree_update_free ( as ) ;
2021-07-26 00:19:52 +03:00
btree_trans_restart ( trans ) ;
2021-04-01 04:44:55 +03:00
return ERR_PTR ( ret ) ;
}
2021-04-03 23:24:13 +03:00
2020-05-25 21:57:06 +03:00
ret = bch2_journal_preres_get ( & c - > journal , & as - > journal_preres ,
2020-06-10 03:54:36 +03:00
BTREE_UPDATE_JOURNAL_RES ,
journal_flags ) ;
2021-04-01 04:44:55 +03:00
if ( ret ) {
2022-01-04 08:33:52 +03:00
trace_trans_restart_journal_preres_get ( trans - > fn , _RET_IP_ ) ;
2021-03-31 22:21:37 +03:00
goto err ;
2021-04-01 04:44:55 +03:00
}
2020-04-06 04:49:17 +03:00
if ( ! bch2_trans_relock ( trans ) ) {
2020-05-25 21:57:06 +03:00
ret = - EINTR ;
goto err ;
2020-04-06 04:49:17 +03:00
}
}
2020-05-25 21:57:06 +03:00
ret = bch2_disk_reservation_get ( c , & as - > disk_res ,
2021-12-14 22:24:41 +03:00
nr_nodes * btree_sectors ( c ) ,
2020-05-25 21:57:06 +03:00
c - > opts . metadata_replicas ,
disk_res_flags ) ;
if ( ret )
goto err ;
2017-03-17 09:18:50 +03:00
2021-07-25 00:38:15 +03:00
ret = bch2_btree_reserve_get ( as , nr_nodes , flags , & cl ) ;
2020-05-25 21:57:06 +03:00
if ( ret )
goto err ;
2017-03-17 09:18:50 +03:00
2020-11-30 10:08:14 +03:00
bch2_journal_pin_add ( & c - > journal ,
atomic64_read ( & c - > journal . seq ) ,
& as - > journal , NULL ) ;
2017-03-17 09:18:50 +03:00
return as ;
2020-05-25 21:57:06 +03:00
err :
bch2_btree_update_free ( as ) ;
2021-03-31 22:21:37 +03:00
if ( ret = = - EAGAIN ) {
bch2_trans_unlock ( trans ) ;
closure_sync ( & cl ) ;
ret = - EINTR ;
}
if ( ret = = - EINTR & & bch2_trans_relock ( trans ) )
goto retry ;
2020-05-25 21:57:06 +03:00
return ERR_PTR ( ret ) ;
2017-03-17 09:18:50 +03:00
}
/* Btree root updates: */
2020-05-25 21:57:06 +03:00
static void bch2_btree_set_root_inmem ( struct bch_fs * c , struct btree * b )
2017-03-17 09:18:50 +03:00
{
/* Root nodes cannot be reaped */
mutex_lock ( & c - > btree_cache . lock ) ;
list_del_init ( & b - > list ) ;
mutex_unlock ( & c - > btree_cache . lock ) ;
2021-03-24 06:52:27 +03:00
if ( b - > c . level )
six_lock_pcpu_alloc ( & b - > c . lock ) ;
else
six_lock_pcpu_free ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
mutex_lock ( & c - > btree_root_lock ) ;
BUG_ON ( btree_node_root ( c , b ) & &
2020-06-06 19:28:01 +03:00
( b - > c . level < btree_node_root ( c , b ) - > c . level | |
2017-03-17 09:18:50 +03:00
! btree_node_dying ( btree_node_root ( c , b ) ) ) ) ;
btree_node_root ( c , b ) = b ;
mutex_unlock ( & c - > btree_root_lock ) ;
bch2_recalc_btree_reserve ( c ) ;
}
/**
* bch_btree_set_root - update the root in memory and on disk
*
* To ensure forward progress , the current task must not be holding any
* btree node write locks . However , you must hold an intent lock on the
* old root .
*
* Note : This allocates a journal entry but doesn ' t add any keys to
* it . All the btree roots are part of every journal write , so there
* is nothing new to be done . This just guarantees that there is a
* journal write .
*/
2021-08-25 04:30:06 +03:00
static void bch2_btree_set_root ( struct btree_update * as ,
struct btree_trans * trans ,
2021-08-30 22:18:31 +03:00
struct btree_path * path ,
2021-08-25 04:30:06 +03:00
struct btree * b )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = as - > c ;
struct btree * old ;
trace_btree_set_root ( c , b ) ;
2018-07-22 17:43:01 +03:00
BUG_ON ( ! b - > written & &
! test_bit ( BCH_FS_HOLD_BTREE_WRITES , & c - > flags ) ) ;
2017-03-17 09:18:50 +03:00
old = btree_node_root ( c , b ) ;
/*
* Ensure no one is using the old root while we switch to the
* new root :
*/
2021-08-30 22:18:31 +03:00
bch2_btree_node_lock_write ( trans , path , old ) ;
2017-03-17 09:18:50 +03:00
2020-05-25 21:57:06 +03:00
bch2_btree_set_root_inmem ( c , b ) ;
2017-03-17 09:18:50 +03:00
2020-02-09 03:06:31 +03:00
btree_update_updated_root ( as , b ) ;
2017-03-17 09:18:50 +03:00
/*
* Unlock old root after new root is visible :
*
* The new root isn ' t persistent , but that ' s ok : we still have
* an intent lock on the new root , and any updates that would
* depend on the new root would have to update the new root .
*/
2021-08-30 22:18:31 +03:00
bch2_btree_node_unlock_write ( trans , path , old ) ;
2017-03-17 09:18:50 +03:00
}
/* Interior node updates: */
2021-08-25 04:30:06 +03:00
static void bch2_insert_fixup_btree_ptr ( struct btree_update * as ,
struct btree_trans * trans ,
2021-08-30 22:18:31 +03:00
struct btree_path * path ,
2021-08-25 04:30:06 +03:00
struct btree * b ,
struct btree_node_iter * node_iter ,
struct bkey_i * insert )
2017-03-17 09:18:50 +03:00
{
2020-11-16 22:16:42 +03:00
struct bch_fs * c = as - > c ;
2017-03-17 09:18:50 +03:00
struct bkey_packed * k ;
2020-11-16 22:16:42 +03:00
const char * invalid ;
2021-07-10 20:44:42 +03:00
BUG_ON ( insert - > k . type = = KEY_TYPE_btree_ptr_v2 & &
! btree_ptr_sectors_written ( insert ) ) ;
2021-12-26 04:07:00 +03:00
if ( unlikely ( ! test_bit ( JOURNAL_REPLAY_DONE , & c - > journal . flags ) ) )
bch2_journal_key_overwritten ( c , b - > c . btree_id , b - > c . level , insert - > k . p ) ;
2020-11-16 22:16:42 +03:00
invalid = bch2_bkey_invalid ( c , bkey_i_to_s_c ( insert ) , btree_node_type ( b ) ) ? :
bch2_bkey_in_btree_node ( b , bkey_i_to_s_c ( insert ) ) ;
if ( invalid ) {
char buf [ 160 ] ;
bch2_bkey_val_to_text ( & PBUF ( buf ) , c , bkey_i_to_s_c ( insert ) ) ;
bch2_fs_inconsistent ( c , " inserting invalid bkey %s: %s " , buf , invalid ) ;
dump_stack ( ) ;
}
2017-03-17 09:18:50 +03:00
2020-03-31 23:23:43 +03:00
BUG_ON ( as - > journal_u64s + jset_u64s ( insert - > k . u64s ) >
ARRAY_SIZE ( as - > journal_entries ) ) ;
2020-05-25 21:57:06 +03:00
as - > journal_u64s + =
journal_entry_set ( ( void * ) & as - > journal_entries [ as - > journal_u64s ] ,
BCH_JSET_ENTRY_btree_keys ,
b - > c . btree_id , b - > c . level ,
insert , insert - > k . u64s ) ;
2017-03-17 09:18:50 +03:00
while ( ( k = bch2_btree_node_iter_peek_all ( node_iter , b ) ) & &
2020-01-07 06:25:09 +03:00
bkey_iter_pos_cmp ( b , k , & insert - > k . p ) < 0 )
2017-03-17 09:18:50 +03:00
bch2_btree_node_iter_advance ( node_iter , b ) ;
2021-08-30 22:18:31 +03:00
bch2_btree_bset_insert_key ( trans , path , b , node_iter , insert ) ;
2020-11-16 22:16:42 +03:00
set_btree_node_dirty ( c , b ) ;
2017-03-17 09:18:50 +03:00
set_btree_node_need_write ( b ) ;
}
2021-04-24 02:25:27 +03:00
static void
2021-08-25 04:30:06 +03:00
__bch2_btree_insert_keys_interior ( struct btree_update * as ,
struct btree_trans * trans ,
2021-08-30 22:18:31 +03:00
struct btree_path * path ,
2021-08-25 04:30:06 +03:00
struct btree * b ,
struct btree_node_iter node_iter ,
struct keylist * keys )
2021-04-24 02:25:27 +03:00
{
struct bkey_i * insert = bch2_keylist_front ( keys ) ;
struct bkey_packed * k ;
BUG_ON ( btree_node_type ( b ) ! = BKEY_TYPE_btree ) ;
while ( ( k = bch2_btree_node_iter_prev_all ( & node_iter , b ) ) & &
( bkey_cmp_left_packed ( b , k , & insert - > k . p ) > = 0 ) )
;
while ( ! bch2_keylist_empty ( keys ) ) {
2021-08-30 22:18:31 +03:00
bch2_insert_fixup_btree_ptr ( as , trans , path , b ,
2021-08-25 04:30:06 +03:00
& node_iter , bch2_keylist_front ( keys ) ) ;
2021-04-24 02:25:27 +03:00
bch2_keylist_pop_front ( keys ) ;
}
}
2017-03-17 09:18:50 +03:00
/*
* Move keys from n1 ( original replacement node , now lower node ) to n2 ( higher
* node )
*/
static struct btree * __btree_split_node ( struct btree_update * as ,
2021-08-30 22:18:31 +03:00
struct btree * n1 )
2017-03-17 09:18:50 +03:00
{
2021-03-27 03:08:56 +03:00
struct bkey_format_state s ;
2017-03-17 09:18:50 +03:00
size_t nr_packed = 0 , nr_unpacked = 0 ;
struct btree * n2 ;
struct bset * set1 , * set2 ;
2021-03-27 03:08:56 +03:00
struct bkey_packed * k , * set2_start , * set2_end , * out , * prev = NULL ;
bcachefs: Start using bpos.snapshot field
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:
* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
and xattrs) now always have their snapshot field set to U32_MAX
The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.
We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).
This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2021-03-25 01:02:16 +03:00
struct bpos n1_pos ;
2017-03-17 09:18:50 +03:00
2020-06-06 19:28:01 +03:00
n2 = bch2_btree_node_alloc ( as , n1 - > c . level ) ;
2020-05-25 21:57:06 +03:00
bch2_btree_update_add_new_node ( as , n2 ) ;
2017-03-17 09:18:50 +03:00
n2 - > data - > max_key = n1 - > data - > max_key ;
n2 - > data - > format = n1 - > format ;
SET_BTREE_NODE_SEQ ( n2 - > data , BTREE_NODE_SEQ ( n1 - > data ) ) ;
n2 - > key . k . p = n1 - > key . k . p ;
set1 = btree_bset_first ( n1 ) ;
set2 = btree_bset_first ( n2 ) ;
/*
* Has to be a linear search because we don ' t have an auxiliary
* search tree yet
*/
k = set1 - > start ;
while ( 1 ) {
2021-03-25 03:22:51 +03:00
struct bkey_packed * n = bkey_next ( k ) ;
2019-11-10 07:50:52 +03:00
if ( n = = vstruct_last ( set1 ) )
2017-03-17 09:18:50 +03:00
break ;
if ( k - > _data - set1 - > _data > = ( le16_to_cpu ( set1 - > u64s ) * 3 ) / 5 )
break ;
if ( bkey_packed ( k ) )
nr_packed + + ;
else
nr_unpacked + + ;
prev = k ;
2019-11-10 07:50:52 +03:00
k = n ;
2017-03-17 09:18:50 +03:00
}
BUG_ON ( ! prev ) ;
2021-03-27 03:08:56 +03:00
set2_start = k ;
set2_end = vstruct_last ( set1 ) ;
2017-03-17 09:18:50 +03:00
2021-03-27 03:08:56 +03:00
set1 - > u64s = cpu_to_le16 ( ( u64 * ) set2_start - set1 - > _data ) ;
2017-03-17 09:18:50 +03:00
set_btree_bset_end ( n1 , n1 - > set ) ;
n1 - > nr . live_u64s = le16_to_cpu ( set1 - > u64s ) ;
n1 - > nr . bset_u64s [ 0 ] = le16_to_cpu ( set1 - > u64s ) ;
n1 - > nr . packed_keys = nr_packed ;
n1 - > nr . unpacked_keys = nr_unpacked ;
bcachefs: Start using bpos.snapshot field
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:
* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
and xattrs) now always have their snapshot field set to U32_MAX
The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.
We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).
This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2021-03-25 01:02:16 +03:00
n1_pos = bkey_unpack_pos ( n1 , prev ) ;
if ( as - > c - > sb . version < bcachefs_metadata_version_snapshot )
n1_pos . snapshot = U32_MAX ;
btree_set_max ( n1 , n1_pos ) ;
btree_set_min ( n2 , bpos_successor ( n1 - > key . k . p ) ) ;
2021-03-27 03:08:56 +03:00
bch2_bkey_format_init ( & s ) ;
bch2_bkey_format_add_pos ( & s , n2 - > data - > min_key ) ;
bch2_bkey_format_add_pos ( & s , n2 - > data - > max_key ) ;
for ( k = set2_start ; k ! = set2_end ; k = bkey_next ( k ) ) {
struct bkey uk = bkey_unpack_key ( n1 , k ) ;
bch2_bkey_format_add_key ( & s , & uk ) ;
}
n2 - > data - > format = bch2_bkey_format_done ( & s ) ;
btree_node_set_format ( n2 , n2 - > data - > format ) ;
out = set2 - > start ;
memset ( & n2 - > nr , 0 , sizeof ( n2 - > nr ) ) ;
for ( k = set2_start ; k ! = set2_end ; k = bkey_next ( k ) ) {
BUG_ON ( ! bch2_bkey_transform ( & n2 - > format , out , bkey_packed ( k )
? & n1 - > format : & bch2_bkey_format_current , k ) ) ;
out - > format = KEY_FORMAT_LOCAL_BTREE ;
btree_keys_account_key_add ( & n2 - > nr , 0 , out ) ;
out = bkey_next ( out ) ;
}
set2 - > u64s = cpu_to_le16 ( ( u64 * ) out - set2 - > _data ) ;
set_btree_bset_end ( n2 , n2 - > set ) ;
2017-03-17 09:18:50 +03:00
BUG_ON ( ! set1 - > u64s ) ;
BUG_ON ( ! set2 - > u64s ) ;
btree_node_reset_sib_u64s ( n1 ) ;
btree_node_reset_sib_u64s ( n2 ) ;
bch2_verify_btree_nr_keys ( n1 ) ;
bch2_verify_btree_nr_keys ( n2 ) ;
2020-06-06 19:28:01 +03:00
if ( n1 - > c . level ) {
2020-07-21 18:51:17 +03:00
btree_node_interior_verify ( as - > c , n1 ) ;
btree_node_interior_verify ( as - > c , n2 ) ;
2017-03-17 09:18:50 +03:00
}
return n2 ;
}
/*
* For updates to interior nodes , we ' ve got to do the insert before we split
* because the stuff we ' re inserting has to be inserted atomically . Post split ,
* the keys might have to go in different nodes and the split would no longer be
* atomic .
*
* Worse , if the insert is from btree node coalescing , if we do the insert after
* we do the split ( and pick the pivot ) - the pivot we pick might be between
* nodes that were coalesced , and thus in the middle of a child node post
* coalescing :
*/
2021-08-25 04:30:06 +03:00
static void btree_split_insert_keys ( struct btree_update * as ,
struct btree_trans * trans ,
2021-08-30 22:18:31 +03:00
struct btree_path * path ,
2021-08-25 04:30:06 +03:00
struct btree * b ,
2017-03-17 09:18:50 +03:00
struct keylist * keys )
{
struct btree_node_iter node_iter ;
struct bkey_i * k = bch2_keylist_front ( keys ) ;
2019-11-10 07:50:52 +03:00
struct bkey_packed * src , * dst , * n ;
2017-03-17 09:18:50 +03:00
struct bset * i ;
2018-08-21 23:30:14 +03:00
bch2_btree_node_iter_init ( & node_iter , b , & k - > k . p ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
__bch2_btree_insert_keys_interior ( as , trans , path , b , node_iter , keys ) ;
2017-03-17 09:18:50 +03:00
/*
* We can ' t tolerate whiteouts here - with whiteouts there can be
* duplicate keys , and it would be rather bad if we picked a duplicate
* for the pivot :
*/
i = btree_bset_first ( b ) ;
2019-11-10 07:50:52 +03:00
src = dst = i - > start ;
while ( src ! = vstruct_last ( i ) ) {
2021-03-25 03:22:51 +03:00
n = bkey_next ( src ) ;
2019-11-10 07:50:52 +03:00
if ( ! bkey_deleted ( src ) ) {
memmove_u64s_down ( dst , src , src - > u64s ) ;
dst = bkey_next ( dst ) ;
}
src = n ;
}
2021-01-08 18:56:39 +03:00
/* Also clear out the unwritten whiteouts area: */
b - > whiteout_u64s = 0 ;
2019-11-10 07:50:52 +03:00
i - > u64s = cpu_to_le16 ( ( u64 * ) dst - i - > _data ) ;
set_btree_bset_end ( b , b - > set ) ;
2017-03-17 09:18:50 +03:00
BUG_ON ( b - > nsets ! = 1 | |
b - > nr . live_u64s ! = le16_to_cpu ( btree_bset_first ( b ) - > u64s ) ) ;
2020-07-21 18:51:17 +03:00
btree_node_interior_verify ( as - > c , b ) ;
2017-03-17 09:18:50 +03:00
}
2021-08-30 22:18:31 +03:00
static void btree_split ( struct btree_update * as , struct btree_trans * trans ,
struct btree_path * path , struct btree * b ,
struct keylist * keys , unsigned flags )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = as - > c ;
2021-08-30 22:18:31 +03:00
struct btree * parent = btree_node_parent ( path , b ) ;
2017-03-17 09:18:50 +03:00
struct btree * n1 , * n2 = NULL , * n3 = NULL ;
u64 start_time = local_clock ( ) ;
BUG_ON ( ! parent & & ( b ! = btree_node_root ( c , b ) ) ) ;
2021-08-30 22:18:31 +03:00
BUG_ON ( ! btree_node_intent_locked ( path , btree_node_root ( c , b ) - > c . level ) ) ;
2017-03-17 09:18:50 +03:00
bch2_btree_interior_update_will_free_node ( as , b ) ;
n1 = bch2_btree_node_alloc_replacement ( as , b ) ;
2020-05-25 21:57:06 +03:00
bch2_btree_update_add_new_node ( as , n1 ) ;
2017-03-17 09:18:50 +03:00
if ( keys )
2021-08-30 22:18:31 +03:00
btree_split_insert_keys ( as , trans , path , n1 , keys ) ;
2017-03-17 09:18:50 +03:00
2020-01-15 23:11:22 +03:00
if ( bset_u64s ( & n1 - > set [ 0 ] ) > BTREE_SPLIT_THRESHOLD ( c ) ) {
2017-03-17 09:18:50 +03:00
trace_btree_split ( c , b ) ;
2021-08-30 22:18:31 +03:00
n2 = __btree_split_node ( as , n1 ) ;
2017-03-17 09:18:50 +03:00
bch2_btree_build_aux_trees ( n2 ) ;
bch2_btree_build_aux_trees ( n1 ) ;
2020-06-06 19:28:01 +03:00
six_unlock_write ( & n2 - > c . lock ) ;
six_unlock_write ( & n1 - > c . lock ) ;
2017-03-17 09:18:50 +03:00
2021-07-10 20:44:42 +03:00
bch2_btree_node_write ( c , n1 , SIX_LOCK_intent ) ;
2017-03-17 09:18:50 +03:00
bch2_btree_node_write ( c , n2 , SIX_LOCK_intent ) ;
/*
* Note that on recursive parent_keys = = keys , so we
* can ' t start adding new keys to parent_keys before emptying it
* out ( which we did with btree_split_insert_keys ( ) above )
*/
bch2_keylist_add ( & as - > parent_keys , & n1 - > key ) ;
bch2_keylist_add ( & as - > parent_keys , & n2 - > key ) ;
if ( ! parent ) {
/* Depth increases, make a new root */
2020-06-06 19:28:01 +03:00
n3 = __btree_root_alloc ( as , b - > c . level + 1 ) ;
2017-03-17 09:18:50 +03:00
n3 - > sib_u64s [ 0 ] = U16_MAX ;
n3 - > sib_u64s [ 1 ] = U16_MAX ;
2021-08-30 22:18:31 +03:00
btree_split_insert_keys ( as , trans , path , n3 , & as - > parent_keys ) ;
2017-03-17 09:18:50 +03:00
bch2_btree_node_write ( c , n3 , SIX_LOCK_intent ) ;
}
} else {
trace_btree_compact ( c , b ) ;
bch2_btree_build_aux_trees ( n1 ) ;
2020-06-06 19:28:01 +03:00
six_unlock_write ( & n1 - > c . lock ) ;
2017-03-17 09:18:50 +03:00
2021-07-10 20:44:42 +03:00
bch2_btree_node_write ( c , n1 , SIX_LOCK_intent ) ;
2020-02-09 03:06:31 +03:00
if ( parent )
bch2_keylist_add ( & as - > parent_keys , & n1 - > key ) ;
2017-03-17 09:18:50 +03:00
}
/* New nodes all written, now make them visible: */
if ( parent ) {
/* Split a non root node */
2021-08-30 22:18:31 +03:00
bch2_btree_insert_node ( as , trans , path , parent , & as - > parent_keys , flags ) ;
2017-03-17 09:18:50 +03:00
} else if ( n3 ) {
2021-08-30 22:18:31 +03:00
bch2_btree_set_root ( as , trans , path , n3 ) ;
2017-03-17 09:18:50 +03:00
} else {
/* Root filled up but didn't need to be split */
2021-08-30 22:18:31 +03:00
bch2_btree_set_root ( as , trans , path , n1 ) ;
2017-03-17 09:18:50 +03:00
}
2020-05-25 21:57:06 +03:00
bch2_btree_update_get_open_buckets ( as , n1 ) ;
2017-03-17 09:18:50 +03:00
if ( n2 )
2020-05-25 21:57:06 +03:00
bch2_btree_update_get_open_buckets ( as , n2 ) ;
2017-03-17 09:18:50 +03:00
if ( n3 )
2020-05-25 21:57:06 +03:00
bch2_btree_update_get_open_buckets ( as , n3 ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
/* Successful split, update the path to point to the new nodes: */
2017-03-17 09:18:50 +03:00
2020-06-06 19:28:01 +03:00
six_lock_increment ( & b - > c . lock , SIX_LOCK_intent ) ;
2017-03-17 09:18:50 +03:00
if ( n3 )
2021-08-30 21:36:03 +03:00
bch2_trans_node_add ( trans , n3 ) ;
2017-03-17 09:18:50 +03:00
if ( n2 )
2021-08-30 21:36:03 +03:00
bch2_trans_node_add ( trans , n2 ) ;
bch2_trans_node_add ( trans , n1 ) ;
2017-03-17 09:18:50 +03:00
2019-10-11 21:45:22 +03:00
/*
* The old node must be freed ( in memory ) _before_ unlocking the new
* nodes - else another thread could re - acquire a read lock on the old
* node after another thread has locked and updated the new node , thus
* seeing stale data :
*/
2021-08-30 21:36:03 +03:00
bch2_btree_node_free_inmem ( trans , b ) ;
2018-11-23 13:19:25 +03:00
2019-10-11 21:45:22 +03:00
if ( n3 )
six_unlock_intent ( & n3 - > c . lock ) ;
if ( n2 )
six_unlock_intent ( & n2 - > c . lock ) ;
six_unlock_intent ( & n1 - > c . lock ) ;
2021-08-30 21:45:11 +03:00
bch2_trans_verify_locks ( trans ) ;
2018-11-23 13:19:25 +03:00
2021-12-10 23:41:38 +03:00
bch2_time_stats_update ( & c - > times [ n2
? BCH_TIME_btree_node_split
: BCH_TIME_btree_node_compact ] ,
2019-03-21 23:28:57 +03:00
start_time ) ;
2017-03-17 09:18:50 +03:00
}
static void
2021-08-25 04:30:06 +03:00
bch2_btree_insert_keys_interior ( struct btree_update * as ,
struct btree_trans * trans ,
2021-08-30 22:18:31 +03:00
struct btree_path * path ,
2021-08-25 04:30:06 +03:00
struct btree * b ,
struct keylist * keys )
2017-03-17 09:18:50 +03:00
{
2021-08-30 22:18:31 +03:00
struct btree_path * linked ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
__bch2_btree_insert_keys_interior ( as , trans , path , b ,
path - > l [ b - > c . level ] . iter , keys ) ;
2017-03-17 09:18:50 +03:00
btree_update_updated_node ( as , b ) ;
2021-08-30 22:18:31 +03:00
trans_for_each_path_with_node ( trans , b , linked )
2020-06-06 19:28:01 +03:00
bch2_btree_node_iter_peek ( & linked - > l [ b - > c . level ] . iter , b ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
bch2_trans_verify_paths ( trans ) ;
2017-03-17 09:18:50 +03:00
}
/**
* bch_btree_insert_node - insert bkeys into a given btree node
*
* @ iter : btree iterator
* @ keys : list of keys to insert
* @ hook : insert callback
* @ persistent : if not null , @ persistent will wait on journal write
*
* Inserts as many keys as it can into a given btree node , splitting it if full .
* If a split occurred , this function will return early . This can only happen
* for leaf nodes - - inserts into interior nodes have to be atomic .
*/
2021-08-30 22:18:31 +03:00
static void bch2_btree_insert_node ( struct btree_update * as , struct btree_trans * trans ,
struct btree_path * path , struct btree * b ,
struct keylist * keys , unsigned flags )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = as - > c ;
int old_u64s = le16_to_cpu ( btree_bset_last ( b ) - > u64s ) ;
int old_live_u64s = b - > nr . live_u64s ;
int live_u64s_added , u64s_added ;
2021-03-31 22:21:37 +03:00
lockdep_assert_held ( & c - > gc_lock ) ;
2021-08-30 22:18:31 +03:00
BUG_ON ( ! btree_node_intent_locked ( path , btree_node_root ( c , b ) - > c . level ) ) ;
2020-06-06 19:28:01 +03:00
BUG_ON ( ! b - > c . level ) ;
2017-03-17 09:18:50 +03:00
BUG_ON ( ! as | | as - > b ) ;
bch2_verify_keylist_sorted ( keys ) ;
2021-08-30 22:18:31 +03:00
bch2_btree_node_lock_for_insert ( trans , path , b ) ;
2017-03-17 09:18:50 +03:00
2020-05-25 21:57:06 +03:00
if ( ! bch2_btree_node_insert_fits ( c , b , bch2_keylist_u64s ( keys ) ) ) {
2021-08-30 22:18:31 +03:00
bch2_btree_node_unlock_write ( trans , path , b ) ;
2017-03-17 09:18:50 +03:00
goto split ;
}
2020-11-16 22:16:42 +03:00
btree_node_interior_verify ( c , b ) ;
2021-08-30 22:18:31 +03:00
bch2_btree_insert_keys_interior ( as , trans , path , b , keys ) ;
2017-03-17 09:18:50 +03:00
live_u64s_added = ( int ) b - > nr . live_u64s - old_live_u64s ;
u64s_added = ( int ) le16_to_cpu ( btree_bset_last ( b ) - > u64s ) - old_u64s ;
if ( b - > sib_u64s [ 0 ] ! = U16_MAX & & live_u64s_added < 0 )
b - > sib_u64s [ 0 ] = max ( 0 , ( int ) b - > sib_u64s [ 0 ] + live_u64s_added ) ;
if ( b - > sib_u64s [ 1 ] ! = U16_MAX & & live_u64s_added < 0 )
b - > sib_u64s [ 1 ] = max ( 0 , ( int ) b - > sib_u64s [ 1 ] + live_u64s_added ) ;
if ( u64s_added > live_u64s_added & &
bch2_maybe_compact_whiteouts ( c , b ) )
2021-08-30 21:36:03 +03:00
bch2_trans_node_reinit_iter ( trans , b ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
bch2_btree_node_unlock_write ( trans , path , b ) ;
2017-03-17 09:18:50 +03:00
2020-07-21 18:51:17 +03:00
btree_node_interior_verify ( c , b ) ;
2017-03-17 09:18:50 +03:00
return ;
split :
2021-08-30 22:18:31 +03:00
btree_split ( as , trans , path , b , keys , flags ) ;
2017-03-17 09:18:50 +03:00
}
2021-07-11 06:22:06 +03:00
int bch2_btree_split_leaf ( struct btree_trans * trans ,
2021-08-30 22:18:31 +03:00
struct btree_path * path ,
2017-03-17 09:18:50 +03:00
unsigned flags )
{
2021-07-11 06:22:06 +03:00
struct bch_fs * c = trans - > c ;
2021-08-30 22:18:31 +03:00
struct btree * b = path_l ( path ) - > b ;
2017-03-17 09:18:50 +03:00
struct btree_update * as ;
2021-03-31 22:39:16 +03:00
unsigned l ;
int ret = 0 ;
2020-04-11 19:32:27 +03:00
2021-08-30 22:18:31 +03:00
as = bch2_btree_update_start ( trans , path , path - > level ,
2021-03-31 22:21:37 +03:00
btree_update_reserve_required ( c , b ) , flags ) ;
if ( IS_ERR ( as ) )
return PTR_ERR ( as ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
btree_split ( as , trans , path , b , NULL , flags ) ;
2017-03-17 09:18:50 +03:00
bch2_btree_update_done ( as ) ;
2021-03-31 22:39:16 +03:00
2021-08-30 22:18:31 +03:00
for ( l = path - > level + 1 ; btree_path_node ( path , l ) & & ! ret ; l + + )
ret = bch2_foreground_maybe_merge ( trans , path , l , flags ) ;
2021-03-31 22:39:16 +03:00
return ret ;
2017-03-17 09:18:50 +03:00
}
2021-07-11 06:22:06 +03:00
int __bch2_foreground_maybe_merge ( struct btree_trans * trans ,
2021-08-30 22:18:31 +03:00
struct btree_path * path ,
2021-03-31 23:16:39 +03:00
unsigned level ,
unsigned flags ,
enum btree_node_sibling sib )
2017-03-17 09:18:50 +03:00
{
2021-07-11 06:22:06 +03:00
struct bch_fs * c = trans - > c ;
2021-08-30 22:18:31 +03:00
struct btree_path * sib_path = NULL ;
2017-03-17 09:18:50 +03:00
struct btree_update * as ;
struct bkey_format_state new_s ;
struct bkey_format new_f ;
struct bkey_i delete ;
struct btree * b , * m , * n , * prev , * next , * parent ;
2021-03-29 08:13:31 +03:00
struct bpos sib_pos ;
2017-03-17 09:18:50 +03:00
size_t sib_u64s ;
2021-12-10 23:41:38 +03:00
u64 start_time = local_clock ( ) ;
2021-09-05 07:22:32 +03:00
int ret = 0 ;
2021-03-29 08:13:31 +03:00
2021-08-30 22:18:31 +03:00
BUG_ON ( ! path - > should_be_locked ) ;
BUG_ON ( ! btree_node_locked ( path , level ) ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
b = path - > l [ level ] . b ;
2017-03-17 09:18:50 +03:00
2021-03-29 08:13:31 +03:00
if ( ( sib = = btree_prev_sib & & ! bpos_cmp ( b - > data - > min_key , POS_MIN ) ) | |
2021-07-06 05:02:07 +03:00
( sib = = btree_next_sib & & ! bpos_cmp ( b - > data - > max_key , SPOS_MAX ) ) ) {
2021-03-29 08:13:31 +03:00
b - > sib_u64s [ sib ] = U16_MAX ;
2021-09-05 07:22:32 +03:00
return 0 ;
2021-03-29 08:13:31 +03:00
}
2017-03-17 09:18:50 +03:00
2021-03-29 08:13:31 +03:00
sib_pos = sib = = btree_prev_sib
? bpos_predecessor ( b - > data - > min_key )
: bpos_successor ( b - > data - > max_key ) ;
2017-03-17 09:18:50 +03:00
2021-12-22 04:48:26 +03:00
sib_path = bch2_path_get ( trans , path - > btree_id , sib_pos ,
U8_MAX , level , BTREE_ITER_INTENT ) ;
2021-08-30 22:18:31 +03:00
ret = bch2_btree_path_traverse ( trans , sib_path , false ) ;
2021-03-29 08:13:31 +03:00
if ( ret )
2017-03-17 09:18:50 +03:00
goto err ;
2021-08-30 22:18:31 +03:00
sib_path - > should_be_locked = true ;
m = sib_path - > l [ level ] . b ;
2021-03-29 08:13:31 +03:00
2021-08-30 22:18:31 +03:00
if ( btree_node_parent ( path , b ) ! =
btree_node_parent ( sib_path , m ) ) {
2017-03-17 09:18:50 +03:00
b - > sib_u64s [ sib ] = U16_MAX ;
goto out ;
}
if ( sib = = btree_prev_sib ) {
prev = m ;
next = b ;
} else {
prev = b ;
next = m ;
}
2021-04-23 23:05:49 +03:00
if ( bkey_cmp ( bpos_successor ( prev - > data - > max_key ) , next - > data - > min_key ) ) {
char buf1 [ 100 ] , buf2 [ 100 ] ;
bch2_bpos_to_text ( & PBUF ( buf1 ) , prev - > data - > max_key ) ;
bch2_bpos_to_text ( & PBUF ( buf2 ) , next - > data - > min_key ) ;
2021-04-24 23:32:35 +03:00
bch_err ( c ,
" btree topology error in btree merge: \n "
" prev ends at %s \n "
" next starts at %s " ,
buf1 , buf2 ) ;
bch2_topology_error ( c ) ;
2021-04-23 23:05:49 +03:00
ret = - EIO ;
goto err ;
}
2021-03-29 08:13:31 +03:00
2017-03-17 09:18:50 +03:00
bch2_bkey_format_init ( & new_s ) ;
2021-03-27 03:29:04 +03:00
bch2_bkey_format_add_pos ( & new_s , prev - > data - > min_key ) ;
__bch2_btree_calc_format ( & new_s , prev ) ;
__bch2_btree_calc_format ( & new_s , next ) ;
bch2_bkey_format_add_pos ( & new_s , next - > data - > max_key ) ;
2017-03-17 09:18:50 +03:00
new_f = bch2_bkey_format_done ( & new_s ) ;
sib_u64s = btree_node_u64s_with_format ( b , & new_f ) +
btree_node_u64s_with_format ( m , & new_f ) ;
if ( sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS ( c ) ) {
sib_u64s - = BTREE_FOREGROUND_MERGE_HYSTERESIS ( c ) ;
sib_u64s / = 2 ;
sib_u64s + = BTREE_FOREGROUND_MERGE_HYSTERESIS ( c ) ;
}
sib_u64s = min ( sib_u64s , btree_max_u64s ( c ) ) ;
2021-03-29 08:13:31 +03:00
sib_u64s = min ( sib_u64s , ( size_t ) U16_MAX - 1 ) ;
2017-03-17 09:18:50 +03:00
b - > sib_u64s [ sib ] = sib_u64s ;
2021-03-29 08:13:31 +03:00
if ( b - > sib_u64s [ sib ] > c - > btree_foreground_merge_threshold )
2017-03-17 09:18:50 +03:00
goto out ;
2021-08-30 22:18:31 +03:00
parent = btree_node_parent ( path , b ) ;
as = bch2_btree_update_start ( trans , path , level ,
2017-03-17 09:18:50 +03:00
btree_update_reserve_required ( c , parent ) + 1 ,
2020-04-08 00:27:12 +03:00
flags |
2017-03-17 09:18:50 +03:00
BTREE_INSERT_NOFAIL |
2021-03-31 22:21:37 +03:00
BTREE_INSERT_USE_RESERVE ) ;
ret = PTR_ERR_OR_ZERO ( as ) ;
2021-03-29 08:13:31 +03:00
if ( ret )
2021-03-31 22:21:37 +03:00
goto err ;
2017-03-17 09:18:50 +03:00
trace_btree_merge ( c , b ) ;
bch2_btree_interior_update_will_free_node ( as , b ) ;
bch2_btree_interior_update_will_free_node ( as , m ) ;
2020-06-06 19:28:01 +03:00
n = bch2_btree_node_alloc ( as , b - > c . level ) ;
2020-05-25 21:57:06 +03:00
bch2_btree_update_add_new_node ( as , n ) ;
2017-03-17 09:18:50 +03:00
2020-02-07 21:38:02 +03:00
btree_set_min ( n , prev - > data - > min_key ) ;
btree_set_max ( n , next - > data - > max_key ) ;
2017-03-17 09:18:50 +03:00
n - > data - > format = new_f ;
btree_node_set_format ( n , new_f ) ;
bch2_btree_sort_into ( c , n , prev ) ;
bch2_btree_sort_into ( c , n , next ) ;
bch2_btree_build_aux_trees ( n ) ;
2020-06-06 19:28:01 +03:00
six_unlock_write ( & n - > c . lock ) ;
2017-03-17 09:18:50 +03:00
2021-07-10 20:44:42 +03:00
bch2_btree_node_write ( c , n , SIX_LOCK_intent ) ;
2017-03-17 09:18:50 +03:00
bkey_init ( & delete . k ) ;
delete . k . p = prev - > key . k . p ;
bch2_keylist_add ( & as - > parent_keys , & delete ) ;
bch2_keylist_add ( & as - > parent_keys , & n - > key ) ;
2021-09-05 04:23:11 +03:00
bch2_trans_verify_paths ( trans ) ;
2021-08-30 22:18:31 +03:00
bch2_btree_insert_node ( as , trans , path , parent , & as - > parent_keys , flags ) ;
2017-03-17 09:18:50 +03:00
2021-09-05 04:23:11 +03:00
bch2_trans_verify_paths ( trans ) ;
2020-05-25 21:57:06 +03:00
bch2_btree_update_get_open_buckets ( as , n ) ;
2018-11-23 13:19:25 +03:00
2020-06-06 19:28:01 +03:00
six_lock_increment ( & b - > c . lock , SIX_LOCK_intent ) ;
2021-03-29 08:13:31 +03:00
six_lock_increment ( & m - > c . lock , SIX_LOCK_intent ) ;
2019-03-29 21:42:34 +03:00
2021-08-30 21:36:03 +03:00
bch2_trans_node_add ( trans , n ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
bch2_trans_verify_paths ( trans ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 21:36:03 +03:00
bch2_btree_node_free_inmem ( trans , b ) ;
bch2_btree_node_free_inmem ( trans , m ) ;
2018-11-23 13:19:25 +03:00
2019-10-11 21:45:22 +03:00
six_unlock_intent ( & n - > c . lock ) ;
2017-03-17 09:18:50 +03:00
bch2_btree_update_done ( as ) ;
2021-12-10 23:41:38 +03:00
bch2_time_stats_update ( & c - > times [ BCH_TIME_btree_node_merge ] , start_time ) ;
2017-03-17 09:18:50 +03:00
out :
err :
2021-09-05 07:22:32 +03:00
bch2_path_put ( trans , sib_path , true ) ;
bch2_trans_verify_locks ( trans ) ;
return ret ;
2017-03-17 09:18:50 +03:00
}
2021-03-31 22:21:37 +03:00
/**
* bch_btree_node_rewrite - Rewrite / move a btree node
*/
2021-07-11 06:22:06 +03:00
int bch2_btree_node_rewrite ( struct btree_trans * trans ,
struct btree_iter * iter ,
2021-10-24 23:59:33 +03:00
struct btree * b ,
unsigned flags )
2017-03-17 09:18:50 +03:00
{
2021-07-11 06:22:06 +03:00
struct bch_fs * c = trans - > c ;
2021-10-24 23:59:33 +03:00
struct btree * n , * parent ;
2017-03-17 09:18:50 +03:00
struct btree_update * as ;
2021-03-31 22:21:37 +03:00
int ret ;
flags | = BTREE_INSERT_NOFAIL ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
parent = btree_node_parent ( iter - > path , b ) ;
as = bch2_btree_update_start ( trans , iter - > path , b - > c . level ,
2017-03-17 09:18:50 +03:00
( parent
? btree_update_reserve_required ( c , parent )
: 0 ) + 1 ,
2021-03-31 22:21:37 +03:00
flags ) ;
ret = PTR_ERR_OR_ZERO ( as ) ;
if ( ret ) {
2017-03-17 09:18:50 +03:00
trace_btree_gc_rewrite_node_fail ( c , b ) ;
2021-03-31 22:21:37 +03:00
goto out ;
2017-03-17 09:18:50 +03:00
}
bch2_btree_interior_update_will_free_node ( as , b ) ;
n = bch2_btree_node_alloc_replacement ( as , b ) ;
2020-05-25 21:57:06 +03:00
bch2_btree_update_add_new_node ( as , n ) ;
2017-03-17 09:18:50 +03:00
bch2_btree_build_aux_trees ( n ) ;
2020-06-06 19:28:01 +03:00
six_unlock_write ( & n - > c . lock ) ;
2017-03-17 09:18:50 +03:00
trace_btree_gc_rewrite_node ( c , b ) ;
bch2_btree_node_write ( c , n , SIX_LOCK_intent ) ;
if ( parent ) {
bch2_keylist_add ( & as - > parent_keys , & n - > key ) ;
2021-08-30 22:18:31 +03:00
bch2_btree_insert_node ( as , trans , iter - > path , parent ,
2021-07-11 06:22:06 +03:00
& as - > parent_keys , flags ) ;
2017-03-17 09:18:50 +03:00
} else {
2021-08-30 22:18:31 +03:00
bch2_btree_set_root ( as , trans , iter - > path , n ) ;
2017-03-17 09:18:50 +03:00
}
2020-05-25 21:57:06 +03:00
bch2_btree_update_get_open_buckets ( as , n ) ;
2017-03-17 09:18:50 +03:00
2020-06-06 19:28:01 +03:00
six_lock_increment ( & b - > c . lock , SIX_LOCK_intent ) ;
2021-08-30 21:36:03 +03:00
bch2_trans_node_add ( trans , n ) ;
bch2_btree_node_free_inmem ( trans , b ) ;
2019-10-11 21:45:22 +03:00
six_unlock_intent ( & n - > c . lock ) ;
2017-03-17 09:18:50 +03:00
bch2_btree_update_done ( as ) ;
2021-03-31 22:21:37 +03:00
out :
2021-08-30 22:18:31 +03:00
bch2_btree_path_downgrade ( iter - > path ) ;
2017-03-17 09:18:50 +03:00
return ret ;
}
2021-04-24 09:47:41 +03:00
struct async_btree_rewrite {
struct bch_fs * c ;
struct work_struct work ;
enum btree_id btree_id ;
unsigned level ;
struct bpos pos ;
__le64 seq ;
} ;
2021-10-24 23:59:33 +03:00
static int async_btree_node_rewrite_trans ( struct btree_trans * trans ,
struct async_btree_rewrite * a )
{
struct btree_iter iter ;
struct btree * b ;
int ret ;
bch2_trans_node_iter_init ( trans , & iter , a - > btree_id , a - > pos ,
BTREE_MAX_DEPTH , a - > level , 0 ) ;
b = bch2_btree_iter_peek_node ( & iter ) ;
ret = PTR_ERR_OR_ZERO ( b ) ;
if ( ret )
goto out ;
if ( ! b | | b - > data - > keys . seq ! = a - > seq )
goto out ;
ret = bch2_btree_node_rewrite ( trans , & iter , b , 0 ) ;
out :
bch2_trans_iter_exit ( trans , & iter ) ;
return ret ;
}
2021-04-24 09:47:41 +03:00
void async_btree_node_rewrite_work ( struct work_struct * work )
{
struct async_btree_rewrite * a =
container_of ( work , struct async_btree_rewrite , work ) ;
struct bch_fs * c = a - > c ;
2021-10-24 23:59:33 +03:00
bch2_trans_do ( c , NULL , NULL , 0 ,
async_btree_node_rewrite_trans ( & trans , a ) ) ;
2021-04-24 09:47:41 +03:00
percpu_ref_put ( & c - > writes ) ;
kfree ( a ) ;
}
void bch2_btree_node_rewrite_async ( struct bch_fs * c , struct btree * b )
{
struct async_btree_rewrite * a ;
if ( ! percpu_ref_tryget ( & c - > writes ) )
return ;
a = kmalloc ( sizeof ( * a ) , GFP_NOFS ) ;
if ( ! a ) {
percpu_ref_put ( & c - > writes ) ;
return ;
}
a - > c = c ;
a - > btree_id = b - > c . btree_id ;
a - > level = b - > c . level ;
a - > pos = b - > key . k . p ;
a - > seq = b - > data - > keys . seq ;
INIT_WORK ( & a - > work , async_btree_node_rewrite_work ) ;
2021-05-23 00:37:25 +03:00
queue_work ( c - > btree_interior_update_worker , & a - > work ) ;
2021-04-24 09:47:41 +03:00
}
2021-07-10 20:44:42 +03:00
static int __bch2_btree_node_update_key ( struct btree_trans * trans ,
struct btree_iter * iter ,
struct btree * b , struct btree * new_hash ,
struct bkey_i * new_key ,
bool skip_triggers )
2017-03-17 09:18:50 +03:00
{
2021-07-10 20:44:42 +03:00
struct bch_fs * c = trans - > c ;
2021-08-30 22:18:31 +03:00
struct btree_iter iter2 = { NULL } ;
2017-03-17 09:18:50 +03:00
struct btree * parent ;
2021-07-10 20:44:42 +03:00
u64 journal_entries [ BKEY_BTREE_PTR_U64s_MAX ] ;
2017-03-17 09:18:50 +03:00
int ret ;
2021-07-10 20:44:42 +03:00
if ( ! skip_triggers ) {
ret = bch2_trans_mark_key ( trans ,
bkey_s_c_null ,
bkey_i_to_s_c ( new_key ) ,
BTREE_TRIGGER_INSERT ) ;
if ( ret )
return ret ;
ret = bch2_trans_mark_key ( trans ,
bkey_i_to_s_c ( & b - > key ) ,
bkey_s_c_null ,
BTREE_TRIGGER_OVERWRITE ) ;
if ( ret )
return ret ;
}
if ( new_hash ) {
bkey_copy ( & new_hash - > key , new_key ) ;
ret = bch2_btree_node_hash_insert ( & c - > btree_cache ,
new_hash , b - > c . level , b - > c . btree_id ) ;
BUG_ON ( ret ) ;
}
2018-07-23 12:32:01 +03:00
2021-08-30 22:18:31 +03:00
parent = btree_node_parent ( iter - > path , b ) ;
2017-03-17 09:18:50 +03:00
if ( parent ) {
2021-08-30 22:18:31 +03:00
bch2_trans_copy_iter ( & iter2 , iter ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
iter2 . path = bch2_btree_path_make_mut ( trans , iter2 . path ,
iter2 . flags & BTREE_ITER_INTENT ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
BUG_ON ( iter2 . path - > level ! = b - > c . level ) ;
BUG_ON ( bpos_cmp ( iter2 . path - > pos , new_key - > k . p ) ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
btree_node_unlock ( iter2 . path , iter2 . path - > level ) ;
path_l ( iter2 . path ) - > b = BTREE_ITER_NO_NODE_UP ;
iter2 . path - > level + + ;
ret = bch2_btree_iter_traverse ( & iter2 ) ? :
bch2_trans_update ( trans , & iter2 , new_key , BTREE_TRIGGER_NORUN ) ;
2021-07-10 20:44:42 +03:00
if ( ret )
goto err ;
2017-03-17 09:18:50 +03:00
} else {
BUG_ON ( btree_node_root ( c , b ) ! = b ) ;
2021-07-10 20:44:42 +03:00
trans - > extra_journal_entries = ( void * ) & journal_entries [ 0 ] ;
trans - > extra_journal_entry_u64s =
journal_entry_set ( ( void * ) & journal_entries [ 0 ] ,
BCH_JSET_ENTRY_btree_root ,
b - > c . btree_id , b - > c . level ,
new_key , new_key - > k . u64s ) ;
}
2017-03-17 09:18:50 +03:00
2021-07-10 20:44:42 +03:00
ret = bch2_trans_commit ( trans , NULL , NULL ,
BTREE_INSERT_NOFAIL |
BTREE_INSERT_NOCHECK_RW |
2022-01-08 11:39:54 +03:00
BTREE_INSERT_USE_RESERVE |
2021-07-10 20:44:42 +03:00
BTREE_INSERT_JOURNAL_RECLAIM |
2021-07-28 05:15:04 +03:00
BTREE_INSERT_JOURNAL_RESERVED ) ;
2021-07-10 20:44:42 +03:00
if ( ret )
goto err ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
bch2_btree_node_lock_write ( trans , iter - > path , b ) ;
2017-03-17 09:18:50 +03:00
2021-07-10 20:44:42 +03:00
if ( new_hash ) {
mutex_lock ( & c - > btree_cache . lock ) ;
bch2_btree_node_hash_remove ( & c - > btree_cache , new_hash ) ;
bch2_btree_node_hash_remove ( & c - > btree_cache , b ) ;
bkey_copy ( & b - > key , new_key ) ;
ret = __bch2_btree_node_hash_insert ( & c - > btree_cache , b ) ;
BUG_ON ( ret ) ;
mutex_unlock ( & c - > btree_cache . lock ) ;
} else {
bkey_copy ( & b - > key , new_key ) ;
2017-03-17 09:18:50 +03:00
}
2021-08-30 22:18:31 +03:00
bch2_btree_node_unlock_write ( trans , iter - > path , b ) ;
2021-07-10 20:44:42 +03:00
out :
2021-08-30 22:18:31 +03:00
bch2_trans_iter_exit ( trans , & iter2 ) ;
2021-07-10 20:44:42 +03:00
return ret ;
err :
if ( new_hash ) {
mutex_lock ( & c - > btree_cache . lock ) ;
bch2_btree_node_hash_remove ( & c - > btree_cache , b ) ;
mutex_unlock ( & c - > btree_cache . lock ) ;
}
goto out ;
2017-03-17 09:18:50 +03:00
}
2021-07-10 20:44:42 +03:00
int bch2_btree_node_update_key ( struct btree_trans * trans , struct btree_iter * iter ,
struct btree * b , struct bkey_i * new_key ,
bool skip_triggers )
2017-03-17 09:18:50 +03:00
{
2021-07-11 06:22:06 +03:00
struct bch_fs * c = trans - > c ;
2017-03-17 09:18:50 +03:00
struct btree * new_hash = NULL ;
2021-10-07 21:56:56 +03:00
struct btree_path * path = iter - > path ;
2017-03-17 09:18:50 +03:00
struct closure cl ;
2021-03-31 22:21:37 +03:00
int ret = 0 ;
2017-03-17 09:18:50 +03:00
2021-10-07 21:56:56 +03:00
if ( ! btree_node_intent_locked ( path , b - > c . level ) & &
! bch2_btree_path_upgrade ( trans , path , b - > c . level + 1 ) ) {
btree_trans_restart ( trans ) ;
return - EINTR ;
}
2017-03-17 09:18:50 +03:00
closure_init_stack ( & cl ) ;
2020-02-19 01:15:32 +03:00
/*
* check btree_ptr_hash_val ( ) after @ b is locked by
* btree_iter_traverse ( ) :
*/
if ( btree_ptr_hash_val ( new_key ) ! = b - > hash_val ) {
2017-03-17 09:18:50 +03:00
ret = bch2_btree_cache_cannibalize_lock ( c , & cl ) ;
if ( ret ) {
2021-07-10 20:44:42 +03:00
bch2_trans_unlock ( trans ) ;
2017-03-17 09:18:50 +03:00
closure_sync ( & cl ) ;
2021-07-10 20:44:42 +03:00
if ( ! bch2_trans_relock ( trans ) )
2021-03-31 22:21:37 +03:00
return - EINTR ;
2017-03-17 09:18:50 +03:00
}
new_hash = bch2_btree_node_mem_alloc ( c ) ;
}
2021-10-07 21:56:56 +03:00
path - > intent_ref + + ;
2021-07-10 20:44:42 +03:00
ret = __bch2_btree_node_update_key ( trans , iter , b , new_hash ,
new_key , skip_triggers ) ;
2021-10-07 21:56:56 +03:00
- - path - > intent_ref ;
2017-03-17 09:18:50 +03:00
if ( new_hash ) {
mutex_lock ( & c - > btree_cache . lock ) ;
list_move ( & new_hash - > list , & c - > btree_cache . freeable ) ;
mutex_unlock ( & c - > btree_cache . lock ) ;
2020-06-06 19:28:01 +03:00
six_unlock_write ( & new_hash - > c . lock ) ;
six_unlock_intent ( & new_hash - > c . lock ) ;
2017-03-17 09:18:50 +03:00
}
closure_sync ( & cl ) ;
2021-03-31 22:21:37 +03:00
bch2_btree_cache_cannibalize_unlock ( c ) ;
2017-03-17 09:18:50 +03:00
return ret ;
}
2021-07-10 20:44:42 +03:00
int bch2_btree_node_update_key_get_iter ( struct btree_trans * trans ,
struct btree * b , struct bkey_i * new_key ,
bool skip_triggers )
{
2021-08-30 22:18:31 +03:00
struct btree_iter iter ;
2021-07-10 20:44:42 +03:00
int ret ;
2021-08-30 22:18:31 +03:00
bch2_trans_node_iter_init ( trans , & iter , b - > c . btree_id , b - > key . k . p ,
BTREE_MAX_DEPTH , b - > c . level ,
BTREE_ITER_INTENT ) ;
ret = bch2_btree_iter_traverse ( & iter ) ;
2021-07-10 20:44:42 +03:00
if ( ret )
goto out ;
/* has node been freed? */
2021-08-30 22:18:31 +03:00
if ( iter . path - > l [ b - > c . level ] . b ! = b ) {
2021-07-10 20:44:42 +03:00
/* node has been freed: */
BUG_ON ( ! btree_node_dying ( b ) ) ;
goto out ;
}
BUG_ON ( ! btree_node_hashed ( b ) ) ;
2021-08-30 22:18:31 +03:00
ret = bch2_btree_node_update_key ( trans , & iter , b , new_key , skip_triggers ) ;
2021-07-10 20:44:42 +03:00
out :
2021-08-30 22:18:31 +03:00
bch2_trans_iter_exit ( trans , & iter ) ;
2021-07-10 20:44:42 +03:00
return ret ;
}
2017-03-17 09:18:50 +03:00
/* Init code: */
/*
* Only for filesystem bringup , when first reading the btree roots or allocating
* btree roots when initializing a new filesystem :
*/
void bch2_btree_set_root_for_read ( struct bch_fs * c , struct btree * b )
{
BUG_ON ( btree_node_root ( c , b ) ) ;
2020-05-25 21:57:06 +03:00
bch2_btree_set_root_inmem ( c , b ) ;
2017-03-17 09:18:50 +03:00
}
void bch2_btree_root_alloc ( struct bch_fs * c , enum btree_id id )
{
struct closure cl ;
struct btree * b ;
int ret ;
closure_init_stack ( & cl ) ;
do {
ret = bch2_btree_cache_cannibalize_lock ( c , & cl ) ;
closure_sync ( & cl ) ;
} while ( ret ) ;
b = bch2_btree_node_mem_alloc ( c ) ;
bch2_btree_cache_cannibalize_unlock ( c ) ;
set_btree_node_fake ( b ) ;
2020-07-03 23:32:00 +03:00
set_btree_node_need_rewrite ( b ) ;
2020-06-06 19:28:01 +03:00
b - > c . level = 0 ;
b - > c . btree_id = id ;
2017-03-17 09:18:50 +03:00
2018-11-01 22:10:01 +03:00
bkey_btree_ptr_init ( & b - > key ) ;
2021-07-06 05:02:07 +03:00
b - > key . k . p = SPOS_MAX ;
2020-02-19 01:15:32 +03:00
* ( ( u64 * ) bkey_i_to_btree_ptr ( & b - > key ) - > v . start ) = U64_MAX - id ;
2017-03-17 09:18:50 +03:00
bch2_bset_init_first ( b , & b - > data - > keys ) ;
bch2_btree_build_aux_trees ( b ) ;
2019-10-12 03:56:27 +03:00
b - > data - > flags = 0 ;
2020-02-07 21:38:02 +03:00
btree_set_min ( b , POS_MIN ) ;
2021-07-06 05:02:07 +03:00
btree_set_max ( b , SPOS_MAX ) ;
2017-03-17 09:18:50 +03:00
b - > data - > format = bch2_btree_calc_format ( b ) ;
btree_node_set_format ( b , b - > data - > format ) ;
2020-06-06 19:28:01 +03:00
ret = bch2_btree_node_hash_insert ( & c - > btree_cache , b ,
b - > c . level , b - > c . btree_id ) ;
2017-03-17 09:18:50 +03:00
BUG_ON ( ret ) ;
2020-05-25 21:57:06 +03:00
bch2_btree_set_root_inmem ( c , b ) ;
2017-03-17 09:18:50 +03:00
2020-06-06 19:28:01 +03:00
six_unlock_write ( & b - > c . lock ) ;
six_unlock_intent ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
}
2020-07-26 00:06:11 +03:00
void bch2_btree_updates_to_text ( struct printbuf * out , struct bch_fs * c )
2017-03-17 09:18:50 +03:00
{
struct btree_update * as ;
mutex_lock ( & c - > btree_interior_update_lock ) ;
list_for_each_entry ( as , & c - > btree_interior_update_list , list )
2020-07-26 00:06:11 +03:00
pr_buf ( out , " %p m %u w %u r %u j %llu \n " ,
2018-11-09 09:24:07 +03:00
as ,
as - > mode ,
as - > nodes_written ,
atomic_read ( & as - > cl . remaining ) & CLOSURE_REMAINING_MASK ,
as - > journal . seq ) ;
2017-03-17 09:18:50 +03:00
mutex_unlock ( & c - > btree_interior_update_lock ) ;
}
size_t bch2_btree_interior_updates_nr_pending ( struct bch_fs * c )
{
size_t ret = 0 ;
struct list_head * i ;
mutex_lock ( & c - > btree_interior_update_lock ) ;
list_for_each ( i , & c - > btree_interior_update_list )
ret + + ;
mutex_unlock ( & c - > btree_interior_update_lock ) ;
return ret ;
}
2020-05-26 03:35:53 +03:00
2020-05-25 21:57:06 +03:00
void bch2_journal_entries_to_btree_roots ( struct bch_fs * c , struct jset * jset )
{
struct btree_root * r ;
struct jset_entry * entry ;
mutex_lock ( & c - > btree_root_lock ) ;
vstruct_for_each ( jset , entry )
if ( entry - > type = = BCH_JSET_ENTRY_btree_root ) {
r = & c - > btree_roots [ entry - > btree_id ] ;
r - > level = entry - > level ;
r - > alive = true ;
bkey_copy ( & r - > key , & entry - > start [ 0 ] ) ;
}
mutex_unlock ( & c - > btree_root_lock ) ;
}
struct jset_entry *
bch2_btree_roots_to_journal_entries ( struct bch_fs * c ,
struct jset_entry * start ,
struct jset_entry * end )
{
struct jset_entry * entry ;
unsigned long have = 0 ;
unsigned i ;
for ( entry = start ; entry < end ; entry = vstruct_next ( entry ) )
if ( entry - > type = = BCH_JSET_ENTRY_btree_root )
__set_bit ( entry - > btree_id , & have ) ;
mutex_lock ( & c - > btree_root_lock ) ;
for ( i = 0 ; i < BTREE_ID_NR ; i + + )
if ( c - > btree_roots [ i ] . alive & & ! test_bit ( i , & have ) ) {
journal_entry_set ( end ,
BCH_JSET_ENTRY_btree_root ,
i , c - > btree_roots [ i ] . level ,
& c - > btree_roots [ i ] . key ,
c - > btree_roots [ i ] . key . u64s ) ;
end = vstruct_next ( end ) ;
}
mutex_unlock ( & c - > btree_root_lock ) ;
return end ;
}
2020-05-26 03:35:53 +03:00
void bch2_fs_btree_interior_update_exit ( struct bch_fs * c )
{
2020-05-25 21:57:06 +03:00
if ( c - > btree_interior_update_worker )
destroy_workqueue ( c - > btree_interior_update_worker ) ;
2020-05-26 03:35:53 +03:00
mempool_exit ( & c - > btree_interior_update_pool ) ;
}
int bch2_fs_btree_interior_update_init ( struct bch_fs * c )
{
mutex_init ( & c - > btree_reserve_cache_lock ) ;
INIT_LIST_HEAD ( & c - > btree_interior_update_list ) ;
INIT_LIST_HEAD ( & c - > btree_interior_updates_unwritten ) ;
mutex_init ( & c - > btree_interior_update_lock ) ;
2020-05-25 21:57:06 +03:00
INIT_WORK ( & c - > btree_interior_update_work , btree_interior_update_work ) ;
c - > btree_interior_update_worker =
alloc_workqueue ( " btree_update " , WQ_UNBOUND | WQ_MEM_RECLAIM , 1 ) ;
if ( ! c - > btree_interior_update_worker )
return - ENOMEM ;
2020-05-26 03:35:53 +03:00
2020-05-25 21:57:06 +03:00
return mempool_init_kmalloc_pool ( & c - > btree_interior_update_pool , 1 ,
sizeof ( struct btree_update ) ) ;
2020-05-26 03:35:53 +03:00
}