2017-03-17 09:18:50 +03:00
// SPDX-License-Identifier: GPL-2.0
/*
* Assorted bcachefs debug code
*
* Copyright 2010 , 2011 Kent Overstreet < kent . overstreet @ gmail . com >
* Copyright 2012 Google , Inc .
*/
# include "bcachefs.h"
# include "bkey_methods.h"
# include "btree_cache.h"
# include "btree_io.h"
# include "btree_iter.h"
2022-08-23 06:12:11 +03:00
# include "btree_locking.h"
2017-03-17 09:18:50 +03:00
# include "btree_update.h"
# include "buckets.h"
# include "debug.h"
# include "error.h"
# include "extents.h"
# include "fsck.h"
# include "inode.h"
# include "io.h"
# include "super.h"
# include <linux/console.h>
# include <linux/debugfs.h>
# include <linux/module.h>
# include <linux/random.h>
# include <linux/seq_file.h>
static struct dentry * bch_debug ;
2021-04-21 03:21:12 +03:00
static bool bch2_btree_verify_replica ( struct bch_fs * c , struct btree * b ,
struct extent_ptr_decoded pick )
2017-03-17 09:18:50 +03:00
{
struct btree * v = c - > verify_data ;
2021-04-21 03:21:12 +03:00
struct btree_node * n_ondisk = c - > verify_ondisk ;
struct btree_node * n_sorted = c - > verify_data - > data ;
struct bset * sorted , * inmemory = & b - > data - > keys ;
struct bch_dev * ca = bch_dev_bkey_exists ( c , pick . ptr . dev ) ;
2017-03-17 09:18:50 +03:00
struct bio * bio ;
2023-01-04 01:32:16 +03:00
bool failed = false , saw_error = false ;
2017-03-17 09:18:50 +03:00
if ( ! bch2_dev_get_ioref ( ca , READ ) )
2021-04-21 03:21:12 +03:00
return false ;
2017-03-17 09:18:50 +03:00
bio = bio_alloc_bioset ( ca - > disk_sb . bdev ,
buf_pages ( n_sorted , btree_bytes ( c ) ) ,
REQ_OP_READ | REQ_META ,
GFP_NOIO ,
& c - > btree_bio ) ;
bio - > bi_iter . bi_sector = pick . ptr . offset ;
2019-07-04 02:27:42 +03:00
bch2_bio_map ( bio , n_sorted , btree_bytes ( c ) ) ;
2017-03-17 09:18:50 +03:00
submit_bio_wait ( bio ) ;
bio_put ( bio ) ;
percpu_ref_put ( & ca - > io_ref ) ;
memcpy ( n_ondisk , n_sorted , btree_bytes ( c ) ) ;
2021-04-21 03:21:12 +03:00
v - > written = 0 ;
2023-01-04 01:32:16 +03:00
if ( bch2_btree_node_read_done ( c , ca , v , false , & saw_error ) | | saw_error )
2021-04-21 03:21:12 +03:00
return false ;
2017-03-17 09:18:50 +03:00
n_sorted = c - > verify_data - > data ;
sorted = & n_sorted - > keys ;
if ( inmemory - > u64s ! = sorted - > u64s | |
memcmp ( inmemory - > start ,
sorted - > start ,
vstruct_end ( inmemory ) - ( void * ) inmemory - > start ) ) {
unsigned offset = 0 , sectors ;
struct bset * i ;
unsigned j ;
console_lock ( ) ;
printk ( KERN_ERR " *** in memory: \n " ) ;
2020-06-18 00:33:53 +03:00
bch2_dump_bset ( c , b , inmemory , 0 ) ;
2017-03-17 09:18:50 +03:00
printk ( KERN_ERR " *** read back in: \n " ) ;
2020-06-18 00:33:53 +03:00
bch2_dump_bset ( c , v , sorted , 0 ) ;
2017-03-17 09:18:50 +03:00
2021-04-21 03:21:12 +03:00
while ( offset < v - > written ) {
if ( ! offset ) {
2017-03-17 09:18:50 +03:00
i = & n_ondisk - > keys ;
sectors = vstruct_blocks ( n_ondisk , c - > block_bits ) < <
c - > block_bits ;
} else {
struct btree_node_entry * bne =
( void * ) n_ondisk + ( offset < < 9 ) ;
i = & bne - > keys ;
sectors = vstruct_blocks ( bne , c - > block_bits ) < <
c - > block_bits ;
}
printk ( KERN_ERR " *** on disk block %u: \n " , offset ) ;
2020-06-18 00:33:53 +03:00
bch2_dump_bset ( c , b , i , offset ) ;
2017-03-17 09:18:50 +03:00
offset + = sectors ;
}
for ( j = 0 ; j < le16_to_cpu ( inmemory - > u64s ) ; j + + )
if ( inmemory - > _data [ j ] ! = sorted - > _data [ j ] )
break ;
console_unlock ( ) ;
2021-04-21 03:21:12 +03:00
bch_err ( c , " verify failed at key %u " , j ) ;
failed = true ;
}
if ( v - > written ! = b - > written ) {
bch_err ( c , " written wrong: expected %u, got %u " ,
b - > written , v - > written ) ;
failed = true ;
}
return failed ;
}
void __bch2_btree_verify ( struct bch_fs * c , struct btree * b )
{
struct bkey_ptrs_c ptrs ;
struct extent_ptr_decoded p ;
const union bch_extent_entry * entry ;
struct btree * v ;
struct bset * inmemory = & b - > data - > keys ;
struct bkey_packed * k ;
bool failed = false ;
if ( c - > opts . nochanges )
return ;
2021-07-11 06:03:15 +03:00
bch2_btree_node_io_lock ( b ) ;
2021-04-21 03:21:12 +03:00
mutex_lock ( & c - > verify_lock ) ;
if ( ! c - > verify_ondisk ) {
c - > verify_ondisk = kvpmalloc ( btree_bytes ( c ) , GFP_KERNEL ) ;
if ( ! c - > verify_ondisk )
goto out ;
}
if ( ! c - > verify_data ) {
c - > verify_data = __bch2_btree_node_mem_alloc ( c ) ;
if ( ! c - > verify_data )
goto out ;
list_del_init ( & c - > verify_data - > list ) ;
}
BUG_ON ( b - > nsets ! = 1 ) ;
2023-03-05 07:05:55 +03:00
for ( k = inmemory - > start ; k ! = vstruct_last ( inmemory ) ; k = bkey_p_next ( k ) )
2021-04-21 03:21:12 +03:00
if ( k - > type = = KEY_TYPE_btree_ptr_v2 ) {
struct bch_btree_ptr_v2 * v = ( void * ) bkeyp_val ( & b - > format , k ) ;
v - > mem_ptr = 0 ;
}
v = c - > verify_data ;
bkey_copy ( & v - > key , & b - > key ) ;
v - > c . level = b - > c . level ;
v - > c . btree_id = b - > c . btree_id ;
bch2_btree_keys_init ( v ) ;
ptrs = bch2_bkey_ptrs_c ( bkey_i_to_s_c ( & b - > key ) ) ;
bkey_for_each_ptr_decode ( & b - > key . k , ptrs , p , entry )
failed | = bch2_btree_verify_replica ( c , b , p ) ;
if ( failed ) {
2022-02-25 21:18:19 +03:00
struct printbuf buf = PRINTBUF ;
2021-04-21 03:21:12 +03:00
2022-02-25 21:18:19 +03:00
bch2_bkey_val_to_text ( & buf , c , bkey_i_to_s_c ( & b - > key ) ) ;
bch2_fs_fatal_error ( c , " btree node verify failed for : %s \n " , buf . buf ) ;
printbuf_exit ( & buf ) ;
2017-03-17 09:18:50 +03:00
}
out :
mutex_unlock ( & c - > verify_lock ) ;
2021-07-11 06:03:15 +03:00
bch2_btree_node_io_unlock ( b ) ;
2017-03-17 09:18:50 +03:00
}
# ifdef CONFIG_DEBUG_FS
/* XXX: bch_fs refcounting */
struct dump_iter {
2022-02-26 19:48:34 +03:00
struct bch_fs * c ;
2017-03-17 09:18:50 +03:00
enum btree_id id ;
2022-02-26 19:48:34 +03:00
struct bpos from ;
2022-07-20 23:50:26 +03:00
struct bpos prev_node ;
2022-02-26 19:48:34 +03:00
u64 iter ;
2017-03-17 09:18:50 +03:00
2022-02-25 21:18:19 +03:00
struct printbuf buf ;
2017-03-17 09:18:50 +03:00
char __user * ubuf ; /* destination user buffer */
size_t size ; /* size of requested read */
ssize_t ret ; /* bytes read so far */
} ;
2022-08-14 23:11:35 +03:00
static ssize_t flush_buf ( struct dump_iter * i )
2017-03-17 09:18:50 +03:00
{
2022-02-25 21:18:19 +03:00
if ( i - > buf . pos ) {
size_t bytes = min_t ( size_t , i - > buf . pos , i - > size ) ;
int err = copy_to_user ( i - > ubuf , i - > buf . buf , bytes ) ;
2017-03-17 09:18:50 +03:00
if ( err )
return err ;
i - > ret + = bytes ;
i - > ubuf + = bytes ;
i - > size - = bytes ;
2022-02-25 21:18:19 +03:00
i - > buf . pos - = bytes ;
memmove ( i - > buf . buf , i - > buf . buf + bytes , i - > buf . pos ) ;
2017-03-17 09:18:50 +03:00
}
2022-08-14 23:11:35 +03:00
return i - > size ? 0 : i - > ret ;
2017-03-17 09:18:50 +03:00
}
static int bch2_dump_open ( struct inode * inode , struct file * file )
{
struct btree_debug * bd = inode - > i_private ;
struct dump_iter * i ;
i = kzalloc ( sizeof ( struct dump_iter ) , GFP_KERNEL ) ;
if ( ! i )
return - ENOMEM ;
file - > private_data = i ;
i - > from = POS_MIN ;
2022-02-26 19:48:34 +03:00
i - > iter = 0 ;
2017-03-17 09:18:50 +03:00
i - > c = container_of ( bd , struct bch_fs , btree_debug [ bd - > id ] ) ;
i - > id = bd - > id ;
2022-02-25 21:18:19 +03:00
i - > buf = PRINTBUF ;
2017-03-17 09:18:50 +03:00
return 0 ;
}
static int bch2_dump_release ( struct inode * inode , struct file * file )
{
2022-02-25 21:18:19 +03:00
struct dump_iter * i = file - > private_data ;
printbuf_exit ( & i - > buf ) ;
kfree ( i ) ;
2017-03-17 09:18:50 +03:00
return 0 ;
}
static ssize_t bch2_read_btree ( struct file * file , char __user * buf ,
size_t size , loff_t * ppos )
{
struct dump_iter * i = file - > private_data ;
2019-03-25 22:10:15 +03:00
struct btree_trans trans ;
2021-08-30 22:18:31 +03:00
struct btree_iter iter ;
2017-03-17 09:18:50 +03:00
struct bkey_s_c k ;
2022-08-14 23:11:35 +03:00
ssize_t ret ;
2017-03-17 09:18:50 +03:00
i - > ubuf = buf ;
i - > size = size ;
i - > ret = 0 ;
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , i - > c , 0 , 0 ) ;
2019-03-25 22:10:15 +03:00
2022-08-14 23:11:35 +03:00
ret = for_each_btree_key2 ( & trans , iter , i - > id , i - > from ,
2022-07-20 23:50:26 +03:00
BTREE_ITER_PREFETCH |
BTREE_ITER_ALL_SNAPSHOTS , k , ( {
2022-08-14 23:11:35 +03:00
ret = flush_buf ( i ) ;
if ( ret )
2017-03-17 09:18:50 +03:00
break ;
2022-07-20 23:50:26 +03:00
bch2_bkey_val_to_text ( & i - > buf , i - > c , k ) ;
prt_newline ( & i - > buf ) ;
0 ;
} ) ) ;
i - > from = iter . pos ;
2022-08-14 23:11:35 +03:00
if ( ! ret )
ret = flush_buf ( i ) ;
2021-03-20 03:29:11 +03:00
2019-03-25 22:10:15 +03:00
bch2_trans_exit ( & trans ) ;
2017-03-17 09:18:50 +03:00
2022-08-14 23:11:35 +03:00
return ret ? : i - > ret ;
2017-03-17 09:18:50 +03:00
}
static const struct file_operations btree_debug_ops = {
. owner = THIS_MODULE ,
. open = bch2_dump_open ,
. release = bch2_dump_release ,
. read = bch2_read_btree ,
} ;
static ssize_t bch2_read_btree_formats ( struct file * file , char __user * buf ,
size_t size , loff_t * ppos )
{
struct dump_iter * i = file - > private_data ;
2019-03-25 22:10:15 +03:00
struct btree_trans trans ;
2021-08-30 22:18:31 +03:00
struct btree_iter iter ;
2017-03-17 09:18:50 +03:00
struct btree * b ;
2022-08-14 23:11:35 +03:00
ssize_t ret ;
2017-03-17 09:18:50 +03:00
i - > ubuf = buf ;
i - > size = size ;
i - > ret = 0 ;
2022-08-14 23:11:35 +03:00
ret = flush_buf ( i ) ;
if ( ret )
return ret ;
2017-03-17 09:18:50 +03:00
2022-11-24 11:12:22 +03:00
if ( bpos_eq ( SPOS_MAX , i - > from ) )
2017-03-17 09:18:50 +03:00
return i - > ret ;
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , i - > c , 0 , 0 ) ;
2019-03-25 22:10:15 +03:00
2022-08-14 23:11:35 +03:00
for_each_btree_node ( & trans , iter , i - > id , i - > from , 0 , b , ret ) {
ret = flush_buf ( i ) ;
if ( ret )
2017-03-17 09:18:50 +03:00
break ;
2022-08-14 23:11:35 +03:00
bch2_btree_node_to_text ( & i - > buf , i - > c , b ) ;
2022-11-24 11:12:22 +03:00
i - > from = ! bpos_eq ( SPOS_MAX , b - > key . k . p )
bcachefs: Start using bpos.snapshot field
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:
* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
and xattrs) now always have their snapshot field set to U32_MAX
The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.
We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).
This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2021-03-25 01:02:16 +03:00
? bpos_successor ( b - > key . k . p )
2017-03-17 09:18:50 +03:00
: b - > key . k . p ;
}
2021-08-30 22:18:31 +03:00
bch2_trans_iter_exit ( & trans , & iter ) ;
2021-03-20 03:29:11 +03:00
2019-03-25 22:10:15 +03:00
bch2_trans_exit ( & trans ) ;
2017-03-17 09:18:50 +03:00
2022-08-14 23:11:35 +03:00
if ( ! ret )
ret = flush_buf ( i ) ;
return ret ? : i - > ret ;
2017-03-17 09:18:50 +03:00
}
static const struct file_operations btree_format_debug_ops = {
. owner = THIS_MODULE ,
. open = bch2_dump_open ,
. release = bch2_dump_release ,
. read = bch2_read_btree_formats ,
} ;
static ssize_t bch2_read_bfloat_failed ( struct file * file , char __user * buf ,
size_t size , loff_t * ppos )
{
struct dump_iter * i = file - > private_data ;
2019-03-25 22:10:15 +03:00
struct btree_trans trans ;
2021-08-30 22:18:31 +03:00
struct btree_iter iter ;
2017-03-17 09:18:50 +03:00
struct bkey_s_c k ;
2022-08-14 23:11:35 +03:00
ssize_t ret ;
2017-03-17 09:18:50 +03:00
i - > ubuf = buf ;
i - > size = size ;
i - > ret = 0 ;
2022-08-14 23:11:35 +03:00
ret = flush_buf ( i ) ;
if ( ret )
return ret ;
2017-03-17 09:18:50 +03:00
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , i - > c , 0 , 0 ) ;
2019-03-25 22:10:15 +03:00
2022-08-14 23:11:35 +03:00
ret = for_each_btree_key2 ( & trans , iter , i - > id , i - > from ,
2022-07-20 23:50:26 +03:00
BTREE_ITER_PREFETCH |
BTREE_ITER_ALL_SNAPSHOTS , k , ( {
2021-08-30 22:18:31 +03:00
struct btree_path_level * l = & iter . path - > l [ 0 ] ;
2017-03-17 09:18:50 +03:00
struct bkey_packed * _k =
bch2_btree_node_iter_peek ( & l - > iter , l - > b ) ;
2022-08-14 23:11:35 +03:00
ret = flush_buf ( i ) ;
if ( ret )
2017-03-17 09:18:50 +03:00
break ;
2022-07-20 23:50:26 +03:00
2022-11-24 11:12:22 +03:00
if ( bpos_gt ( l - > b - > key . k . p , i - > prev_node ) ) {
2022-07-20 23:50:26 +03:00
bch2_btree_node_to_text ( & i - > buf , i - > c , l - > b ) ;
i - > prev_node = l - > b - > key . k . p ;
}
bch2_bfloat_to_text ( & i - > buf , l - > b , _k ) ;
0 ;
} ) ) ;
i - > from = iter . pos ;
2019-03-25 22:10:15 +03:00
bch2_trans_exit ( & trans ) ;
2017-03-17 09:18:50 +03:00
2022-08-14 23:11:35 +03:00
if ( ! ret )
ret = flush_buf ( i ) ;
return ret ? : i - > ret ;
2017-03-17 09:18:50 +03:00
}
static const struct file_operations bfloat_failed_debug_ops = {
. owner = THIS_MODULE ,
. open = bch2_dump_open ,
. release = bch2_dump_release ,
. read = bch2_read_bfloat_failed ,
} ;
2022-02-26 19:48:34 +03:00
static void bch2_cached_btree_node_to_text ( struct printbuf * out , struct bch_fs * c ,
struct btree * b )
{
2023-02-04 05:01:40 +03:00
if ( ! out - > nr_tabstops )
printbuf_tabstop_push ( out , 32 ) ;
2022-02-26 19:48:34 +03:00
2023-02-04 05:01:40 +03:00
prt_printf ( out , " %px btree=%s l=%u " ,
2022-02-26 19:48:34 +03:00
b ,
bch2_btree_ids [ b - > c . btree_id ] ,
b - > c . level ) ;
2023-02-04 05:01:40 +03:00
prt_newline ( out ) ;
2022-02-26 19:48:34 +03:00
2023-02-04 05:01:40 +03:00
printbuf_indent_add ( out , 2 ) ;
2022-02-26 19:48:34 +03:00
bch2_bkey_val_to_text ( out , c , bkey_i_to_s_c ( & b - > key ) ) ;
2023-02-04 05:01:40 +03:00
prt_newline ( out ) ;
prt_printf ( out , " flags: " ) ;
prt_tab ( out ) ;
prt_bitflags ( out , bch2_btree_node_flags , b - > flags ) ;
prt_newline ( out ) ;
prt_printf ( out , " pcpu read locks: " ) ;
prt_tab ( out ) ;
prt_printf ( out , " %u " , b - > c . lock . readers ! = NULL ) ;
prt_newline ( out ) ;
prt_printf ( out , " written: " ) ;
prt_tab ( out ) ;
prt_printf ( out , " %u " , b - > written ) ;
prt_newline ( out ) ;
prt_printf ( out , " writes blocked: " ) ;
prt_tab ( out ) ;
prt_printf ( out , " %u " , ! list_empty_careful ( & b - > write_blocked ) ) ;
prt_newline ( out ) ;
prt_printf ( out , " will make reachable: " ) ;
prt_tab ( out ) ;
prt_printf ( out , " %lx " , b - > will_make_reachable ) ;
prt_newline ( out ) ;
prt_printf ( out , " journal pin %px: " , & b - > writes [ 0 ] . journal ) ;
prt_tab ( out ) ;
prt_printf ( out , " %llu " , b - > writes [ 0 ] . journal . seq ) ;
prt_newline ( out ) ;
prt_printf ( out , " journal pin %px: " , & b - > writes [ 1 ] . journal ) ;
prt_tab ( out ) ;
prt_printf ( out , " %llu " , b - > writes [ 1 ] . journal . seq ) ;
prt_newline ( out ) ;
printbuf_indent_sub ( out , 2 ) ;
2022-02-26 19:48:34 +03:00
}
static ssize_t bch2_cached_btree_nodes_read ( struct file * file , char __user * buf ,
size_t size , loff_t * ppos )
{
struct dump_iter * i = file - > private_data ;
struct bch_fs * c = i - > c ;
bool done = false ;
2022-08-14 23:11:35 +03:00
ssize_t ret = 0 ;
2022-02-26 19:48:34 +03:00
i - > ubuf = buf ;
i - > size = size ;
i - > ret = 0 ;
do {
struct bucket_table * tbl ;
struct rhash_head * pos ;
struct btree * b ;
2022-08-14 23:11:35 +03:00
ret = flush_buf ( i ) ;
if ( ret )
return ret ;
2022-02-26 19:48:34 +03:00
rcu_read_lock ( ) ;
i - > buf . atomic + + ;
tbl = rht_dereference_rcu ( c - > btree_cache . table . tbl ,
& c - > btree_cache . table ) ;
if ( i - > iter < tbl - > size ) {
rht_for_each_entry_rcu ( b , pos , tbl , i - > iter , hash )
bch2_cached_btree_node_to_text ( & i - > buf , c , b ) ;
2022-10-20 01:31:33 +03:00
i - > iter + + ;
2022-02-26 19:48:34 +03:00
} else {
done = true ;
}
- - i - > buf . atomic ;
rcu_read_unlock ( ) ;
} while ( ! done ) ;
if ( i - > buf . allocation_failure )
2022-08-14 23:11:35 +03:00
ret = - ENOMEM ;
if ( ! ret )
ret = flush_buf ( i ) ;
2022-02-26 19:48:34 +03:00
2022-08-14 23:11:35 +03:00
return ret ? : i - > ret ;
2022-02-26 19:48:34 +03:00
}
static const struct file_operations cached_btree_nodes_ops = {
. owner = THIS_MODULE ,
. open = bch2_dump_open ,
. release = bch2_dump_release ,
. read = bch2_cached_btree_nodes_read ,
} ;
2022-06-18 03:12:02 +03:00
# ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
static ssize_t bch2_btree_transactions_read ( struct file * file , char __user * buf ,
size_t size , loff_t * ppos )
{
struct dump_iter * i = file - > private_data ;
struct bch_fs * c = i - > c ;
struct btree_trans * trans ;
2022-08-14 23:11:35 +03:00
ssize_t ret = 0 ;
2022-06-18 03:12:02 +03:00
i - > ubuf = buf ;
i - > size = size ;
i - > ret = 0 ;
mutex_lock ( & c - > btree_trans_lock ) ;
list_for_each_entry ( trans , & c - > btree_trans_list , list ) {
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
if ( trans - > locking_wait . task - > pid < = i - > iter )
2022-06-18 03:12:02 +03:00
continue ;
2022-08-14 23:11:35 +03:00
ret = flush_buf ( i ) ;
if ( ret )
2023-01-25 03:42:04 +03:00
break ;
2022-06-18 03:12:02 +03:00
bch2_btree_trans_to_text ( & i - > buf , trans ) ;
prt_printf ( & i - > buf , " backtrace: " ) ;
prt_newline ( & i - > buf ) ;
printbuf_indent_add ( & i - > buf , 2 ) ;
2023-02-13 07:15:53 +03:00
bch2_prt_task_backtrace ( & i - > buf , trans - > locking_wait . task ) ;
2022-06-18 03:12:02 +03:00
printbuf_indent_sub ( & i - > buf , 2 ) ;
prt_newline ( & i - > buf ) ;
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
i - > iter = trans - > locking_wait . task - > pid ;
2022-06-18 03:12:02 +03:00
}
mutex_unlock ( & c - > btree_trans_lock ) ;
if ( i - > buf . allocation_failure )
2022-08-14 23:11:35 +03:00
ret = - ENOMEM ;
if ( ! ret )
ret = flush_buf ( i ) ;
2022-06-18 03:12:02 +03:00
2022-08-14 23:11:35 +03:00
return ret ? : i - > ret ;
2022-06-18 03:12:02 +03:00
}
static const struct file_operations btree_transactions_ops = {
. owner = THIS_MODULE ,
. open = bch2_dump_open ,
. release = bch2_dump_release ,
. read = bch2_btree_transactions_read ,
} ;
# endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
2022-02-26 19:48:34 +03:00
static ssize_t bch2_journal_pins_read ( struct file * file , char __user * buf ,
size_t size , loff_t * ppos )
{
struct dump_iter * i = file - > private_data ;
struct bch_fs * c = i - > c ;
bool done = false ;
int err ;
i - > ubuf = buf ;
i - > size = size ;
i - > ret = 0 ;
do {
err = flush_buf ( i ) ;
if ( err )
return err ;
if ( ! i - > size )
break ;
done = bch2_journal_seq_pins_to_text ( & i - > buf , & c - > journal , & i - > iter ) ;
i - > iter + + ;
} while ( ! done ) ;
if ( i - > buf . allocation_failure )
return - ENOMEM ;
return i - > ret ;
}
static const struct file_operations journal_pins_ops = {
. owner = THIS_MODULE ,
. open = bch2_dump_open ,
. release = bch2_dump_release ,
. read = bch2_journal_pins_read ,
} ;
2022-07-14 11:33:09 +03:00
static int lock_held_stats_open ( struct inode * inode , struct file * file )
{
struct bch_fs * c = inode - > i_private ;
struct dump_iter * i ;
i = kzalloc ( sizeof ( struct dump_iter ) , GFP_KERNEL ) ;
if ( ! i )
return - ENOMEM ;
i - > iter = 0 ;
i - > c = c ;
i - > buf = PRINTBUF ;
file - > private_data = i ;
return 0 ;
}
static int lock_held_stats_release ( struct inode * inode , struct file * file )
{
struct dump_iter * i = file - > private_data ;
printbuf_exit ( & i - > buf ) ;
kfree ( i ) ;
return 0 ;
}
static ssize_t lock_held_stats_read ( struct file * file , char __user * buf ,
size_t size , loff_t * ppos )
{
struct dump_iter * i = file - > private_data ;
2022-08-12 02:36:24 +03:00
struct bch_fs * c = i - > c ;
2022-07-14 11:33:09 +03:00
int err ;
i - > ubuf = buf ;
i - > size = size ;
i - > ret = 0 ;
2022-08-12 04:06:02 +03:00
while ( 1 ) {
2022-08-12 02:36:24 +03:00
struct btree_transaction_stats * s = & c - > btree_transaction_stats [ i - > iter ] ;
2022-07-14 11:33:09 +03:00
err = flush_buf ( i ) ;
if ( err )
return err ;
if ( ! i - > size )
break ;
2022-10-17 14:03:11 +03:00
if ( i - > iter = = ARRAY_SIZE ( bch2_btree_transaction_fns ) | |
! bch2_btree_transaction_fns [ i - > iter ] )
2022-08-12 04:06:02 +03:00
break ;
2022-10-17 14:03:11 +03:00
prt_printf ( & i - > buf , " %s: " , bch2_btree_transaction_fns [ i - > iter ] ) ;
2022-07-14 11:33:09 +03:00
prt_newline ( & i - > buf ) ;
2022-08-12 02:36:24 +03:00
printbuf_indent_add ( & i - > buf , 2 ) ;
2022-08-12 03:14:54 +03:00
mutex_lock ( & s - > lock ) ;
2022-08-23 04:49:55 +03:00
prt_printf ( & i - > buf , " Max mem used: %u " , s - > max_mem ) ;
prt_newline ( & i - > buf ) ;
2022-08-12 03:14:54 +03:00
if ( IS_ENABLED ( CONFIG_BCACHEFS_LOCK_TIME_STATS ) ) {
prt_printf ( & i - > buf , " Lock hold times: " ) ;
prt_newline ( & i - > buf ) ;
printbuf_indent_add ( & i - > buf , 2 ) ;
bch2_time_stats_to_text ( & i - > buf , & s - > lock_hold_times ) ;
printbuf_indent_sub ( & i - > buf , 2 ) ;
}
if ( s - > max_paths_text ) {
prt_printf ( & i - > buf , " Maximum allocated btree paths (%u): " , s - > nr_max_paths ) ;
prt_newline ( & i - > buf ) ;
printbuf_indent_add ( & i - > buf , 2 ) ;
prt_str_indented ( & i - > buf , s - > max_paths_text ) ;
printbuf_indent_sub ( & i - > buf , 2 ) ;
}
mutex_unlock ( & s - > lock ) ;
2022-08-12 02:36:24 +03:00
printbuf_indent_sub ( & i - > buf , 2 ) ;
2022-07-14 11:33:09 +03:00
prt_newline ( & i - > buf ) ;
i - > iter + + ;
}
if ( i - > buf . allocation_failure )
return - ENOMEM ;
return i - > ret ;
}
static const struct file_operations lock_held_stats_op = {
. owner = THIS_MODULE ,
. open = lock_held_stats_open ,
. release = lock_held_stats_release ,
. read = lock_held_stats_read ,
} ;
2022-08-23 06:12:11 +03:00
static ssize_t bch2_btree_deadlock_read ( struct file * file , char __user * buf ,
size_t size , loff_t * ppos )
{
struct dump_iter * i = file - > private_data ;
struct bch_fs * c = i - > c ;
struct btree_trans * trans ;
ssize_t ret = 0 ;
i - > ubuf = buf ;
i - > size = size ;
i - > ret = 0 ;
if ( i - > iter )
goto out ;
mutex_lock ( & c - > btree_trans_lock ) ;
2022-10-02 08:41:08 +03:00
list_for_each_entry ( trans , & c - > btree_trans_list , list ) {
if ( trans - > locking_wait . task - > pid < = i - > iter )
continue ;
ret = flush_buf ( i ) ;
if ( ret )
2023-01-25 03:42:04 +03:00
break ;
2022-10-02 08:41:08 +03:00
bch2_check_for_deadlock ( trans , & i - > buf ) ;
i - > iter = trans - > locking_wait . task - > pid ;
}
2022-08-23 06:12:11 +03:00
mutex_unlock ( & c - > btree_trans_lock ) ;
out :
if ( i - > buf . allocation_failure )
ret = - ENOMEM ;
if ( ! ret )
ret = flush_buf ( i ) ;
return ret ? : i - > ret ;
}
static const struct file_operations btree_deadlock_ops = {
. owner = THIS_MODULE ,
. open = bch2_dump_open ,
. release = bch2_dump_release ,
. read = bch2_btree_deadlock_read ,
} ;
2017-03-17 09:18:50 +03:00
void bch2_fs_debug_exit ( struct bch_fs * c )
{
2022-02-26 19:48:34 +03:00
if ( ! IS_ERR_OR_NULL ( c - > fs_debug_dir ) )
debugfs_remove_recursive ( c - > fs_debug_dir ) ;
2017-03-17 09:18:50 +03:00
}
void bch2_fs_debug_init ( struct bch_fs * c )
{
struct btree_debug * bd ;
char name [ 100 ] ;
if ( IS_ERR_OR_NULL ( bch_debug ) )
return ;
snprintf ( name , sizeof ( name ) , " %pU " , c - > sb . user_uuid . b ) ;
2022-02-26 19:48:34 +03:00
c - > fs_debug_dir = debugfs_create_dir ( name , bch_debug ) ;
if ( IS_ERR_OR_NULL ( c - > fs_debug_dir ) )
return ;
debugfs_create_file ( " cached_btree_nodes " , 0400 , c - > fs_debug_dir ,
c - > btree_debug , & cached_btree_nodes_ops ) ;
2022-06-18 03:12:02 +03:00
# ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
debugfs_create_file ( " btree_transactions " , 0400 , c - > fs_debug_dir ,
c - > btree_debug , & btree_transactions_ops ) ;
# endif
2022-02-26 19:48:34 +03:00
debugfs_create_file ( " journal_pins " , 0400 , c - > fs_debug_dir ,
c - > btree_debug , & journal_pins_ops ) ;
2022-08-12 03:14:54 +03:00
debugfs_create_file ( " btree_transaction_stats " , 0400 , c - > fs_debug_dir ,
c , & lock_held_stats_op ) ;
2022-07-14 11:33:09 +03:00
2022-08-23 06:12:11 +03:00
debugfs_create_file ( " btree_deadlock " , 0400 , c - > fs_debug_dir ,
c - > btree_debug , & btree_deadlock_ops ) ;
2022-02-26 19:48:34 +03:00
c - > btree_debug_dir = debugfs_create_dir ( " btrees " , c - > fs_debug_dir ) ;
if ( IS_ERR_OR_NULL ( c - > btree_debug_dir ) )
2017-03-17 09:18:50 +03:00
return ;
for ( bd = c - > btree_debug ;
bd < c - > btree_debug + ARRAY_SIZE ( c - > btree_debug ) ;
bd + + ) {
bd - > id = bd - c - > btree_debug ;
2022-02-26 19:48:34 +03:00
debugfs_create_file ( bch2_btree_ids [ bd - > id ] ,
0400 , c - > btree_debug_dir , bd ,
& btree_debug_ops ) ;
2017-03-17 09:18:50 +03:00
snprintf ( name , sizeof ( name ) , " %s-formats " ,
bch2_btree_ids [ bd - > id ] ) ;
2022-02-26 19:48:34 +03:00
debugfs_create_file ( name , 0400 , c - > btree_debug_dir , bd ,
& btree_format_debug_ops ) ;
2017-03-17 09:18:50 +03:00
snprintf ( name , sizeof ( name ) , " %s-bfloat-failed " ,
bch2_btree_ids [ bd - > id ] ) ;
2022-02-26 19:48:34 +03:00
debugfs_create_file ( name , 0400 , c - > btree_debug_dir , bd ,
& bfloat_failed_debug_ops ) ;
2017-03-17 09:18:50 +03:00
}
}
# endif
void bch2_debug_exit ( void )
{
if ( ! IS_ERR_OR_NULL ( bch_debug ) )
debugfs_remove_recursive ( bch_debug ) ;
}
int __init bch2_debug_init ( void )
{
int ret = 0 ;
bch_debug = debugfs_create_dir ( " bcachefs " , NULL ) ;
return ret ;
}