2017-03-17 09:18:50 +03:00
/* SPDX-License-Identifier: GPL-2.0 */
# ifndef _BCACHEFS_BTREE_LOCKING_H
# define _BCACHEFS_BTREE_LOCKING_H
/*
* Only for internal btree use :
*
* The btree iterator tracks what locks it wants to take , and what locks it
* currently has - here we have wrappers for locking / unlocking btree nodes and
* updating the iterator state
*/
# include "btree_iter.h"
# include "six.h"
2022-08-19 22:35:34 +03:00
extern struct lock_class_key bch2_btree_node_lock_key ;
2022-08-10 19:42:55 +03:00
static inline bool is_btree_node ( struct btree_path * path , unsigned l )
{
return l < BTREE_MAX_DEPTH & & ! IS_ERR_OR_NULL ( path - > l [ l ] . b ) ;
}
2022-08-20 02:50:18 +03:00
static inline struct btree_transaction_stats * btree_trans_stats ( struct btree_trans * trans )
{
return trans - > fn_idx < ARRAY_SIZE ( trans - > c - > btree_transaction_stats )
? & trans - > c - > btree_transaction_stats [ trans - > fn_idx ]
: NULL ;
}
2017-03-17 09:18:50 +03:00
/* matches six lock types */
enum btree_node_locked_type {
BTREE_NODE_UNLOCKED = - 1 ,
BTREE_NODE_READ_LOCKED = SIX_LOCK_read ,
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent ,
2022-08-23 04:05:31 +03:00
BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write ,
2017-03-17 09:18:50 +03:00
} ;
2021-08-30 22:18:31 +03:00
static inline int btree_node_locked_type ( struct btree_path * path ,
2017-03-17 09:18:50 +03:00
unsigned level )
{
2022-08-22 00:20:42 +03:00
return BTREE_NODE_UNLOCKED + ( ( path - > nodes_locked > > ( level < < 1 ) ) & 3 ) ;
2017-03-17 09:18:50 +03:00
}
2022-08-23 04:05:31 +03:00
static inline bool btree_node_write_locked ( struct btree_path * path , unsigned l )
{
return btree_node_locked_type ( path , l ) = = BTREE_NODE_WRITE_LOCKED ;
}
static inline bool btree_node_intent_locked ( struct btree_path * path , unsigned l )
2017-03-17 09:18:50 +03:00
{
2022-08-23 04:05:31 +03:00
return btree_node_locked_type ( path , l ) = = BTREE_NODE_INTENT_LOCKED ;
2017-03-17 09:18:50 +03:00
}
2022-08-23 04:05:31 +03:00
static inline bool btree_node_read_locked ( struct btree_path * path , unsigned l )
2017-03-17 09:18:50 +03:00
{
2022-08-23 04:05:31 +03:00
return btree_node_locked_type ( path , l ) = = BTREE_NODE_READ_LOCKED ;
2017-03-17 09:18:50 +03:00
}
2021-08-30 22:18:31 +03:00
static inline bool btree_node_locked ( struct btree_path * path , unsigned level )
2017-03-17 09:18:50 +03:00
{
2022-08-22 01:17:51 +03:00
return btree_node_locked_type ( path , level ) ! = BTREE_NODE_UNLOCKED ;
2017-03-17 09:18:50 +03:00
}
2022-08-22 01:17:51 +03:00
static inline void mark_btree_node_locked_noreset ( struct btree_path * path ,
2022-08-22 00:20:42 +03:00
unsigned level ,
enum btree_node_locked_type type )
2017-03-17 09:18:50 +03:00
{
/* relying on this to avoid a branch */
BUILD_BUG_ON ( SIX_LOCK_read ! = 0 ) ;
BUILD_BUG_ON ( SIX_LOCK_intent ! = 1 ) ;
2022-08-22 00:20:42 +03:00
path - > nodes_locked & = ~ ( 3U < < ( level < < 1 ) ) ;
path - > nodes_locked | = ( type + 1 ) < < ( level < < 1 ) ;
2022-08-22 01:17:51 +03:00
}
static inline void mark_btree_node_unlocked ( struct btree_path * path ,
unsigned level )
{
2022-08-23 04:05:31 +03:00
EBUG_ON ( btree_node_write_locked ( path , level ) ) ;
2022-08-22 01:17:51 +03:00
mark_btree_node_locked_noreset ( path , level , BTREE_NODE_UNLOCKED ) ;
2017-03-17 09:18:50 +03:00
}
2022-07-14 11:33:09 +03:00
static inline void mark_btree_node_locked ( struct btree_trans * trans ,
struct btree_path * path ,
unsigned level ,
enum six_lock_type type )
{
2022-08-22 01:17:51 +03:00
mark_btree_node_locked_noreset ( path , level , type ) ;
2022-07-14 11:33:09 +03:00
# ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
2022-10-15 08:03:14 +03:00
path - > l [ level ] . lock_taken_time = local_clock ( ) ;
2022-07-14 11:33:09 +03:00
# endif
}
2021-08-30 22:18:31 +03:00
static inline enum six_lock_type __btree_lock_want ( struct btree_path * path , int level )
2017-03-17 09:18:50 +03:00
{
2021-08-30 22:18:31 +03:00
return level < path - > locks_want
2017-03-17 09:18:50 +03:00
? SIX_LOCK_intent
: SIX_LOCK_read ;
}
static inline enum btree_node_locked_type
2021-08-30 22:18:31 +03:00
btree_lock_want ( struct btree_path * path , int level )
2017-03-17 09:18:50 +03:00
{
2021-08-30 22:18:31 +03:00
if ( level < path - > level )
2017-03-17 09:18:50 +03:00
return BTREE_NODE_UNLOCKED ;
2021-08-30 22:18:31 +03:00
if ( level < path - > locks_want )
2017-03-17 09:18:50 +03:00
return BTREE_NODE_INTENT_LOCKED ;
2021-08-30 22:18:31 +03:00
if ( level = = path - > level )
2017-03-17 09:18:50 +03:00
return BTREE_NODE_READ_LOCKED ;
return BTREE_NODE_UNLOCKED ;
}
2022-08-12 02:36:24 +03:00
static void btree_trans_lock_hold_time_update ( struct btree_trans * trans ,
struct btree_path * path , unsigned level )
{
# ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
struct btree_transaction_stats * s = btree_trans_stats ( trans ) ;
if ( s )
__bch2_time_stats_update ( & s - > lock_hold_times ,
path - > l [ level ] . lock_taken_time ,
2022-10-15 08:03:14 +03:00
local_clock ( ) ) ;
2022-08-12 02:36:24 +03:00
# endif
}
2022-08-20 02:50:18 +03:00
/* unlock: */
2022-07-14 09:58:23 +03:00
static inline void btree_node_unlock ( struct btree_trans * trans ,
struct btree_path * path , unsigned level )
2017-03-17 09:18:50 +03:00
{
2021-08-30 22:18:31 +03:00
int lock_type = btree_node_locked_type ( path , level ) ;
2017-03-17 09:18:50 +03:00
EBUG_ON ( level > = BTREE_MAX_DEPTH ) ;
2022-07-14 11:33:09 +03:00
if ( lock_type ! = BTREE_NODE_UNLOCKED ) {
2021-08-30 22:18:31 +03:00
six_unlock_type ( & path - > l [ level ] . b - > c . lock , lock_type ) ;
2022-08-12 02:36:24 +03:00
btree_trans_lock_hold_time_update ( trans , path , level ) ;
2022-07-14 11:33:09 +03:00
}
2021-08-30 22:18:31 +03:00
mark_btree_node_unlocked ( path , level ) ;
2017-03-17 09:18:50 +03:00
}
2022-08-22 01:17:51 +03:00
static inline int btree_path_lowest_level_locked ( struct btree_path * path )
{
2022-08-22 00:20:42 +03:00
return __ffs ( path - > nodes_locked ) > > 1 ;
2022-08-22 01:17:51 +03:00
}
static inline int btree_path_highest_level_locked ( struct btree_path * path )
{
2022-08-22 00:20:42 +03:00
return __fls ( path - > nodes_locked ) > > 1 ;
2022-08-22 01:17:51 +03:00
}
2022-07-14 09:58:23 +03:00
static inline void __bch2_btree_path_unlock ( struct btree_trans * trans ,
struct btree_path * path )
2017-03-17 09:18:50 +03:00
{
2021-08-30 22:18:31 +03:00
btree_path_set_dirty ( path , BTREE_ITER_NEED_RELOCK ) ;
2017-03-17 09:18:50 +03:00
2021-08-30 22:18:31 +03:00
while ( path - > nodes_locked )
2022-08-22 01:17:51 +03:00
btree_node_unlock ( trans , path , btree_path_lowest_level_locked ( path ) ) ;
2017-03-17 09:18:50 +03:00
}
2022-08-20 02:50:18 +03:00
/*
* Updates the saved lock sequence number , so that bch2_btree_node_relock ( ) will
* succeed :
*/
static inline void
bch2_btree_node_unlock_write_inlined ( struct btree_trans * trans , struct btree_path * path ,
struct btree * b )
2017-03-17 09:18:50 +03:00
{
2022-08-20 02:50:18 +03:00
struct btree_path * linked ;
EBUG_ON ( path - > l [ b - > c . level ] . b ! = b ) ;
EBUG_ON ( path - > l [ b - > c . level ] . lock_seq + 1 ! = b - > c . lock . state . seq ) ;
2022-08-23 04:05:31 +03:00
EBUG_ON ( btree_node_locked_type ( path , b - > c . level ) ! = SIX_LOCK_write ) ;
mark_btree_node_locked_noreset ( path , b - > c . level , SIX_LOCK_intent ) ;
2022-08-20 02:50:18 +03:00
trans_for_each_path_with_node ( trans , b , linked )
linked - > l [ b - > c . level ] . lock_seq + = 2 ;
six_unlock_write ( & b - > c . lock ) ;
2017-03-17 09:18:50 +03:00
}
2022-08-20 02:50:18 +03:00
void bch2_btree_node_unlock_write ( struct btree_trans * ,
struct btree_path * , struct btree * ) ;
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
int bch2_six_check_for_deadlock ( struct six_lock * lock , void * p ) ;
2022-08-20 02:50:18 +03:00
/* lock: */
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
static inline int __btree_node_lock_nopath ( struct btree_trans * trans ,
struct btree_bkey_cached_common * b ,
enum six_lock_type type ,
2023-02-05 03:39:59 +03:00
bool lock_may_not_fail ,
unsigned long ip )
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
{
int ret ;
trans - > lock_may_not_fail = lock_may_not_fail ;
trans - > lock_must_abort = false ;
2022-08-22 22:29:53 +03:00
trans - > locking = b ;
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
2023-02-05 03:39:59 +03:00
ret = six_lock_type_ip_waiter ( & b - > lock , type , & trans - > locking_wait ,
bch2_six_check_for_deadlock , trans , ip ) ;
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
WRITE_ONCE ( trans - > locking , NULL ) ;
WRITE_ONCE ( trans - > locking_wait . start_time , 0 ) ;
return ret ;
}
2022-08-21 21:29:43 +03:00
static inline int __must_check
btree_node_lock_nopath ( struct btree_trans * trans ,
struct btree_bkey_cached_common * b ,
2023-02-05 03:39:59 +03:00
enum six_lock_type type ,
unsigned long ip )
2022-08-21 21:29:43 +03:00
{
2023-02-05 03:39:59 +03:00
return __btree_node_lock_nopath ( trans , b , type , false , ip ) ;
2022-08-21 21:29:43 +03:00
}
static inline void btree_node_lock_nopath_nofail ( struct btree_trans * trans ,
struct btree_bkey_cached_common * b ,
enum six_lock_type type )
{
2023-02-05 03:39:59 +03:00
int ret = __btree_node_lock_nopath ( trans , b , type , true , _THIS_IP_ ) ;
2022-08-21 21:29:43 +03:00
BUG_ON ( ret ) ;
}
2018-07-24 23:42:27 +03:00
/*
* Lock a btree node if we already have it locked on one of our linked
* iterators :
*/
2020-06-12 21:58:07 +03:00
static inline bool btree_node_lock_increment ( struct btree_trans * trans ,
2022-08-22 20:21:10 +03:00
struct btree_bkey_cached_common * b ,
unsigned level ,
2018-07-24 23:42:27 +03:00
enum btree_node_locked_type want )
{
2021-08-30 22:18:31 +03:00
struct btree_path * path ;
2018-07-24 23:42:27 +03:00
2021-08-30 22:18:31 +03:00
trans_for_each_path ( trans , path )
2022-08-22 20:21:10 +03:00
if ( & path - > l [ level ] . b - > c = = b & &
2021-08-30 22:18:31 +03:00
btree_node_locked_type ( path , level ) > = want ) {
2022-08-22 20:21:10 +03:00
six_lock_increment ( & b - > lock , want ) ;
2018-07-24 23:42:27 +03:00
return true ;
}
return false ;
}
2022-07-18 06:06:38 +03:00
static inline int btree_node_lock ( struct btree_trans * trans ,
2021-08-30 22:18:31 +03:00
struct btree_path * path ,
2022-08-22 20:21:10 +03:00
struct btree_bkey_cached_common * b ,
2022-08-22 22:29:53 +03:00
unsigned level ,
2020-06-13 05:29:48 +03:00
enum six_lock_type type ,
2020-10-28 21:17:46 +03:00
unsigned long ip )
2017-03-17 09:18:50 +03:00
{
2022-07-18 06:06:38 +03:00
int ret = 0 ;
2017-03-17 09:18:50 +03:00
EBUG_ON ( level > = BTREE_MAX_DEPTH ) ;
2021-08-30 22:18:31 +03:00
EBUG_ON ( ! ( trans - > paths_allocated & ( 1ULL < < path - > idx ) ) ) ;
2020-06-13 05:29:48 +03:00
2022-08-22 20:21:10 +03:00
if ( likely ( six_trylock_type ( & b - > lock , type ) ) | |
2022-07-18 06:06:38 +03:00
btree_node_lock_increment ( trans , b , level , type ) | |
2023-02-05 03:39:59 +03:00
! ( ret = btree_node_lock_nopath ( trans , b , type , btree_path_ip_allocated ( path ) ) ) ) {
2022-07-14 11:33:09 +03:00
# ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
2022-10-15 08:03:14 +03:00
path - > l [ b - > level ] . lock_taken_time = local_clock ( ) ;
2022-07-14 11:33:09 +03:00
# endif
}
2022-07-18 06:06:38 +03:00
return ret ;
2017-03-17 09:18:50 +03:00
}
2022-08-22 22:29:53 +03:00
int __bch2_btree_node_lock_write ( struct btree_trans * , struct btree_path * ,
struct btree_bkey_cached_common * b , bool ) ;
2017-03-17 09:18:50 +03:00
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
static inline int __btree_node_lock_write ( struct btree_trans * trans ,
struct btree_path * path ,
struct btree_bkey_cached_common * b ,
bool lock_may_not_fail )
2017-03-17 09:18:50 +03:00
{
2022-09-04 04:09:54 +03:00
EBUG_ON ( & path - > l [ b - > level ] . b - > c ! = b ) ;
EBUG_ON ( path - > l [ b - > level ] . lock_seq ! = b - > lock . state . seq ) ;
EBUG_ON ( ! btree_node_intent_locked ( path , b - > level ) ) ;
2017-03-17 09:18:50 +03:00
2022-08-26 21:55:00 +03:00
/*
* six locks are unfair , and read locks block while a thread wants a
* write lock : thus , we need to tell the cycle detector we have a write
* lock _before_ taking the lock :
*/
2022-09-04 04:09:54 +03:00
mark_btree_node_locked_noreset ( path , b - > level , SIX_LOCK_write ) ;
2022-08-26 21:55:00 +03:00
2022-08-22 22:29:53 +03:00
return likely ( six_trylock_write ( & b - > lock ) )
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
? 0
2022-08-22 22:29:53 +03:00
: __bch2_btree_node_lock_write ( trans , path , b , lock_may_not_fail ) ;
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
}
2022-08-23 06:39:23 +03:00
static inline int __must_check
bch2_btree_node_lock_write ( struct btree_trans * trans ,
struct btree_path * path ,
2022-09-04 04:09:54 +03:00
struct btree_bkey_cached_common * b )
2022-08-23 06:39:23 +03:00
{
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
2022-08-22 20:23:47 +03:00
return __btree_node_lock_write ( trans , path , b , false ) ;
2022-08-23 06:39:23 +03:00
}
2023-03-06 16:58:02 +03:00
void bch2_btree_node_lock_write_nofail ( struct btree_trans * ,
struct btree_path * ,
struct btree_bkey_cached_common * ) ;
2022-08-20 02:50:18 +03:00
/* relock: */
bool bch2_btree_path_relock_norestart ( struct btree_trans * ,
struct btree_path * , unsigned long ) ;
int __bch2_btree_path_relock ( struct btree_trans * ,
struct btree_path * , unsigned long ) ;
static inline int bch2_btree_path_relock ( struct btree_trans * trans ,
struct btree_path * path , unsigned long trace_ip )
{
return btree_node_locked ( path , path - > level )
? 0
: __bch2_btree_path_relock ( trans , path , trace_ip ) ;
}
2022-09-25 23:42:53 +03:00
bool __bch2_btree_node_relock ( struct btree_trans * , struct btree_path * , unsigned , bool trace ) ;
2022-08-20 02:50:18 +03:00
static inline bool bch2_btree_node_relock ( struct btree_trans * trans ,
struct btree_path * path , unsigned level )
{
EBUG_ON ( btree_node_locked ( path , level ) & &
2022-08-23 04:05:31 +03:00
! btree_node_write_locked ( path , level ) & &
btree_node_locked_type ( path , level ) ! = __btree_lock_want ( path , level ) ) ;
2022-08-20 02:50:18 +03:00
return likely ( btree_node_locked ( path , level ) ) | |
2022-08-27 19:28:09 +03:00
( ! IS_ERR_OR_NULL ( path - > l [ level ] . b ) & &
2022-09-25 23:42:53 +03:00
__bch2_btree_node_relock ( trans , path , level , true ) ) ;
}
static inline bool bch2_btree_node_relock_notrace ( struct btree_trans * trans ,
struct btree_path * path , unsigned level )
{
EBUG_ON ( btree_node_locked ( path , level ) & &
! btree_node_write_locked ( path , level ) & &
btree_node_locked_type ( path , level ) ! = __btree_lock_want ( path , level ) ) ;
return likely ( btree_node_locked ( path , level ) ) | |
( ! IS_ERR_OR_NULL ( path - > l [ level ] . b ) & &
__bch2_btree_node_relock ( trans , path , level , false ) ) ;
2022-08-20 02:50:18 +03:00
}
/* upgrade */
2022-08-19 22:35:34 +03:00
bool bch2_btree_path_upgrade_noupgrade_sibs ( struct btree_trans * ,
struct btree_path * , unsigned ) ;
bool __bch2_btree_path_upgrade ( struct btree_trans * ,
struct btree_path * , unsigned ) ;
2022-09-17 21:36:24 +03:00
static inline int bch2_btree_path_upgrade ( struct btree_trans * trans ,
struct btree_path * path ,
unsigned new_locks_want )
2022-08-19 22:35:34 +03:00
{
2022-09-17 21:36:24 +03:00
unsigned old_locks_want = path - > locks_want ;
2022-08-19 22:35:34 +03:00
new_locks_want = min ( new_locks_want , BTREE_MAX_DEPTH ) ;
2022-09-17 21:36:24 +03:00
if ( path - > locks_want < new_locks_want
? __bch2_btree_path_upgrade ( trans , path , new_locks_want )
: path - > uptodate = = BTREE_ITER_UPTODATE )
return 0 ;
trace_and_count ( trans - > c , trans_restart_upgrade , trans , _THIS_IP_ , path ,
old_locks_want , new_locks_want ) ;
return btree_trans_restart ( trans , BCH_ERR_transaction_restart_upgrade ) ;
2022-08-19 22:35:34 +03:00
}
2022-08-20 02:50:18 +03:00
/* misc: */
2022-08-11 01:55:53 +03:00
static inline void btree_path_set_should_be_locked ( struct btree_path * path )
{
EBUG_ON ( ! btree_node_locked ( path , path - > level ) ) ;
EBUG_ON ( path - > uptodate ) ;
path - > should_be_locked = true ;
}
2022-08-11 02:08:30 +03:00
static inline void __btree_path_set_level_up ( struct btree_trans * trans ,
struct btree_path * path ,
unsigned l )
{
btree_node_unlock ( trans , path , l ) ;
path - > l [ l ] . b = ERR_PTR ( - BCH_ERR_no_btree_node_up ) ;
}
static inline void btree_path_set_level_up ( struct btree_trans * trans ,
struct btree_path * path )
{
__btree_path_set_level_up ( trans , path , path - > level + + ) ;
btree_path_set_dirty ( path , BTREE_ITER_NEED_TRAVERSE ) ;
}
2022-08-20 02:50:18 +03:00
/* debug */
2022-08-10 19:42:55 +03:00
struct six_lock_count bch2_btree_node_lock_counts ( struct btree_trans * ,
2022-08-22 20:21:10 +03:00
struct btree_path * ,
struct btree_bkey_cached_common * b ,
unsigned ) ;
2022-08-10 19:42:55 +03:00
2022-08-23 06:12:11 +03:00
int bch2_check_for_deadlock ( struct btree_trans * , struct printbuf * ) ;
2022-08-19 22:35:34 +03:00
# ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_path_verify_locks ( struct btree_path * ) ;
void bch2_trans_verify_locks ( struct btree_trans * ) ;
# else
static inline void bch2_btree_path_verify_locks ( struct btree_path * path ) { }
static inline void bch2_trans_verify_locks ( struct btree_trans * trans ) { }
# endif
2017-03-17 09:18:50 +03:00
# endif /* _BCACHEFS_BTREE_LOCKING_H */