2018-10-06 00:46:55 -04:00
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright 2012 Google , Inc .
*
2021-04-17 20:37:04 -04:00
* Foreground allocator code : allocate buckets from freelist , and allocate in
* sector granularity from writepoints .
2018-10-06 00:46:55 -04:00
*
* bch2_bucket_alloc ( ) allocates a single bucket from a specific device .
*
* bch2_bucket_alloc_set ( ) allocates one or more buckets from different devices
* in a given filesystem .
*/
# include "bcachefs.h"
# include "alloc_background.h"
# include "alloc_foreground.h"
2022-03-17 20:51:27 -04:00
# include "backpointers.h"
2022-01-09 20:48:31 -05:00
# include "btree_iter.h"
# include "btree_update.h"
2018-10-06 00:46:55 -04:00
# include "btree_gc.h"
# include "buckets.h"
2022-01-09 20:48:31 -05:00
# include "buckets_waiting_for_journal.h"
2018-10-06 00:46:55 -04:00
# include "clock.h"
# include "debug.h"
# include "disk_groups.h"
2018-11-01 15:13:19 -04:00
# include "ec.h"
2022-01-09 20:48:31 -05:00
# include "error.h"
2023-09-10 18:05:17 -04:00
# include "io_write.h"
2022-01-09 20:48:31 -05:00
# include "journal.h"
2022-07-17 21:40:39 -04:00
# include "movinggc.h"
2022-12-14 20:52:11 -05:00
# include "nocow_locking.h"
2018-10-06 00:46:55 -04:00
# include "trace.h"
# include <linux/math64.h>
# include <linux/rculist.h>
# include <linux/rcupdate.h>
2023-03-16 18:05:00 -04:00
static void bch2_trans_mutex_lock_norelock ( struct btree_trans * trans ,
struct mutex * lock )
{
if ( ! mutex_trylock ( lock ) ) {
bch2_trans_unlock ( trans ) ;
mutex_lock ( lock ) ;
}
}
2023-06-24 19:30:10 -04:00
const char * const bch2_watermarks [ ] = {
2022-03-13 19:27:55 -04:00
# define x(t) #t,
2023-06-24 19:30:10 -04:00
BCH_WATERMARKS ( )
2022-03-13 19:27:55 -04:00
# undef x
NULL
} ;
2018-10-06 00:46:55 -04:00
/*
* Open buckets represent a bucket that ' s currently being allocated from . They
* serve two purposes :
*
* - They track buckets that have been partially allocated , allowing for
* sub - bucket sized allocations - they ' re used by the sector allocator below
*
* - They provide a reference to the buckets they own that mark and sweep GC
* can find , until the new allocation has a pointer to it inserted into the
* btree
*
* When allocating some space with the sector allocator , the allocation comes
* with a reference to an open bucket - the caller is required to put that
* reference _after_ doing the index update that makes its allocation reachable .
*/
2022-01-09 20:48:31 -05:00
void bch2_reset_alloc_cursors ( struct bch_fs * c )
{
rcu_read_lock ( ) ;
2023-12-17 02:34:05 -05:00
for_each_member_device_rcu ( c , ca , NULL )
2022-01-09 20:48:31 -05:00
ca - > alloc_cursor = 0 ;
rcu_read_unlock ( ) ;
}
2021-12-25 21:43:29 -05:00
static void bch2_open_bucket_hash_add ( struct bch_fs * c , struct open_bucket * ob )
{
open_bucket_idx_t idx = ob - c - > open_buckets ;
open_bucket_idx_t * slot = open_bucket_hashslot ( c , ob - > dev , ob - > bucket ) ;
ob - > hash = * slot ;
* slot = idx ;
}
static void bch2_open_bucket_hash_remove ( struct bch_fs * c , struct open_bucket * ob )
{
open_bucket_idx_t idx = ob - c - > open_buckets ;
open_bucket_idx_t * slot = open_bucket_hashslot ( c , ob - > dev , ob - > bucket ) ;
while ( * slot ! = idx ) {
BUG_ON ( ! * slot ) ;
slot = & c - > open_buckets [ * slot ] . hash ;
}
* slot = ob - > hash ;
ob - > hash = 0 ;
}
2018-10-06 00:46:55 -04:00
void __bch2_open_bucket_put ( struct bch_fs * c , struct open_bucket * ob )
{
2021-12-25 21:21:46 -05:00
struct bch_dev * ca = bch_dev_bkey_exists ( c , ob - > dev ) ;
2018-10-06 00:46:55 -04:00
2018-11-01 15:13:19 -04:00
if ( ob - > ec ) {
2023-03-09 10:18:09 -05:00
ec_stripe_new_put ( c , ob - > ec , STRIPE_REF_io ) ;
2018-11-01 15:13:19 -04:00
return ;
}
2018-11-26 00:13:33 -05:00
percpu_down_read ( & c - > mark_lock ) ;
2018-10-06 00:46:55 -04:00
spin_lock ( & ob - > lock ) ;
ob - > valid = false ;
2021-12-25 21:21:46 -05:00
ob - > data_type = 0 ;
2018-10-06 00:46:55 -04:00
spin_unlock ( & ob - > lock ) ;
2018-11-26 00:13:33 -05:00
percpu_up_read ( & c - > mark_lock ) ;
2018-10-06 00:46:55 -04:00
spin_lock ( & c - > freelist_lock ) ;
2021-12-25 21:43:29 -05:00
bch2_open_bucket_hash_remove ( c , ob ) ;
2018-10-06 00:46:55 -04:00
ob - > freelist = c - > open_buckets_freelist ;
c - > open_buckets_freelist = ob - c - > open_buckets ;
2021-04-13 09:49:23 -04:00
2018-10-06 00:46:55 -04:00
c - > open_buckets_nr_free + + ;
2021-04-13 09:49:23 -04:00
ca - > nr_open_buckets - - ;
2018-10-06 00:46:55 -04:00
spin_unlock ( & c - > freelist_lock ) ;
closure_wake_up ( & c - > open_buckets_wait ) ;
}
2018-11-01 15:13:19 -04:00
void bch2_open_bucket_write_error ( struct bch_fs * c ,
struct open_buckets * obs ,
unsigned dev )
{
struct open_bucket * ob ;
unsigned i ;
open_bucket_for_each ( c , obs , ob , i )
2021-12-25 21:21:46 -05:00
if ( ob - > dev = = dev & & ob - > ec )
2018-11-01 15:13:19 -04:00
bch2_ec_bucket_cancel ( c , ob ) ;
}
2018-10-06 00:46:55 -04:00
static struct open_bucket * bch2_open_bucket_alloc ( struct bch_fs * c )
{
struct open_bucket * ob ;
BUG_ON ( ! c - > open_buckets_freelist | | ! c - > open_buckets_nr_free ) ;
ob = c - > open_buckets + c - > open_buckets_freelist ;
c - > open_buckets_freelist = ob - > freelist ;
atomic_set ( & ob - > pin , 1 ) ;
2021-12-25 21:21:46 -05:00
ob - > data_type = 0 ;
2018-10-06 00:46:55 -04:00
c - > open_buckets_nr_free - - ;
return ob ;
}
2023-03-16 18:05:00 -04:00
static void open_bucket_free_unused ( struct bch_fs * c , struct open_bucket * ob )
2018-10-06 04:12:42 -04:00
{
2023-02-25 00:32:34 -05:00
BUG_ON ( c - > open_buckets_partial_nr > =
ARRAY_SIZE ( c - > open_buckets_partial ) ) ;
2018-10-06 04:12:42 -04:00
2023-02-25 02:22:49 -05:00
spin_lock ( & c - > freelist_lock ) ;
ob - > on_partial_list = true ;
c - > open_buckets_partial [ c - > open_buckets_partial_nr + + ] =
ob - c - > open_buckets ;
spin_unlock ( & c - > freelist_lock ) ;
2018-10-06 04:12:42 -04:00
2023-02-25 02:22:49 -05:00
closure_wake_up ( & c - > open_buckets_wait ) ;
closure_wake_up ( & c - > freelist_wait ) ;
2018-10-06 04:12:42 -04:00
}
2018-10-06 00:46:55 -04:00
/* _only_ for allocating the journal on a new device: */
long bch2_bucket_alloc_new_fs ( struct bch_dev * ca )
{
2021-12-24 04:22:20 -05:00
while ( ca - > new_fs_bucket_idx < ca - > mi . nbuckets ) {
u64 b = ca - > new_fs_bucket_idx + + ;
if ( ! is_superblock_bucket ( ca , b ) & &
( ! ca - > buckets_nouse | | ! test_bit ( b , ca - > buckets_nouse ) ) )
return b ;
}
return - 1 ;
2018-10-06 00:46:55 -04:00
}
2023-06-24 19:30:10 -04:00
static inline unsigned open_buckets_reserved ( enum bch_watermark watermark )
2018-10-06 00:46:55 -04:00
{
2023-06-24 19:30:10 -04:00
switch ( watermark ) {
2023-06-27 17:29:20 -04:00
case BCH_WATERMARK_reclaim :
return 0 ;
2023-06-24 19:30:10 -04:00
case BCH_WATERMARK_btree :
case BCH_WATERMARK_btree_copygc :
2021-01-07 17:18:14 -05:00
return OPEN_BUCKETS_COUNT / 4 ;
2023-06-27 17:29:20 -04:00
case BCH_WATERMARK_copygc :
return OPEN_BUCKETS_COUNT / 3 ;
2018-10-06 00:46:55 -04:00
default :
2020-06-09 15:46:22 -04:00
return OPEN_BUCKETS_COUNT / 2 ;
2018-10-06 00:46:55 -04:00
}
}
2022-01-09 20:48:31 -05:00
static struct open_bucket * __try_alloc_bucket ( struct bch_fs * c , struct bch_dev * ca ,
u64 bucket ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2023-01-30 20:58:43 -05:00
const struct bch_alloc_v4 * a ,
2022-11-04 16:06:55 -04:00
struct bucket_alloc_state * s ,
2022-01-09 20:48:31 -05:00
struct closure * cl )
2018-10-06 00:46:55 -04:00
{
struct open_bucket * ob ;
2022-01-09 20:48:31 -05:00
if ( unlikely ( ca - > buckets_nouse & & test_bit ( bucket , ca - > buckets_nouse ) ) ) {
2022-11-04 16:06:55 -04:00
s - > skipped_nouse + + ;
2022-01-09 20:48:31 -05:00
return NULL ;
}
2018-10-06 00:46:55 -04:00
2022-01-09 20:48:31 -05:00
if ( bch2_bucket_is_open ( c , ca - > dev_idx , bucket ) ) {
2022-11-04 16:06:55 -04:00
s - > skipped_open + + ;
2022-01-09 20:48:31 -05:00
return NULL ;
}
if ( bch2_bucket_needs_journal_commit ( & c - > buckets_waiting_for_journal ,
c - > journal . flushed_seq_ondisk , ca - > dev_idx , bucket ) ) {
2022-11-04 16:06:55 -04:00
s - > skipped_need_journal_commit + + ;
2022-01-09 20:48:31 -05:00
return NULL ;
2018-10-06 00:46:55 -04:00
}
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
if ( bch2_bucket_nocow_is_locked ( & c - > nocow_locks , POS ( ca - > dev_idx , bucket ) ) ) {
s - > skipped_nocow + + ;
return NULL ;
}
2022-01-09 20:48:31 -05:00
spin_lock ( & c - > freelist_lock ) ;
2023-06-24 19:30:10 -04:00
if ( unlikely ( c - > open_buckets_nr_free < = open_buckets_reserved ( watermark ) ) ) {
2018-10-06 00:46:55 -04:00
if ( cl )
closure_wait ( & c - > open_buckets_wait , cl ) ;
2019-03-18 13:42:10 -04:00
2023-12-11 10:15:18 -05:00
track_event_change ( & c - > times [ BCH_TIME_blocked_allocate_open_bucket ] ,
& c - > blocked_allocate_open_bucket , true ) ;
2018-10-06 00:46:55 -04:00
spin_unlock ( & c - > freelist_lock ) ;
2022-07-17 22:31:21 -04:00
return ERR_PTR ( - BCH_ERR_open_buckets_empty ) ;
2018-10-06 00:46:55 -04:00
}
2022-01-09 20:48:31 -05:00
/* Recheck under lock: */
if ( bch2_bucket_is_open ( c , ca - > dev_idx , bucket ) ) {
spin_unlock ( & c - > freelist_lock ) ;
2022-11-04 16:06:55 -04:00
s - > skipped_open + + ;
2022-01-09 20:48:31 -05:00
return NULL ;
2018-10-06 00:46:55 -04:00
}
ob = bch2_open_bucket_alloc ( c ) ;
spin_lock ( & ob - > lock ) ;
ob - > valid = true ;
ob - > sectors_free = ca - > mi . bucket_size ;
2021-12-25 21:21:46 -05:00
ob - > dev = ca - > dev_idx ;
2022-01-09 20:48:31 -05:00
ob - > gen = a - > gen ;
ob - > bucket = bucket ;
2018-10-06 00:46:55 -04:00
spin_unlock ( & ob - > lock ) ;
2021-12-25 21:43:29 -05:00
ca - > nr_open_buckets + + ;
bch2_open_bucket_hash_add ( c , ob ) ;
2023-12-11 10:15:18 -05:00
track_event_change ( & c - > times [ BCH_TIME_blocked_allocate_open_bucket ] ,
& c - > blocked_allocate_open_bucket , false ) ;
2019-03-18 13:42:10 -04:00
2023-12-11 10:15:18 -05:00
track_event_change ( & c - > times [ BCH_TIME_blocked_allocate ] ,
& c - > blocked_allocate , false ) ;
2019-03-18 13:42:10 -04:00
2018-10-06 00:46:55 -04:00
spin_unlock ( & c - > freelist_lock ) ;
2022-01-09 20:48:31 -05:00
return ob ;
}
static struct open_bucket * try_alloc_bucket ( struct btree_trans * trans , struct bch_dev * ca ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark , u64 free_entry ,
2022-11-04 16:06:55 -04:00
struct bucket_alloc_state * s ,
2022-01-09 20:48:31 -05:00
struct bkey_s_c freespace_k ,
struct closure * cl )
{
struct bch_fs * c = trans - > c ;
struct btree_iter iter = { NULL } ;
struct bkey_s_c k ;
struct open_bucket * ob ;
2023-01-30 20:58:43 -05:00
struct bch_alloc_v4 a_convert ;
const struct bch_alloc_v4 * a ;
2022-01-09 20:48:31 -05:00
u64 b = free_entry & ~ ( ~ 0ULL < < 56 ) ;
unsigned genbits = free_entry > > 56 ;
struct printbuf buf = PRINTBUF ;
int ret ;
if ( b < ca - > mi . first_bucket | | b > = ca - > mi . nbuckets ) {
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " freespace btree has bucket outside allowed range %u-%llu \n "
2022-01-09 20:48:31 -05:00
" freespace key " ,
ca - > mi . first_bucket , ca - > mi . nbuckets ) ;
bch2_bkey_val_to_text ( & buf , c , freespace_k ) ;
bch2_trans_inconsistent ( trans , " %s " , buf . buf ) ;
ob = ERR_PTR ( - EIO ) ;
goto err ;
}
2023-04-29 19:33:09 -04:00
k = bch2_bkey_get_iter ( trans , & iter ,
BTREE_ID_alloc , POS ( ca - > dev_idx , b ) ,
BTREE_ITER_CACHED ) ;
2022-01-09 20:48:31 -05:00
ret = bkey_err ( k ) ;
if ( ret ) {
ob = ERR_PTR ( ret ) ;
goto err ;
}
2023-01-30 20:58:43 -05:00
a = bch2_alloc_to_v4 ( k , & a_convert ) ;
2022-01-09 20:48:31 -05:00
2022-12-11 19:14:30 -05:00
if ( a - > data_type ! = BCH_DATA_free ) {
2023-07-07 02:42:28 -04:00
if ( c - > curr_recovery_pass < = BCH_RECOVERY_PASS_check_alloc_info ) {
2022-12-11 19:14:30 -05:00
ob = NULL ;
goto err ;
}
prt_printf ( & buf , " non free bucket in freespace btree \n "
" freespace key " ) ;
2022-01-09 20:48:31 -05:00
bch2_bkey_val_to_text ( & buf , c , freespace_k ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " \n " ) ;
2022-01-09 20:48:31 -05:00
bch2_bkey_val_to_text ( & buf , c , k ) ;
bch2_trans_inconsistent ( trans , " %s " , buf . buf ) ;
ob = ERR_PTR ( - EIO ) ;
goto err ;
}
2022-12-11 19:14:30 -05:00
if ( genbits ! = ( alloc_freespace_genbits ( * a ) > > 56 ) & &
2023-07-07 02:42:28 -04:00
c - > curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info ) {
2022-12-11 19:14:30 -05:00
prt_printf ( & buf , " bucket in freespace btree with wrong genbits (got %u should be %llu) \n "
" freespace key " ,
genbits , alloc_freespace_genbits ( * a ) > > 56 ) ;
2022-01-09 20:48:31 -05:00
bch2_bkey_val_to_text ( & buf , c , freespace_k ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( & buf , " \n " ) ;
2022-01-09 20:48:31 -05:00
bch2_bkey_val_to_text ( & buf , c , k ) ;
bch2_trans_inconsistent ( trans , " %s " , buf . buf ) ;
ob = ERR_PTR ( - EIO ) ;
goto err ;
}
2023-07-07 02:42:28 -04:00
if ( c - > curr_recovery_pass < = BCH_RECOVERY_PASS_check_extents_to_backpointers ) {
2022-03-17 20:51:27 -04:00
struct bch_backpointer bp ;
2023-03-31 16:24:45 -04:00
struct bpos bp_pos = POS_MIN ;
2022-03-17 20:51:27 -04:00
ret = bch2_get_next_backpointer ( trans , POS ( ca - > dev_idx , b ) , - 1 ,
2023-03-31 16:24:45 -04:00
& bp_pos , & bp ,
2022-10-14 07:02:36 -04:00
BTREE_ITER_NOPRESERVE ) ;
2022-03-17 20:51:27 -04:00
if ( ret ) {
ob = ERR_PTR ( ret ) ;
goto err ;
}
2023-03-31 16:24:45 -04:00
if ( ! bkey_eq ( bp_pos , POS_MAX ) ) {
2022-03-17 20:51:27 -04:00
/*
* Bucket may have data in it - we don ' t call
* bc2h_trans_inconnsistent ( ) because fsck hasn ' t
* finished yet
*/
ob = NULL ;
goto err ;
}
}
2023-06-24 19:30:10 -04:00
ob = __try_alloc_bucket ( c , ca , b , watermark , a , s , cl ) ;
2022-01-09 20:48:31 -05:00
if ( ! ob )
2023-12-06 17:53:59 -05:00
set_btree_iter_dontneed ( & iter ) ;
2022-01-09 20:48:31 -05:00
err :
2023-05-30 08:15:41 -04:00
if ( iter . trans & & iter . path )
set_btree_iter_dontneed ( & iter ) ;
2022-01-09 20:48:31 -05:00
bch2_trans_iter_exit ( trans , & iter ) ;
printbuf_exit ( & buf ) ;
return ob ;
}
/*
* This path is for before the freespace btree is initialized :
*
* If ca - > new_fs_bucket_idx is nonzero , we haven ' t yet marked superblock &
* journal buckets - journal buckets will be < ca - > new_fs_bucket_idx
*/
static noinline struct open_bucket *
bch2_bucket_alloc_early ( struct btree_trans * trans ,
struct bch_dev * ca ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2022-11-04 16:06:55 -04:00
struct bucket_alloc_state * s ,
2022-01-09 20:48:31 -05:00
struct closure * cl )
{
bcachefs: serialize on cached key in early bucket allocator
bcachefs had a transient bug where freespace_initialized was not
properly being set, which lead to unexpected use of the early bucket
allocator at runtime. This issue has been fixed, but the existence
of it uncovered a coherency issue in the early bucket allocation
code that is somewhat related to how uncached iterators deal with
the key cache.
The problem itself manifests as occasional failure of generic/113
due to corruption, often seen as a duplicate backpointer or multiple
data types per-bucket error. The immediate cause of the error is a
racing bucket allocation along the lines of the following sequence:
- Task 1 selects key A in bch2_bucket_alloc_early() and schedules.
- Task 2 selects the same key A, but proceeds to complete the
allocation and associated I/O, after which it releases the
open_bucket.
- Task 1 resumes with key A, but does not recognize the bucket is
now allocated because the open_bucket has been removed
from the hash when it was released in the previous step.
This generally shouldn't happen because the allocating task updates
the alloc btree key before releasing the bucket. This is not
sufficient in this particular instance, however, because an uncached
iterator for a cached btree doesn't actually lock the key cache slot
when no key exists for a given slot in the cache. Thus the fact that
the allocation side updates the cached key means that multiple
uncached iters can stumble across the same alloc key and duplicate
the bucket allocation as described above.
This is something that probably needs a longer term fix in the
iterator code. As a short term fix, close the race through explicit
use of a cached iterator for likely allocation candidates. We don't
want to scan the btree with a cached iterator because that would
unnecessarily pollute the cache. This mitigates cache pollution by
primarily scanning the tree with an uncached iterator, but closes
the race by creating a key cache entry for any prospective slot
prior to the bucket allocation attempt (also similar to how
_alloc_freelist() works via try_alloc_bucket()). This survives many
iterations of generic/113 on a kernel hacked to always use the early
bucket allocator.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-11-01 15:02:44 -04:00
struct btree_iter iter , citer ;
struct bkey_s_c k , ck ;
2022-01-09 20:48:31 -05:00
struct open_bucket * ob = NULL ;
2023-11-01 15:02:45 -04:00
u64 first_bucket = max_t ( u64 , ca - > mi . first_bucket , ca - > new_fs_bucket_idx ) ;
u64 alloc_start = max ( first_bucket , READ_ONCE ( ca - > alloc_cursor ) ) ;
u64 alloc_cursor = alloc_start ;
2022-01-09 20:48:31 -05:00
int ret ;
bcachefs: serialize on cached key in early bucket allocator
bcachefs had a transient bug where freespace_initialized was not
properly being set, which lead to unexpected use of the early bucket
allocator at runtime. This issue has been fixed, but the existence
of it uncovered a coherency issue in the early bucket allocation
code that is somewhat related to how uncached iterators deal with
the key cache.
The problem itself manifests as occasional failure of generic/113
due to corruption, often seen as a duplicate backpointer or multiple
data types per-bucket error. The immediate cause of the error is a
racing bucket allocation along the lines of the following sequence:
- Task 1 selects key A in bch2_bucket_alloc_early() and schedules.
- Task 2 selects the same key A, but proceeds to complete the
allocation and associated I/O, after which it releases the
open_bucket.
- Task 1 resumes with key A, but does not recognize the bucket is
now allocated because the open_bucket has been removed
from the hash when it was released in the previous step.
This generally shouldn't happen because the allocating task updates
the alloc btree key before releasing the bucket. This is not
sufficient in this particular instance, however, because an uncached
iterator for a cached btree doesn't actually lock the key cache slot
when no key exists for a given slot in the cache. Thus the fact that
the allocation side updates the cached key means that multiple
uncached iters can stumble across the same alloc key and duplicate
the bucket allocation as described above.
This is something that probably needs a longer term fix in the
iterator code. As a short term fix, close the race through explicit
use of a cached iterator for likely allocation candidates. We don't
want to scan the btree with a cached iterator because that would
unnecessarily pollute the cache. This mitigates cache pollution by
primarily scanning the tree with an uncached iterator, but closes
the race by creating a key cache entry for any prospective slot
prior to the bucket allocation attempt (also similar to how
_alloc_freelist() works via try_alloc_bucket()). This survives many
iterations of generic/113 on a kernel hacked to always use the early
bucket allocator.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-11-01 15:02:44 -04:00
/*
* Scan with an uncached iterator to avoid polluting the key cache . An
* uncached iter will return a cached key if one exists , but if not
* there is no other underlying protection for the associated key cache
* slot . To avoid racing bucket allocations , look up the cached key slot
* of any likely allocation candidate before attempting to proceed with
* the allocation . This provides proper exclusion on the associated
* bucket .
*/
2022-01-09 20:48:31 -05:00
again :
2022-07-17 02:46:46 -04:00
for_each_btree_key_norestart ( trans , iter , BTREE_ID_alloc , POS ( ca - > dev_idx , alloc_cursor ) ,
2022-01-09 20:48:31 -05:00
BTREE_ITER_SLOTS , k , ret ) {
2023-01-30 20:58:43 -05:00
struct bch_alloc_v4 a_convert ;
const struct bch_alloc_v4 * a ;
2022-01-09 20:48:31 -05:00
2022-11-24 03:12:22 -05:00
if ( bkey_ge ( k . k - > p , POS ( ca - > dev_idx , ca - > mi . nbuckets ) ) )
2022-01-09 20:48:31 -05:00
break ;
if ( ca - > new_fs_bucket_idx & &
is_superblock_bucket ( ca , k . k - > p . offset ) )
continue ;
2023-01-30 20:58:43 -05:00
a = bch2_alloc_to_v4 ( k , & a_convert ) ;
if ( a - > data_type ! = BCH_DATA_free )
2022-01-09 20:48:31 -05:00
continue ;
bcachefs: serialize on cached key in early bucket allocator
bcachefs had a transient bug where freespace_initialized was not
properly being set, which lead to unexpected use of the early bucket
allocator at runtime. This issue has been fixed, but the existence
of it uncovered a coherency issue in the early bucket allocation
code that is somewhat related to how uncached iterators deal with
the key cache.
The problem itself manifests as occasional failure of generic/113
due to corruption, often seen as a duplicate backpointer or multiple
data types per-bucket error. The immediate cause of the error is a
racing bucket allocation along the lines of the following sequence:
- Task 1 selects key A in bch2_bucket_alloc_early() and schedules.
- Task 2 selects the same key A, but proceeds to complete the
allocation and associated I/O, after which it releases the
open_bucket.
- Task 1 resumes with key A, but does not recognize the bucket is
now allocated because the open_bucket has been removed
from the hash when it was released in the previous step.
This generally shouldn't happen because the allocating task updates
the alloc btree key before releasing the bucket. This is not
sufficient in this particular instance, however, because an uncached
iterator for a cached btree doesn't actually lock the key cache slot
when no key exists for a given slot in the cache. Thus the fact that
the allocation side updates the cached key means that multiple
uncached iters can stumble across the same alloc key and duplicate
the bucket allocation as described above.
This is something that probably needs a longer term fix in the
iterator code. As a short term fix, close the race through explicit
use of a cached iterator for likely allocation candidates. We don't
want to scan the btree with a cached iterator because that would
unnecessarily pollute the cache. This mitigates cache pollution by
primarily scanning the tree with an uncached iterator, but closes
the race by creating a key cache entry for any prospective slot
prior to the bucket allocation attempt (also similar to how
_alloc_freelist() works via try_alloc_bucket()). This survives many
iterations of generic/113 on a kernel hacked to always use the early
bucket allocator.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-11-01 15:02:44 -04:00
/* now check the cached key to serialize concurrent allocs of the bucket */
ck = bch2_bkey_get_iter ( trans , & citer , BTREE_ID_alloc , k . k - > p , BTREE_ITER_CACHED ) ;
ret = bkey_err ( ck ) ;
if ( ret )
break ;
a = bch2_alloc_to_v4 ( ck , & a_convert ) ;
if ( a - > data_type ! = BCH_DATA_free )
goto next ;
2022-11-04 16:06:55 -04:00
s - > buckets_seen + + ;
2022-01-09 20:48:31 -05:00
2023-06-24 19:30:10 -04:00
ob = __try_alloc_bucket ( trans - > c , ca , k . k - > p . offset , watermark , a , s , cl ) ;
bcachefs: serialize on cached key in early bucket allocator
bcachefs had a transient bug where freespace_initialized was not
properly being set, which lead to unexpected use of the early bucket
allocator at runtime. This issue has been fixed, but the existence
of it uncovered a coherency issue in the early bucket allocation
code that is somewhat related to how uncached iterators deal with
the key cache.
The problem itself manifests as occasional failure of generic/113
due to corruption, often seen as a duplicate backpointer or multiple
data types per-bucket error. The immediate cause of the error is a
racing bucket allocation along the lines of the following sequence:
- Task 1 selects key A in bch2_bucket_alloc_early() and schedules.
- Task 2 selects the same key A, but proceeds to complete the
allocation and associated I/O, after which it releases the
open_bucket.
- Task 1 resumes with key A, but does not recognize the bucket is
now allocated because the open_bucket has been removed
from the hash when it was released in the previous step.
This generally shouldn't happen because the allocating task updates
the alloc btree key before releasing the bucket. This is not
sufficient in this particular instance, however, because an uncached
iterator for a cached btree doesn't actually lock the key cache slot
when no key exists for a given slot in the cache. Thus the fact that
the allocation side updates the cached key means that multiple
uncached iters can stumble across the same alloc key and duplicate
the bucket allocation as described above.
This is something that probably needs a longer term fix in the
iterator code. As a short term fix, close the race through explicit
use of a cached iterator for likely allocation candidates. We don't
want to scan the btree with a cached iterator because that would
unnecessarily pollute the cache. This mitigates cache pollution by
primarily scanning the tree with an uncached iterator, but closes
the race by creating a key cache entry for any prospective slot
prior to the bucket allocation attempt (also similar to how
_alloc_freelist() works via try_alloc_bucket()). This survives many
iterations of generic/113 on a kernel hacked to always use the early
bucket allocator.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-11-01 15:02:44 -04:00
next :
2023-12-06 17:53:59 -05:00
set_btree_iter_dontneed ( & citer ) ;
bcachefs: serialize on cached key in early bucket allocator
bcachefs had a transient bug where freespace_initialized was not
properly being set, which lead to unexpected use of the early bucket
allocator at runtime. This issue has been fixed, but the existence
of it uncovered a coherency issue in the early bucket allocation
code that is somewhat related to how uncached iterators deal with
the key cache.
The problem itself manifests as occasional failure of generic/113
due to corruption, often seen as a duplicate backpointer or multiple
data types per-bucket error. The immediate cause of the error is a
racing bucket allocation along the lines of the following sequence:
- Task 1 selects key A in bch2_bucket_alloc_early() and schedules.
- Task 2 selects the same key A, but proceeds to complete the
allocation and associated I/O, after which it releases the
open_bucket.
- Task 1 resumes with key A, but does not recognize the bucket is
now allocated because the open_bucket has been removed
from the hash when it was released in the previous step.
This generally shouldn't happen because the allocating task updates
the alloc btree key before releasing the bucket. This is not
sufficient in this particular instance, however, because an uncached
iterator for a cached btree doesn't actually lock the key cache slot
when no key exists for a given slot in the cache. Thus the fact that
the allocation side updates the cached key means that multiple
uncached iters can stumble across the same alloc key and duplicate
the bucket allocation as described above.
This is something that probably needs a longer term fix in the
iterator code. As a short term fix, close the race through explicit
use of a cached iterator for likely allocation candidates. We don't
want to scan the btree with a cached iterator because that would
unnecessarily pollute the cache. This mitigates cache pollution by
primarily scanning the tree with an uncached iterator, but closes
the race by creating a key cache entry for any prospective slot
prior to the bucket allocation attempt (also similar to how
_alloc_freelist() works via try_alloc_bucket()). This survives many
iterations of generic/113 on a kernel hacked to always use the early
bucket allocator.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-11-01 15:02:44 -04:00
bch2_trans_iter_exit ( trans , & citer ) ;
2022-01-09 20:48:31 -05:00
if ( ob )
break ;
}
bch2_trans_iter_exit ( trans , & iter ) ;
2023-11-01 15:02:45 -04:00
alloc_cursor = iter . pos . offset ;
2022-01-09 20:48:31 -05:00
ca - > alloc_cursor = alloc_cursor ;
2023-01-23 20:28:59 -05:00
if ( ! ob & & ret )
ob = ERR_PTR ( ret ) ;
2023-11-01 15:02:45 -04:00
if ( ! ob & & alloc_start > first_bucket ) {
alloc_cursor = alloc_start = first_bucket ;
2022-01-09 20:48:31 -05:00
goto again ;
}
2023-01-23 20:28:59 -05:00
return ob ;
2022-01-09 20:48:31 -05:00
}
static struct open_bucket * bch2_bucket_alloc_freelist ( struct btree_trans * trans ,
struct bch_dev * ca ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2022-11-04 16:06:55 -04:00
struct bucket_alloc_state * s ,
2022-01-09 20:48:31 -05:00
struct closure * cl )
{
struct btree_iter iter ;
struct bkey_s_c k ;
struct open_bucket * ob = NULL ;
u64 alloc_start = max_t ( u64 , ca - > mi . first_bucket , READ_ONCE ( ca - > alloc_cursor ) ) ;
u64 alloc_cursor = alloc_start ;
int ret ;
BUG_ON ( ca - > new_fs_bucket_idx ) ;
again :
for_each_btree_key_norestart ( trans , iter , BTREE_ID_freespace ,
POS ( ca - > dev_idx , alloc_cursor ) , 0 , k , ret ) {
if ( k . k - > p . inode ! = ca - > dev_idx )
break ;
for ( alloc_cursor = max ( alloc_cursor , bkey_start_offset ( k . k ) ) ;
alloc_cursor < k . k - > p . offset ;
alloc_cursor + + ) {
2022-07-17 23:06:38 -04:00
ret = btree_trans_too_many_iters ( trans ) ;
if ( ret ) {
ob = ERR_PTR ( ret ) ;
2022-01-09 20:48:31 -05:00
break ;
}
2022-11-04 16:06:55 -04:00
s - > buckets_seen + + ;
2022-01-09 20:48:31 -05:00
2023-06-24 19:30:10 -04:00
ob = try_alloc_bucket ( trans , ca , watermark ,
2022-11-04 16:06:55 -04:00
alloc_cursor , s , k , cl ) ;
2022-01-09 20:48:31 -05:00
if ( ob ) {
2023-12-06 17:53:59 -05:00
set_btree_iter_dontneed ( & iter ) ;
2022-01-09 20:48:31 -05:00
break ;
}
}
2022-07-17 23:06:38 -04:00
if ( ob | | ret )
2022-01-09 20:48:31 -05:00
break ;
}
bch2_trans_iter_exit ( trans , & iter ) ;
ca - > alloc_cursor = alloc_cursor ;
if ( ! ob & & ret )
ob = ERR_PTR ( ret ) ;
if ( ! ob & & alloc_start > ca - > mi . first_bucket ) {
alloc_cursor = alloc_start = ca - > mi . first_bucket ;
goto again ;
}
return ob ;
}
/**
2023-09-12 18:41:22 -04:00
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
* @ trans : transaction object
* @ ca : device to allocate from
* @ watermark : how important is this allocation ?
* @ cl : if not NULL , closure to be used to wait if buckets not available
* @ usage : for secondarily also returning the current device usage
2022-01-09 20:48:31 -05:00
*
2023-09-12 18:41:22 -04:00
* Returns : an open_bucket on success , or an ERR_PTR ( ) on failure .
2022-10-19 18:31:33 -04:00
*/
2022-01-09 20:48:31 -05:00
static struct open_bucket * bch2_bucket_alloc_trans ( struct btree_trans * trans ,
struct bch_dev * ca ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2022-10-21 14:01:19 -04:00
struct closure * cl ,
struct bch_dev_usage * usage )
2022-01-09 20:48:31 -05:00
{
struct bch_fs * c = trans - > c ;
struct open_bucket * ob = NULL ;
2022-12-11 19:14:30 -05:00
bool freespace = READ_ONCE ( ca - > mi . freespace_initialized ) ;
2022-04-01 01:29:59 -04:00
u64 avail ;
2022-11-04 16:06:55 -04:00
struct bucket_alloc_state s = { 0 } ;
2022-04-01 01:29:59 -04:00
bool waiting = false ;
2022-01-09 20:48:31 -05:00
again :
2022-10-21 14:01:19 -04:00
bch2_dev_usage_read_fast ( ca , usage ) ;
2023-06-24 19:30:10 -04:00
avail = dev_buckets_free ( ca , * usage , watermark ) ;
2022-04-01 01:29:59 -04:00
2022-10-21 14:01:19 -04:00
if ( usage - > d [ BCH_DATA_need_discard ] . buckets > avail )
2022-04-01 01:29:59 -04:00
bch2_do_discards ( c ) ;
2022-10-21 14:01:19 -04:00
if ( usage - > d [ BCH_DATA_need_gc_gens ] . buckets > avail )
2022-04-01 01:29:59 -04:00
bch2_do_gc_gens ( c ) ;
2022-10-21 14:01:19 -04:00
if ( should_invalidate_buckets ( ca , * usage ) )
2022-04-01 01:29:59 -04:00
bch2_do_invalidates ( c ) ;
2022-01-09 20:48:31 -05:00
if ( ! avail ) {
2022-04-01 01:29:59 -04:00
if ( cl & & ! waiting ) {
2022-01-09 20:48:31 -05:00
closure_wait ( & c - > freelist_wait , cl ) ;
2022-04-01 01:29:59 -04:00
waiting = true ;
goto again ;
2022-01-09 20:48:31 -05:00
}
2023-12-11 10:15:18 -05:00
track_event_change ( & c - > times [ BCH_TIME_blocked_allocate ] ,
& c - > blocked_allocate , true ) ;
2022-01-09 20:48:31 -05:00
2022-07-17 22:31:21 -04:00
ob = ERR_PTR ( - BCH_ERR_freelist_empty ) ;
2022-01-09 20:48:31 -05:00
goto err ;
}
2022-04-01 01:29:59 -04:00
if ( waiting )
closure_wake_up ( & c - > freelist_wait ) ;
2022-12-11 19:14:30 -05:00
alloc :
ob = likely ( freespace )
2023-06-24 19:30:10 -04:00
? bch2_bucket_alloc_freelist ( trans , ca , watermark , & s , cl )
: bch2_bucket_alloc_early ( trans , ca , watermark , & s , cl ) ;
2022-11-04 16:06:55 -04:00
if ( s . skipped_need_journal_commit * 2 > avail )
2022-01-09 20:48:31 -05:00
bch2_journal_flush_async ( & c - > journal , NULL ) ;
2022-12-11 19:14:30 -05:00
2023-07-07 02:42:28 -04:00
if ( ! ob & & freespace & & c - > curr_recovery_pass < = BCH_RECOVERY_PASS_check_alloc_info ) {
2022-12-11 19:14:30 -05:00
freespace = false ;
goto alloc ;
}
2022-01-09 20:48:31 -05:00
err :
if ( ! ob )
2022-07-17 22:31:21 -04:00
ob = ERR_PTR ( - BCH_ERR_no_buckets_found ) ;
2022-01-09 20:48:31 -05:00
2022-08-27 12:48:36 -04:00
if ( ! IS_ERR ( ob ) )
trace_and_count ( c , bucket_alloc , ca ,
2023-06-24 19:30:10 -04:00
bch2_watermarks [ watermark ] ,
2022-09-26 18:18:00 -04:00
ob - > bucket ,
2022-10-21 14:01:19 -04:00
usage - > d [ BCH_DATA_free ] . buckets ,
2022-08-27 12:48:36 -04:00
avail ,
bch2_copygc_wait_amount ( c ) ,
c - > copygc_wait - atomic64_read ( & c - > io_clock [ WRITE ] . now ) ,
2022-11-04 16:06:55 -04:00
& s ,
2022-08-27 12:48:36 -04:00
cl = = NULL ,
" " ) ;
2022-10-17 07:07:28 -04:00
else if ( ! bch2_err_matches ( PTR_ERR ( ob ) , BCH_ERR_transaction_restart ) )
2022-08-27 12:48:36 -04:00
trace_and_count ( c , bucket_alloc_fail , ca ,
2023-06-24 19:30:10 -04:00
bch2_watermarks [ watermark ] ,
2022-09-26 18:18:00 -04:00
0 ,
2022-10-21 14:01:19 -04:00
usage - > d [ BCH_DATA_free ] . buckets ,
2022-08-27 12:48:36 -04:00
avail ,
bch2_copygc_wait_amount ( c ) ,
c - > copygc_wait - atomic64_read ( & c - > io_clock [ WRITE ] . now ) ,
2022-11-04 16:06:55 -04:00
& s ,
2022-08-27 12:48:36 -04:00
cl = = NULL ,
bch2_err_str ( PTR_ERR ( ob ) ) ) ;
2022-01-09 20:48:31 -05:00
return ob ;
}
2018-10-06 00:46:55 -04:00
2022-01-09 20:48:31 -05:00
struct open_bucket * bch2_bucket_alloc ( struct bch_fs * c , struct bch_dev * ca ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2022-01-09 20:48:31 -05:00
struct closure * cl )
{
2022-10-21 14:01:19 -04:00
struct bch_dev_usage usage ;
2022-01-09 20:48:31 -05:00
struct open_bucket * ob ;
2018-10-06 00:46:55 -04:00
2022-01-09 20:48:31 -05:00
bch2_trans_do ( c , NULL , NULL , 0 ,
2023-09-12 17:16:02 -04:00
PTR_ERR_OR_ZERO ( ob = bch2_bucket_alloc_trans ( trans , ca , watermark ,
2023-02-25 02:22:49 -05:00
cl , & usage ) ) ) ;
2018-10-06 04:12:42 -04:00
return ob ;
2018-10-06 00:46:55 -04:00
}
2018-11-01 15:13:19 -04:00
static int __dev_stripe_cmp ( struct dev_stripe_state * stripe ,
unsigned l , unsigned r )
2018-10-06 00:46:55 -04:00
{
2018-11-01 15:13:19 -04:00
return ( ( stripe - > next_alloc [ l ] > stripe - > next_alloc [ r ] ) -
( stripe - > next_alloc [ l ] < stripe - > next_alloc [ r ] ) ) ;
2018-10-06 00:46:55 -04:00
}
2018-11-01 15:13:19 -04:00
# define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
2018-10-06 00:46:55 -04:00
2018-11-01 15:13:19 -04:00
struct dev_alloc_list bch2_dev_alloc_list ( struct bch_fs * c ,
struct dev_stripe_state * stripe ,
struct bch_devs_mask * devs )
2018-10-06 00:46:55 -04:00
{
struct dev_alloc_list ret = { . nr = 0 } ;
unsigned i ;
2020-07-06 22:33:54 -04:00
for_each_set_bit ( i , devs - > d , BCH_SB_MEMBERS_MAX )
2018-10-06 00:46:55 -04:00
ret . devs [ ret . nr + + ] = i ;
2018-11-01 15:13:19 -04:00
bubble_sort ( ret . devs , ret . nr , dev_stripe_cmp ) ;
2018-10-06 00:46:55 -04:00
return ret ;
}
2022-10-21 14:01:19 -04:00
static inline void bch2_dev_stripe_increment_inlined ( struct bch_dev * ca ,
struct dev_stripe_state * stripe ,
struct bch_dev_usage * usage )
2018-10-06 00:46:55 -04:00
{
2018-11-01 15:13:19 -04:00
u64 * v = stripe - > next_alloc + ca - > dev_idx ;
2023-06-24 19:30:10 -04:00
u64 free_space = dev_buckets_available ( ca , BCH_WATERMARK_normal ) ;
2018-10-06 00:46:55 -04:00
u64 free_space_inv = free_space
? div64_u64 ( 1ULL < < 48 , free_space )
: 1ULL < < 48 ;
u64 scale = * v / 4 ;
if ( * v + free_space_inv > = * v )
* v + = free_space_inv ;
else
* v = U64_MAX ;
2018-11-01 15:13:19 -04:00
for ( v = stripe - > next_alloc ;
v < stripe - > next_alloc + ARRAY_SIZE ( stripe - > next_alloc ) ; v + + )
2018-10-06 00:46:55 -04:00
* v = * v < scale ? 0 : * v - scale ;
}
2022-10-21 14:01:19 -04:00
void bch2_dev_stripe_increment ( struct bch_dev * ca ,
struct dev_stripe_state * stripe )
{
struct bch_dev_usage usage ;
bch2_dev_usage_read_fast ( ca , & usage ) ;
bch2_dev_stripe_increment_inlined ( ca , stripe , & usage ) ;
}
2023-02-25 02:22:49 -05:00
static int add_new_bucket ( struct bch_fs * c ,
2019-06-10 11:31:07 -04:00
struct open_buckets * ptrs ,
struct bch_devs_mask * devs_may_alloc ,
2023-02-25 02:22:49 -05:00
unsigned nr_replicas ,
2019-06-10 11:31:07 -04:00
unsigned * nr_effective ,
bool * have_cache ,
unsigned flags ,
struct open_bucket * ob )
{
unsigned durability =
2021-12-25 21:21:46 -05:00
bch_dev_bkey_exists ( c , ob - > dev ) - > mi . durability ;
2019-06-10 11:31:07 -04:00
2023-02-25 02:22:49 -05:00
BUG_ON ( * nr_effective > = nr_replicas ) ;
2021-12-25 21:21:46 -05:00
__clear_bit ( ob - > dev , devs_may_alloc - > d ) ;
2023-11-28 19:47:26 -05:00
* nr_effective + = durability ;
2019-06-10 11:31:07 -04:00
* have_cache | = ! durability ;
ob_push ( c , ptrs , ob ) ;
2023-02-25 02:22:49 -05:00
if ( * nr_effective > = nr_replicas )
return 1 ;
if ( ob - > ec )
return 1 ;
return 0 ;
2019-06-10 11:31:07 -04:00
}
2023-02-17 20:50:55 -05:00
int bch2_bucket_alloc_set_trans ( struct btree_trans * trans ,
2020-07-11 18:52:14 -04:00
struct open_buckets * ptrs ,
struct dev_stripe_state * stripe ,
struct bch_devs_mask * devs_may_alloc ,
unsigned nr_replicas ,
unsigned * nr_effective ,
bool * have_cache ,
unsigned flags ,
2023-02-25 02:22:49 -05:00
enum bch_data_type data_type ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2020-07-11 18:52:14 -04:00
struct closure * cl )
2018-10-06 00:46:55 -04:00
{
2022-01-09 20:48:31 -05:00
struct bch_fs * c = trans - > c ;
2018-10-06 04:12:42 -04:00
struct dev_alloc_list devs_sorted =
2018-11-01 15:13:19 -04:00
bch2_dev_alloc_list ( c , stripe , devs_may_alloc ) ;
2022-01-09 20:48:31 -05:00
unsigned dev ;
2018-10-06 00:46:55 -04:00
struct bch_dev * ca ;
2022-10-01 23:54:46 -04:00
int ret = - BCH_ERR_insufficient_devices ;
2019-06-10 11:31:07 -04:00
unsigned i ;
2018-10-06 00:46:55 -04:00
2018-10-06 04:12:42 -04:00
BUG_ON ( * nr_effective > = nr_replicas ) ;
2018-10-06 00:46:55 -04:00
for ( i = 0 ; i < devs_sorted . nr ; i + + ) {
2022-10-21 14:01:19 -04:00
struct bch_dev_usage usage ;
2018-10-06 04:12:42 -04:00
struct open_bucket * ob ;
2018-10-06 00:46:55 -04:00
2022-01-09 20:48:31 -05:00
dev = devs_sorted . devs [ i ] ;
rcu_read_lock ( ) ;
ca = rcu_dereference ( c - > devs [ dev ] ) ;
if ( ca )
percpu_ref_get ( & ca - > ref ) ;
rcu_read_unlock ( ) ;
2018-10-06 00:46:55 -04:00
if ( ! ca )
continue ;
2022-01-09 20:48:31 -05:00
if ( ! ca - > mi . durability & & * have_cache ) {
percpu_ref_put ( & ca - > ref ) ;
2018-10-06 00:46:55 -04:00
continue ;
2022-01-09 20:48:31 -05:00
}
2018-10-06 00:46:55 -04:00
2023-06-24 19:30:10 -04:00
ob = bch2_bucket_alloc_trans ( trans , ca , watermark , cl , & usage ) ;
2022-01-09 20:48:31 -05:00
if ( ! IS_ERR ( ob ) )
2022-10-21 14:01:19 -04:00
bch2_dev_stripe_increment_inlined ( ca , stripe , & usage ) ;
2022-01-09 20:48:31 -05:00
percpu_ref_put ( & ca - > ref ) ;
2022-10-01 23:54:46 -04:00
if ( IS_ERR ( ob ) ) {
ret = PTR_ERR ( ob ) ;
2022-07-17 23:06:38 -04:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) | | cl )
2022-01-09 20:48:31 -05:00
break ;
2018-10-06 00:46:55 -04:00
continue ;
}
2023-02-25 02:22:49 -05:00
ob - > data_type = data_type ;
2018-10-06 00:46:55 -04:00
2023-02-25 02:22:49 -05:00
if ( add_new_bucket ( c , ptrs , devs_may_alloc ,
nr_replicas , nr_effective ,
have_cache , flags , ob ) ) {
2022-10-01 23:54:46 -04:00
ret = 0 ;
2022-01-09 20:48:31 -05:00
break ;
2022-10-01 23:54:46 -04:00
}
2018-10-06 00:46:55 -04:00
}
2020-07-11 18:52:14 -04:00
return ret ;
2018-10-06 00:46:55 -04:00
}
2018-11-01 15:13:19 -04:00
/* Allocate from stripes: */
/*
* if we can ' t allocate a new stripe because there are already too many
* partially filled stripes , force allocating from an existing stripe even when
* it ' s to a device we don ' t want :
*/
2023-02-17 20:50:55 -05:00
static int bucket_alloc_from_stripe ( struct btree_trans * trans ,
2020-12-15 12:38:17 -05:00
struct open_buckets * ptrs ,
struct write_point * wp ,
struct bch_devs_mask * devs_may_alloc ,
u16 target ,
unsigned nr_replicas ,
unsigned * nr_effective ,
bool * have_cache ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2020-12-15 12:38:17 -05:00
unsigned flags ,
struct closure * cl )
2018-11-01 15:13:19 -04:00
{
2023-02-17 20:50:55 -05:00
struct bch_fs * c = trans - > c ;
2018-11-01 15:13:19 -04:00
struct dev_alloc_list devs_sorted ;
struct ec_stripe_head * h ;
struct open_bucket * ob ;
unsigned i , ec_idx ;
2023-02-25 02:22:49 -05:00
int ret = 0 ;
2018-11-01 15:13:19 -04:00
if ( nr_replicas < 2 )
2020-12-15 12:38:17 -05:00
return 0 ;
2018-11-01 15:13:19 -04:00
if ( ec_open_bucket ( c , ptrs ) )
2020-12-15 12:38:17 -05:00
return 0 ;
2018-11-01 15:13:19 -04:00
2023-06-24 19:30:10 -04:00
h = bch2_ec_stripe_head_get ( trans , target , 0 , nr_replicas - 1 , watermark , cl ) ;
2020-12-15 12:38:17 -05:00
if ( IS_ERR ( h ) )
2023-02-17 20:50:55 -05:00
return PTR_ERR ( h ) ;
2018-11-01 15:13:19 -04:00
if ( ! h )
2020-12-15 12:38:17 -05:00
return 0 ;
2018-11-01 15:13:19 -04:00
devs_sorted = bch2_dev_alloc_list ( c , & wp - > stripe , devs_may_alloc ) ;
for ( i = 0 ; i < devs_sorted . nr ; i + + )
2021-01-18 23:26:42 -05:00
for ( ec_idx = 0 ; ec_idx < h - > s - > nr_data ; ec_idx + + ) {
if ( ! h - > s - > blocks [ ec_idx ] )
continue ;
ob = c - > open_buckets + h - > s - > blocks [ ec_idx ] ;
2021-12-25 21:21:46 -05:00
if ( ob - > dev = = devs_sorted . devs [ i ] & &
2021-01-18 23:26:42 -05:00
! test_and_set_bit ( ec_idx , h - > s - > blocks_allocated ) )
2018-11-01 15:13:19 -04:00
goto got_bucket ;
2021-01-18 23:26:42 -05:00
}
2018-11-01 15:13:19 -04:00
goto out_put_head ;
got_bucket :
2021-01-18 23:26:42 -05:00
ob - > ec_idx = ec_idx ;
2018-11-01 15:13:19 -04:00
ob - > ec = h - > s ;
2023-03-09 10:18:09 -05:00
ec_stripe_new_get ( h - > s , STRIPE_REF_io ) ;
2018-11-01 15:13:19 -04:00
2023-02-25 02:22:49 -05:00
ret = add_new_bucket ( c , ptrs , devs_may_alloc ,
nr_replicas , nr_effective ,
have_cache , flags , ob ) ;
2018-11-01 15:13:19 -04:00
out_put_head :
2020-07-06 20:59:46 -04:00
bch2_ec_stripe_head_put ( c , h ) ;
2023-02-25 02:22:49 -05:00
return ret ;
2018-11-01 15:13:19 -04:00
}
2018-10-06 00:46:55 -04:00
/* Sector allocator */
2023-02-25 02:22:49 -05:00
static bool want_bucket ( struct bch_fs * c ,
struct write_point * wp ,
struct bch_devs_mask * devs_may_alloc ,
bool * have_cache , bool ec ,
struct open_bucket * ob )
{
struct bch_dev * ca = bch_dev_bkey_exists ( c , ob - > dev ) ;
if ( ! test_bit ( ob - > dev , devs_may_alloc - > d ) )
return false ;
if ( ob - > data_type ! = wp - > data_type )
return false ;
if ( ! ca - > mi . durability & &
( wp - > data_type = = BCH_DATA_btree | | ec | | * have_cache ) )
return false ;
if ( ec ! = ( ob - > ec ! = NULL ) )
return false ;
return true ;
}
static int bucket_alloc_set_writepoint ( struct bch_fs * c ,
struct open_buckets * ptrs ,
struct write_point * wp ,
struct bch_devs_mask * devs_may_alloc ,
unsigned nr_replicas ,
unsigned * nr_effective ,
bool * have_cache ,
bool ec , unsigned flags )
2018-10-06 00:46:55 -04:00
{
2018-10-06 04:12:42 -04:00
struct open_buckets ptrs_skip = { . nr = 0 } ;
2018-10-06 00:46:55 -04:00
struct open_bucket * ob ;
unsigned i ;
2023-02-25 02:22:49 -05:00
int ret = 0 ;
2018-10-06 00:46:55 -04:00
2018-10-06 04:12:42 -04:00
open_bucket_for_each ( c , & wp - > ptrs , ob , i ) {
2023-02-25 02:22:49 -05:00
if ( ! ret & & want_bucket ( c , wp , devs_may_alloc ,
have_cache , ec , ob ) )
ret = add_new_bucket ( c , ptrs , devs_may_alloc ,
nr_replicas , nr_effective ,
have_cache , flags , ob ) ;
else
2018-10-06 04:12:42 -04:00
ob_push ( c , & ptrs_skip , ob ) ;
2018-10-06 00:46:55 -04:00
}
2018-10-06 04:12:42 -04:00
wp - > ptrs = ptrs_skip ;
2023-02-25 02:22:49 -05:00
return ret ;
2018-10-06 00:46:55 -04:00
}
2023-02-25 02:22:49 -05:00
static int bucket_alloc_set_partial ( struct bch_fs * c ,
struct open_buckets * ptrs ,
struct write_point * wp ,
struct bch_devs_mask * devs_may_alloc ,
unsigned nr_replicas ,
unsigned * nr_effective ,
bool * have_cache , bool ec ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2023-02-25 02:22:49 -05:00
unsigned flags )
{
int i , ret = 0 ;
if ( ! c - > open_buckets_partial_nr )
return 0 ;
spin_lock ( & c - > freelist_lock ) ;
if ( ! c - > open_buckets_partial_nr )
goto unlock ;
for ( i = c - > open_buckets_partial_nr - 1 ; i > = 0 ; - - i ) {
struct open_bucket * ob = c - > open_buckets + c - > open_buckets_partial [ i ] ;
if ( want_bucket ( c , wp , devs_may_alloc , have_cache , ec , ob ) ) {
struct bch_dev * ca = bch_dev_bkey_exists ( c , ob - > dev ) ;
struct bch_dev_usage usage ;
u64 avail ;
bch2_dev_usage_read_fast ( ca , & usage ) ;
2023-06-24 19:30:10 -04:00
avail = dev_buckets_free ( ca , usage , watermark ) ;
2023-02-25 02:22:49 -05:00
if ( ! avail )
continue ;
array_remove_item ( c - > open_buckets_partial ,
c - > open_buckets_partial_nr ,
i ) ;
ob - > on_partial_list = false ;
ret = add_new_bucket ( c , ptrs , devs_may_alloc ,
nr_replicas , nr_effective ,
have_cache , flags , ob ) ;
if ( ret )
break ;
}
}
unlock :
spin_unlock ( & c - > freelist_lock ) ;
return ret ;
}
static int __open_bucket_add_buckets ( struct btree_trans * trans ,
2020-07-11 18:52:14 -04:00
struct open_buckets * ptrs ,
struct write_point * wp ,
struct bch_devs_list * devs_have ,
u16 target ,
2023-02-25 02:22:49 -05:00
bool erasure_code ,
2020-07-11 18:52:14 -04:00
unsigned nr_replicas ,
unsigned * nr_effective ,
bool * have_cache ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2020-07-11 18:52:14 -04:00
unsigned flags ,
struct closure * _cl )
2018-10-06 00:46:55 -04:00
{
2022-01-09 20:48:31 -05:00
struct bch_fs * c = trans - > c ;
2018-10-06 04:12:42 -04:00
struct bch_devs_mask devs ;
2018-10-06 00:46:55 -04:00
struct open_bucket * ob ;
2018-11-01 15:13:19 -04:00
struct closure * cl = NULL ;
2019-06-10 11:31:07 -04:00
unsigned i ;
2023-02-25 02:22:49 -05:00
int ret ;
2018-10-06 00:46:55 -04:00
2021-12-25 21:21:46 -05:00
devs = target_rw_devs ( c , wp - > data_type , target ) ;
2018-10-06 04:12:42 -04:00
2018-10-06 00:46:55 -04:00
/* Don't allocate from devices we already have pointers to: */
2023-12-23 21:02:45 -05:00
darray_for_each ( * devs_have , i )
__clear_bit ( * i , devs . d ) ;
2018-10-06 00:46:55 -04:00
2018-10-06 04:12:42 -04:00
open_bucket_for_each ( c , ptrs , ob , i )
2021-12-25 21:21:46 -05:00
__clear_bit ( ob - > dev , devs . d ) ;
2018-10-06 00:46:55 -04:00
2023-02-25 02:22:49 -05:00
if ( erasure_code & & ec_open_bucket ( c , ptrs ) )
return 0 ;
ret = bucket_alloc_set_writepoint ( c , ptrs , wp , & devs ,
nr_replicas , nr_effective ,
have_cache , erasure_code , flags ) ;
if ( ret )
return ret ;
ret = bucket_alloc_set_partial ( c , ptrs , wp , & devs ,
nr_replicas , nr_effective ,
2023-06-24 19:30:10 -04:00
have_cache , erasure_code , watermark , flags ) ;
2023-02-25 02:22:49 -05:00
if ( ret )
return ret ;
2018-11-01 15:13:19 -04:00
if ( erasure_code ) {
2023-02-25 02:22:49 -05:00
ret = bucket_alloc_from_stripe ( trans , ptrs , wp , & devs ,
target ,
nr_replicas , nr_effective ,
have_cache ,
2023-06-24 19:30:10 -04:00
watermark , flags , _cl ) ;
2023-02-25 02:22:49 -05:00
} else {
retry_blocking :
/*
* Try nonblocking first , so that if one device is full we ' ll try from
* other devices :
*/
ret = bch2_bucket_alloc_set_trans ( trans , ptrs , & wp - > stripe , & devs ,
nr_replicas , nr_effective , have_cache ,
2023-06-24 19:30:10 -04:00
flags , wp - > data_type , watermark , cl ) ;
2023-02-25 02:22:49 -05:00
if ( ret & &
! bch2_err_matches ( ret , BCH_ERR_transaction_restart ) & &
! bch2_err_matches ( ret , BCH_ERR_insufficient_devices ) & &
! cl & & _cl ) {
cl = _cl ;
goto retry_blocking ;
2020-07-23 11:31:01 -04:00
}
2018-11-01 15:13:19 -04:00
}
2023-02-25 02:22:49 -05:00
return ret ;
}
2018-11-01 15:13:19 -04:00
2023-02-25 02:22:49 -05:00
static int open_bucket_add_buckets ( struct btree_trans * trans ,
struct open_buckets * ptrs ,
struct write_point * wp ,
struct bch_devs_list * devs_have ,
u16 target ,
unsigned erasure_code ,
unsigned nr_replicas ,
unsigned * nr_effective ,
bool * have_cache ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2023-02-25 02:22:49 -05:00
unsigned flags ,
struct closure * cl )
{
int ret ;
if ( erasure_code ) {
ret = __open_bucket_add_buckets ( trans , ptrs , wp ,
devs_have , target , erasure_code ,
2018-10-06 04:12:42 -04:00
nr_replicas , nr_effective , have_cache ,
2023-06-24 19:30:10 -04:00
watermark , flags , cl ) ;
2023-02-25 02:22:49 -05:00
if ( bch2_err_matches ( ret , BCH_ERR_transaction_restart ) | |
bch2_err_matches ( ret , BCH_ERR_operation_blocked ) | |
bch2_err_matches ( ret , BCH_ERR_freelist_empty ) | |
bch2_err_matches ( ret , BCH_ERR_open_buckets_empty ) )
return ret ;
if ( * nr_effective > = nr_replicas )
return 0 ;
2018-11-01 15:13:19 -04:00
}
2018-10-06 04:12:42 -04:00
2023-02-25 02:22:49 -05:00
ret = __open_bucket_add_buckets ( trans , ptrs , wp ,
devs_have , target , false ,
nr_replicas , nr_effective , have_cache ,
2023-06-24 19:30:10 -04:00
watermark , flags , cl ) ;
2023-02-25 02:22:49 -05:00
return ret < 0 ? ret : 0 ;
2018-10-06 00:46:55 -04:00
}
2023-08-12 16:46:54 -04:00
/**
* should_drop_bucket - check if this is open_bucket should go away
2023-09-12 18:41:22 -04:00
* @ ob : open_bucket to predicate on
* @ c : filesystem handle
2023-08-12 16:46:54 -04:00
* @ ca : if set , we ' re killing buckets for a particular device
* @ ec : if true , we ' re shutting down erasure coding and killing all ec
* open_buckets
* otherwise , return true
2023-09-12 18:41:22 -04:00
* Returns : true if we should kill this open_bucket
2023-08-12 16:46:54 -04:00
*
* We ' re killing open_buckets because we ' re shutting down a device , erasure
* coding , or the entire filesystem - check if this open_bucket matches :
*/
2023-03-13 22:01:47 -04:00
static bool should_drop_bucket ( struct open_bucket * ob , struct bch_fs * c ,
struct bch_dev * ca , bool ec )
2018-10-06 00:46:55 -04:00
{
2023-03-13 22:01:47 -04:00
if ( ec ) {
return ob - > ec ! = NULL ;
} else if ( ca ) {
bool drop = ob - > dev = = ca - > dev_idx ;
struct open_bucket * ob2 ;
unsigned i ;
2018-11-01 15:13:19 -04:00
if ( ! drop & & ob - > ec ) {
2023-08-01 20:06:45 -04:00
unsigned nr_blocks ;
2018-11-01 15:13:19 -04:00
mutex_lock ( & ob - > ec - > lock ) ;
2023-08-01 20:06:45 -04:00
nr_blocks = bkey_i_to_stripe ( & ob - > ec - > new_stripe . key ) - > v . nr_blocks ;
for ( i = 0 ; i < nr_blocks ; i + + ) {
2023-03-13 22:01:47 -04:00
if ( ! ob - > ec - > blocks [ i ] )
2021-01-18 23:26:42 -05:00
continue ;
2023-03-13 22:01:47 -04:00
ob2 = c - > open_buckets + ob - > ec - > blocks [ i ] ;
2021-12-25 21:21:46 -05:00
drop | = ob2 - > dev = = ca - > dev_idx ;
2021-01-18 23:26:42 -05:00
}
2018-11-01 15:13:19 -04:00
mutex_unlock ( & ob - > ec - > lock ) ;
}
2023-03-13 22:01:47 -04:00
return drop ;
} else {
return true ;
2018-11-01 15:13:19 -04:00
}
}
2023-03-13 22:01:47 -04:00
static void bch2_writepoint_stop ( struct bch_fs * c , struct bch_dev * ca ,
bool ec , struct write_point * wp )
2018-11-01 15:13:19 -04:00
{
2023-03-13 22:01:47 -04:00
struct open_buckets ptrs = { . nr = 0 } ;
struct open_bucket * ob ;
unsigned i ;
2018-11-01 15:13:19 -04:00
mutex_lock ( & wp - > lock ) ;
2023-03-13 22:01:47 -04:00
open_bucket_for_each ( c , & wp - > ptrs , ob , i )
if ( should_drop_bucket ( ob , c , ca , ec ) )
bch2_open_bucket_put ( c , ob ) ;
else
ob_push ( c , & ptrs , ob ) ;
wp - > ptrs = ptrs ;
2018-10-06 00:46:55 -04:00
mutex_unlock ( & wp - > lock ) ;
}
2023-03-13 22:01:47 -04:00
void bch2_open_buckets_stop ( struct bch_fs * c , struct bch_dev * ca ,
bool ec )
{
unsigned i ;
/* Next, close write points that point to this device... */
for ( i = 0 ; i < ARRAY_SIZE ( c - > write_points ) ; i + + )
bch2_writepoint_stop ( c , ca , ec , & c - > write_points [ i ] ) ;
bch2_writepoint_stop ( c , ca , ec , & c - > copygc_write_point ) ;
bch2_writepoint_stop ( c , ca , ec , & c - > rebalance_write_point ) ;
bch2_writepoint_stop ( c , ca , ec , & c - > btree_write_point ) ;
mutex_lock ( & c - > btree_reserve_cache_lock ) ;
while ( c - > btree_reserve_cache_nr ) {
struct btree_alloc * a =
& c - > btree_reserve_cache [ - - c - > btree_reserve_cache_nr ] ;
bch2_open_buckets_put ( c , & a - > ob ) ;
}
mutex_unlock ( & c - > btree_reserve_cache_lock ) ;
spin_lock ( & c - > freelist_lock ) ;
i = 0 ;
while ( i < c - > open_buckets_partial_nr ) {
struct open_bucket * ob =
c - > open_buckets + c - > open_buckets_partial [ i ] ;
if ( should_drop_bucket ( ob , c , ca , ec ) ) {
- - c - > open_buckets_partial_nr ;
swap ( c - > open_buckets_partial [ i ] ,
c - > open_buckets_partial [ c - > open_buckets_partial_nr ] ) ;
ob - > on_partial_list = false ;
spin_unlock ( & c - > freelist_lock ) ;
bch2_open_bucket_put ( c , ob ) ;
spin_lock ( & c - > freelist_lock ) ;
} else {
i + + ;
}
}
spin_unlock ( & c - > freelist_lock ) ;
bch2_ec_stop_dev ( c , ca ) ;
}
2018-11-04 21:55:35 -05:00
static inline struct hlist_head * writepoint_hash ( struct bch_fs * c ,
unsigned long write_point )
{
unsigned hash =
hash_long ( write_point , ilog2 ( ARRAY_SIZE ( c - > write_points_hash ) ) ) ;
return & c - > write_points_hash [ hash ] ;
}
2018-10-06 00:46:55 -04:00
static struct write_point * __writepoint_find ( struct hlist_head * head ,
unsigned long write_point )
{
struct write_point * wp ;
2021-04-15 18:31:58 -04:00
rcu_read_lock ( ) ;
2018-10-06 00:46:55 -04:00
hlist_for_each_entry_rcu ( wp , head , node )
if ( wp - > write_point = = write_point )
2021-04-15 18:31:58 -04:00
goto out ;
wp = NULL ;
out :
rcu_read_unlock ( ) ;
return wp ;
2018-10-06 00:46:55 -04:00
}
2018-11-04 21:55:35 -05:00
static inline bool too_many_writepoints ( struct bch_fs * c , unsigned factor )
{
u64 stranded = c - > write_points_nr * c - > bucket_size_max ;
2019-02-14 18:38:52 -05:00
u64 free = bch2_fs_usage_read_short ( c ) . free ;
2018-11-04 21:55:35 -05:00
return stranded * factor > free ;
}
static bool try_increase_writepoints ( struct bch_fs * c )
{
struct write_point * wp ;
if ( c - > write_points_nr = = ARRAY_SIZE ( c - > write_points ) | |
too_many_writepoints ( c , 32 ) )
return false ;
wp = c - > write_points + c - > write_points_nr + + ;
hlist_add_head_rcu ( & wp - > node , writepoint_hash ( c , wp - > write_point ) ) ;
return true ;
}
2023-03-16 18:05:00 -04:00
static bool try_decrease_writepoints ( struct btree_trans * trans , unsigned old_nr )
2018-11-04 21:55:35 -05:00
{
2023-03-16 18:05:00 -04:00
struct bch_fs * c = trans - > c ;
2018-11-04 21:55:35 -05:00
struct write_point * wp ;
2023-03-16 18:05:00 -04:00
struct open_bucket * ob ;
unsigned i ;
2018-11-04 21:55:35 -05:00
mutex_lock ( & c - > write_points_hash_lock ) ;
if ( c - > write_points_nr < old_nr ) {
mutex_unlock ( & c - > write_points_hash_lock ) ;
return true ;
}
if ( c - > write_points_nr = = 1 | |
! too_many_writepoints ( c , 8 ) ) {
mutex_unlock ( & c - > write_points_hash_lock ) ;
return false ;
}
wp = c - > write_points + - - c - > write_points_nr ;
hlist_del_rcu ( & wp - > node ) ;
mutex_unlock ( & c - > write_points_hash_lock ) ;
2023-03-16 18:05:00 -04:00
bch2_trans_mutex_lock_norelock ( trans , & wp - > lock ) ;
open_bucket_for_each ( c , & wp - > ptrs , ob , i )
open_bucket_free_unused ( c , ob ) ;
2023-07-09 13:49:34 -04:00
wp - > ptrs . nr = 0 ;
2023-03-16 18:05:00 -04:00
mutex_unlock ( & wp - > lock ) ;
2018-11-04 21:55:35 -05:00
return true ;
}
2022-01-09 20:48:31 -05:00
static struct write_point * writepoint_find ( struct btree_trans * trans ,
2018-10-06 00:46:55 -04:00
unsigned long write_point )
{
2022-01-09 20:48:31 -05:00
struct bch_fs * c = trans - > c ;
2018-10-06 00:46:55 -04:00
struct write_point * wp , * oldest ;
struct hlist_head * head ;
if ( ! ( write_point & 1UL ) ) {
wp = ( struct write_point * ) write_point ;
2023-02-17 22:43:47 -05:00
bch2_trans_mutex_lock_norelock ( trans , & wp - > lock ) ;
2018-10-06 00:46:55 -04:00
return wp ;
}
head = writepoint_hash ( c , write_point ) ;
restart_find :
wp = __writepoint_find ( head , write_point ) ;
if ( wp ) {
lock_wp :
2023-02-17 22:43:47 -05:00
bch2_trans_mutex_lock_norelock ( trans , & wp - > lock ) ;
2018-10-06 00:46:55 -04:00
if ( wp - > write_point = = write_point )
goto out ;
mutex_unlock ( & wp - > lock ) ;
goto restart_find ;
}
2018-11-04 21:55:35 -05:00
restart_find_oldest :
2018-10-06 00:46:55 -04:00
oldest = NULL ;
for ( wp = c - > write_points ;
2018-11-04 21:55:35 -05:00
wp < c - > write_points + c - > write_points_nr ; wp + + )
2018-10-06 00:46:55 -04:00
if ( ! oldest | | time_before64 ( wp - > last_used , oldest - > last_used ) )
oldest = wp ;
2023-02-17 22:43:47 -05:00
bch2_trans_mutex_lock_norelock ( trans , & oldest - > lock ) ;
bch2_trans_mutex_lock_norelock ( trans , & c - > write_points_hash_lock ) ;
2018-11-04 21:55:35 -05:00
if ( oldest > = c - > write_points + c - > write_points_nr | |
try_increase_writepoints ( c ) ) {
mutex_unlock ( & c - > write_points_hash_lock ) ;
mutex_unlock ( & oldest - > lock ) ;
goto restart_find_oldest ;
}
2018-10-06 00:46:55 -04:00
wp = __writepoint_find ( head , write_point ) ;
if ( wp & & wp ! = oldest ) {
mutex_unlock ( & c - > write_points_hash_lock ) ;
mutex_unlock ( & oldest - > lock ) ;
goto lock_wp ;
}
wp = oldest ;
hlist_del_rcu ( & wp - > node ) ;
wp - > write_point = write_point ;
hlist_add_head_rcu ( & wp - > node , head ) ;
mutex_unlock ( & c - > write_points_hash_lock ) ;
out :
2022-10-17 07:07:28 -04:00
wp - > last_used = local_clock ( ) ;
2018-10-06 00:46:55 -04:00
return wp ;
}
2023-11-20 18:23:26 -05:00
static noinline void
deallocate_extra_replicas ( struct bch_fs * c ,
struct open_buckets * ptrs ,
struct open_buckets * ptrs_no_use ,
unsigned extra_replicas )
{
struct open_buckets ptrs2 = { 0 } ;
struct open_bucket * ob ;
unsigned i ;
open_bucket_for_each ( c , ptrs , ob , i ) {
unsigned d = bch_dev_bkey_exists ( c , ob - > dev ) - > mi . durability ;
if ( d & & d < = extra_replicas ) {
extra_replicas - = d ;
ob_push ( c , ptrs_no_use , ob ) ;
} else {
ob_push ( c , & ptrs2 , ob ) ;
}
}
* ptrs = ptrs2 ;
}
2018-10-06 00:46:55 -04:00
/*
* Get us an open_bucket we can allocate from , return with it locked :
*/
2022-01-09 20:48:31 -05:00
int bch2_alloc_sectors_start_trans ( struct btree_trans * trans ,
2022-10-31 16:13:05 -04:00
unsigned target ,
unsigned erasure_code ,
struct write_point_specifier write_point ,
struct bch_devs_list * devs_have ,
unsigned nr_replicas ,
unsigned nr_replicas_required ,
2023-06-24 19:30:10 -04:00
enum bch_watermark watermark ,
2022-10-31 16:13:05 -04:00
unsigned flags ,
struct closure * cl ,
struct write_point * * wp_ret )
2018-10-06 00:46:55 -04:00
{
2022-01-09 20:48:31 -05:00
struct bch_fs * c = trans - > c ;
2018-10-06 00:46:55 -04:00
struct write_point * wp ;
struct open_bucket * ob ;
2019-01-18 18:58:51 -05:00
struct open_buckets ptrs ;
unsigned nr_effective , write_points_nr ;
bool have_cache ;
2021-11-28 13:42:05 -05:00
int ret ;
2020-07-11 18:52:14 -04:00
int i ;
2018-10-06 00:46:55 -04:00
2023-11-23 17:56:14 -05:00
if ( ! IS_ENABLED ( CONFIG_BCACHEFS_ERASURE_CODING ) )
erasure_code = false ;
2023-02-25 02:22:49 -05:00
BUG_ON ( flags & BCH_WRITE_ONLY_SPECIFIED_DEVS ) ;
2019-06-10 11:31:07 -04:00
2018-10-06 00:46:55 -04:00
BUG_ON ( ! nr_replicas | | ! nr_replicas_required ) ;
2018-11-04 21:55:35 -05:00
retry :
2019-01-18 18:58:51 -05:00
ptrs . nr = 0 ;
nr_effective = 0 ;
2018-11-04 21:55:35 -05:00
write_points_nr = c - > write_points_nr ;
2019-01-18 18:58:51 -05:00
have_cache = false ;
2018-11-01 15:13:19 -04:00
2022-01-09 20:48:31 -05:00
* wp_ret = wp = writepoint_find ( trans , write_point . v ) ;
2018-10-06 00:46:55 -04:00
2018-11-01 15:13:19 -04:00
/* metadata may not allocate on cache devices: */
2021-12-25 21:21:46 -05:00
if ( wp - > data_type ! = BCH_DATA_user )
2018-11-01 15:13:19 -04:00
have_cache = true ;
2023-02-25 02:22:49 -05:00
if ( target & & ! ( flags & BCH_WRITE_ONLY_SPECIFIED_DEVS ) ) {
2022-01-09 20:48:31 -05:00
ret = open_bucket_add_buckets ( trans , & ptrs , wp , devs_have ,
2018-11-01 15:13:19 -04:00
target , erasure_code ,
2018-10-06 04:12:42 -04:00
nr_replicas , & nr_effective ,
2023-06-24 19:30:10 -04:00
& have_cache , watermark ,
2023-02-25 02:22:49 -05:00
flags , NULL ) ;
2022-07-17 23:06:38 -04:00
if ( ! ret | |
bch2_err_matches ( ret , BCH_ERR_transaction_restart ) )
2018-10-06 00:46:55 -04:00
goto alloc_done ;
2023-02-25 02:22:49 -05:00
/* Don't retry from all devices if we're out of open buckets: */
2023-12-19 17:16:34 -05:00
if ( bch2_err_matches ( ret , BCH_ERR_open_buckets_empty ) ) {
int ret = open_bucket_add_buckets ( trans , & ptrs , wp , devs_have ,
target , erasure_code ,
nr_replicas , & nr_effective ,
& have_cache , watermark ,
flags , cl ) ;
if ( ! ret | |
bch2_err_matches ( ret , BCH_ERR_transaction_restart ) | |
bch2_err_matches ( ret , BCH_ERR_open_buckets_empty ) )
goto alloc_done ;
}
2023-02-25 02:22:49 -05:00
/*
* Only try to allocate cache ( durability = 0 devices ) from the
* specified target :
*/
have_cache = true ;
2022-01-09 20:48:31 -05:00
ret = open_bucket_add_buckets ( trans , & ptrs , wp , devs_have ,
2018-11-01 15:13:19 -04:00
0 , erasure_code ,
2018-10-06 04:12:42 -04:00
nr_replicas , & nr_effective ,
2023-06-24 19:30:10 -04:00
& have_cache , watermark ,
2023-02-25 02:22:49 -05:00
flags , cl ) ;
} else {
ret = open_bucket_add_buckets ( trans , & ptrs , wp , devs_have ,
target , erasure_code ,
nr_replicas , & nr_effective ,
2023-06-24 19:30:10 -04:00
& have_cache , watermark ,
2023-02-25 02:22:49 -05:00
flags , cl ) ;
2018-10-06 00:46:55 -04:00
}
alloc_done :
2018-10-06 04:12:42 -04:00
BUG_ON ( ! ret & & nr_effective < nr_replicas ) ;
2018-10-06 00:46:55 -04:00
2018-11-01 15:13:19 -04:00
if ( erasure_code & & ! ec_open_bucket ( c , & ptrs ) )
pr_debug ( " failed to get ec bucket: ret %u " , ret ) ;
2022-07-17 22:31:21 -04:00
if ( ret = = - BCH_ERR_insufficient_devices & &
2018-10-06 04:12:42 -04:00
nr_effective > = nr_replicas_required )
2018-10-06 00:46:55 -04:00
ret = 0 ;
if ( ret )
goto err ;
2023-11-20 18:23:26 -05:00
if ( nr_effective > nr_replicas )
deallocate_extra_replicas ( c , & ptrs , & wp - > ptrs , nr_effective - nr_replicas ) ;
2018-10-06 04:12:42 -04:00
/* Free buckets we didn't use: */
open_bucket_for_each ( c , & wp - > ptrs , ob , i )
2023-03-16 18:05:00 -04:00
open_bucket_free_unused ( c , ob ) ;
2018-10-06 00:46:55 -04:00
2018-10-06 04:12:42 -04:00
wp - > ptrs = ptrs ;
2018-10-06 00:46:55 -04:00
wp - > sectors_free = UINT_MAX ;
2018-10-06 04:12:42 -04:00
open_bucket_for_each ( c , & wp - > ptrs , ob , i )
2018-10-06 00:46:55 -04:00
wp - > sectors_free = min ( wp - > sectors_free , ob - > sectors_free ) ;
BUG_ON ( ! wp - > sectors_free | | wp - > sectors_free = = UINT_MAX ) ;
2022-10-31 16:13:05 -04:00
return 0 ;
2018-10-06 00:46:55 -04:00
err :
2018-10-06 04:12:42 -04:00
open_bucket_for_each ( c , & wp - > ptrs , ob , i )
if ( ptrs . nr < ARRAY_SIZE ( ptrs . v ) )
ob_push ( c , & ptrs , ob ) ;
else
2023-03-16 18:05:00 -04:00
open_bucket_free_unused ( c , ob ) ;
2018-10-06 04:12:42 -04:00
wp - > ptrs = ptrs ;
2018-10-06 00:46:55 -04:00
mutex_unlock ( & wp - > lock ) ;
2018-11-04 21:55:35 -05:00
2022-07-17 22:31:21 -04:00
if ( bch2_err_matches ( ret , BCH_ERR_freelist_empty ) & &
2023-03-16 18:05:00 -04:00
try_decrease_writepoints ( trans , write_points_nr ) )
2018-11-04 21:55:35 -05:00
goto retry ;
2022-07-17 22:31:21 -04:00
if ( bch2_err_matches ( ret , BCH_ERR_open_buckets_empty ) | |
bch2_err_matches ( ret , BCH_ERR_freelist_empty ) )
2022-09-18 17:10:33 -04:00
return cl
2022-12-13 15:17:40 -05:00
? - BCH_ERR_bucket_alloc_blocked
2022-09-18 17:10:33 -04:00
: - BCH_ERR_ENOSPC_bucket_alloc ;
2022-07-17 22:31:21 -04:00
return ret ;
2018-10-06 00:46:55 -04:00
}
2021-12-25 21:21:46 -05:00
struct bch_extent_ptr bch2_ob_ptr ( struct bch_fs * c , struct open_bucket * ob )
{
struct bch_dev * ca = bch_dev_bkey_exists ( c , ob - > dev ) ;
return ( struct bch_extent_ptr ) {
. type = 1 < < BCH_EXTENT_ENTRY_ptr ,
. gen = ob - > gen ,
. dev = ob - > dev ,
. offset = bucket_to_sector ( ca , ob - > bucket ) +
ca - > mi . bucket_size -
ob - > sectors_free ,
} ;
}
2018-10-06 00:46:55 -04:00
void bch2_alloc_sectors_append_ptrs ( struct bch_fs * c , struct write_point * wp ,
2021-12-25 21:14:49 -05:00
struct bkey_i * k , unsigned sectors ,
bool cached )
2018-10-06 00:46:55 -04:00
{
2022-11-24 18:03:55 -05:00
bch2_alloc_sectors_append_ptrs_inlined ( c , wp , k , sectors , cached ) ;
2018-10-06 00:46:55 -04:00
}
/*
* Append pointers to the space we just allocated to @ k , and mark @ sectors space
* as allocated out of @ ob
*/
void bch2_alloc_sectors_done ( struct bch_fs * c , struct write_point * wp )
{
2022-11-24 18:03:55 -05:00
bch2_alloc_sectors_done_inlined ( c , wp ) ;
2018-10-06 00:46:55 -04:00
}
2018-11-04 21:55:35 -05:00
2020-07-21 17:12:39 -04:00
static inline void writepoint_init ( struct write_point * wp ,
enum bch_data_type type )
{
mutex_init ( & wp - > lock ) ;
2021-12-25 21:21:46 -05:00
wp - > data_type = type ;
2022-10-31 16:13:05 -04:00
INIT_WORK ( & wp - > index_update_work , bch2_write_point_do_index_updates ) ;
INIT_LIST_HEAD ( & wp - > writes ) ;
spin_lock_init ( & wp - > writes_lock ) ;
2020-07-21 17:12:39 -04:00
}
2018-11-04 21:55:35 -05:00
void bch2_fs_allocator_foreground_init ( struct bch_fs * c )
{
struct open_bucket * ob ;
struct write_point * wp ;
mutex_init ( & c - > write_points_hash_lock ) ;
c - > write_points_nr = ARRAY_SIZE ( c - > write_points ) ;
/* open bucket 0 is a sentinal NULL: */
spin_lock_init ( & c - > open_buckets [ 0 ] . lock ) ;
for ( ob = c - > open_buckets + 1 ;
ob < c - > open_buckets + ARRAY_SIZE ( c - > open_buckets ) ; ob + + ) {
spin_lock_init ( & ob - > lock ) ;
c - > open_buckets_nr_free + + ;
ob - > freelist = c - > open_buckets_freelist ;
c - > open_buckets_freelist = ob - c - > open_buckets ;
}
2020-07-11 16:28:54 -04:00
writepoint_init ( & c - > btree_write_point , BCH_DATA_btree ) ;
writepoint_init ( & c - > rebalance_write_point , BCH_DATA_user ) ;
writepoint_init ( & c - > copygc_write_point , BCH_DATA_user ) ;
2018-11-04 21:55:35 -05:00
for ( wp = c - > write_points ;
wp < c - > write_points + c - > write_points_nr ; wp + + ) {
2020-07-09 18:28:11 -04:00
writepoint_init ( wp , BCH_DATA_user ) ;
2018-11-04 21:55:35 -05:00
2022-10-17 07:07:28 -04:00
wp - > last_used = local_clock ( ) ;
2018-11-04 21:55:35 -05:00
wp - > write_point = ( unsigned long ) wp ;
hlist_add_head_rcu ( & wp - > node ,
writepoint_hash ( c , wp - > write_point ) ) ;
}
}
2021-12-25 21:21:46 -05:00
2023-02-28 23:08:48 -05:00
static void bch2_open_bucket_to_text ( struct printbuf * out , struct bch_fs * c , struct open_bucket * ob )
{
struct bch_dev * ca = bch_dev_bkey_exists ( c , ob - > dev ) ;
unsigned data_type = ob - > data_type ;
barrier ( ) ; /* READ_ONCE() doesn't work on bitfields */
prt_printf ( out , " %zu ref %u %s %u:%llu gen %u allocated %u/%u " ,
ob - c - > open_buckets ,
atomic_read ( & ob - > pin ) ,
data_type < BCH_DATA_NR ? bch2_data_types [ data_type ] : " invalid data type " ,
ob - > dev , ob - > bucket , ob - > gen ,
ca - > mi . bucket_size - ob - > sectors_free , ca - > mi . bucket_size ) ;
if ( ob - > ec )
prt_printf ( out , " ec idx %llu " , ob - > ec - > idx ) ;
if ( ob - > on_partial_list )
prt_str ( out , " partial " ) ;
prt_newline ( out ) ;
}
2021-12-25 21:21:46 -05:00
void bch2_open_buckets_to_text ( struct printbuf * out , struct bch_fs * c )
{
struct open_bucket * ob ;
2023-02-28 23:08:48 -05:00
out - > atomic + + ;
2021-12-25 21:21:46 -05:00
for ( ob = c - > open_buckets ;
ob < c - > open_buckets + ARRAY_SIZE ( c - > open_buckets ) ;
ob + + ) {
spin_lock ( & ob - > lock ) ;
2023-02-28 23:08:48 -05:00
if ( ob - > valid & & ! ob - > on_partial_list )
bch2_open_bucket_to_text ( out , c , ob ) ;
2021-12-25 21:21:46 -05:00
spin_unlock ( & ob - > lock ) ;
}
2023-02-28 23:08:48 -05:00
- - out - > atomic ;
}
void bch2_open_buckets_partial_to_text ( struct printbuf * out , struct bch_fs * c )
{
unsigned i ;
out - > atomic + + ;
spin_lock ( & c - > freelist_lock ) ;
for ( i = 0 ; i < c - > open_buckets_partial_nr ; i + + )
bch2_open_bucket_to_text ( out , c ,
c - > open_buckets + c - > open_buckets_partial [ i ] ) ;
spin_unlock ( & c - > freelist_lock ) ;
- - out - > atomic ;
2021-12-25 21:21:46 -05:00
}
2022-10-31 16:13:05 -04:00
static const char * const bch2_write_point_states [ ] = {
# define x(n) #n,
WRITE_POINT_STATES ( )
# undef x
NULL
} ;
2023-08-12 12:13:19 -04:00
static void bch2_write_point_to_text ( struct printbuf * out , struct bch_fs * c ,
struct write_point * wp )
{
struct open_bucket * ob ;
unsigned i ;
prt_printf ( out , " %lu: " , wp - > write_point ) ;
prt_human_readable_u64 ( out , wp - > sectors_allocated ) ;
prt_printf ( out , " last wrote: " ) ;
bch2_pr_time_units ( out , sched_clock ( ) - wp - > last_used ) ;
for ( i = 0 ; i < WRITE_POINT_STATE_NR ; i + + ) {
prt_printf ( out , " %s: " , bch2_write_point_states [ i ] ) ;
bch2_pr_time_units ( out , wp - > time [ i ] ) ;
}
prt_newline ( out ) ;
printbuf_indent_add ( out , 2 ) ;
open_bucket_for_each ( c , & wp - > ptrs , ob , i )
bch2_open_bucket_to_text ( out , c , ob ) ;
printbuf_indent_sub ( out , 2 ) ;
}
2022-10-31 16:13:05 -04:00
void bch2_write_points_to_text ( struct printbuf * out , struct bch_fs * c )
{
struct write_point * wp ;
2023-08-12 12:13:19 -04:00
prt_str ( out , " Foreground write points \n " ) ;
2022-10-31 16:13:05 -04:00
for ( wp = c - > write_points ;
wp < c - > write_points + ARRAY_SIZE ( c - > write_points ) ;
2023-08-12 12:13:19 -04:00
wp + + )
bch2_write_point_to_text ( out , c , wp ) ;
2022-10-31 16:13:05 -04:00
2023-08-12 12:13:19 -04:00
prt_str ( out , " Copygc write point \n " ) ;
bch2_write_point_to_text ( out , c , & c - > copygc_write_point ) ;
2022-10-31 16:13:05 -04:00
2023-08-12 12:13:19 -04:00
prt_str ( out , " Rebalance write point \n " ) ;
bch2_write_point_to_text ( out , c , & c - > rebalance_write_point ) ;
2022-10-31 16:13:05 -04:00
2023-08-12 12:13:19 -04:00
prt_str ( out , " Btree write point \n " ) ;
bch2_write_point_to_text ( out , c , & c - > btree_write_point ) ;
2022-10-31 16:13:05 -04:00
}