2018-10-06 00:46:55 -04:00
// SPDX-License-Identifier: GPL-2.0
2017-03-16 22:18:50 -08:00
# include "bcachefs.h"
2018-10-06 00:46:55 -04:00
# include "alloc_background.h"
# include "alloc_foreground.h"
2017-03-16 22:18:50 -08:00
# include "btree_cache.h"
# include "btree_io.h"
2019-10-05 12:54:53 -04:00
# include "btree_key_cache.h"
2017-03-16 22:18:50 -08:00
# include "btree_update.h"
# include "btree_update_interior.h"
# include "btree_gc.h"
# include "buckets.h"
# include "clock.h"
# include "debug.h"
2018-11-01 15:13:19 -04:00
# include "ec.h"
2017-03-16 22:18:50 -08:00
# include "error.h"
2019-04-11 22:39:39 -04:00
# include "recovery.h"
2017-03-16 22:18:50 -08:00
# include "trace.h"
2021-01-22 18:01:07 -05:00
# include "varint.h"
2017-03-16 22:18:50 -08:00
# include <linux/kthread.h>
# include <linux/math64.h>
# include <linux/random.h>
# include <linux/rculist.h>
# include <linux/rcupdate.h>
# include <linux/sched/task.h>
# include <linux/sort.h>
2021-04-18 17:54:56 -04:00
const char * const bch2_allocator_states [ ] = {
# define x(n) #n,
ALLOC_THREAD_STATES ( )
# undef x
NULL
} ;
2021-01-22 18:01:07 -05:00
static const unsigned BCH_ALLOC_V1_FIELD_BYTES [ ] = {
# define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
BCH_ALLOC_FIELDS_V1 ( )
2018-07-21 23:36:11 -04:00
# undef x
} ;
2017-03-16 22:18:50 -08:00
/* Persistent alloc info: */
2021-01-22 18:01:07 -05:00
static inline u64 alloc_field_v1_get ( const struct bch_alloc * a ,
const void * * p , unsigned field )
2018-07-21 23:36:11 -04:00
{
2021-01-22 18:01:07 -05:00
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES [ field ] ;
2018-07-21 23:36:11 -04:00
u64 v ;
if ( ! ( a - > fields & ( 1 < < field ) ) )
return 0 ;
switch ( bytes ) {
case 1 :
v = * ( ( const u8 * ) * p ) ;
break ;
case 2 :
v = le16_to_cpup ( * p ) ;
break ;
case 4 :
v = le32_to_cpup ( * p ) ;
break ;
case 8 :
v = le64_to_cpup ( * p ) ;
break ;
default :
BUG ( ) ;
}
* p + = bytes ;
return v ;
}
2021-01-22 18:01:07 -05:00
static inline void alloc_field_v1_put ( struct bkey_i_alloc * a , void * * p ,
unsigned field , u64 v )
2018-07-21 23:36:11 -04:00
{
2021-01-22 18:01:07 -05:00
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES [ field ] ;
2018-07-21 23:36:11 -04:00
if ( ! v )
return ;
a - > v . fields | = 1 < < field ;
switch ( bytes ) {
case 1 :
* ( ( u8 * ) * p ) = v ;
break ;
case 2 :
* ( ( __le16 * ) * p ) = cpu_to_le16 ( v ) ;
break ;
case 4 :
* ( ( __le32 * ) * p ) = cpu_to_le32 ( v ) ;
break ;
case 8 :
* ( ( __le64 * ) * p ) = cpu_to_le64 ( v ) ;
break ;
default :
BUG ( ) ;
}
* p + = bytes ;
}
2021-01-22 18:01:07 -05:00
static void bch2_alloc_unpack_v1 ( struct bkey_alloc_unpacked * out ,
struct bkey_s_c k )
2019-02-13 14:46:32 -05:00
{
2021-01-22 18:01:07 -05:00
const struct bch_alloc * in = bkey_s_c_to_alloc ( k ) . v ;
const void * d = in - > data ;
unsigned idx = 0 ;
2019-04-17 18:14:46 -04:00
2021-01-22 18:01:07 -05:00
out - > gen = in - > gen ;
# define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
BCH_ALLOC_FIELDS_V1 ( )
# undef x
}
static void bch2_alloc_pack_v1 ( struct bkey_alloc_buf * dst ,
const struct bkey_alloc_unpacked src )
{
struct bkey_i_alloc * a = bkey_alloc_init ( & dst - > k ) ;
void * d = a - > v . data ;
unsigned bytes , idx = 0 ;
2019-04-17 18:14:46 -04:00
2021-01-22 18:01:07 -05:00
a - > k . p = POS ( src . dev , src . bucket ) ;
a - > v . fields = 0 ;
a - > v . gen = src . gen ;
2019-02-13 14:46:32 -05:00
2021-01-22 18:01:07 -05:00
# define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name);
BCH_ALLOC_FIELDS_V1 ( )
2019-02-13 14:46:32 -05:00
# undef x
2021-01-22 18:01:07 -05:00
bytes = ( void * ) d - ( void * ) & a - > v ;
set_bkey_val_bytes ( & a - > k , bytes ) ;
memset_u64s_tail ( & a - > v , 0 , bytes ) ;
2019-02-13 14:46:32 -05:00
}
2021-01-22 18:01:07 -05:00
static int bch2_alloc_unpack_v2 ( struct bkey_alloc_unpacked * out ,
struct bkey_s_c k )
2019-02-13 14:46:32 -05:00
{
2021-01-22 18:01:07 -05:00
struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2 ( k ) ;
const u8 * in = a . v - > data ;
const u8 * end = bkey_val_end ( a ) ;
unsigned fieldnr = 0 ;
int ret ;
u64 v ;
out - > gen = a . v - > gen ;
out - > oldest_gen = a . v - > oldest_gen ;
out - > data_type = a . v - > data_type ;
# define x(_name, _bits) \
if ( fieldnr < a . v - > nr_fields ) { \
ret = bch2_varint_decode ( in , end , & v ) ; \
if ( ret < 0 ) \
return ret ; \
in + = ret ; \
} else { \
v = 0 ; \
} \
out - > _name = v ; \
if ( v ! = out - > _name ) \
return - 1 ; \
fieldnr + + ;
BCH_ALLOC_FIELDS_V2 ( )
# undef x
return 0 ;
}
static void bch2_alloc_pack_v2 ( struct bkey_alloc_buf * dst ,
const struct bkey_alloc_unpacked src )
{
struct bkey_i_alloc_v2 * a = bkey_alloc_v2_init ( & dst - > k ) ;
unsigned nr_fields = 0 , last_nonzero_fieldnr = 0 ;
u8 * out = a - > v . data ;
u8 * end = ( void * ) & dst [ 1 ] ;
u8 * last_nonzero_field = out ;
2019-10-11 17:20:30 -07:00
unsigned bytes ;
2019-02-13 14:46:32 -05:00
2021-01-22 18:01:07 -05:00
a - > k . p = POS ( src . dev , src . bucket ) ;
a - > v . gen = src . gen ;
a - > v . oldest_gen = src . oldest_gen ;
a - > v . data_type = src . data_type ;
# define x(_name, _bits) \
nr_fields + + ; \
\
if ( src . _name ) { \
out + = bch2_varint_encode ( out , src . _name ) ; \
\
last_nonzero_field = out ; \
last_nonzero_fieldnr = nr_fields ; \
} else { \
* out + + = 0 ; \
}
2019-02-13 14:46:32 -05:00
2021-01-22 18:01:07 -05:00
BCH_ALLOC_FIELDS_V2 ( )
2019-02-13 14:46:32 -05:00
# undef x
2021-01-22 18:01:07 -05:00
BUG_ON ( out > end ) ;
out = last_nonzero_field ;
a - > v . nr_fields = last_nonzero_fieldnr ;
bytes = ( u8 * ) out - ( u8 * ) & a - > v ;
set_bkey_val_bytes ( & a - > k , bytes ) ;
memset_u64s_tail ( & a - > v , 0 , bytes ) ;
}
struct bkey_alloc_unpacked bch2_alloc_unpack ( struct bkey_s_c k )
{
struct bkey_alloc_unpacked ret = {
. dev = k . k - > p . inode ,
. bucket = k . k - > p . offset ,
. gen = 0 ,
} ;
2019-02-13 14:46:32 -05:00
2021-01-22 18:01:07 -05:00
if ( k . k - > type = = KEY_TYPE_alloc_v2 )
bch2_alloc_unpack_v2 ( & ret , k ) ;
else if ( k . k - > type = = KEY_TYPE_alloc )
bch2_alloc_unpack_v1 ( & ret , k ) ;
return ret ;
}
void bch2_alloc_pack ( struct bch_fs * c ,
struct bkey_alloc_buf * dst ,
const struct bkey_alloc_unpacked src )
{
if ( c - > sb . features & ( 1ULL < < BCH_FEATURE_alloc_v2 ) )
bch2_alloc_pack_v2 ( dst , src ) ;
else
bch2_alloc_pack_v1 ( dst , src ) ;
2019-02-13 14:46:32 -05:00
}
2017-03-16 22:18:50 -08:00
static unsigned bch_alloc_val_u64s ( const struct bch_alloc * a )
{
2018-07-21 23:36:11 -04:00
unsigned i , bytes = offsetof ( struct bch_alloc , data ) ;
2017-03-16 22:18:50 -08:00
2021-01-22 18:01:07 -05:00
for ( i = 0 ; i < ARRAY_SIZE ( BCH_ALLOC_V1_FIELD_BYTES ) ; i + + )
2018-07-21 23:36:11 -04:00
if ( a - > fields & ( 1 < < i ) )
2021-01-22 18:01:07 -05:00
bytes + = BCH_ALLOC_V1_FIELD_BYTES [ i ] ;
2017-03-16 22:18:50 -08:00
return DIV_ROUND_UP ( bytes , sizeof ( u64 ) ) ;
}
2021-01-22 18:01:07 -05:00
const char * bch2_alloc_v1_invalid ( const struct bch_fs * c , struct bkey_s_c k )
2017-03-16 22:18:50 -08:00
{
2018-11-01 15:10:01 -04:00
struct bkey_s_c_alloc a = bkey_s_c_to_alloc ( k ) ;
2017-03-16 22:18:50 -08:00
if ( k . k - > p . inode > = c - > sb . nr_devices | |
! c - > devs [ k . k - > p . inode ] )
return " invalid device " ;
2018-11-01 15:10:01 -04:00
/* allow for unknown fields */
if ( bkey_val_u64s ( a . k ) < bch_alloc_val_u64s ( a . v ) )
return " incorrect value size " ;
2017-03-16 22:18:50 -08:00
return NULL ;
}
2021-01-22 18:01:07 -05:00
const char * bch2_alloc_v2_invalid ( const struct bch_fs * c , struct bkey_s_c k )
2017-03-16 22:18:50 -08:00
{
2021-01-22 18:01:07 -05:00
struct bkey_alloc_unpacked u ;
if ( k . k - > p . inode > = c - > sb . nr_devices | |
! c - > devs [ k . k - > p . inode ] )
return " invalid device " ;
2018-11-09 01:24:07 -05:00
2021-01-22 18:01:07 -05:00
if ( bch2_alloc_unpack_v2 ( & u , k ) )
return " unpack error " ;
2018-07-21 23:36:11 -04:00
2021-01-22 18:01:07 -05:00
return NULL ;
}
void bch2_alloc_to_text ( struct printbuf * out , struct bch_fs * c ,
struct bkey_s_c k )
{
struct bkey_alloc_unpacked u = bch2_alloc_unpack ( k ) ;
pr_buf ( out , " gen %u oldest_gen %u data_type %u " ,
u . gen , u . oldest_gen , u . data_type ) ;
# define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name);
BCH_ALLOC_FIELDS_V2 ( )
# undef x
2017-03-16 22:18:50 -08:00
}
2020-05-24 14:06:10 -04:00
static int bch2_alloc_read_fn ( struct bch_fs * c , enum btree_id id ,
unsigned level , struct bkey_s_c k )
2017-03-16 22:18:50 -08:00
{
2020-10-16 21:32:02 -04:00
struct bch_dev * ca ;
struct bucket * g ;
struct bkey_alloc_unpacked u ;
2021-01-22 18:01:07 -05:00
if ( level | |
( k . k - > type ! = KEY_TYPE_alloc & &
k . k - > type ! = KEY_TYPE_alloc_v2 ) )
2020-10-16 21:32:02 -04:00
return 0 ;
ca = bch_dev_bkey_exists ( c , k . k - > p . inode ) ;
2021-01-21 20:51:51 -05:00
g = bucket ( ca , k . k - > p . offset ) ;
2020-10-16 21:32:02 -04:00
u = bch2_alloc_unpack ( k ) ;
g - > _mark . gen = u . gen ;
g - > _mark . data_type = u . data_type ;
g - > _mark . dirty_sectors = u . dirty_sectors ;
g - > _mark . cached_sectors = u . cached_sectors ;
g - > io_time [ READ ] = u . read_time ;
g - > io_time [ WRITE ] = u . write_time ;
g - > oldest_gen = u . oldest_gen ;
g - > gen_valid = 1 ;
2017-03-16 22:18:50 -08:00
2020-05-24 14:06:10 -04:00
return 0 ;
}
int bch2_alloc_read ( struct bch_fs * c , struct journal_keys * journal_keys )
{
2021-01-21 15:28:59 -05:00
int ret ;
2019-12-27 20:51:35 -05:00
2020-10-16 21:32:02 -04:00
down_read ( & c - > gc_lock ) ;
2021-02-20 19:27:37 -05:00
ret = bch2_btree_and_journal_walk ( c , journal_keys , BTREE_ID_alloc ,
2020-05-24 14:06:10 -04:00
NULL , bch2_alloc_read_fn ) ;
2020-10-16 21:32:02 -04:00
up_read ( & c - > gc_lock ) ;
2019-04-17 15:49:28 -04:00
if ( ret ) {
bch_err ( c , " error reading alloc info: %i " , ret ) ;
2017-03-16 22:18:50 -08:00
return ret ;
2019-04-17 15:49:28 -04:00
}
2017-03-16 22:18:50 -08:00
return 0 ;
}
2019-08-27 17:45:42 -04:00
static int bch2_alloc_write_key ( struct btree_trans * trans ,
struct btree_iter * iter ,
unsigned flags )
2019-03-13 20:49:16 -04:00
{
2019-08-27 17:45:42 -04:00
struct bch_fs * c = trans - > c ;
struct bkey_s_c k ;
2019-03-13 20:49:16 -04:00
struct bch_dev * ca ;
2019-08-27 17:45:42 -04:00
struct bucket * g ;
struct bucket_mark m ;
struct bkey_alloc_unpacked old_u , new_u ;
2021-01-22 18:01:07 -05:00
struct bkey_alloc_buf a ;
2019-03-13 20:49:16 -04:00
int ret ;
2019-08-27 17:45:42 -04:00
retry :
2019-10-05 12:54:53 -04:00
bch2_trans_begin ( trans ) ;
ret = bch2_btree_key_cache_flush ( trans ,
2021-02-20 19:27:37 -05:00
BTREE_ID_alloc , iter - > pos ) ;
2019-10-05 12:54:53 -04:00
if ( ret )
goto err ;
2019-08-27 17:45:42 -04:00
k = bch2_btree_iter_peek_slot ( iter ) ;
ret = bkey_err ( k ) ;
if ( ret )
goto err ;
2019-03-13 20:49:16 -04:00
2019-08-27 17:45:42 -04:00
old_u = bch2_alloc_unpack ( k ) ;
2019-03-13 20:49:16 -04:00
2019-08-27 17:45:42 -04:00
percpu_down_read ( & c - > mark_lock ) ;
ca = bch_dev_bkey_exists ( c , iter - > pos . inode ) ;
2021-01-21 20:51:51 -05:00
g = bucket ( ca , iter - > pos . offset ) ;
2019-08-27 17:45:42 -04:00
m = READ_ONCE ( g - > mark ) ;
2021-01-22 18:01:07 -05:00
new_u = alloc_mem_to_key ( iter , g , m ) ;
2019-08-27 17:45:42 -04:00
percpu_up_read ( & c - > mark_lock ) ;
2019-03-13 20:49:16 -04:00
2019-08-27 17:45:42 -04:00
if ( ! bkey_alloc_unpacked_cmp ( old_u , new_u ) )
2020-10-16 21:36:26 -04:00
return 0 ;
2019-03-13 20:49:16 -04:00
2021-01-22 18:01:07 -05:00
bch2_alloc_pack ( c , & a , new_u ) ;
bch2_trans_update ( trans , iter , & a . k ,
2019-12-31 16:17:42 -05:00
BTREE_TRIGGER_NORUN ) ;
2019-08-27 17:45:42 -04:00
ret = bch2_trans_commit ( trans , NULL , NULL ,
2020-12-21 17:17:18 -05:00
BTREE_INSERT_NOFAIL | flags ) ;
2019-03-13 20:49:16 -04:00
err :
2019-08-27 17:45:42 -04:00
if ( ret = = - EINTR )
goto retry ;
2019-03-13 20:49:16 -04:00
return ret ;
}
2021-01-22 17:56:34 -05:00
int bch2_alloc_write ( struct bch_fs * c , unsigned flags )
2017-03-16 22:18:50 -08:00
{
2019-04-16 14:42:05 -04:00
struct btree_trans trans ;
struct btree_iter * iter ;
2021-01-22 17:56:34 -05:00
struct bch_dev * ca ;
unsigned i ;
2017-03-16 22:18:50 -08:00
int ret = 0 ;
2019-10-05 12:54:53 -04:00
bch2_trans_init ( & trans , c , BTREE_ITER_MAX , 0 ) ;
2019-03-13 20:49:16 -04:00
2021-02-20 19:27:37 -05:00
iter = bch2_trans_get_iter ( & trans , BTREE_ID_alloc , POS_MIN ,
2019-04-16 14:42:05 -04:00
BTREE_ITER_SLOTS | BTREE_ITER_INTENT ) ;
2017-03-16 22:18:50 -08:00
2021-01-08 21:20:58 -05:00
for_each_member_device ( ca , c , i ) {
2021-01-22 17:56:34 -05:00
bch2_btree_iter_set_pos ( iter ,
POS ( ca - > dev_idx , ca - > mi . first_bucket ) ) ;
while ( iter - > pos . offset < ca - > mi . nbuckets ) {
bch2_trans_cond_resched ( & trans ) ;
ret = bch2_alloc_write_key ( & trans , iter , flags ) ;
if ( ret ) {
percpu_ref_put ( & ca - > io_ref ) ;
goto err ;
}
bch2_btree_iter_next_slot ( iter ) ;
2017-03-16 22:18:50 -08:00
}
}
2021-01-22 17:56:34 -05:00
err :
bch2_trans_exit ( & trans ) ;
2020-10-16 21:36:26 -04:00
return ret ;
2019-08-27 17:45:42 -04:00
}
2017-03-16 22:18:50 -08:00
/* Bucket IO clocks: */
2020-10-16 21:39:16 -04:00
int bch2_bucket_io_time_reset ( struct btree_trans * trans , unsigned dev ,
size_t bucket_nr , int rw )
{
struct bch_fs * c = trans - > c ;
struct bch_dev * ca = bch_dev_bkey_exists ( c , dev ) ;
struct btree_iter * iter ;
struct bucket * g ;
2021-01-22 18:01:07 -05:00
struct bkey_alloc_buf * a ;
2020-10-16 21:39:16 -04:00
struct bkey_alloc_unpacked u ;
2021-01-21 15:28:59 -05:00
u64 * time , now ;
2020-10-16 21:39:16 -04:00
int ret = 0 ;
2021-02-20 19:27:37 -05:00
iter = bch2_trans_get_iter ( trans , BTREE_ID_alloc , POS ( dev , bucket_nr ) ,
2020-10-16 21:39:16 -04:00
BTREE_ITER_CACHED |
BTREE_ITER_CACHED_NOFILL |
BTREE_ITER_INTENT ) ;
ret = bch2_btree_iter_traverse ( iter ) ;
if ( ret )
goto out ;
2021-01-22 18:01:07 -05:00
a = bch2_trans_kmalloc ( trans , sizeof ( struct bkey_alloc_buf ) ) ;
2020-10-16 21:39:16 -04:00
ret = PTR_ERR_OR_ZERO ( a ) ;
if ( ret )
goto out ;
percpu_down_read ( & c - > mark_lock ) ;
g = bucket ( ca , bucket_nr ) ;
2021-01-22 18:01:07 -05:00
u = alloc_mem_to_key ( iter , g , READ_ONCE ( g - > mark ) ) ;
2020-10-16 21:39:16 -04:00
percpu_up_read ( & c - > mark_lock ) ;
time = rw = = READ ? & u . read_time : & u . write_time ;
2021-01-21 15:28:59 -05:00
now = atomic64_read ( & c - > io_clock [ rw ] . now ) ;
if ( * time = = now )
2020-10-16 21:39:16 -04:00
goto out ;
2021-01-21 15:28:59 -05:00
* time = now ;
2020-10-16 21:39:16 -04:00
2021-01-22 18:01:07 -05:00
bch2_alloc_pack ( c , a , u ) ;
ret = bch2_trans_update ( trans , iter , & a - > k , 0 ) ? :
2020-10-16 21:39:16 -04:00
bch2_trans_commit ( trans , NULL , NULL , 0 ) ;
out :
bch2_trans_iter_put ( trans , iter ) ;
return ret ;
}
2017-03-16 22:18:50 -08:00
/* Background allocator thread: */
/*
* Scans for buckets to be invalidated , invalidates them , rewrites prios / gens
* ( marking them as invalidated on disk ) , then optionally issues discard
* commands to the newly free buckets , then puts them on the various freelists .
*/
/**
* wait_buckets_available - wait on reclaimable buckets
*
* If there aren ' t enough available buckets to fill up free_inc , wait until
* there are .
*/
static int wait_buckets_available ( struct bch_fs * c , struct bch_dev * ca )
{
unsigned long gc_count = c - > gc_count ;
2021-01-21 20:51:51 -05:00
s64 available ;
2017-03-16 22:18:50 -08:00
int ret = 0 ;
2021-04-18 17:54:56 -04:00
ca - > allocator_state = ALLOCATOR_blocked ;
2019-04-16 15:13:16 -04:00
closure_wake_up ( & c - > freelist_wait ) ;
2017-03-16 22:18:50 -08:00
while ( 1 ) {
set_current_state ( TASK_INTERRUPTIBLE ) ;
if ( kthread_should_stop ( ) ) {
ret = 1 ;
break ;
}
if ( gc_count ! = c - > gc_count )
ca - > inc_gen_really_needs_gc = 0 ;
2021-04-13 09:49:23 -04:00
available = dev_buckets_reclaimable ( ca ) ;
2021-01-21 20:51:51 -05:00
available - = ca - > inc_gen_really_needs_gc ;
available = max ( available , 0LL ) ;
2020-06-15 17:38:26 -04:00
2021-04-13 09:49:23 -04:00
if ( available )
2017-03-16 22:18:50 -08:00
break ;
up_read ( & c - > gc_lock ) ;
schedule ( ) ;
try_to_freeze ( ) ;
down_read ( & c - > gc_lock ) ;
}
__set_current_state ( TASK_RUNNING ) ;
2021-04-18 17:54:56 -04:00
ca - > allocator_state = ALLOCATOR_running ;
2019-04-16 15:13:16 -04:00
closure_wake_up ( & c - > freelist_wait ) ;
2017-03-16 22:18:50 -08:00
return ret ;
}
2021-01-21 15:28:59 -05:00
static bool bch2_can_invalidate_bucket ( struct bch_dev * ca , size_t b ,
struct bucket_mark m )
2017-03-16 22:18:50 -08:00
{
u8 gc_gen ;
2021-01-21 15:28:59 -05:00
if ( ! is_available_bucket ( m ) )
2017-03-16 22:18:50 -08:00
return false ;
2021-01-21 15:28:59 -05:00
if ( m . owned_by_allocator )
2021-01-21 20:51:51 -05:00
return false ;
2018-11-19 01:16:07 -05:00
if ( ca - > buckets_nouse & &
2021-01-21 15:28:59 -05:00
test_bit ( b , ca - > buckets_nouse ) )
2018-11-19 01:16:07 -05:00
return false ;
2021-01-21 15:28:59 -05:00
gc_gen = bucket_gc_gen ( bucket ( ca , b ) ) ;
2017-03-16 22:18:50 -08:00
if ( gc_gen > = BUCKET_GC_GEN_MAX / 2 )
ca - > inc_gen_needs_gc + + ;
if ( gc_gen > = BUCKET_GC_GEN_MAX )
ca - > inc_gen_really_needs_gc + + ;
return gc_gen < BUCKET_GC_GEN_MAX ;
}
/*
* Determines what order we ' re going to reuse buckets , smallest bucket_key ( )
* first .
*/
2021-01-21 15:28:59 -05:00
static unsigned bucket_sort_key ( struct bucket * g , struct bucket_mark m ,
u64 now , u64 last_seq_ondisk )
2017-03-16 22:18:50 -08:00
{
2021-01-21 15:28:59 -05:00
unsigned used = bucket_sectors_used ( m ) ;
2017-03-16 22:18:50 -08:00
2021-01-21 15:28:59 -05:00
if ( used ) {
/*
* Prefer to keep buckets that have been read more recently , and
* buckets that have more data in them :
*/
u64 last_read = max_t ( s64 , 0 , now - g - > io_time [ READ ] ) ;
u32 last_read_scaled = max_t ( u64 , U32_MAX , div_u64 ( last_read , used ) ) ;
2017-03-16 22:18:50 -08:00
2021-01-21 15:28:59 -05:00
return - last_read_scaled ;
} else {
/*
* Prefer to use buckets with smaller gc_gen so that we don ' t
* have to walk the btree and recalculate oldest_gen - but shift
* off the low bits so that buckets will still have equal sort
* keys when there ' s only a small difference , so that we can
* keep sequential buckets together :
*/
return ( bucket_needs_journal_commit ( m , last_seq_ondisk ) < < 4 ) |
( bucket_gc_gen ( g ) > > 4 ) ;
}
2017-03-16 22:18:50 -08:00
}
static inline int bucket_alloc_cmp ( alloc_heap * h ,
struct alloc_heap_entry l ,
struct alloc_heap_entry r )
{
2019-04-12 04:54:12 -04:00
return cmp_int ( l . key , r . key ) ? :
cmp_int ( r . nr , l . nr ) ? :
cmp_int ( l . bucket , r . bucket ) ;
2017-03-16 22:18:50 -08:00
}
2018-07-22 10:43:01 -04:00
static inline int bucket_idx_cmp ( const void * _l , const void * _r )
{
const struct alloc_heap_entry * l = _l , * r = _r ;
2019-04-12 04:54:12 -04:00
return cmp_int ( l - > bucket , r - > bucket ) ;
2018-07-22 10:43:01 -04:00
}
2017-03-16 22:18:50 -08:00
static void find_reclaimable_buckets_lru ( struct bch_fs * c , struct bch_dev * ca )
{
struct bucket_array * buckets ;
struct alloc_heap_entry e = { 0 } ;
2021-01-21 15:28:59 -05:00
u64 now , last_seq_ondisk ;
2018-07-22 10:43:01 -04:00
size_t b , i , nr = 0 ;
2017-03-16 22:18:50 -08:00
down_read ( & ca - > bucket_lock ) ;
buckets = bucket_array ( ca ) ;
2021-01-21 15:28:59 -05:00
ca - > alloc_heap . used = 0 ;
now = atomic64_read ( & c - > io_clock [ READ ] . now ) ;
last_seq_ondisk = c - > journal . last_seq_ondisk ;
2017-03-16 22:18:50 -08:00
/*
* Find buckets with lowest read priority , by building a maxheap sorted
* by read priority and repeatedly replacing the maximum element until
* all buckets have been visited .
*/
for ( b = ca - > mi . first_bucket ; b < ca - > mi . nbuckets ; b + + ) {
2021-01-21 15:28:59 -05:00
struct bucket * g = & buckets - > b [ b ] ;
struct bucket_mark m = READ_ONCE ( g - > mark ) ;
unsigned key = bucket_sort_key ( g , m , now , last_seq_ondisk ) ;
2017-03-16 22:18:50 -08:00
if ( ! bch2_can_invalidate_bucket ( ca , b , m ) )
continue ;
if ( e . nr & & e . bucket + e . nr = = b & & e . key = = key ) {
e . nr + + ;
} else {
if ( e . nr )
2018-10-21 16:32:51 -04:00
heap_add_or_replace ( & ca - > alloc_heap , e ,
- bucket_alloc_cmp , NULL ) ;
2017-03-16 22:18:50 -08:00
e = ( struct alloc_heap_entry ) {
. bucket = b ,
. nr = 1 ,
. key = key ,
} ;
}
cond_resched ( ) ;
}
if ( e . nr )
2018-10-21 16:32:51 -04:00
heap_add_or_replace ( & ca - > alloc_heap , e ,
- bucket_alloc_cmp , NULL ) ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
for ( i = 0 ; i < ca - > alloc_heap . used ; i + + )
nr + = ca - > alloc_heap . data [ i ] . nr ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
while ( nr - ca - > alloc_heap . data [ 0 ] . nr > = ALLOC_SCAN_BATCH ( ca ) ) {
nr - = ca - > alloc_heap . data [ 0 ] . nr ;
2018-10-21 16:32:51 -04:00
heap_pop ( & ca - > alloc_heap , e , - bucket_alloc_cmp , NULL ) ;
2017-03-16 22:18:50 -08:00
}
2018-07-22 10:43:01 -04:00
up_read ( & ca - > bucket_lock ) ;
2017-03-16 22:18:50 -08:00
}
static void find_reclaimable_buckets_fifo ( struct bch_fs * c , struct bch_dev * ca )
{
struct bucket_array * buckets = bucket_array ( ca ) ;
struct bucket_mark m ;
2018-07-22 10:43:01 -04:00
size_t b , start ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
if ( ca - > fifo_last_bucket < ca - > mi . first_bucket | |
ca - > fifo_last_bucket > = ca - > mi . nbuckets )
ca - > fifo_last_bucket = ca - > mi . first_bucket ;
start = ca - > fifo_last_bucket ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
do {
ca - > fifo_last_bucket + + ;
if ( ca - > fifo_last_bucket = = ca - > mi . nbuckets )
ca - > fifo_last_bucket = ca - > mi . first_bucket ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
b = ca - > fifo_last_bucket ;
2017-03-16 22:18:50 -08:00
m = READ_ONCE ( buckets - > b [ b ] . mark ) ;
2018-07-22 10:43:01 -04:00
if ( bch2_can_invalidate_bucket ( ca , b , m ) ) {
struct alloc_heap_entry e = { . bucket = b , . nr = 1 , } ;
2018-10-21 16:32:51 -04:00
heap_add ( & ca - > alloc_heap , e , bucket_alloc_cmp , NULL ) ;
2018-07-22 10:43:01 -04:00
if ( heap_full ( & ca - > alloc_heap ) )
break ;
}
2017-03-16 22:18:50 -08:00
cond_resched ( ) ;
2018-07-22 10:43:01 -04:00
} while ( ca - > fifo_last_bucket ! = start ) ;
2017-03-16 22:18:50 -08:00
}
static void find_reclaimable_buckets_random ( struct bch_fs * c , struct bch_dev * ca )
{
struct bucket_array * buckets = bucket_array ( ca ) ;
struct bucket_mark m ;
2018-07-22 10:43:01 -04:00
size_t checked , i ;
2017-03-16 22:18:50 -08:00
for ( checked = 0 ;
2018-07-22 10:43:01 -04:00
checked < ca - > mi . nbuckets / 2 ;
2017-03-16 22:18:50 -08:00
checked + + ) {
size_t b = bch2_rand_range ( ca - > mi . nbuckets -
ca - > mi . first_bucket ) +
ca - > mi . first_bucket ;
m = READ_ONCE ( buckets - > b [ b ] . mark ) ;
2018-07-22 10:43:01 -04:00
if ( bch2_can_invalidate_bucket ( ca , b , m ) ) {
struct alloc_heap_entry e = { . bucket = b , . nr = 1 , } ;
2018-10-21 16:32:51 -04:00
heap_add ( & ca - > alloc_heap , e , bucket_alloc_cmp , NULL ) ;
2018-07-22 10:43:01 -04:00
if ( heap_full ( & ca - > alloc_heap ) )
break ;
}
2017-03-16 22:18:50 -08:00
cond_resched ( ) ;
}
2018-07-22 10:43:01 -04:00
sort ( ca - > alloc_heap . data ,
ca - > alloc_heap . used ,
sizeof ( ca - > alloc_heap . data [ 0 ] ) ,
bucket_idx_cmp , NULL ) ;
/* remove duplicates: */
for ( i = 0 ; i + 1 < ca - > alloc_heap . used ; i + + )
if ( ca - > alloc_heap . data [ i ] . bucket = =
ca - > alloc_heap . data [ i + 1 ] . bucket )
ca - > alloc_heap . data [ i ] . nr = 0 ;
2017-03-16 22:18:50 -08:00
}
2018-07-22 10:43:01 -04:00
static size_t find_reclaimable_buckets ( struct bch_fs * c , struct bch_dev * ca )
2017-03-16 22:18:50 -08:00
{
2018-07-22 10:43:01 -04:00
size_t i , nr = 0 ;
2017-03-16 22:18:50 -08:00
ca - > inc_gen_needs_gc = 0 ;
switch ( ca - > mi . replacement ) {
2021-02-20 19:47:58 -05:00
case BCH_CACHE_REPLACEMENT_lru :
2017-03-16 22:18:50 -08:00
find_reclaimable_buckets_lru ( c , ca ) ;
break ;
2021-02-20 19:47:58 -05:00
case BCH_CACHE_REPLACEMENT_fifo :
2017-03-16 22:18:50 -08:00
find_reclaimable_buckets_fifo ( c , ca ) ;
break ;
2021-02-20 19:47:58 -05:00
case BCH_CACHE_REPLACEMENT_random :
2017-03-16 22:18:50 -08:00
find_reclaimable_buckets_random ( c , ca ) ;
break ;
}
2018-07-22 10:43:01 -04:00
2018-10-21 16:32:51 -04:00
heap_resort ( & ca - > alloc_heap , bucket_alloc_cmp , NULL ) ;
2018-07-22 10:43:01 -04:00
for ( i = 0 ; i < ca - > alloc_heap . used ; i + + )
nr + = ca - > alloc_heap . data [ i ] . nr ;
return nr ;
2017-03-16 22:18:50 -08:00
}
2018-07-22 10:43:01 -04:00
static inline long next_alloc_bucket ( struct bch_dev * ca )
2017-03-16 22:18:50 -08:00
{
2018-07-22 10:43:01 -04:00
struct alloc_heap_entry e , * top = ca - > alloc_heap . data ;
while ( ca - > alloc_heap . used ) {
if ( top - > nr ) {
size_t b = top - > bucket ;
top - > bucket + + ;
top - > nr - - ;
return b ;
}
2017-03-16 22:18:50 -08:00
2018-10-21 16:32:51 -04:00
heap_pop ( & ca - > alloc_heap , e , bucket_alloc_cmp , NULL ) ;
2018-07-22 10:43:01 -04:00
}
return - 1 ;
2017-03-16 22:18:50 -08:00
}
2019-02-13 14:46:32 -05:00
/*
* returns sequence number of most recent journal entry that updated this
* bucket :
*/
static u64 bucket_journal_seq ( struct bch_fs * c , struct bucket_mark m )
{
if ( m . journal_seq_valid ) {
u64 journal_seq = atomic64_read ( & c - > journal . seq ) ;
u64 bucket_seq = journal_seq ;
bucket_seq & = ~ ( ( u64 ) U16_MAX ) ;
bucket_seq | = m . journal_seq ;
if ( bucket_seq > journal_seq )
bucket_seq - = 1 < < 16 ;
return bucket_seq ;
} else {
return 0 ;
}
}
2019-03-13 20:49:16 -04:00
static int bch2_invalidate_one_bucket2 ( struct btree_trans * trans ,
struct bch_dev * ca ,
2019-02-13 14:46:32 -05:00
struct btree_iter * iter ,
u64 * journal_seq , unsigned flags )
{
2019-03-13 20:49:16 -04:00
struct bch_fs * c = trans - > c ;
2021-01-22 18:01:07 -05:00
struct bkey_alloc_buf a ;
2019-02-13 14:46:32 -05:00
struct bkey_alloc_unpacked u ;
2019-04-17 18:14:46 -04:00
struct bucket * g ;
2019-02-13 14:46:32 -05:00
struct bucket_mark m ;
bool invalidating_cached_data ;
size_t b ;
2020-05-28 15:51:50 -04:00
int ret = 0 ;
2019-02-13 14:46:32 -05:00
BUG_ON ( ! ca - > alloc_heap . used | |
! ca - > alloc_heap . data [ 0 ] . nr ) ;
b = ca - > alloc_heap . data [ 0 ] . bucket ;
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read ( & c - > mark_lock ) ;
2020-05-28 15:51:50 -04:00
g = bucket ( ca , b ) ;
m = READ_ONCE ( g - > mark ) ;
2021-01-25 14:04:31 -05:00
BUG_ON ( m . dirty_sectors ) ;
2021-01-22 18:19:15 -05:00
bch2_mark_alloc_bucket ( c , ca , b , true , gc_pos_alloc ( c , NULL ) , 0 ) ;
spin_lock ( & c - > freelist_lock ) ;
verify_not_on_freelist ( c , ca , b ) ;
BUG_ON ( ! fifo_push ( & ca - > free_inc , b ) ) ;
spin_unlock ( & c - > freelist_lock ) ;
2020-06-03 23:46:15 -04:00
/*
* If we ' re not invalidating cached data , we only increment the bucket
* gen in memory here , the incremented gen will be updated in the btree
* by bch2_trans_mark_pointer ( ) :
*/
2021-01-22 18:19:15 -05:00
if ( ! m . cached_sectors & &
! bucket_needs_journal_commit ( m , c - > journal . last_seq_ondisk ) ) {
2021-01-25 14:04:31 -05:00
BUG_ON ( m . data_type ) ;
2021-01-22 18:19:15 -05:00
bucket_cmpxchg ( g , m , m . gen + + ) ;
percpu_up_read ( & c - > mark_lock ) ;
goto out ;
}
2020-06-03 23:46:15 -04:00
2019-02-13 14:46:32 -05:00
percpu_up_read ( & c - > mark_lock ) ;
2020-05-28 16:06:13 -04:00
/*
* If the read - only path is trying to shut down , we can ' t be generating
* new btree updates :
*/
if ( test_bit ( BCH_FS_ALLOCATOR_STOPPING , & c - > flags ) ) {
ret = 1 ;
goto out ;
}
2019-02-13 14:46:32 -05:00
bch2_btree_iter_set_pos ( iter , POS ( ca - > dev_idx , b ) ) ;
retry :
2019-10-05 12:54:53 -04:00
ret = bch2_btree_iter_traverse ( iter ) ;
2019-02-13 14:46:32 -05:00
if ( ret )
return ret ;
2019-04-17 18:14:46 -04:00
percpu_down_read ( & c - > mark_lock ) ;
2020-06-03 23:46:15 -04:00
g = bucket ( ca , iter - > pos . offset ) ;
2019-04-17 18:14:46 -04:00
m = READ_ONCE ( g - > mark ) ;
2021-01-22 18:01:07 -05:00
u = alloc_mem_to_key ( iter , g , m ) ;
2020-06-03 23:46:15 -04:00
2019-04-17 18:14:46 -04:00
percpu_up_read ( & c - > mark_lock ) ;
2019-02-13 14:46:32 -05:00
2020-06-03 23:46:15 -04:00
invalidating_cached_data = u . cached_sectors ! = 0 ;
2019-02-13 14:46:32 -05:00
2019-04-17 18:14:46 -04:00
u . gen + + ;
2019-02-13 14:46:32 -05:00
u . data_type = 0 ;
u . dirty_sectors = 0 ;
u . cached_sectors = 0 ;
2021-01-21 15:28:59 -05:00
u . read_time = atomic64_read ( & c - > io_clock [ READ ] . now ) ;
u . write_time = atomic64_read ( & c - > io_clock [ WRITE ] . now ) ;
2019-03-13 16:56:48 -04:00
2021-01-22 18:01:07 -05:00
bch2_alloc_pack ( c , & a , u ) ;
bch2_trans_update ( trans , iter , & a . k ,
2019-12-31 16:17:42 -05:00
BTREE_TRIGGER_BUCKET_INVALIDATE ) ;
2019-03-13 20:49:16 -04:00
2019-03-13 13:31:02 -04:00
/*
* XXX :
* when using deferred btree updates , we have journal reclaim doing
* btree updates and thus requiring the allocator to make forward
* progress , and here the allocator is requiring space in the journal -
* so we need a journal pre - reservation :
*/
2019-03-13 20:49:16 -04:00
ret = bch2_trans_commit ( trans , NULL ,
invalidating_cached_data ? journal_seq : NULL ,
BTREE_INSERT_NOUNLOCK |
BTREE_INSERT_NOCHECK_RW |
BTREE_INSERT_NOFAIL |
2020-12-21 17:17:18 -05:00
BTREE_INSERT_JOURNAL_RESERVED |
2019-03-13 20:49:16 -04:00
flags ) ;
2019-02-13 14:46:32 -05:00
if ( ret = = - EINTR )
goto retry ;
2020-05-28 15:51:50 -04:00
out :
2019-02-13 14:46:32 -05:00
if ( ! ret ) {
/* remove from alloc_heap: */
struct alloc_heap_entry e , * top = ca - > alloc_heap . data ;
top - > bucket + + ;
top - > nr - - ;
if ( ! top - > nr )
heap_pop ( & ca - > alloc_heap , e , bucket_alloc_cmp , NULL ) ;
/*
* Make sure we flush the last journal entry that updated this
* bucket ( i . e . deleting the last reference ) before writing to
* this bucket again :
*/
* journal_seq = max ( * journal_seq , bucket_journal_seq ( c , m ) ) ;
} else {
size_t b2 ;
/* remove from free_inc: */
percpu_down_read ( & c - > mark_lock ) ;
spin_lock ( & c - > freelist_lock ) ;
bch2_mark_alloc_bucket ( c , ca , b , false ,
gc_pos_alloc ( c , NULL ) , 0 ) ;
BUG_ON ( ! fifo_pop_back ( & ca - > free_inc , b2 ) ) ;
BUG_ON ( b ! = b2 ) ;
spin_unlock ( & c - > freelist_lock ) ;
percpu_up_read ( & c - > mark_lock ) ;
}
2020-05-28 16:06:13 -04:00
return ret < 0 ? ret : 0 ;
2019-02-13 14:46:32 -05:00
}
2018-07-22 10:43:01 -04:00
/*
* Pull buckets off ca - > alloc_heap , invalidate them , move them to ca - > free_inc :
*/
static int bch2_invalidate_buckets ( struct bch_fs * c , struct bch_dev * ca )
2017-03-16 22:18:50 -08:00
{
2019-03-13 20:49:16 -04:00
struct btree_trans trans ;
struct btree_iter * iter ;
2018-07-22 10:43:01 -04:00
u64 journal_seq = 0 ;
2017-03-16 22:18:50 -08:00
int ret = 0 ;
2019-05-15 10:54:43 -04:00
bch2_trans_init ( & trans , c , 0 , 0 ) ;
2019-03-13 20:49:16 -04:00
2021-02-20 19:27:37 -05:00
iter = bch2_trans_get_iter ( & trans , BTREE_ID_alloc ,
2019-03-13 20:49:16 -04:00
POS ( ca - > dev_idx , 0 ) ,
2019-10-05 12:54:53 -04:00
BTREE_ITER_CACHED |
BTREE_ITER_CACHED_NOFILL |
BTREE_ITER_INTENT ) ;
2017-03-16 22:18:50 -08:00
/* Only use nowait if we've already invalidated at least one bucket: */
2018-07-22 10:43:01 -04:00
while ( ! ret & &
! fifo_full ( & ca - > free_inc ) & &
2019-02-13 14:46:32 -05:00
ca - > alloc_heap . used )
2019-03-13 20:49:16 -04:00
ret = bch2_invalidate_one_bucket2 ( & trans , ca , iter , & journal_seq ,
2019-02-11 19:04:40 -05:00
BTREE_INSERT_GC_LOCK_HELD |
2019-02-13 15:17:23 -05:00
( ! fifo_empty ( & ca - > free_inc )
? BTREE_INSERT_NOWAIT : 0 ) ) ;
2017-03-16 22:18:50 -08:00
2019-03-13 20:49:16 -04:00
bch2_trans_exit ( & trans ) ;
2017-03-16 22:18:50 -08:00
/* If we used NOWAIT, don't return the error: */
2018-07-22 10:43:01 -04:00
if ( ! fifo_empty ( & ca - > free_inc ) )
ret = 0 ;
if ( ret ) {
bch_err ( ca , " error invalidating buckets: %i " , ret ) ;
return ret ;
}
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
if ( journal_seq )
ret = bch2_journal_flush_seq ( & c - > journal , journal_seq ) ;
if ( ret ) {
bch_err ( ca , " journal error: %i " , ret ) ;
return ret ;
}
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
return 0 ;
2017-03-16 22:18:50 -08:00
}
static int push_invalidated_bucket ( struct bch_fs * c , struct bch_dev * ca , size_t bucket )
{
2018-07-22 10:43:01 -04:00
unsigned i ;
2017-03-16 22:18:50 -08:00
int ret = 0 ;
while ( 1 ) {
set_current_state ( TASK_INTERRUPTIBLE ) ;
2018-07-22 10:43:01 -04:00
spin_lock ( & c - > freelist_lock ) ;
2020-05-11 20:01:07 -04:00
for ( i = 0 ; i < RESERVE_NR ; i + + ) {
/*
* Don ' t strand buckets on the copygc freelist until
* after recovery is finished :
*/
if ( ! test_bit ( BCH_FS_STARTED , & c - > flags ) & &
i = = RESERVE_MOVINGGC )
continue ;
2018-07-22 10:43:01 -04:00
if ( fifo_push ( & ca - > free [ i ] , bucket ) ) {
fifo_pop ( & ca - > free_inc , bucket ) ;
2018-11-19 01:31:41 -05:00
2018-07-22 10:43:01 -04:00
closure_wake_up ( & c - > freelist_wait ) ;
2021-04-18 17:54:56 -04:00
ca - > allocator_state = ALLOCATOR_running ;
2018-11-19 01:31:41 -05:00
2018-07-22 10:43:01 -04:00
spin_unlock ( & c - > freelist_lock ) ;
goto out ;
}
2020-05-11 20:01:07 -04:00
}
2018-11-19 01:31:41 -05:00
2021-04-18 17:54:56 -04:00
if ( ca - > allocator_state ! = ALLOCATOR_blocked_full ) {
ca - > allocator_state = ALLOCATOR_blocked_full ;
2018-11-19 01:31:41 -05:00
closure_wake_up ( & c - > freelist_wait ) ;
}
2018-07-22 10:43:01 -04:00
spin_unlock ( & c - > freelist_lock ) ;
2017-03-16 22:18:50 -08:00
if ( ( current - > flags & PF_KTHREAD ) & &
kthread_should_stop ( ) ) {
ret = 1 ;
break ;
}
schedule ( ) ;
try_to_freeze ( ) ;
}
2018-07-22 10:43:01 -04:00
out :
2017-03-16 22:18:50 -08:00
__set_current_state ( TASK_RUNNING ) ;
return ret ;
}
/*
2018-07-22 10:43:01 -04:00
* Pulls buckets off free_inc , discards them ( if enabled ) , then adds them to
* freelists , waiting until there ' s room if necessary :
2017-03-16 22:18:50 -08:00
*/
static int discard_invalidated_buckets ( struct bch_fs * c , struct bch_dev * ca )
{
2018-07-22 10:43:01 -04:00
while ( ! fifo_empty ( & ca - > free_inc ) ) {
2017-03-16 22:18:50 -08:00
size_t bucket = fifo_peek ( & ca - > free_inc ) ;
if ( ca - > mi . discard & &
bdev_max_discard_sectors ( ca - > disk_sb . bdev ) )
blkdev_issue_discard ( ca - > disk_sb . bdev ,
bucket_to_sector ( ca , bucket ) ,
ca - > mi . bucket_size , GFP_NOIO ) ;
if ( push_invalidated_bucket ( c , ca , bucket ) )
return 1 ;
}
return 0 ;
}
2021-03-05 18:00:55 -05:00
static inline bool allocator_thread_running ( struct bch_dev * ca )
{
2021-02-20 19:47:58 -05:00
return ca - > mi . state = = BCH_MEMBER_STATE_rw & &
2021-03-05 18:00:55 -05:00
test_bit ( BCH_FS_ALLOCATOR_RUNNING , & ca - > fs - > flags ) ;
}
2017-03-16 22:18:50 -08:00
/**
* bch_allocator_thread - move buckets from free_inc to reserves
*
* The free_inc FIFO is populated by find_reclaimable_buckets ( ) , and
* the reserves are depleted by bucket allocation . When we run out
* of free_inc , try to invalidate some buckets and write out
* prios and gens .
*/
static int bch2_allocator_thread ( void * arg )
{
struct bch_dev * ca = arg ;
struct bch_fs * c = ca - > fs ;
2018-07-22 10:43:01 -04:00
size_t nr ;
2017-03-16 22:18:50 -08:00
int ret ;
set_freezable ( ) ;
while ( 1 ) {
2021-03-05 18:00:55 -05:00
if ( ! allocator_thread_running ( ca ) ) {
2021-04-18 17:54:56 -04:00
ca - > allocator_state = ALLOCATOR_stopped ;
2021-03-05 18:00:55 -05:00
if ( kthread_wait_freezable ( allocator_thread_running ( ca ) ) )
break ;
}
2021-04-18 17:54:56 -04:00
ca - > allocator_state = ALLOCATOR_running ;
2021-03-05 18:00:55 -05:00
2018-07-22 10:43:01 -04:00
cond_resched ( ) ;
2020-06-17 18:20:26 -04:00
if ( kthread_should_stop ( ) )
break ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
pr_debug ( " discarding %zu invalidated buckets " ,
fifo_used ( & ca - > free_inc ) ) ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
ret = discard_invalidated_buckets ( c , ca ) ;
if ( ret )
goto stop ;
2017-03-16 22:18:50 -08:00
2018-07-24 19:45:22 -04:00
down_read ( & c - > gc_lock ) ;
2018-07-22 10:43:01 -04:00
ret = bch2_invalidate_buckets ( c , ca ) ;
2018-07-24 19:45:22 -04:00
if ( ret ) {
up_read ( & c - > gc_lock ) ;
2018-07-22 10:43:01 -04:00
goto stop ;
2018-07-24 19:45:22 -04:00
}
2017-03-16 22:18:50 -08:00
2018-07-24 19:45:22 -04:00
if ( ! fifo_empty ( & ca - > free_inc ) ) {
up_read ( & c - > gc_lock ) ;
2018-07-22 10:43:01 -04:00
continue ;
2018-07-24 19:45:22 -04:00
}
2017-03-16 22:18:50 -08:00
pr_debug ( " free_inc now empty " ) ;
2018-07-22 10:43:01 -04:00
do {
2021-04-05 00:53:42 -04:00
cond_resched ( ) ;
2017-03-16 22:18:50 -08:00
/*
* Find some buckets that we can invalidate , either
* they ' re completely unused , or only contain clean data
* that ' s been written back to the backing device or
* another cache tier
*/
pr_debug ( " scanning for reclaimable buckets " ) ;
2018-07-22 10:43:01 -04:00
nr = find_reclaimable_buckets ( c , ca ) ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
pr_debug ( " found %zu buckets " , nr ) ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
trace_alloc_batch ( ca , nr , ca - > alloc_heap . size ) ;
2017-03-16 22:18:50 -08:00
2018-07-22 10:43:01 -04:00
if ( ( ca - > inc_gen_needs_gc > = ALLOC_SCAN_BATCH ( ca ) | |
ca - > inc_gen_really_needs_gc ) & &
2017-03-16 22:18:50 -08:00
c - > gc_thread ) {
atomic_inc ( & c - > kick_gc ) ;
wake_up_process ( c - > gc_thread ) ;
}
/*
2018-07-22 10:43:01 -04:00
* If we found any buckets , we have to invalidate them
* before we scan for more - but if we didn ' t find very
* many we may want to wait on more buckets being
* available so we don ' t spin :
2017-03-16 22:18:50 -08:00
*/
2018-07-22 10:43:01 -04:00
if ( ! nr | |
( nr < ALLOC_SCAN_BATCH ( ca ) & &
2019-09-18 19:33:12 -04:00
! fifo_empty ( & ca - > free [ RESERVE_NONE ] ) ) ) {
2018-07-22 10:43:01 -04:00
ret = wait_buckets_available ( c , ca ) ;
if ( ret ) {
up_read ( & c - > gc_lock ) ;
goto stop ;
}
2017-03-16 22:18:50 -08:00
}
2018-07-22 10:43:01 -04:00
} while ( ! nr ) ;
2017-03-16 22:18:50 -08:00
up_read ( & c - > gc_lock ) ;
2018-07-22 10:43:01 -04:00
pr_debug ( " %zu buckets to invalidate " , nr ) ;
2017-03-16 22:18:50 -08:00
/*
2018-07-22 10:43:01 -04:00
* alloc_heap is now full of newly - invalidated buckets : next ,
2017-03-16 22:18:50 -08:00
* write out the new bucket gens :
*/
}
stop :
pr_debug ( " alloc thread stopping (ret %i) " , ret ) ;
2021-04-18 17:54:56 -04:00
ca - > allocator_state = ALLOCATOR_stopped ;
2019-04-16 15:13:16 -04:00
closure_wake_up ( & c - > freelist_wait ) ;
2017-03-16 22:18:50 -08:00
return 0 ;
}
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity ( struct bch_fs * c )
{
struct bch_dev * ca ;
2021-04-13 09:49:23 -04:00
u64 capacity = 0 , reserved_sectors = 0 , gc_reserve ;
2018-11-04 21:55:35 -05:00
unsigned bucket_size_max = 0 ;
2017-03-16 22:18:50 -08:00
unsigned long ra_pages = 0 ;
unsigned i , j ;
lockdep_assert_held ( & c - > state_lock ) ;
for_each_online_member ( ca , c , i ) {
struct backing_dev_info * bdi = ca - > disk_sb . bdev - > bd_disk - > bdi ;
ra_pages + = bdi - > ra_pages ;
}
bch2_set_ra_pages ( c , ra_pages ) ;
for_each_rw_member ( ca , c , i ) {
2018-07-24 14:55:05 -04:00
u64 dev_reserve = 0 ;
2017-03-16 22:18:50 -08:00
/*
* We need to reserve buckets ( from the number
* of currently available buckets ) against
* foreground writes so that mainly copygc can
* make forward progress .
*
* We need enough to refill the various reserves
* from scratch - copygc will use its entire
* reserve all at once , then run against when
* its reserve is refilled ( from the formerly
* available buckets ) .
*
* This reserve is just used when considering if
* allocations for foreground writes must wait -
* not - ENOSPC calculations .
*/
for ( j = 0 ; j < RESERVE_NONE ; j + + )
2018-08-01 14:26:55 -04:00
dev_reserve + = ca - > free [ j ] . size ;
2017-03-16 22:18:50 -08:00
2018-08-01 14:26:55 -04:00
dev_reserve + = 1 ; /* btree write point */
dev_reserve + = 1 ; /* copygc write point */
dev_reserve + = 1 ; /* rebalance write point */
2017-03-16 22:18:50 -08:00
2018-08-01 14:26:55 -04:00
dev_reserve * = ca - > mi . bucket_size ;
2017-03-16 22:18:50 -08:00
2018-07-24 14:55:05 -04:00
capacity + = bucket_to_sector ( ca , ca - > mi . nbuckets -
ca - > mi . first_bucket ) ;
2017-03-16 22:18:50 -08:00
2018-07-24 14:55:05 -04:00
reserved_sectors + = dev_reserve * 2 ;
2018-11-04 21:55:35 -05:00
bucket_size_max = max_t ( unsigned , bucket_size_max ,
ca - > mi . bucket_size ) ;
2018-08-01 14:26:55 -04:00
}
2017-03-16 22:18:50 -08:00
2018-07-24 14:55:05 -04:00
gc_reserve = c - > opts . gc_reserve_bytes
? c - > opts . gc_reserve_bytes > > 9
: div64_u64 ( capacity * c - > opts . gc_reserve_percent , 100 ) ;
reserved_sectors = max ( gc_reserve , reserved_sectors ) ;
2017-03-16 22:18:50 -08:00
2018-07-24 14:55:05 -04:00
reserved_sectors = min ( reserved_sectors , capacity ) ;
2017-03-16 22:18:50 -08:00
2018-08-01 14:26:55 -04:00
c - > capacity = capacity - reserved_sectors ;
2017-03-16 22:18:50 -08:00
2018-11-04 21:55:35 -05:00
c - > bucket_size_max = bucket_size_max ;
2017-03-16 22:18:50 -08:00
/* Wake up case someone was waiting for buckets */
closure_wake_up ( & c - > freelist_wait ) ;
}
static bool bch2_dev_has_open_write_point ( struct bch_fs * c , struct bch_dev * ca )
{
struct open_bucket * ob ;
bool ret = false ;
for ( ob = c - > open_buckets ;
ob < c - > open_buckets + ARRAY_SIZE ( c - > open_buckets ) ;
ob + + ) {
spin_lock ( & ob - > lock ) ;
if ( ob - > valid & & ! ob - > on_partial_list & &
ob - > ptr . dev = = ca - > dev_idx )
ret = true ;
spin_unlock ( & ob - > lock ) ;
}
return ret ;
}
/* device goes ro: */
void bch2_dev_allocator_remove ( struct bch_fs * c , struct bch_dev * ca )
{
unsigned i ;
BUG_ON ( ca - > alloc_thread ) ;
/* First, remove device from allocation groups: */
for ( i = 0 ; i < ARRAY_SIZE ( c - > rw_devs ) ; i + + )
clear_bit ( ca - > dev_idx , c - > rw_devs [ i ] . d ) ;
/*
* Capacity is calculated based off of devices in allocation groups :
*/
bch2_recalc_capacity ( c ) ;
/* Next, close write points that point to this device... */
for ( i = 0 ; i < ARRAY_SIZE ( c - > write_points ) ; i + + )
2018-10-06 00:46:55 -04:00
bch2_writepoint_stop ( c , ca , & c - > write_points [ i ] ) ;
2017-03-16 22:18:50 -08:00
2020-07-11 16:28:54 -04:00
bch2_writepoint_stop ( c , ca , & c - > copygc_write_point ) ;
2018-10-06 00:46:55 -04:00
bch2_writepoint_stop ( c , ca , & c - > rebalance_write_point ) ;
bch2_writepoint_stop ( c , ca , & c - > btree_write_point ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & c - > btree_reserve_cache_lock ) ;
while ( c - > btree_reserve_cache_nr ) {
struct btree_alloc * a =
& c - > btree_reserve_cache [ - - c - > btree_reserve_cache_nr ] ;
2018-10-06 04:12:42 -04:00
bch2_open_buckets_put ( c , & a - > ob ) ;
2017-03-16 22:18:50 -08:00
}
mutex_unlock ( & c - > btree_reserve_cache_lock ) ;
2018-11-01 15:13:19 -04:00
while ( 1 ) {
struct open_bucket * ob ;
spin_lock ( & c - > freelist_lock ) ;
if ( ! ca - > open_buckets_partial_nr ) {
spin_unlock ( & c - > freelist_lock ) ;
break ;
}
ob = c - > open_buckets +
ca - > open_buckets_partial [ - - ca - > open_buckets_partial_nr ] ;
ob - > on_partial_list = false ;
spin_unlock ( & c - > freelist_lock ) ;
bch2_open_bucket_put ( c , ob ) ;
}
bch2_ec_stop_dev ( c , ca ) ;
2017-03-16 22:18:50 -08:00
/*
* Wake up threads that were blocked on allocation , so they can notice
* the device can no longer be removed and the capacity has changed :
*/
closure_wake_up ( & c - > freelist_wait ) ;
/*
* journal_res_get ( ) can block waiting for free space in the journal -
* it needs to notice there may not be devices to allocate from anymore :
*/
wake_up ( & c - > journal . wait ) ;
/* Now wait for any in flight writes: */
closure_wait_event ( & c - > open_buckets_wait ,
! bch2_dev_has_open_write_point ( c , ca ) ) ;
}
/* device goes rw: */
void bch2_dev_allocator_add ( struct bch_fs * c , struct bch_dev * ca )
{
unsigned i ;
for ( i = 0 ; i < ARRAY_SIZE ( c - > rw_devs ) ; i + + )
if ( ca - > mi . data_allowed & ( 1 < < i ) )
set_bit ( ca - > dev_idx , c - > rw_devs [ i ] . d ) ;
}
2018-11-19 01:31:41 -05:00
void bch2_dev_allocator_quiesce ( struct bch_fs * c , struct bch_dev * ca )
{
2019-02-08 14:43:53 -05:00
if ( ca - > alloc_thread )
2019-04-16 15:13:16 -04:00
closure_wait_event ( & c - > freelist_wait ,
2021-04-18 17:54:56 -04:00
ca - > allocator_state ! = ALLOCATOR_running ) ;
2018-11-19 01:31:41 -05:00
}
2017-03-16 22:18:50 -08:00
/* stop allocator thread: */
void bch2_dev_allocator_stop ( struct bch_dev * ca )
{
struct task_struct * p ;
p = rcu_dereference_protected ( ca - > alloc_thread , 1 ) ;
ca - > alloc_thread = NULL ;
/*
* We need an rcu barrier between setting ca - > alloc_thread = NULL and
* the thread shutting down to avoid bch2_wake_allocator ( ) racing :
*
* XXX : it would be better to have the rcu barrier be asynchronous
* instead of blocking us here
*/
synchronize_rcu ( ) ;
if ( p ) {
kthread_stop ( p ) ;
put_task_struct ( p ) ;
}
}
/* start allocator thread: */
int bch2_dev_allocator_start ( struct bch_dev * ca )
{
struct task_struct * p ;
/*
* allocator thread already started ?
*/
if ( ca - > alloc_thread )
return 0 ;
p = kthread_create ( bch2_allocator_thread , ca ,
2020-11-19 20:55:33 -05:00
" bch-alloc/%s " , ca - > name ) ;
2021-02-23 15:16:41 -05:00
if ( IS_ERR ( p ) ) {
bch_err ( ca - > fs , " error creating allocator thread: %li " ,
PTR_ERR ( p ) ) ;
2017-03-16 22:18:50 -08:00
return PTR_ERR ( p ) ;
2021-02-23 15:16:41 -05:00
}
2017-03-16 22:18:50 -08:00
get_task_struct ( p ) ;
rcu_assign_pointer ( ca - > alloc_thread , p ) ;
wake_up_process ( p ) ;
return 0 ;
}
2018-11-04 21:55:35 -05:00
void bch2_fs_allocator_background_init ( struct bch_fs * c )
2017-03-16 22:18:50 -08:00
{
spin_lock_init ( & c - > freelist_lock ) ;
}