2018-10-06 07:46:55 +03:00
// SPDX-License-Identifier: GPL-2.0
2017-03-17 09:18:50 +03:00
# include "bcachefs.h"
2018-10-06 07:46:55 +03:00
# include "alloc_background.h"
# include "alloc_foreground.h"
2017-03-17 09:18:50 +03:00
# include "btree_cache.h"
# include "btree_io.h"
2019-10-05 19:54:53 +03:00
# include "btree_key_cache.h"
2017-03-17 09:18:50 +03:00
# include "btree_update.h"
# include "btree_update_interior.h"
# include "btree_gc.h"
# include "buckets.h"
# include "clock.h"
# include "debug.h"
2018-11-01 22:13:19 +03:00
# include "ec.h"
2017-03-17 09:18:50 +03:00
# include "error.h"
2019-04-12 05:39:39 +03:00
# include "recovery.h"
2017-03-17 09:18:50 +03:00
# include "trace.h"
2021-01-23 02:01:07 +03:00
# include "varint.h"
2017-03-17 09:18:50 +03:00
# include <linux/kthread.h>
# include <linux/math64.h>
# include <linux/random.h>
# include <linux/rculist.h>
# include <linux/rcupdate.h>
# include <linux/sched/task.h>
# include <linux/sort.h>
2021-01-23 02:01:07 +03:00
static const unsigned BCH_ALLOC_V1_FIELD_BYTES [ ] = {
# define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
BCH_ALLOC_FIELDS_V1 ( )
2018-07-22 06:36:11 +03:00
# undef x
} ;
2017-03-17 09:18:50 +03:00
static void bch2_recalc_oldest_io ( struct bch_fs * , struct bch_dev * , int ) ;
/* Ratelimiting/PD controllers */
static void pd_controllers_update ( struct work_struct * work )
{
struct bch_fs * c = container_of ( to_delayed_work ( work ) ,
struct bch_fs ,
pd_controllers_update ) ;
struct bch_dev * ca ;
2020-07-11 23:28:54 +03:00
s64 free = 0 , fragmented = 0 ;
2017-03-17 09:18:50 +03:00
unsigned i ;
for_each_member_device ( ca , c , i ) {
2020-07-22 20:27:00 +03:00
struct bch_dev_usage stats = bch2_dev_usage_read ( ca ) ;
2017-03-17 09:18:50 +03:00
2020-07-11 23:28:54 +03:00
free + = bucket_to_sector ( ca ,
2017-03-17 09:18:50 +03:00
__dev_buckets_free ( ca , stats ) ) < < 9 ;
/*
* Bytes of internal fragmentation , which can be
* reclaimed by copy GC
*/
2020-07-11 23:28:54 +03:00
fragmented + = max_t ( s64 , 0 , ( bucket_to_sector ( ca ,
2021-01-22 04:51:51 +03:00
stats . d [ BCH_DATA_user ] . buckets +
stats . d [ BCH_DATA_cached ] . buckets ) -
( stats . d [ BCH_DATA_user ] . sectors +
stats . d [ BCH_DATA_cached ] . sectors ) ) < < 9 ) ;
2017-03-17 09:18:50 +03:00
}
2020-07-11 23:28:54 +03:00
bch2_pd_controller_update ( & c - > copygc_pd , free , fragmented , - 1 ) ;
2017-03-17 09:18:50 +03:00
schedule_delayed_work ( & c - > pd_controllers_update ,
c - > pd_controllers_update_seconds * HZ ) ;
}
/* Persistent alloc info: */
2021-01-23 02:01:07 +03:00
static inline u64 alloc_field_v1_get ( const struct bch_alloc * a ,
const void * * p , unsigned field )
2018-07-22 06:36:11 +03:00
{
2021-01-23 02:01:07 +03:00
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES [ field ] ;
2018-07-22 06:36:11 +03:00
u64 v ;
if ( ! ( a - > fields & ( 1 < < field ) ) )
return 0 ;
switch ( bytes ) {
case 1 :
v = * ( ( const u8 * ) * p ) ;
break ;
case 2 :
v = le16_to_cpup ( * p ) ;
break ;
case 4 :
v = le32_to_cpup ( * p ) ;
break ;
case 8 :
v = le64_to_cpup ( * p ) ;
break ;
default :
BUG ( ) ;
}
* p + = bytes ;
return v ;
}
2021-01-23 02:01:07 +03:00
static inline void alloc_field_v1_put ( struct bkey_i_alloc * a , void * * p ,
unsigned field , u64 v )
2018-07-22 06:36:11 +03:00
{
2021-01-23 02:01:07 +03:00
unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES [ field ] ;
2018-07-22 06:36:11 +03:00
if ( ! v )
return ;
a - > v . fields | = 1 < < field ;
switch ( bytes ) {
case 1 :
* ( ( u8 * ) * p ) = v ;
break ;
case 2 :
* ( ( __le16 * ) * p ) = cpu_to_le16 ( v ) ;
break ;
case 4 :
* ( ( __le32 * ) * p ) = cpu_to_le32 ( v ) ;
break ;
case 8 :
* ( ( __le64 * ) * p ) = cpu_to_le64 ( v ) ;
break ;
default :
BUG ( ) ;
}
* p + = bytes ;
}
2021-01-23 02:01:07 +03:00
static void bch2_alloc_unpack_v1 ( struct bkey_alloc_unpacked * out ,
struct bkey_s_c k )
2019-02-13 22:46:32 +03:00
{
2021-01-23 02:01:07 +03:00
const struct bch_alloc * in = bkey_s_c_to_alloc ( k ) . v ;
const void * d = in - > data ;
unsigned idx = 0 ;
2019-04-18 01:14:46 +03:00
2021-01-23 02:01:07 +03:00
out - > gen = in - > gen ;
# define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
BCH_ALLOC_FIELDS_V1 ( )
# undef x
}
static void bch2_alloc_pack_v1 ( struct bkey_alloc_buf * dst ,
const struct bkey_alloc_unpacked src )
{
struct bkey_i_alloc * a = bkey_alloc_init ( & dst - > k ) ;
void * d = a - > v . data ;
unsigned bytes , idx = 0 ;
2019-04-18 01:14:46 +03:00
2021-01-23 02:01:07 +03:00
a - > k . p = POS ( src . dev , src . bucket ) ;
a - > v . fields = 0 ;
a - > v . gen = src . gen ;
2019-02-13 22:46:32 +03:00
2021-01-23 02:01:07 +03:00
# define x(_name, _bits) alloc_field_v1_put(a, &d, idx++, src._name);
BCH_ALLOC_FIELDS_V1 ( )
2019-02-13 22:46:32 +03:00
# undef x
2021-01-23 02:01:07 +03:00
bytes = ( void * ) d - ( void * ) & a - > v ;
set_bkey_val_bytes ( & a - > k , bytes ) ;
memset_u64s_tail ( & a - > v , 0 , bytes ) ;
2019-02-13 22:46:32 +03:00
}
2021-01-23 02:01:07 +03:00
static int bch2_alloc_unpack_v2 ( struct bkey_alloc_unpacked * out ,
struct bkey_s_c k )
2019-02-13 22:46:32 +03:00
{
2021-01-23 02:01:07 +03:00
struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2 ( k ) ;
const u8 * in = a . v - > data ;
const u8 * end = bkey_val_end ( a ) ;
unsigned fieldnr = 0 ;
int ret ;
u64 v ;
out - > gen = a . v - > gen ;
out - > oldest_gen = a . v - > oldest_gen ;
out - > data_type = a . v - > data_type ;
# define x(_name, _bits) \
if ( fieldnr < a . v - > nr_fields ) { \
ret = bch2_varint_decode ( in , end , & v ) ; \
if ( ret < 0 ) \
return ret ; \
in + = ret ; \
} else { \
v = 0 ; \
} \
out - > _name = v ; \
if ( v ! = out - > _name ) \
return - 1 ; \
fieldnr + + ;
BCH_ALLOC_FIELDS_V2 ( )
# undef x
return 0 ;
}
static void bch2_alloc_pack_v2 ( struct bkey_alloc_buf * dst ,
const struct bkey_alloc_unpacked src )
{
struct bkey_i_alloc_v2 * a = bkey_alloc_v2_init ( & dst - > k ) ;
unsigned nr_fields = 0 , last_nonzero_fieldnr = 0 ;
u8 * out = a - > v . data ;
u8 * end = ( void * ) & dst [ 1 ] ;
u8 * last_nonzero_field = out ;
2019-10-12 03:20:30 +03:00
unsigned bytes ;
2019-02-13 22:46:32 +03:00
2021-01-23 02:01:07 +03:00
a - > k . p = POS ( src . dev , src . bucket ) ;
a - > v . gen = src . gen ;
a - > v . oldest_gen = src . oldest_gen ;
a - > v . data_type = src . data_type ;
# define x(_name, _bits) \
nr_fields + + ; \
\
if ( src . _name ) { \
out + = bch2_varint_encode ( out , src . _name ) ; \
\
last_nonzero_field = out ; \
last_nonzero_fieldnr = nr_fields ; \
} else { \
* out + + = 0 ; \
}
2019-02-13 22:46:32 +03:00
2021-01-23 02:01:07 +03:00
BCH_ALLOC_FIELDS_V2 ( )
2019-02-13 22:46:32 +03:00
# undef x
2021-01-23 02:01:07 +03:00
BUG_ON ( out > end ) ;
out = last_nonzero_field ;
a - > v . nr_fields = last_nonzero_fieldnr ;
bytes = ( u8 * ) out - ( u8 * ) & a - > v ;
set_bkey_val_bytes ( & a - > k , bytes ) ;
memset_u64s_tail ( & a - > v , 0 , bytes ) ;
}
struct bkey_alloc_unpacked bch2_alloc_unpack ( struct bkey_s_c k )
{
struct bkey_alloc_unpacked ret = {
. dev = k . k - > p . inode ,
. bucket = k . k - > p . offset ,
. gen = 0 ,
} ;
2019-02-13 22:46:32 +03:00
2021-01-23 02:01:07 +03:00
if ( k . k - > type = = KEY_TYPE_alloc_v2 )
bch2_alloc_unpack_v2 ( & ret , k ) ;
else if ( k . k - > type = = KEY_TYPE_alloc )
bch2_alloc_unpack_v1 ( & ret , k ) ;
return ret ;
}
void bch2_alloc_pack ( struct bch_fs * c ,
struct bkey_alloc_buf * dst ,
const struct bkey_alloc_unpacked src )
{
if ( c - > sb . features & ( 1ULL < < BCH_FEATURE_alloc_v2 ) )
bch2_alloc_pack_v2 ( dst , src ) ;
else
bch2_alloc_pack_v1 ( dst , src ) ;
2019-02-13 22:46:32 +03:00
}
2017-03-17 09:18:50 +03:00
static unsigned bch_alloc_val_u64s ( const struct bch_alloc * a )
{
2018-07-22 06:36:11 +03:00
unsigned i , bytes = offsetof ( struct bch_alloc , data ) ;
2017-03-17 09:18:50 +03:00
2021-01-23 02:01:07 +03:00
for ( i = 0 ; i < ARRAY_SIZE ( BCH_ALLOC_V1_FIELD_BYTES ) ; i + + )
2018-07-22 06:36:11 +03:00
if ( a - > fields & ( 1 < < i ) )
2021-01-23 02:01:07 +03:00
bytes + = BCH_ALLOC_V1_FIELD_BYTES [ i ] ;
2017-03-17 09:18:50 +03:00
return DIV_ROUND_UP ( bytes , sizeof ( u64 ) ) ;
}
2021-01-23 02:01:07 +03:00
const char * bch2_alloc_v1_invalid ( const struct bch_fs * c , struct bkey_s_c k )
2017-03-17 09:18:50 +03:00
{
2018-11-01 22:10:01 +03:00
struct bkey_s_c_alloc a = bkey_s_c_to_alloc ( k ) ;
2017-03-17 09:18:50 +03:00
if ( k . k - > p . inode > = c - > sb . nr_devices | |
! c - > devs [ k . k - > p . inode ] )
return " invalid device " ;
2018-11-01 22:10:01 +03:00
/* allow for unknown fields */
if ( bkey_val_u64s ( a . k ) < bch_alloc_val_u64s ( a . v ) )
return " incorrect value size " ;
2017-03-17 09:18:50 +03:00
return NULL ;
}
2021-01-23 02:01:07 +03:00
const char * bch2_alloc_v2_invalid ( const struct bch_fs * c , struct bkey_s_c k )
2017-03-17 09:18:50 +03:00
{
2021-01-23 02:01:07 +03:00
struct bkey_alloc_unpacked u ;
if ( k . k - > p . inode > = c - > sb . nr_devices | |
! c - > devs [ k . k - > p . inode ] )
return " invalid device " ;
2018-11-09 09:24:07 +03:00
2021-01-23 02:01:07 +03:00
if ( bch2_alloc_unpack_v2 ( & u , k ) )
return " unpack error " ;
2018-07-22 06:36:11 +03:00
2021-01-23 02:01:07 +03:00
return NULL ;
}
void bch2_alloc_to_text ( struct printbuf * out , struct bch_fs * c ,
struct bkey_s_c k )
{
struct bkey_alloc_unpacked u = bch2_alloc_unpack ( k ) ;
pr_buf ( out , " gen %u oldest_gen %u data_type %u " ,
u . gen , u . oldest_gen , u . data_type ) ;
# define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name);
BCH_ALLOC_FIELDS_V2 ( )
# undef x
2017-03-17 09:18:50 +03:00
}
2020-05-24 21:06:10 +03:00
static int bch2_alloc_read_fn ( struct bch_fs * c , enum btree_id id ,
unsigned level , struct bkey_s_c k )
2017-03-17 09:18:50 +03:00
{
2020-10-17 04:32:02 +03:00
struct bch_dev * ca ;
struct bucket * g ;
struct bkey_alloc_unpacked u ;
2021-01-23 02:01:07 +03:00
if ( level | |
( k . k - > type ! = KEY_TYPE_alloc & &
k . k - > type ! = KEY_TYPE_alloc_v2 ) )
2020-10-17 04:32:02 +03:00
return 0 ;
ca = bch_dev_bkey_exists ( c , k . k - > p . inode ) ;
2021-01-22 04:51:51 +03:00
g = bucket ( ca , k . k - > p . offset ) ;
2020-10-17 04:32:02 +03:00
u = bch2_alloc_unpack ( k ) ;
g - > _mark . gen = u . gen ;
g - > _mark . data_type = u . data_type ;
g - > _mark . dirty_sectors = u . dirty_sectors ;
g - > _mark . cached_sectors = u . cached_sectors ;
g - > io_time [ READ ] = u . read_time ;
g - > io_time [ WRITE ] = u . write_time ;
g - > oldest_gen = u . oldest_gen ;
g - > gen_valid = 1 ;
2017-03-17 09:18:50 +03:00
2020-05-24 21:06:10 +03:00
return 0 ;
}
int bch2_alloc_read ( struct bch_fs * c , struct journal_keys * journal_keys )
{
struct bch_dev * ca ;
unsigned i ;
int ret = 0 ;
2019-12-28 04:51:35 +03:00
2020-10-17 04:32:02 +03:00
down_read ( & c - > gc_lock ) ;
2020-05-24 21:06:10 +03:00
ret = bch2_btree_and_journal_walk ( c , journal_keys , BTREE_ID_ALLOC ,
NULL , bch2_alloc_read_fn ) ;
2020-10-17 04:32:02 +03:00
up_read ( & c - > gc_lock ) ;
2019-04-17 22:49:28 +03:00
if ( ret ) {
bch_err ( c , " error reading alloc info: %i " , ret ) ;
2017-03-17 09:18:50 +03:00
return ret ;
2019-04-17 22:49:28 +03:00
}
2017-03-17 09:18:50 +03:00
2022-12-25 06:44:56 +03:00
percpu_down_write ( & c - > mark_lock ) ;
2019-03-30 05:22:45 +03:00
bch2_dev_usage_from_buckets ( c ) ;
2022-12-25 06:44:56 +03:00
percpu_up_write ( & c - > mark_lock ) ;
2018-11-19 09:31:41 +03:00
2017-03-17 09:18:50 +03:00
mutex_lock ( & c - > bucket_clock [ READ ] . lock ) ;
for_each_member_device ( ca , c , i ) {
down_read ( & ca - > bucket_lock ) ;
bch2_recalc_oldest_io ( c , ca , READ ) ;
up_read ( & ca - > bucket_lock ) ;
}
mutex_unlock ( & c - > bucket_clock [ READ ] . lock ) ;
mutex_lock ( & c - > bucket_clock [ WRITE ] . lock ) ;
for_each_member_device ( ca , c , i ) {
down_read ( & ca - > bucket_lock ) ;
bch2_recalc_oldest_io ( c , ca , WRITE ) ;
up_read ( & ca - > bucket_lock ) ;
}
mutex_unlock ( & c - > bucket_clock [ WRITE ] . lock ) ;
return 0 ;
}
2019-08-28 00:45:42 +03:00
static int bch2_alloc_write_key ( struct btree_trans * trans ,
struct btree_iter * iter ,
unsigned flags )
2019-03-14 03:49:16 +03:00
{
2019-08-28 00:45:42 +03:00
struct bch_fs * c = trans - > c ;
struct bkey_s_c k ;
2019-03-14 03:49:16 +03:00
struct bch_dev * ca ;
2019-08-28 00:45:42 +03:00
struct bucket * g ;
struct bucket_mark m ;
struct bkey_alloc_unpacked old_u , new_u ;
2021-01-23 02:01:07 +03:00
struct bkey_alloc_buf a ;
2019-03-14 03:49:16 +03:00
int ret ;
2019-08-28 00:45:42 +03:00
retry :
2019-10-05 19:54:53 +03:00
bch2_trans_begin ( trans ) ;
ret = bch2_btree_key_cache_flush ( trans ,
BTREE_ID_ALLOC , iter - > pos ) ;
if ( ret )
goto err ;
2019-08-28 00:45:42 +03:00
k = bch2_btree_iter_peek_slot ( iter ) ;
ret = bkey_err ( k ) ;
if ( ret )
goto err ;
2019-03-14 03:49:16 +03:00
2019-08-28 00:45:42 +03:00
old_u = bch2_alloc_unpack ( k ) ;
2019-03-14 03:49:16 +03:00
2019-08-28 00:45:42 +03:00
percpu_down_read ( & c - > mark_lock ) ;
ca = bch_dev_bkey_exists ( c , iter - > pos . inode ) ;
2021-01-22 04:51:51 +03:00
g = bucket ( ca , iter - > pos . offset ) ;
2019-08-28 00:45:42 +03:00
m = READ_ONCE ( g - > mark ) ;
2021-01-23 02:01:07 +03:00
new_u = alloc_mem_to_key ( iter , g , m ) ;
2019-08-28 00:45:42 +03:00
percpu_up_read ( & c - > mark_lock ) ;
2019-03-14 03:49:16 +03:00
2019-08-28 00:45:42 +03:00
if ( ! bkey_alloc_unpacked_cmp ( old_u , new_u ) )
2020-10-17 04:36:26 +03:00
return 0 ;
2019-03-14 03:49:16 +03:00
2021-01-23 02:01:07 +03:00
bch2_alloc_pack ( c , & a , new_u ) ;
bch2_trans_update ( trans , iter , & a . k ,
2020-01-01 00:17:42 +03:00
BTREE_TRIGGER_NORUN ) ;
2019-08-28 00:45:42 +03:00
ret = bch2_trans_commit ( trans , NULL , NULL ,
2020-12-22 01:17:18 +03:00
BTREE_INSERT_NOFAIL | flags ) ;
2019-03-14 03:49:16 +03:00
err :
2019-08-28 00:45:42 +03:00
if ( ret = = - EINTR )
goto retry ;
2019-03-14 03:49:16 +03:00
return ret ;
}
2021-01-23 01:56:34 +03:00
int bch2_alloc_write ( struct bch_fs * c , unsigned flags )
2017-03-17 09:18:50 +03:00
{
2019-04-16 21:42:05 +03:00
struct btree_trans trans ;
struct btree_iter * iter ;
2021-01-23 01:56:34 +03:00
struct bch_dev * ca ;
unsigned i ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
2019-10-05 19:54:53 +03:00
bch2_trans_init ( & trans , c , BTREE_ITER_MAX , 0 ) ;
2019-03-14 03:49:16 +03:00
2021-01-23 01:56:34 +03:00
iter = bch2_trans_get_iter ( & trans , BTREE_ID_ALLOC , POS_MIN ,
2019-04-16 21:42:05 +03:00
BTREE_ITER_SLOTS | BTREE_ITER_INTENT ) ;
2017-03-17 09:18:50 +03:00
2021-01-09 05:20:58 +03:00
for_each_member_device ( ca , c , i ) {
2021-01-23 01:56:34 +03:00
bch2_btree_iter_set_pos ( iter ,
POS ( ca - > dev_idx , ca - > mi . first_bucket ) ) ;
while ( iter - > pos . offset < ca - > mi . nbuckets ) {
bch2_trans_cond_resched ( & trans ) ;
ret = bch2_alloc_write_key ( & trans , iter , flags ) ;
if ( ret ) {
percpu_ref_put ( & ca - > io_ref ) ;
goto err ;
}
bch2_btree_iter_next_slot ( iter ) ;
2017-03-17 09:18:50 +03:00
}
}
2021-01-23 01:56:34 +03:00
err :
bch2_trans_exit ( & trans ) ;
2020-10-17 04:36:26 +03:00
return ret ;
2019-08-28 00:45:42 +03:00
}
2017-03-17 09:18:50 +03:00
/* Bucket IO clocks: */
static void bch2_recalc_oldest_io ( struct bch_fs * c , struct bch_dev * ca , int rw )
{
struct bucket_clock * clock = & c - > bucket_clock [ rw ] ;
struct bucket_array * buckets = bucket_array ( ca ) ;
struct bucket * g ;
u16 max_last_io = 0 ;
unsigned i ;
lockdep_assert_held ( & c - > bucket_clock [ rw ] . lock ) ;
/* Recalculate max_last_io for this device: */
for_each_bucket ( g , buckets )
max_last_io = max ( max_last_io , bucket_last_io ( c , g , rw ) ) ;
ca - > max_last_bucket_io [ rw ] = max_last_io ;
/* Recalculate global max_last_io: */
max_last_io = 0 ;
for_each_member_device ( ca , c , i )
max_last_io = max ( max_last_io , ca - > max_last_bucket_io [ rw ] ) ;
clock - > max_last_io = max_last_io ;
}
static void bch2_rescale_bucket_io_times ( struct bch_fs * c , int rw )
{
struct bucket_clock * clock = & c - > bucket_clock [ rw ] ;
struct bucket_array * buckets ;
struct bch_dev * ca ;
struct bucket * g ;
unsigned i ;
trace_rescale_prios ( c ) ;
for_each_member_device ( ca , c , i ) {
down_read ( & ca - > bucket_lock ) ;
buckets = bucket_array ( ca ) ;
for_each_bucket ( g , buckets )
g - > io_time [ rw ] = clock - > hand -
bucket_last_io ( c , g , rw ) / 2 ;
bch2_recalc_oldest_io ( c , ca , rw ) ;
up_read ( & ca - > bucket_lock ) ;
}
}
2018-11-05 06:09:51 +03:00
static inline u64 bucket_clock_freq ( u64 capacity )
{
return max ( capacity > > 10 , 2028ULL ) ;
}
2017-03-17 09:18:50 +03:00
static void bch2_inc_clock_hand ( struct io_timer * timer )
{
struct bucket_clock * clock = container_of ( timer ,
struct bucket_clock , rescale ) ;
struct bch_fs * c = container_of ( clock ,
struct bch_fs , bucket_clock [ clock - > rw ] ) ;
struct bch_dev * ca ;
u64 capacity ;
unsigned i ;
mutex_lock ( & clock - > lock ) ;
/* if clock cannot be advanced more, rescale prio */
if ( clock - > max_last_io > = U16_MAX - 2 )
bch2_rescale_bucket_io_times ( c , clock - > rw ) ;
BUG_ON ( clock - > max_last_io > = U16_MAX - 2 ) ;
for_each_member_device ( ca , c , i )
ca - > max_last_bucket_io [ clock - > rw ] + + ;
clock - > max_last_io + + ;
clock - > hand + + ;
mutex_unlock ( & clock - > lock ) ;
capacity = READ_ONCE ( c - > capacity ) ;
if ( ! capacity )
return ;
/*
* we only increment when 0.1 % of the filesystem capacity has been read
* or written too , this determines if it ' s time
*
* XXX : we shouldn ' t really be going off of the capacity of devices in
* RW mode ( that will be 0 when we ' re RO , yet we can still service
* reads )
*/
2018-11-05 06:09:51 +03:00
timer - > expire + = bucket_clock_freq ( capacity ) ;
2017-03-17 09:18:50 +03:00
bch2_io_timer_add ( & c - > io_clock [ clock - > rw ] , timer ) ;
}
static void bch2_bucket_clock_init ( struct bch_fs * c , int rw )
{
struct bucket_clock * clock = & c - > bucket_clock [ rw ] ;
clock - > hand = 1 ;
clock - > rw = rw ;
clock - > rescale . fn = bch2_inc_clock_hand ;
2018-11-05 06:09:51 +03:00
clock - > rescale . expire = bucket_clock_freq ( c - > capacity ) ;
2017-03-17 09:18:50 +03:00
mutex_init ( & clock - > lock ) ;
}
2020-10-17 04:39:16 +03:00
int bch2_bucket_io_time_reset ( struct btree_trans * trans , unsigned dev ,
size_t bucket_nr , int rw )
{
struct bch_fs * c = trans - > c ;
struct bch_dev * ca = bch_dev_bkey_exists ( c , dev ) ;
struct btree_iter * iter ;
struct bucket * g ;
2021-01-23 02:01:07 +03:00
struct bkey_alloc_buf * a ;
2020-10-17 04:39:16 +03:00
struct bkey_alloc_unpacked u ;
2021-01-23 02:01:07 +03:00
u64 * time ;
2020-10-17 04:39:16 +03:00
int ret = 0 ;
iter = bch2_trans_get_iter ( trans , BTREE_ID_ALLOC , POS ( dev , bucket_nr ) ,
BTREE_ITER_CACHED |
BTREE_ITER_CACHED_NOFILL |
BTREE_ITER_INTENT ) ;
ret = bch2_btree_iter_traverse ( iter ) ;
if ( ret )
goto out ;
2021-01-23 02:01:07 +03:00
a = bch2_trans_kmalloc ( trans , sizeof ( struct bkey_alloc_buf ) ) ;
2020-10-17 04:39:16 +03:00
ret = PTR_ERR_OR_ZERO ( a ) ;
if ( ret )
goto out ;
percpu_down_read ( & c - > mark_lock ) ;
g = bucket ( ca , bucket_nr ) ;
2021-01-23 02:01:07 +03:00
u = alloc_mem_to_key ( iter , g , READ_ONCE ( g - > mark ) ) ;
2020-10-17 04:39:16 +03:00
percpu_up_read ( & c - > mark_lock ) ;
time = rw = = READ ? & u . read_time : & u . write_time ;
if ( * time = = c - > bucket_clock [ rw ] . hand )
goto out ;
* time = c - > bucket_clock [ rw ] . hand ;
2021-01-23 02:01:07 +03:00
bch2_alloc_pack ( c , a , u ) ;
ret = bch2_trans_update ( trans , iter , & a - > k , 0 ) ? :
2020-10-17 04:39:16 +03:00
bch2_trans_commit ( trans , NULL , NULL , 0 ) ;
out :
bch2_trans_iter_put ( trans , iter ) ;
return ret ;
}
2017-03-17 09:18:50 +03:00
/* Background allocator thread: */
/*
* Scans for buckets to be invalidated , invalidates them , rewrites prios / gens
* ( marking them as invalidated on disk ) , then optionally issues discard
* commands to the newly free buckets , then puts them on the various freelists .
*/
/**
* wait_buckets_available - wait on reclaimable buckets
*
* If there aren ' t enough available buckets to fill up free_inc , wait until
* there are .
*/
static int wait_buckets_available ( struct bch_fs * c , struct bch_dev * ca )
{
unsigned long gc_count = c - > gc_count ;
2021-01-22 04:51:51 +03:00
s64 available ;
unsigned i ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
2019-04-16 22:13:16 +03:00
ca - > allocator_state = ALLOCATOR_BLOCKED ;
closure_wake_up ( & c - > freelist_wait ) ;
2017-03-17 09:18:50 +03:00
while ( 1 ) {
set_current_state ( TASK_INTERRUPTIBLE ) ;
if ( kthread_should_stop ( ) ) {
ret = 1 ;
break ;
}
if ( gc_count ! = c - > gc_count )
ca - > inc_gen_really_needs_gc = 0 ;
2021-01-22 04:51:51 +03:00
available = dev_buckets_available ( ca ) ;
available - = ca - > inc_gen_really_needs_gc ;
spin_lock ( & c - > freelist_lock ) ;
for ( i = 0 ; i < RESERVE_NR ; i + + )
available - = fifo_used ( & ca - > free [ i ] ) ;
spin_unlock ( & c - > freelist_lock ) ;
available = max ( available , 0LL ) ;
2020-06-16 00:38:26 +03:00
if ( available > fifo_free ( & ca - > free_inc ) | |
2020-07-22 00:12:39 +03:00
( available & &
2020-12-22 01:17:18 +03:00
! fifo_full ( & ca - > free [ RESERVE_MOVINGGC ] ) ) )
2017-03-17 09:18:50 +03:00
break ;
up_read ( & c - > gc_lock ) ;
schedule ( ) ;
try_to_freeze ( ) ;
down_read ( & c - > gc_lock ) ;
}
__set_current_state ( TASK_RUNNING ) ;
2019-04-16 22:13:16 +03:00
ca - > allocator_state = ALLOCATOR_RUNNING ;
closure_wake_up ( & c - > freelist_wait ) ;
2017-03-17 09:18:50 +03:00
return ret ;
}
static bool bch2_can_invalidate_bucket ( struct bch_dev * ca ,
size_t bucket ,
struct bucket_mark mark )
{
u8 gc_gen ;
if ( ! is_available_bucket ( mark ) )
return false ;
2021-01-22 04:51:51 +03:00
if ( mark . owned_by_allocator )
return false ;
2018-11-19 09:16:07 +03:00
if ( ca - > buckets_nouse & &
test_bit ( bucket , ca - > buckets_nouse ) )
return false ;
2017-03-17 09:18:50 +03:00
gc_gen = bucket_gc_gen ( ca , bucket ) ;
if ( gc_gen > = BUCKET_GC_GEN_MAX / 2 )
ca - > inc_gen_needs_gc + + ;
if ( gc_gen > = BUCKET_GC_GEN_MAX )
ca - > inc_gen_really_needs_gc + + ;
return gc_gen < BUCKET_GC_GEN_MAX ;
}
/*
* Determines what order we ' re going to reuse buckets , smallest bucket_key ( )
* first .
*
*
* - We take into account the read prio of the bucket , which gives us an
* indication of how hot the data is - - we scale the prio so that the prio
* farthest from the clock is worth 1 / 8 th of the closest .
*
* - The number of sectors of cached data in the bucket , which gives us an
* indication of the cost in cache misses this eviction will cause .
*
* - If hotness * sectors used compares equal , we pick the bucket with the
* smallest bucket_gc_gen ( ) - since incrementing the same bucket ' s generation
* number repeatedly forces us to run mark and sweep gc to avoid generation
* number wraparound .
*/
static unsigned long bucket_sort_key ( struct bch_fs * c , struct bch_dev * ca ,
size_t b , struct bucket_mark m )
{
unsigned last_io = bucket_last_io ( c , bucket ( ca , b ) , READ ) ;
unsigned max_last_io = ca - > max_last_bucket_io [ READ ] ;
/*
* Time since last read , scaled to [ 0 , 8 ) where larger value indicates
* more recently read data :
*/
unsigned long hotness = ( max_last_io - last_io ) * 7 / max_last_io ;
/* How much we want to keep the data in this bucket: */
unsigned long data_wantness =
( hotness + 1 ) * bucket_sectors_used ( m ) ;
unsigned long needs_journal_commit =
bucket_needs_journal_commit ( m , c - > journal . last_seq_ondisk ) ;
return ( data_wantness < < 9 ) |
( needs_journal_commit < < 8 ) |
2018-08-21 22:19:33 +03:00
( bucket_gc_gen ( ca , b ) / 16 ) ;
2017-03-17 09:18:50 +03:00
}
static inline int bucket_alloc_cmp ( alloc_heap * h ,
struct alloc_heap_entry l ,
struct alloc_heap_entry r )
{
2019-04-12 11:54:12 +03:00
return cmp_int ( l . key , r . key ) ? :
cmp_int ( r . nr , l . nr ) ? :
cmp_int ( l . bucket , r . bucket ) ;
2017-03-17 09:18:50 +03:00
}
2018-07-22 17:43:01 +03:00
static inline int bucket_idx_cmp ( const void * _l , const void * _r )
{
const struct alloc_heap_entry * l = _l , * r = _r ;
2019-04-12 11:54:12 +03:00
return cmp_int ( l - > bucket , r - > bucket ) ;
2018-07-22 17:43:01 +03:00
}
2017-03-17 09:18:50 +03:00
static void find_reclaimable_buckets_lru ( struct bch_fs * c , struct bch_dev * ca )
{
struct bucket_array * buckets ;
struct alloc_heap_entry e = { 0 } ;
2018-07-22 17:43:01 +03:00
size_t b , i , nr = 0 ;
2017-03-17 09:18:50 +03:00
ca - > alloc_heap . used = 0 ;
mutex_lock ( & c - > bucket_clock [ READ ] . lock ) ;
down_read ( & ca - > bucket_lock ) ;
buckets = bucket_array ( ca ) ;
bch2_recalc_oldest_io ( c , ca , READ ) ;
/*
* Find buckets with lowest read priority , by building a maxheap sorted
* by read priority and repeatedly replacing the maximum element until
* all buckets have been visited .
*/
for ( b = ca - > mi . first_bucket ; b < ca - > mi . nbuckets ; b + + ) {
struct bucket_mark m = READ_ONCE ( buckets - > b [ b ] . mark ) ;
unsigned long key = bucket_sort_key ( c , ca , b , m ) ;
if ( ! bch2_can_invalidate_bucket ( ca , b , m ) )
continue ;
if ( e . nr & & e . bucket + e . nr = = b & & e . key = = key ) {
e . nr + + ;
} else {
if ( e . nr )
2018-10-21 23:32:51 +03:00
heap_add_or_replace ( & ca - > alloc_heap , e ,
- bucket_alloc_cmp , NULL ) ;
2017-03-17 09:18:50 +03:00
e = ( struct alloc_heap_entry ) {
. bucket = b ,
. nr = 1 ,
. key = key ,
} ;
}
cond_resched ( ) ;
}
if ( e . nr )
2018-10-21 23:32:51 +03:00
heap_add_or_replace ( & ca - > alloc_heap , e ,
- bucket_alloc_cmp , NULL ) ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
for ( i = 0 ; i < ca - > alloc_heap . used ; i + + )
nr + = ca - > alloc_heap . data [ i ] . nr ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
while ( nr - ca - > alloc_heap . data [ 0 ] . nr > = ALLOC_SCAN_BATCH ( ca ) ) {
nr - = ca - > alloc_heap . data [ 0 ] . nr ;
2018-10-21 23:32:51 +03:00
heap_pop ( & ca - > alloc_heap , e , - bucket_alloc_cmp , NULL ) ;
2017-03-17 09:18:50 +03:00
}
2018-07-22 17:43:01 +03:00
up_read ( & ca - > bucket_lock ) ;
mutex_unlock ( & c - > bucket_clock [ READ ] . lock ) ;
2017-03-17 09:18:50 +03:00
}
static void find_reclaimable_buckets_fifo ( struct bch_fs * c , struct bch_dev * ca )
{
struct bucket_array * buckets = bucket_array ( ca ) ;
struct bucket_mark m ;
2018-07-22 17:43:01 +03:00
size_t b , start ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
if ( ca - > fifo_last_bucket < ca - > mi . first_bucket | |
ca - > fifo_last_bucket > = ca - > mi . nbuckets )
ca - > fifo_last_bucket = ca - > mi . first_bucket ;
start = ca - > fifo_last_bucket ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
do {
ca - > fifo_last_bucket + + ;
if ( ca - > fifo_last_bucket = = ca - > mi . nbuckets )
ca - > fifo_last_bucket = ca - > mi . first_bucket ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
b = ca - > fifo_last_bucket ;
2017-03-17 09:18:50 +03:00
m = READ_ONCE ( buckets - > b [ b ] . mark ) ;
2018-07-22 17:43:01 +03:00
if ( bch2_can_invalidate_bucket ( ca , b , m ) ) {
struct alloc_heap_entry e = { . bucket = b , . nr = 1 , } ;
2018-10-21 23:32:51 +03:00
heap_add ( & ca - > alloc_heap , e , bucket_alloc_cmp , NULL ) ;
2018-07-22 17:43:01 +03:00
if ( heap_full ( & ca - > alloc_heap ) )
break ;
}
2017-03-17 09:18:50 +03:00
cond_resched ( ) ;
2018-07-22 17:43:01 +03:00
} while ( ca - > fifo_last_bucket ! = start ) ;
2017-03-17 09:18:50 +03:00
}
static void find_reclaimable_buckets_random ( struct bch_fs * c , struct bch_dev * ca )
{
struct bucket_array * buckets = bucket_array ( ca ) ;
struct bucket_mark m ;
2018-07-22 17:43:01 +03:00
size_t checked , i ;
2017-03-17 09:18:50 +03:00
for ( checked = 0 ;
2018-07-22 17:43:01 +03:00
checked < ca - > mi . nbuckets / 2 ;
2017-03-17 09:18:50 +03:00
checked + + ) {
size_t b = bch2_rand_range ( ca - > mi . nbuckets -
ca - > mi . first_bucket ) +
ca - > mi . first_bucket ;
m = READ_ONCE ( buckets - > b [ b ] . mark ) ;
2018-07-22 17:43:01 +03:00
if ( bch2_can_invalidate_bucket ( ca , b , m ) ) {
struct alloc_heap_entry e = { . bucket = b , . nr = 1 , } ;
2018-10-21 23:32:51 +03:00
heap_add ( & ca - > alloc_heap , e , bucket_alloc_cmp , NULL ) ;
2018-07-22 17:43:01 +03:00
if ( heap_full ( & ca - > alloc_heap ) )
break ;
}
2017-03-17 09:18:50 +03:00
cond_resched ( ) ;
}
2018-07-22 17:43:01 +03:00
sort ( ca - > alloc_heap . data ,
ca - > alloc_heap . used ,
sizeof ( ca - > alloc_heap . data [ 0 ] ) ,
bucket_idx_cmp , NULL ) ;
/* remove duplicates: */
for ( i = 0 ; i + 1 < ca - > alloc_heap . used ; i + + )
if ( ca - > alloc_heap . data [ i ] . bucket = =
ca - > alloc_heap . data [ i + 1 ] . bucket )
ca - > alloc_heap . data [ i ] . nr = 0 ;
2017-03-17 09:18:50 +03:00
}
2018-07-22 17:43:01 +03:00
static size_t find_reclaimable_buckets ( struct bch_fs * c , struct bch_dev * ca )
2017-03-17 09:18:50 +03:00
{
2018-07-22 17:43:01 +03:00
size_t i , nr = 0 ;
2017-03-17 09:18:50 +03:00
ca - > inc_gen_needs_gc = 0 ;
switch ( ca - > mi . replacement ) {
case CACHE_REPLACEMENT_LRU :
find_reclaimable_buckets_lru ( c , ca ) ;
break ;
case CACHE_REPLACEMENT_FIFO :
find_reclaimable_buckets_fifo ( c , ca ) ;
break ;
case CACHE_REPLACEMENT_RANDOM :
find_reclaimable_buckets_random ( c , ca ) ;
break ;
}
2018-07-22 17:43:01 +03:00
2018-10-21 23:32:51 +03:00
heap_resort ( & ca - > alloc_heap , bucket_alloc_cmp , NULL ) ;
2018-07-22 17:43:01 +03:00
for ( i = 0 ; i < ca - > alloc_heap . used ; i + + )
nr + = ca - > alloc_heap . data [ i ] . nr ;
return nr ;
2017-03-17 09:18:50 +03:00
}
2018-07-22 17:43:01 +03:00
static inline long next_alloc_bucket ( struct bch_dev * ca )
2017-03-17 09:18:50 +03:00
{
2018-07-22 17:43:01 +03:00
struct alloc_heap_entry e , * top = ca - > alloc_heap . data ;
while ( ca - > alloc_heap . used ) {
if ( top - > nr ) {
size_t b = top - > bucket ;
top - > bucket + + ;
top - > nr - - ;
return b ;
}
2017-03-17 09:18:50 +03:00
2018-10-21 23:32:51 +03:00
heap_pop ( & ca - > alloc_heap , e , bucket_alloc_cmp , NULL ) ;
2018-07-22 17:43:01 +03:00
}
return - 1 ;
2017-03-17 09:18:50 +03:00
}
2019-02-13 22:46:32 +03:00
/*
* returns sequence number of most recent journal entry that updated this
* bucket :
*/
static u64 bucket_journal_seq ( struct bch_fs * c , struct bucket_mark m )
{
if ( m . journal_seq_valid ) {
u64 journal_seq = atomic64_read ( & c - > journal . seq ) ;
u64 bucket_seq = journal_seq ;
bucket_seq & = ~ ( ( u64 ) U16_MAX ) ;
bucket_seq | = m . journal_seq ;
if ( bucket_seq > journal_seq )
bucket_seq - = 1 < < 16 ;
return bucket_seq ;
} else {
return 0 ;
}
}
2019-03-14 03:49:16 +03:00
static int bch2_invalidate_one_bucket2 ( struct btree_trans * trans ,
struct bch_dev * ca ,
2019-02-13 22:46:32 +03:00
struct btree_iter * iter ,
u64 * journal_seq , unsigned flags )
{
2019-03-14 03:49:16 +03:00
struct bch_fs * c = trans - > c ;
2021-01-23 02:01:07 +03:00
struct bkey_alloc_buf a ;
2019-02-13 22:46:32 +03:00
struct bkey_alloc_unpacked u ;
2019-04-18 01:14:46 +03:00
struct bucket * g ;
2019-02-13 22:46:32 +03:00
struct bucket_mark m ;
bool invalidating_cached_data ;
size_t b ;
2020-05-28 22:51:50 +03:00
int ret = 0 ;
2019-02-13 22:46:32 +03:00
BUG_ON ( ! ca - > alloc_heap . used | |
! ca - > alloc_heap . data [ 0 ] . nr ) ;
b = ca - > alloc_heap . data [ 0 ] . bucket ;
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read ( & c - > mark_lock ) ;
2020-05-28 22:51:50 +03:00
g = bucket ( ca , b ) ;
m = READ_ONCE ( g - > mark ) ;
2021-01-25 22:04:31 +03:00
BUG_ON ( m . dirty_sectors ) ;
2021-01-23 02:19:15 +03:00
bch2_mark_alloc_bucket ( c , ca , b , true , gc_pos_alloc ( c , NULL ) , 0 ) ;
spin_lock ( & c - > freelist_lock ) ;
verify_not_on_freelist ( c , ca , b ) ;
BUG_ON ( ! fifo_push ( & ca - > free_inc , b ) ) ;
spin_unlock ( & c - > freelist_lock ) ;
2020-06-04 06:46:15 +03:00
/*
* If we ' re not invalidating cached data , we only increment the bucket
* gen in memory here , the incremented gen will be updated in the btree
* by bch2_trans_mark_pointer ( ) :
*/
2021-01-23 02:19:15 +03:00
if ( ! m . cached_sectors & &
! bucket_needs_journal_commit ( m , c - > journal . last_seq_ondisk ) ) {
2021-01-25 22:04:31 +03:00
BUG_ON ( m . data_type ) ;
2021-01-23 02:19:15 +03:00
bucket_cmpxchg ( g , m , m . gen + + ) ;
percpu_up_read ( & c - > mark_lock ) ;
goto out ;
}
2020-06-04 06:46:15 +03:00
2019-02-13 22:46:32 +03:00
percpu_up_read ( & c - > mark_lock ) ;
2020-05-28 23:06:13 +03:00
/*
* If the read - only path is trying to shut down , we can ' t be generating
* new btree updates :
*/
if ( test_bit ( BCH_FS_ALLOCATOR_STOPPING , & c - > flags ) ) {
ret = 1 ;
goto out ;
}
2019-02-13 22:46:32 +03:00
bch2_btree_iter_set_pos ( iter , POS ( ca - > dev_idx , b ) ) ;
retry :
2019-10-05 19:54:53 +03:00
ret = bch2_btree_iter_traverse ( iter ) ;
2019-02-13 22:46:32 +03:00
if ( ret )
return ret ;
2019-04-18 01:14:46 +03:00
percpu_down_read ( & c - > mark_lock ) ;
2020-06-04 06:46:15 +03:00
g = bucket ( ca , iter - > pos . offset ) ;
2019-04-18 01:14:46 +03:00
m = READ_ONCE ( g - > mark ) ;
2021-01-23 02:01:07 +03:00
u = alloc_mem_to_key ( iter , g , m ) ;
2020-06-04 06:46:15 +03:00
2019-04-18 01:14:46 +03:00
percpu_up_read ( & c - > mark_lock ) ;
2019-02-13 22:46:32 +03:00
2020-06-04 06:46:15 +03:00
invalidating_cached_data = u . cached_sectors ! = 0 ;
2019-02-13 22:46:32 +03:00
2019-04-18 01:14:46 +03:00
u . gen + + ;
2019-02-13 22:46:32 +03:00
u . data_type = 0 ;
u . dirty_sectors = 0 ;
u . cached_sectors = 0 ;
u . read_time = c - > bucket_clock [ READ ] . hand ;
u . write_time = c - > bucket_clock [ WRITE ] . hand ;
2019-03-13 23:56:48 +03:00
2021-01-23 02:01:07 +03:00
bch2_alloc_pack ( c , & a , u ) ;
bch2_trans_update ( trans , iter , & a . k ,
2020-01-01 00:17:42 +03:00
BTREE_TRIGGER_BUCKET_INVALIDATE ) ;
2019-03-14 03:49:16 +03:00
2019-03-13 20:31:02 +03:00
/*
* XXX :
* when using deferred btree updates , we have journal reclaim doing
* btree updates and thus requiring the allocator to make forward
* progress , and here the allocator is requiring space in the journal -
* so we need a journal pre - reservation :
*/
2019-03-14 03:49:16 +03:00
ret = bch2_trans_commit ( trans , NULL ,
invalidating_cached_data ? journal_seq : NULL ,
BTREE_INSERT_NOUNLOCK |
BTREE_INSERT_NOCHECK_RW |
BTREE_INSERT_NOFAIL |
2020-12-22 01:17:18 +03:00
BTREE_INSERT_JOURNAL_RESERVED |
2019-03-14 03:49:16 +03:00
flags ) ;
2019-02-13 22:46:32 +03:00
if ( ret = = - EINTR )
goto retry ;
2020-05-28 22:51:50 +03:00
out :
2019-02-13 22:46:32 +03:00
if ( ! ret ) {
/* remove from alloc_heap: */
struct alloc_heap_entry e , * top = ca - > alloc_heap . data ;
top - > bucket + + ;
top - > nr - - ;
if ( ! top - > nr )
heap_pop ( & ca - > alloc_heap , e , bucket_alloc_cmp , NULL ) ;
/*
* Make sure we flush the last journal entry that updated this
* bucket ( i . e . deleting the last reference ) before writing to
* this bucket again :
*/
* journal_seq = max ( * journal_seq , bucket_journal_seq ( c , m ) ) ;
} else {
size_t b2 ;
/* remove from free_inc: */
percpu_down_read ( & c - > mark_lock ) ;
spin_lock ( & c - > freelist_lock ) ;
bch2_mark_alloc_bucket ( c , ca , b , false ,
gc_pos_alloc ( c , NULL ) , 0 ) ;
BUG_ON ( ! fifo_pop_back ( & ca - > free_inc , b2 ) ) ;
BUG_ON ( b ! = b2 ) ;
spin_unlock ( & c - > freelist_lock ) ;
percpu_up_read ( & c - > mark_lock ) ;
}
2020-05-28 23:06:13 +03:00
return ret < 0 ? ret : 0 ;
2019-02-13 22:46:32 +03:00
}
2018-07-22 17:43:01 +03:00
/*
* Pull buckets off ca - > alloc_heap , invalidate them , move them to ca - > free_inc :
*/
static int bch2_invalidate_buckets ( struct bch_fs * c , struct bch_dev * ca )
2017-03-17 09:18:50 +03:00
{
2019-03-14 03:49:16 +03:00
struct btree_trans trans ;
struct btree_iter * iter ;
2018-07-22 17:43:01 +03:00
u64 journal_seq = 0 ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
2019-05-15 17:54:43 +03:00
bch2_trans_init ( & trans , c , 0 , 0 ) ;
2019-03-14 03:49:16 +03:00
iter = bch2_trans_get_iter ( & trans , BTREE_ID_ALLOC ,
POS ( ca - > dev_idx , 0 ) ,
2019-10-05 19:54:53 +03:00
BTREE_ITER_CACHED |
BTREE_ITER_CACHED_NOFILL |
BTREE_ITER_INTENT ) ;
2017-03-17 09:18:50 +03:00
/* Only use nowait if we've already invalidated at least one bucket: */
2018-07-22 17:43:01 +03:00
while ( ! ret & &
! fifo_full ( & ca - > free_inc ) & &
2019-02-13 22:46:32 +03:00
ca - > alloc_heap . used )
2019-03-14 03:49:16 +03:00
ret = bch2_invalidate_one_bucket2 ( & trans , ca , iter , & journal_seq ,
2019-02-12 03:04:40 +03:00
BTREE_INSERT_GC_LOCK_HELD |
2019-02-13 23:17:23 +03:00
( ! fifo_empty ( & ca - > free_inc )
? BTREE_INSERT_NOWAIT : 0 ) ) ;
2017-03-17 09:18:50 +03:00
2019-03-14 03:49:16 +03:00
bch2_trans_exit ( & trans ) ;
2017-03-17 09:18:50 +03:00
/* If we used NOWAIT, don't return the error: */
2018-07-22 17:43:01 +03:00
if ( ! fifo_empty ( & ca - > free_inc ) )
ret = 0 ;
if ( ret ) {
bch_err ( ca , " error invalidating buckets: %i " , ret ) ;
return ret ;
}
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
if ( journal_seq )
ret = bch2_journal_flush_seq ( & c - > journal , journal_seq ) ;
if ( ret ) {
bch_err ( ca , " journal error: %i " , ret ) ;
return ret ;
}
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
return 0 ;
2017-03-17 09:18:50 +03:00
}
static int push_invalidated_bucket ( struct bch_fs * c , struct bch_dev * ca , size_t bucket )
{
2018-07-22 17:43:01 +03:00
unsigned i ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
while ( 1 ) {
set_current_state ( TASK_INTERRUPTIBLE ) ;
2018-07-22 17:43:01 +03:00
spin_lock ( & c - > freelist_lock ) ;
2020-05-12 03:01:07 +03:00
for ( i = 0 ; i < RESERVE_NR ; i + + ) {
/*
* Don ' t strand buckets on the copygc freelist until
* after recovery is finished :
*/
if ( ! test_bit ( BCH_FS_STARTED , & c - > flags ) & &
i = = RESERVE_MOVINGGC )
continue ;
2018-07-22 17:43:01 +03:00
if ( fifo_push ( & ca - > free [ i ] , bucket ) ) {
fifo_pop ( & ca - > free_inc , bucket ) ;
2018-11-19 09:31:41 +03:00
2018-07-22 17:43:01 +03:00
closure_wake_up ( & c - > freelist_wait ) ;
2019-04-16 22:13:16 +03:00
ca - > allocator_state = ALLOCATOR_RUNNING ;
2018-11-19 09:31:41 +03:00
2018-07-22 17:43:01 +03:00
spin_unlock ( & c - > freelist_lock ) ;
goto out ;
}
2020-05-12 03:01:07 +03:00
}
2018-11-19 09:31:41 +03:00
2019-04-16 22:13:16 +03:00
if ( ca - > allocator_state ! = ALLOCATOR_BLOCKED_FULL ) {
ca - > allocator_state = ALLOCATOR_BLOCKED_FULL ;
2018-11-19 09:31:41 +03:00
closure_wake_up ( & c - > freelist_wait ) ;
}
2018-07-22 17:43:01 +03:00
spin_unlock ( & c - > freelist_lock ) ;
2017-03-17 09:18:50 +03:00
if ( ( current - > flags & PF_KTHREAD ) & &
kthread_should_stop ( ) ) {
ret = 1 ;
break ;
}
schedule ( ) ;
try_to_freeze ( ) ;
}
2018-07-22 17:43:01 +03:00
out :
2017-03-17 09:18:50 +03:00
__set_current_state ( TASK_RUNNING ) ;
return ret ;
}
/*
2018-07-22 17:43:01 +03:00
* Pulls buckets off free_inc , discards them ( if enabled ) , then adds them to
* freelists , waiting until there ' s room if necessary :
2017-03-17 09:18:50 +03:00
*/
static int discard_invalidated_buckets ( struct bch_fs * c , struct bch_dev * ca )
{
2018-07-22 17:43:01 +03:00
while ( ! fifo_empty ( & ca - > free_inc ) ) {
2017-03-17 09:18:50 +03:00
size_t bucket = fifo_peek ( & ca - > free_inc ) ;
if ( ca - > mi . discard & &
bdev_max_discard_sectors ( ca - > disk_sb . bdev ) )
blkdev_issue_discard ( ca - > disk_sb . bdev ,
bucket_to_sector ( ca , bucket ) ,
ca - > mi . bucket_size , GFP_NOIO ) ;
if ( push_invalidated_bucket ( c , ca , bucket ) )
return 1 ;
}
return 0 ;
}
/**
* bch_allocator_thread - move buckets from free_inc to reserves
*
* The free_inc FIFO is populated by find_reclaimable_buckets ( ) , and
* the reserves are depleted by bucket allocation . When we run out
* of free_inc , try to invalidate some buckets and write out
* prios and gens .
*/
static int bch2_allocator_thread ( void * arg )
{
struct bch_dev * ca = arg ;
struct bch_fs * c = ca - > fs ;
2018-07-22 17:43:01 +03:00
size_t nr ;
2017-03-17 09:18:50 +03:00
int ret ;
set_freezable ( ) ;
2019-04-16 22:13:16 +03:00
ca - > allocator_state = ALLOCATOR_RUNNING ;
2017-03-17 09:18:50 +03:00
while ( 1 ) {
2018-07-22 17:43:01 +03:00
cond_resched ( ) ;
2020-06-18 01:20:26 +03:00
if ( kthread_should_stop ( ) )
break ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
pr_debug ( " discarding %zu invalidated buckets " ,
fifo_used ( & ca - > free_inc ) ) ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
ret = discard_invalidated_buckets ( c , ca ) ;
if ( ret )
goto stop ;
2017-03-17 09:18:50 +03:00
2018-07-25 02:45:22 +03:00
down_read ( & c - > gc_lock ) ;
2018-07-22 17:43:01 +03:00
ret = bch2_invalidate_buckets ( c , ca ) ;
2018-07-25 02:45:22 +03:00
if ( ret ) {
up_read ( & c - > gc_lock ) ;
2018-07-22 17:43:01 +03:00
goto stop ;
2018-07-25 02:45:22 +03:00
}
2017-03-17 09:18:50 +03:00
2018-07-25 02:45:22 +03:00
if ( ! fifo_empty ( & ca - > free_inc ) ) {
up_read ( & c - > gc_lock ) ;
2018-07-22 17:43:01 +03:00
continue ;
2018-07-25 02:45:22 +03:00
}
2017-03-17 09:18:50 +03:00
pr_debug ( " free_inc now empty " ) ;
2018-07-22 17:43:01 +03:00
do {
2017-03-17 09:18:50 +03:00
/*
* Find some buckets that we can invalidate , either
* they ' re completely unused , or only contain clean data
* that ' s been written back to the backing device or
* another cache tier
*/
pr_debug ( " scanning for reclaimable buckets " ) ;
2018-07-22 17:43:01 +03:00
nr = find_reclaimable_buckets ( c , ca ) ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
pr_debug ( " found %zu buckets " , nr ) ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
trace_alloc_batch ( ca , nr , ca - > alloc_heap . size ) ;
2017-03-17 09:18:50 +03:00
2018-07-22 17:43:01 +03:00
if ( ( ca - > inc_gen_needs_gc > = ALLOC_SCAN_BATCH ( ca ) | |
ca - > inc_gen_really_needs_gc ) & &
2017-03-17 09:18:50 +03:00
c - > gc_thread ) {
atomic_inc ( & c - > kick_gc ) ;
wake_up_process ( c - > gc_thread ) ;
}
/*
2018-07-22 17:43:01 +03:00
* If we found any buckets , we have to invalidate them
* before we scan for more - but if we didn ' t find very
* many we may want to wait on more buckets being
* available so we don ' t spin :
2017-03-17 09:18:50 +03:00
*/
2018-07-22 17:43:01 +03:00
if ( ! nr | |
( nr < ALLOC_SCAN_BATCH ( ca ) & &
2019-09-19 02:33:12 +03:00
! fifo_empty ( & ca - > free [ RESERVE_NONE ] ) ) ) {
2018-07-22 17:43:01 +03:00
ret = wait_buckets_available ( c , ca ) ;
if ( ret ) {
up_read ( & c - > gc_lock ) ;
goto stop ;
}
2017-03-17 09:18:50 +03:00
}
2018-07-22 17:43:01 +03:00
} while ( ! nr ) ;
2017-03-17 09:18:50 +03:00
up_read ( & c - > gc_lock ) ;
2018-07-22 17:43:01 +03:00
pr_debug ( " %zu buckets to invalidate " , nr ) ;
2017-03-17 09:18:50 +03:00
/*
2018-07-22 17:43:01 +03:00
* alloc_heap is now full of newly - invalidated buckets : next ,
2017-03-17 09:18:50 +03:00
* write out the new bucket gens :
*/
}
stop :
pr_debug ( " alloc thread stopping (ret %i) " , ret ) ;
2019-04-16 22:13:16 +03:00
ca - > allocator_state = ALLOCATOR_STOPPED ;
closure_wake_up ( & c - > freelist_wait ) ;
2017-03-17 09:18:50 +03:00
return 0 ;
}
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity ( struct bch_fs * c )
{
struct bch_dev * ca ;
2020-07-11 23:28:54 +03:00
u64 capacity = 0 , reserved_sectors = 0 , gc_reserve , copygc_threshold = 0 ;
2018-11-05 05:55:35 +03:00
unsigned bucket_size_max = 0 ;
2017-03-17 09:18:50 +03:00
unsigned long ra_pages = 0 ;
unsigned i , j ;
lockdep_assert_held ( & c - > state_lock ) ;
for_each_online_member ( ca , c , i ) {
struct backing_dev_info * bdi = ca - > disk_sb . bdev - > bd_disk - > bdi ;
ra_pages + = bdi - > ra_pages ;
}
bch2_set_ra_pages ( c , ra_pages ) ;
for_each_rw_member ( ca , c , i ) {
2018-07-24 21:55:05 +03:00
u64 dev_reserve = 0 ;
2017-03-17 09:18:50 +03:00
/*
* We need to reserve buckets ( from the number
* of currently available buckets ) against
* foreground writes so that mainly copygc can
* make forward progress .
*
* We need enough to refill the various reserves
* from scratch - copygc will use its entire
* reserve all at once , then run against when
* its reserve is refilled ( from the formerly
* available buckets ) .
*
* This reserve is just used when considering if
* allocations for foreground writes must wait -
* not - ENOSPC calculations .
*/
for ( j = 0 ; j < RESERVE_NONE ; j + + )
2018-08-01 21:26:55 +03:00
dev_reserve + = ca - > free [ j ] . size ;
2017-03-17 09:18:50 +03:00
2018-08-01 21:26:55 +03:00
dev_reserve + = 1 ; /* btree write point */
dev_reserve + = 1 ; /* copygc write point */
dev_reserve + = 1 ; /* rebalance write point */
2017-03-17 09:18:50 +03:00
2018-08-01 21:26:55 +03:00
dev_reserve * = ca - > mi . bucket_size ;
2017-03-17 09:18:50 +03:00
2020-07-11 23:28:54 +03:00
copygc_threshold + = dev_reserve ;
2018-08-01 21:26:55 +03:00
2018-07-24 21:55:05 +03:00
capacity + = bucket_to_sector ( ca , ca - > mi . nbuckets -
ca - > mi . first_bucket ) ;
2017-03-17 09:18:50 +03:00
2018-07-24 21:55:05 +03:00
reserved_sectors + = dev_reserve * 2 ;
2018-11-05 05:55:35 +03:00
bucket_size_max = max_t ( unsigned , bucket_size_max ,
ca - > mi . bucket_size ) ;
2018-08-01 21:26:55 +03:00
}
2017-03-17 09:18:50 +03:00
2018-07-24 21:55:05 +03:00
gc_reserve = c - > opts . gc_reserve_bytes
? c - > opts . gc_reserve_bytes > > 9
: div64_u64 ( capacity * c - > opts . gc_reserve_percent , 100 ) ;
reserved_sectors = max ( gc_reserve , reserved_sectors ) ;
2017-03-17 09:18:50 +03:00
2018-07-24 21:55:05 +03:00
reserved_sectors = min ( reserved_sectors , capacity ) ;
2017-03-17 09:18:50 +03:00
2020-07-11 23:28:54 +03:00
c - > copygc_threshold = copygc_threshold ;
2018-08-01 21:26:55 +03:00
c - > capacity = capacity - reserved_sectors ;
2017-03-17 09:18:50 +03:00
2018-11-05 05:55:35 +03:00
c - > bucket_size_max = bucket_size_max ;
2017-03-17 09:18:50 +03:00
/* Wake up case someone was waiting for buckets */
closure_wake_up ( & c - > freelist_wait ) ;
}
static bool bch2_dev_has_open_write_point ( struct bch_fs * c , struct bch_dev * ca )
{
struct open_bucket * ob ;
bool ret = false ;
for ( ob = c - > open_buckets ;
ob < c - > open_buckets + ARRAY_SIZE ( c - > open_buckets ) ;
ob + + ) {
spin_lock ( & ob - > lock ) ;
if ( ob - > valid & & ! ob - > on_partial_list & &
ob - > ptr . dev = = ca - > dev_idx )
ret = true ;
spin_unlock ( & ob - > lock ) ;
}
return ret ;
}
/* device goes ro: */
void bch2_dev_allocator_remove ( struct bch_fs * c , struct bch_dev * ca )
{
unsigned i ;
BUG_ON ( ca - > alloc_thread ) ;
/* First, remove device from allocation groups: */
for ( i = 0 ; i < ARRAY_SIZE ( c - > rw_devs ) ; i + + )
clear_bit ( ca - > dev_idx , c - > rw_devs [ i ] . d ) ;
/*
* Capacity is calculated based off of devices in allocation groups :
*/
bch2_recalc_capacity ( c ) ;
/* Next, close write points that point to this device... */
for ( i = 0 ; i < ARRAY_SIZE ( c - > write_points ) ; i + + )
2018-10-06 07:46:55 +03:00
bch2_writepoint_stop ( c , ca , & c - > write_points [ i ] ) ;
2017-03-17 09:18:50 +03:00
2020-07-11 23:28:54 +03:00
bch2_writepoint_stop ( c , ca , & c - > copygc_write_point ) ;
2018-10-06 07:46:55 +03:00
bch2_writepoint_stop ( c , ca , & c - > rebalance_write_point ) ;
bch2_writepoint_stop ( c , ca , & c - > btree_write_point ) ;
2017-03-17 09:18:50 +03:00
mutex_lock ( & c - > btree_reserve_cache_lock ) ;
while ( c - > btree_reserve_cache_nr ) {
struct btree_alloc * a =
& c - > btree_reserve_cache [ - - c - > btree_reserve_cache_nr ] ;
2018-10-06 11:12:42 +03:00
bch2_open_buckets_put ( c , & a - > ob ) ;
2017-03-17 09:18:50 +03:00
}
mutex_unlock ( & c - > btree_reserve_cache_lock ) ;
2018-11-01 22:13:19 +03:00
while ( 1 ) {
struct open_bucket * ob ;
spin_lock ( & c - > freelist_lock ) ;
if ( ! ca - > open_buckets_partial_nr ) {
spin_unlock ( & c - > freelist_lock ) ;
break ;
}
ob = c - > open_buckets +
ca - > open_buckets_partial [ - - ca - > open_buckets_partial_nr ] ;
ob - > on_partial_list = false ;
spin_unlock ( & c - > freelist_lock ) ;
bch2_open_bucket_put ( c , ob ) ;
}
bch2_ec_stop_dev ( c , ca ) ;
2017-03-17 09:18:50 +03:00
/*
* Wake up threads that were blocked on allocation , so they can notice
* the device can no longer be removed and the capacity has changed :
*/
closure_wake_up ( & c - > freelist_wait ) ;
/*
* journal_res_get ( ) can block waiting for free space in the journal -
* it needs to notice there may not be devices to allocate from anymore :
*/
wake_up ( & c - > journal . wait ) ;
/* Now wait for any in flight writes: */
closure_wait_event ( & c - > open_buckets_wait ,
! bch2_dev_has_open_write_point ( c , ca ) ) ;
}
/* device goes rw: */
void bch2_dev_allocator_add ( struct bch_fs * c , struct bch_dev * ca )
{
unsigned i ;
for ( i = 0 ; i < ARRAY_SIZE ( c - > rw_devs ) ; i + + )
if ( ca - > mi . data_allowed & ( 1 < < i ) )
set_bit ( ca - > dev_idx , c - > rw_devs [ i ] . d ) ;
}
2018-11-19 09:31:41 +03:00
void bch2_dev_allocator_quiesce ( struct bch_fs * c , struct bch_dev * ca )
{
2019-02-08 22:43:53 +03:00
if ( ca - > alloc_thread )
2019-04-16 22:13:16 +03:00
closure_wait_event ( & c - > freelist_wait ,
ca - > allocator_state ! = ALLOCATOR_RUNNING ) ;
2018-11-19 09:31:41 +03:00
}
2017-03-17 09:18:50 +03:00
/* stop allocator thread: */
void bch2_dev_allocator_stop ( struct bch_dev * ca )
{
struct task_struct * p ;
p = rcu_dereference_protected ( ca - > alloc_thread , 1 ) ;
ca - > alloc_thread = NULL ;
/*
* We need an rcu barrier between setting ca - > alloc_thread = NULL and
* the thread shutting down to avoid bch2_wake_allocator ( ) racing :
*
* XXX : it would be better to have the rcu barrier be asynchronous
* instead of blocking us here
*/
synchronize_rcu ( ) ;
if ( p ) {
kthread_stop ( p ) ;
put_task_struct ( p ) ;
}
}
/* start allocator thread: */
int bch2_dev_allocator_start ( struct bch_dev * ca )
{
struct task_struct * p ;
/*
* allocator thread already started ?
*/
if ( ca - > alloc_thread )
return 0 ;
p = kthread_create ( bch2_allocator_thread , ca ,
2020-11-20 04:55:33 +03:00
" bch-alloc/%s " , ca - > name ) ;
2017-03-17 09:18:50 +03:00
if ( IS_ERR ( p ) )
return PTR_ERR ( p ) ;
get_task_struct ( p ) ;
rcu_assign_pointer ( ca - > alloc_thread , p ) ;
wake_up_process ( p ) ;
return 0 ;
}
2018-11-05 05:55:35 +03:00
void bch2_fs_allocator_background_init ( struct bch_fs * c )
2017-03-17 09:18:50 +03:00
{
spin_lock_init ( & c - > freelist_lock ) ;
bch2_bucket_clock_init ( c , READ ) ;
bch2_bucket_clock_init ( c , WRITE ) ;
c - > pd_controllers_update_seconds = 5 ;
INIT_DELAYED_WORK ( & c - > pd_controllers_update , pd_controllers_update ) ;
}