2023-12-27 18:31:46 -05:00
// SPDX-License-Identifier: GPL-2.0
# include "bcachefs.h"
2023-11-09 14:22:46 -05:00
# include "bcachefs_ioctl.h"
2023-12-27 18:31:46 -05:00
# include "btree_update.h"
2023-11-09 14:22:46 -05:00
# include "btree_write_buffer.h"
2023-12-27 18:31:46 -05:00
# include "buckets.h"
# include "disk_accounting.h"
2023-11-09 14:22:46 -05:00
# include "error.h"
# include "journal_io.h"
2023-12-27 18:31:46 -05:00
# include "replicas.h"
2023-11-09 14:22:46 -05:00
/*
* Notes on disk accounting :
*
* We have two parallel sets of counters to be concerned with , and both must be
* kept in sync .
*
* - Persistent / on disk accounting , stored in the accounting btree and updated
* via btree write buffer updates that treat new accounting keys as deltas to
* apply to existing values . But reading from a write buffer btree is
* expensive , so we also have
*
* - In memory accounting , where accounting is stored as an array of percpu
* counters , indexed by an eytzinger array of disk acounting keys / bpos ( which
* are the same thing , excepting byte swabbing on big endian ) .
*
* Cheap to read , but non persistent .
*
* Disk accounting updates are generated by transactional triggers ; these run as
* keys enter and leave the btree , and can compare old and new versions of keys ;
* the output of these triggers are deltas to the various counters .
*
* Disk accounting updates are done as btree write buffer updates , where the
* counters in the disk accounting key are deltas that will be applied to the
* counter in the btree when the key is flushed by the write buffer ( or journal
* replay ) .
*
* To do a disk accounting update :
* - initialize a disk_accounting_key , to specify which counter is being update
* - initialize counter deltas , as an array of 1 - 3 s64s
* - call bch2_disk_accounting_mod ( )
*
* This queues up the accounting update to be done at transaction commit time .
* Underneath , it ' s a normal btree write buffer update .
*
* The transaction commit path is responsible for propagating updates to the in
* memory counters , with bch2_accounting_mem_mod ( ) .
*
* The commit path also assigns every disk accounting update a unique version
* number , based on the journal sequence number and offset within that journal
* buffer ; this is used by journal replay to determine which updates have been
* done .
*
* The transaction commit path also ensures that replicas entry accounting
* updates are properly marked in the superblock ( so that we know whether we can
* mount without data being unavailable ) ; it will update the superblock if
* bch2_accounting_mem_mod ( ) tells it to .
*/
2023-12-27 18:31:46 -05:00
static const char * const disk_accounting_type_strs [ ] = {
# define x(t, n, ...) [n] = #t,
BCH_DISK_ACCOUNTING_TYPES ( )
# undef x
NULL
} ;
2023-11-09 14:22:46 -05:00
int bch2_disk_accounting_mod ( struct btree_trans * trans ,
struct disk_accounting_pos * k ,
s64 * d , unsigned nr )
{
/* Normalize: */
switch ( k - > type ) {
case BCH_DISK_ACCOUNTING_replicas :
bubble_sort ( k - > replicas . devs , k - > replicas . nr_devs , u8_cmp ) ;
break ;
}
BUG_ON ( nr > BCH_ACCOUNTING_MAX_COUNTERS ) ;
struct {
__BKEY_PADDED ( k , BCH_ACCOUNTING_MAX_COUNTERS ) ;
} k_i ;
struct bkey_i_accounting * acc = bkey_accounting_init ( & k_i . k ) ;
acc - > k . p = disk_accounting_pos_to_bpos ( k ) ;
set_bkey_val_u64s ( & acc - > k , sizeof ( struct bch_accounting ) / sizeof ( u64 ) + nr ) ;
memcpy_u64s_small ( acc - > v . d , d , nr ) ;
return bch2_trans_update_buffered ( trans , BTREE_ID_accounting , & acc - > k_i ) ;
}
int bch2_mod_dev_cached_sectors ( struct btree_trans * trans ,
unsigned dev , s64 sectors )
{
struct disk_accounting_pos acc = {
. type = BCH_DISK_ACCOUNTING_replicas ,
} ;
bch2_replicas_entry_cached ( & acc . replicas , dev ) ;
return bch2_disk_accounting_mod ( trans , & acc , & sectors , 1 ) ;
}
2023-12-27 18:31:46 -05:00
int bch2_accounting_invalid ( struct bch_fs * c , struct bkey_s_c k ,
enum bch_validate_flags flags ,
struct printbuf * err )
{
return 0 ;
}
void bch2_accounting_key_to_text ( struct printbuf * out , struct disk_accounting_pos * k )
{
if ( k - > type > = BCH_DISK_ACCOUNTING_TYPE_NR ) {
prt_printf ( out , " unknown type %u " , k - > type ) ;
return ;
}
prt_str ( out , disk_accounting_type_strs [ k - > type ] ) ;
prt_str ( out , " " ) ;
switch ( k - > type ) {
case BCH_DISK_ACCOUNTING_nr_inodes :
break ;
case BCH_DISK_ACCOUNTING_persistent_reserved :
prt_printf ( out , " replicas=%u " , k - > persistent_reserved . nr_replicas ) ;
break ;
case BCH_DISK_ACCOUNTING_replicas :
bch2_replicas_entry_to_text ( out , & k - > replicas ) ;
break ;
case BCH_DISK_ACCOUNTING_dev_data_type :
prt_printf ( out , " dev=%u data_type= " , k - > dev_data_type . dev ) ;
bch2_prt_data_type ( out , k - > dev_data_type . data_type ) ;
break ;
}
}
void bch2_accounting_to_text ( struct printbuf * out , struct bch_fs * c , struct bkey_s_c k )
{
struct bkey_s_c_accounting acc = bkey_s_c_to_accounting ( k ) ;
struct disk_accounting_pos acc_k ;
bpos_to_disk_accounting_pos ( & acc_k , k . k - > p ) ;
bch2_accounting_key_to_text ( out , & acc_k ) ;
for ( unsigned i = 0 ; i < bch2_accounting_counters ( k . k ) ; i + + )
prt_printf ( out , " %lli " , acc . v - > d [ i ] ) ;
}
void bch2_accounting_swab ( struct bkey_s k )
{
for ( u64 * p = ( u64 * ) k . v ;
p < ( u64 * ) bkey_val_end ( k ) ;
p + + )
* p = swab64 ( * p ) ;
}
2023-11-09 14:22:46 -05:00
static inline bool accounting_to_replicas ( struct bch_replicas_entry_v1 * r , struct bpos p )
{
struct disk_accounting_pos acc_k ;
bpos_to_disk_accounting_pos ( & acc_k , p ) ;
switch ( acc_k . type ) {
case BCH_DISK_ACCOUNTING_replicas :
unsafe_memcpy ( r , & acc_k . replicas ,
replicas_entry_bytes ( & acc_k . replicas ) ,
" variable length struct " ) ;
return true ;
default :
return false ;
}
}
static int bch2_accounting_update_sb_one ( struct bch_fs * c , struct bpos p )
{
struct bch_replicas_padded r ;
return accounting_to_replicas ( & r . e , p )
? bch2_mark_replicas ( c , & r . e )
: 0 ;
}
/*
* Ensure accounting keys being updated are present in the superblock , when
* applicable ( i . e . replicas updates )
*/
int bch2_accounting_update_sb ( struct btree_trans * trans )
{
for ( struct jset_entry * i = trans - > journal_entries ;
i ! = ( void * ) ( ( u64 * ) trans - > journal_entries + trans - > journal_entries_u64s ) ;
i = vstruct_next ( i ) )
if ( jset_entry_is_key ( i ) & & i - > start - > k . type = = KEY_TYPE_accounting ) {
int ret = bch2_accounting_update_sb_one ( trans - > c , i - > start - > k . p ) ;
if ( ret )
return ret ;
}
return 0 ;
}
static int __bch2_accounting_mem_mod_slowpath ( struct bch_fs * c , struct bkey_s_c_accounting a )
{
struct bch_replicas_padded r ;
if ( accounting_to_replicas ( & r . e , a . k - > p ) & &
! bch2_replicas_marked_locked ( c , & r . e ) )
return - BCH_ERR_btree_insert_need_mark_replicas ;
struct bch_accounting_mem * acc = & c - > accounting ;
unsigned new_nr_counters = acc - > nr_counters + bch2_accounting_counters ( a . k ) ;
u64 __percpu * new_counters = __alloc_percpu_gfp ( new_nr_counters * sizeof ( u64 ) ,
sizeof ( u64 ) , GFP_KERNEL ) ;
if ( ! new_counters )
return - BCH_ERR_ENOMEM_disk_accounting ;
preempt_disable ( ) ;
memcpy ( this_cpu_ptr ( new_counters ) ,
bch2_acc_percpu_u64s ( acc - > v , acc - > nr_counters ) ,
acc - > nr_counters * sizeof ( u64 ) ) ;
preempt_enable ( ) ;
struct accounting_pos_offset n = {
. pos = a . k - > p ,
. version = a . k - > version ,
. offset = acc - > nr_counters ,
. nr_counters = bch2_accounting_counters ( a . k ) ,
} ;
if ( darray_push ( & acc - > k , n ) ) {
free_percpu ( new_counters ) ;
return - BCH_ERR_ENOMEM_disk_accounting ;
}
eytzinger0_sort ( acc - > k . data , acc - > k . nr , sizeof ( acc - > k . data [ 0 ] ) , accounting_pos_cmp , NULL ) ;
free_percpu ( acc - > v ) ;
acc - > v = new_counters ;
acc - > nr_counters = new_nr_counters ;
for ( unsigned i = 0 ; i < n . nr_counters ; i + + )
this_cpu_add ( acc - > v [ n . offset + i ] , a . v - > d [ i ] ) ;
return 0 ;
}
int bch2_accounting_mem_mod_slowpath ( struct bch_fs * c , struct bkey_s_c_accounting a )
{
percpu_up_read ( & c - > mark_lock ) ;
percpu_down_write ( & c - > mark_lock ) ;
int ret = __bch2_accounting_mem_mod_slowpath ( c , a ) ;
percpu_up_write ( & c - > mark_lock ) ;
percpu_down_read ( & c - > mark_lock ) ;
return ret ;
}
/*
* Read out accounting keys for replicas entries , as an array of
* bch_replicas_usage entries .
*
* Note : this may be deprecated / removed at smoe point in the future and replaced
* with something more general , it exists to support the ioctl used by the
* ' bcachefs fs usage ' command .
*/
int bch2_fs_replicas_usage_read ( struct bch_fs * c , darray_char * usage )
{
struct bch_accounting_mem * acc = & c - > accounting ;
int ret = 0 ;
darray_init ( usage ) ;
percpu_down_read ( & c - > mark_lock ) ;
darray_for_each ( acc - > k , i ) {
struct {
struct bch_replicas_usage r ;
u8 pad [ BCH_BKEY_PTRS_MAX ] ;
} u ;
if ( ! accounting_to_replicas ( & u . r . r , i - > pos ) )
continue ;
u64 sectors ;
bch2_accounting_mem_read ( c , i - > pos , & sectors , 1 ) ;
u . r . sectors = sectors ;
ret = darray_make_room ( usage , replicas_usage_bytes ( & u . r ) ) ;
if ( ret )
break ;
memcpy ( & darray_top ( * usage ) , & u . r , replicas_usage_bytes ( & u . r ) ) ;
usage - > nr + = replicas_usage_bytes ( & u . r ) ;
}
percpu_up_read ( & c - > mark_lock ) ;
if ( ret )
darray_exit ( usage ) ;
return ret ;
}
static int accounting_read_key ( struct bch_fs * c , struct bkey_s_c k )
{
struct printbuf buf = PRINTBUF ;
if ( k . k - > type ! = KEY_TYPE_accounting )
return 0 ;
percpu_down_read ( & c - > mark_lock ) ;
int ret = __bch2_accounting_mem_mod ( c , bkey_s_c_to_accounting ( k ) ) ;
percpu_up_read ( & c - > mark_lock ) ;
if ( bch2_accounting_key_is_zero ( bkey_s_c_to_accounting ( k ) ) & &
ret = = - BCH_ERR_btree_insert_need_mark_replicas )
ret = 0 ;
struct disk_accounting_pos acc ;
bpos_to_disk_accounting_pos ( & acc , k . k - > p ) ;
if ( fsck_err_on ( ret = = - BCH_ERR_btree_insert_need_mark_replicas ,
c , accounting_replicas_not_marked ,
" accounting not marked in superblock replicas \n %s " ,
( bch2_accounting_key_to_text ( & buf , & acc ) ,
buf . buf ) ) )
ret = bch2_accounting_update_sb_one ( c , k . k - > p ) ;
fsck_err :
printbuf_exit ( & buf ) ;
return ret ;
}
/*
* At startup time , initialize the in memory accounting from the btree ( and
* journal )
*/
int bch2_accounting_read ( struct bch_fs * c )
{
struct bch_accounting_mem * acc = & c - > accounting ;
int ret = bch2_trans_run ( c ,
for_each_btree_key ( trans , iter ,
BTREE_ID_accounting , POS_MIN ,
BTREE_ITER_prefetch | BTREE_ITER_all_snapshots , k , ( {
struct bkey u ;
struct bkey_s_c k = bch2_btree_path_peek_slot_exact ( btree_iter_path ( trans , & iter ) , & u ) ;
accounting_read_key ( c , k ) ;
} ) ) ) ;
if ( ret )
goto err ;
struct journal_keys * keys = & c - > journal_keys ;
move_gap ( keys , keys - > nr ) ;
darray_for_each ( * keys , i ) {
if ( i - > k - > k . type = = KEY_TYPE_accounting ) {
struct bkey_s_c k = bkey_i_to_s_c ( i - > k ) ;
unsigned idx = eytzinger0_find ( acc - > k . data , acc - > k . nr ,
sizeof ( acc - > k . data [ 0 ] ) ,
accounting_pos_cmp , & k . k - > p ) ;
bool applied = idx < acc - > k . nr & &
bversion_cmp ( acc - > k . data [ idx ] . version , k . k - > version ) > = 0 ;
if ( applied )
continue ;
ret = accounting_read_key ( c , k ) ;
if ( ret )
goto err ;
}
}
percpu_down_read ( & c - > mark_lock ) ;
preempt_disable ( ) ;
struct bch_fs_usage_base * usage = & c - > usage_base - > b ;
for ( unsigned i = 0 ; i < acc - > k . nr ; i + + ) {
struct disk_accounting_pos k ;
bpos_to_disk_accounting_pos ( & k , acc - > k . data [ i ] . pos ) ;
u64 v [ BCH_ACCOUNTING_MAX_COUNTERS ] ;
bch2_accounting_mem_read_counters ( c , i , v , ARRAY_SIZE ( v ) ) ;
switch ( k . type ) {
case BCH_DISK_ACCOUNTING_persistent_reserved :
usage - > reserved + = v [ 0 ] * k . persistent_reserved . nr_replicas ;
break ;
case BCH_DISK_ACCOUNTING_replicas :
fs_usage_data_type_to_base ( usage , k . replicas . data_type , v [ 0 ] ) ;
break ;
}
}
preempt_enable ( ) ;
percpu_up_read ( & c - > mark_lock ) ;
err :
bch_err_fn ( c , ret ) ;
return ret ;
}
int bch2_dev_usage_remove ( struct bch_fs * c , unsigned dev )
{
return bch2_trans_run ( c ,
bch2_btree_write_buffer_flush_sync ( trans ) ? :
for_each_btree_key_commit ( trans , iter , BTREE_ID_accounting , POS_MIN ,
BTREE_ITER_all_snapshots , k , NULL , NULL , 0 , ( {
struct disk_accounting_pos acc ;
bpos_to_disk_accounting_pos ( & acc , k . k - > p ) ;
acc . type = = BCH_DISK_ACCOUNTING_dev_data_type & &
acc . dev_data_type . dev = = dev
? bch2_btree_bit_mod_buffered ( trans , BTREE_ID_accounting , k . k - > p , 0 )
: 0 ;
} ) ) ? :
bch2_btree_write_buffer_flush_sync ( trans ) ) ;
}
int bch2_dev_usage_init ( struct bch_dev * ca )
{
struct disk_accounting_pos acc = {
. type = BCH_DISK_ACCOUNTING_dev_data_type ,
. dev_data_type . dev = ca - > dev_idx ,
. dev_data_type . data_type = BCH_DATA_free ,
} ;
u64 v [ 3 ] = { ca - > mi . nbuckets - ca - > mi . first_bucket , 0 , 0 } ;
return bch2_trans_do ( ca - > fs , NULL , NULL , 0 ,
bch2_disk_accounting_mod ( trans , & acc , v , ARRAY_SIZE ( v ) ) ) ;
}
void bch2_fs_accounting_exit ( struct bch_fs * c )
{
struct bch_accounting_mem * acc = & c - > accounting ;
darray_exit ( & acc - > k ) ;
free_percpu ( acc - > v ) ;
}