2017-03-16 22:18:50 -08:00
// SPDX-License-Identifier: GPL-2.0
/*
* bcachefs setup / teardown code , and some metadata io - read a superblock and
* figure out what to do with it .
*
* Copyright 2010 , 2011 Kent Overstreet < kent . overstreet @ gmail . com >
* Copyright 2012 Google , Inc .
*/
# include "bcachefs.h"
2018-10-06 00:46:55 -04:00
# include "alloc_background.h"
# include "alloc_foreground.h"
2018-11-27 18:30:56 -05:00
# include "bkey_sort.h"
2017-03-16 22:18:50 -08:00
# include "btree_cache.h"
# include "btree_gc.h"
2019-03-07 19:46:10 -05:00
# include "btree_key_cache.h"
2017-03-16 22:18:50 -08:00
# include "btree_update_interior.h"
# include "btree_io.h"
2022-01-04 22:32:09 -05:00
# include "buckets_waiting_for_journal.h"
2017-03-16 22:18:50 -08:00
# include "chardev.h"
# include "checksum.h"
# include "clock.h"
# include "compress.h"
# include "debug.h"
# include "disk_groups.h"
2018-11-01 15:13:19 -04:00
# include "ec.h"
2017-03-16 22:18:50 -08:00
# include "error.h"
# include "fs.h"
# include "fs-io.h"
# include "fsck.h"
# include "inode.h"
# include "io.h"
# include "journal.h"
# include "journal_reclaim.h"
2019-04-04 21:53:12 -04:00
# include "journal_seq_blacklist.h"
2017-03-16 22:18:50 -08:00
# include "move.h"
# include "migrate.h"
# include "movinggc.h"
# include "quota.h"
# include "rebalance.h"
# include "recovery.h"
# include "replicas.h"
2021-03-16 00:42:25 -04:00
# include "subvolume.h"
2017-03-16 22:18:50 -08:00
# include "super.h"
# include "super-io.h"
# include "sysfs.h"
# include "trace.h"
# include <linux/backing-dev.h>
# include <linux/blkdev.h>
# include <linux/debugfs.h>
# include <linux/device.h>
# include <linux/idr.h>
# include <linux/module.h>
# include <linux/percpu.h>
# include <linux/random.h>
# include <linux/sysfs.h>
# include <crypto/hash.h>
MODULE_LICENSE ( " GPL " ) ;
MODULE_AUTHOR ( " Kent Overstreet <kent.overstreet@gmail.com> " ) ;
# define KTYPE(type) \
static const struct attribute_group type # # _group = { \
. attrs = type # # _files \
} ; \
\
static const struct attribute_group * type # # _groups [ ] = { \
& type # # _group , \
NULL \
} ; \
\
static const struct kobj_type type # # _ktype = { \
. release = type # # _release , \
. sysfs_ops = & type # # _sysfs_ops , \
. default_groups = type # # _groups \
}
static void bch2_fs_release ( struct kobject * ) ;
static void bch2_dev_release ( struct kobject * ) ;
static void bch2_fs_internal_release ( struct kobject * k )
{
}
static void bch2_fs_opts_dir_release ( struct kobject * k )
{
}
static void bch2_fs_time_stats_release ( struct kobject * k )
{
}
KTYPE ( bch2_fs ) ;
KTYPE ( bch2_fs_internal ) ;
KTYPE ( bch2_fs_opts_dir ) ;
KTYPE ( bch2_fs_time_stats ) ;
KTYPE ( bch2_dev ) ;
static struct kset * bcachefs_kset ;
static LIST_HEAD ( bch_fs_list ) ;
static DEFINE_MUTEX ( bch_fs_list_lock ) ;
static DECLARE_WAIT_QUEUE_HEAD ( bch_read_only_wait ) ;
static void bch2_dev_free ( struct bch_dev * ) ;
static int bch2_dev_alloc ( struct bch_fs * , unsigned ) ;
static int bch2_dev_sysfs_online ( struct bch_fs * , struct bch_dev * ) ;
static void __bch2_dev_read_only ( struct bch_fs * , struct bch_dev * ) ;
struct bch_fs * bch2_dev_to_fs ( dev_t dev )
{
struct bch_fs * c ;
struct bch_dev * ca ;
unsigned i ;
mutex_lock ( & bch_fs_list_lock ) ;
rcu_read_lock ( ) ;
list_for_each_entry ( c , & bch_fs_list , list )
for_each_member_device_rcu ( ca , c , i , NULL )
2021-05-12 14:07:57 -04:00
if ( ca - > disk_sb . bdev & & ca - > disk_sb . bdev - > bd_dev = = dev ) {
2017-03-16 22:18:50 -08:00
closure_get ( & c - > cl ) ;
goto found ;
}
c = NULL ;
found :
rcu_read_unlock ( ) ;
mutex_unlock ( & bch_fs_list_lock ) ;
return c ;
}
static struct bch_fs * __bch2_uuid_to_fs ( __uuid_t uuid )
{
struct bch_fs * c ;
lockdep_assert_held ( & bch_fs_list_lock ) ;
list_for_each_entry ( c , & bch_fs_list , list )
if ( ! memcmp ( & c - > disk_sb . sb - > uuid , & uuid , sizeof ( uuid ) ) )
return c ;
return NULL ;
}
struct bch_fs * bch2_uuid_to_fs ( __uuid_t uuid )
{
struct bch_fs * c ;
mutex_lock ( & bch_fs_list_lock ) ;
c = __bch2_uuid_to_fs ( uuid ) ;
if ( c )
closure_get ( & c - > cl ) ;
mutex_unlock ( & bch_fs_list_lock ) ;
return c ;
}
2021-01-21 21:52:06 -05:00
static void bch2_dev_usage_journal_reserve ( struct bch_fs * c )
{
struct bch_dev * ca ;
unsigned i , nr = 0 , u64s =
2021-02-03 13:10:55 -05:00
( ( sizeof ( struct jset_entry_dev_usage ) +
sizeof ( struct jset_entry_dev_usage_type ) * BCH_DATA_NR ) ) /
sizeof ( u64 ) ;
2021-01-21 21:52:06 -05:00
rcu_read_lock ( ) ;
for_each_member_device_rcu ( ca , c , i , NULL )
nr + + ;
rcu_read_unlock ( ) ;
bch2_journal_entry_res_resize ( & c - > journal ,
& c - > dev_usage_journal_res , u64s * nr ) ;
}
2017-03-16 22:18:50 -08:00
/* Filesystem RO/RW: */
/*
* For startup / shutdown of RW stuff , the dependencies are :
*
* - foreground writes depend on copygc and rebalance ( to free up space )
*
* - copygc and rebalance depend on mark and sweep gc ( they actually probably
* don ' t because they either reserve ahead of time or don ' t block if
* allocations fail , but allocations can require mark and sweep gc to run
* because of generation number wraparound )
*
* - all of the above depends on the allocator threads
*
* - allocator depends on the journal ( when it rewrites prios and gens )
*/
static void __bch2_fs_read_only ( struct bch_fs * c )
{
struct bch_dev * ca ;
2019-03-28 03:40:39 -04:00
unsigned i , clean_passes = 0 ;
2017-03-16 22:18:50 -08:00
bch2_rebalance_stop ( c ) ;
2020-07-11 16:28:54 -04:00
bch2_copygc_stop ( c ) ;
2017-03-16 22:18:50 -08:00
bch2_gc_thread_stop ( c ) ;
/*
* Flush journal before stopping allocators , because flushing journal
* blacklist entries involves allocating new btree nodes :
*/
bch2_journal_flush_all_pins ( & c - > journal ) ;
2020-05-24 13:37:44 -04:00
/*
* If the allocator threads didn ' t all start up , the btree updates to
* write out alloc info aren ' t going to work :
*/
2019-02-09 16:15:29 -05:00
if ( ! test_bit ( BCH_FS_ALLOCATOR_RUNNING , & c - > flags ) )
2020-05-24 13:37:44 -04:00
goto nowrote_alloc ;
2019-02-09 16:15:29 -05:00
2020-05-28 16:06:13 -04:00
bch_verbose ( c , " flushing journal and stopping allocators " ) ;
2019-04-17 18:14:46 -04:00
2020-05-28 16:06:13 -04:00
bch2_journal_flush_all_pins ( & c - > journal ) ;
set_bit ( BCH_FS_ALLOCATOR_STOPPING , & c - > flags ) ;
2018-11-25 20:53:51 -05:00
2020-05-28 16:06:13 -04:00
do {
clean_passes + + ;
2018-11-19 01:31:41 -05:00
2020-05-28 16:06:13 -04:00
if ( bch2_journal_flush_all_pins ( & c - > journal ) )
clean_passes = 0 ;
2018-11-19 01:31:41 -05:00
/*
2020-05-28 16:06:13 -04:00
* In flight interior btree updates will generate more journal
* updates and btree updates ( alloc btree ) :
2018-11-19 01:31:41 -05:00
*/
2020-05-28 16:06:13 -04:00
if ( bch2_btree_interior_updates_nr_pending ( c ) ) {
closure_wait_event ( & c - > btree_interior_update_wait ,
! bch2_btree_interior_updates_nr_pending ( c ) ) ;
clean_passes = 0 ;
}
2020-05-25 14:57:06 -04:00
flush_work ( & c - > btree_interior_update_work ) ;
2019-03-28 03:40:39 -04:00
2020-05-28 16:06:13 -04:00
if ( bch2_journal_flush_all_pins ( & c - > journal ) )
clean_passes = 0 ;
2019-03-28 03:40:39 -04:00
} while ( clean_passes < 2 ) ;
2020-05-28 16:06:13 -04:00
bch_verbose ( c , " flushing journal and stopping allocators complete " ) ;
2020-05-24 13:37:44 -04:00
set_bit ( BCH_FS_ALLOC_CLEAN , & c - > flags ) ;
nowrote_alloc :
2020-05-25 14:57:06 -04:00
closure_wait_event ( & c - > btree_interior_update_wait ,
! bch2_btree_interior_updates_nr_pending ( c ) ) ;
flush_work ( & c - > btree_interior_update_work ) ;
2017-03-16 22:18:50 -08:00
for_each_member_device ( ca , c , i )
bch2_dev_allocator_stop ( ca ) ;
2019-02-09 16:15:29 -05:00
clear_bit ( BCH_FS_ALLOCATOR_RUNNING , & c - > flags ) ;
2020-05-28 16:06:13 -04:00
clear_bit ( BCH_FS_ALLOCATOR_STOPPING , & c - > flags ) ;
2019-02-09 16:15:29 -05:00
2017-03-16 22:18:50 -08:00
bch2_fs_journal_stop ( & c - > journal ) ;
/*
* the journal kicks off btree writes via reclaim - wait for in flight
* writes after stopping journal :
*/
2021-01-20 17:31:31 -05:00
bch2_btree_flush_all_writes ( c ) ;
2017-03-16 22:18:50 -08:00
/*
* After stopping journal :
*/
for_each_member_device ( ca , c , i )
bch2_dev_allocator_remove ( c , ca ) ;
}
static void bch2_writes_disabled ( struct percpu_ref * writes )
{
struct bch_fs * c = container_of ( writes , struct bch_fs , writes ) ;
set_bit ( BCH_FS_WRITE_DISABLE_COMPLETE , & c - > flags ) ;
wake_up ( & bch_read_only_wait ) ;
}
void bch2_fs_read_only ( struct bch_fs * c )
{
2019-03-21 22:19:57 -04:00
if ( ! test_bit ( BCH_FS_RW , & c - > flags ) ) {
2021-06-21 16:30:52 -04:00
bch2_journal_reclaim_stop ( & c - > journal ) ;
2017-03-16 22:18:50 -08:00
return ;
2019-03-21 22:19:57 -04:00
}
2017-03-16 22:18:50 -08:00
BUG_ON ( test_bit ( BCH_FS_WRITE_DISABLE_COMPLETE , & c - > flags ) ) ;
/*
* Block new foreground - end write operations from starting - any new
* writes will return - EROFS :
*
* ( This is really blocking new _allocations_ , writes to previously
* allocated space can still happen until stopping the allocator in
* bch2_dev_allocator_stop ( ) ) .
*/
percpu_ref_kill ( & c - > writes ) ;
2019-07-11 12:45:59 -04:00
cancel_work_sync ( & c - > ec_stripe_delete_work ) ;
2017-03-16 22:18:50 -08:00
/*
* If we ' re not doing an emergency shutdown , we want to wait on
* outstanding writes to complete so they don ' t see spurious errors due
* to shutting down the allocator :
*
* If we are doing an emergency shutdown outstanding writes may
* hang until we shutdown the allocator so we don ' t want to wait
* on outstanding writes before shutting everything down - but
* we do need to wait on them before returning and signalling
* that going RO is complete :
*/
wait_event ( bch_read_only_wait ,
test_bit ( BCH_FS_WRITE_DISABLE_COMPLETE , & c - > flags ) | |
test_bit ( BCH_FS_EMERGENCY_RO , & c - > flags ) ) ;
__bch2_fs_read_only ( c ) ;
wait_event ( bch_read_only_wait ,
test_bit ( BCH_FS_WRITE_DISABLE_COMPLETE , & c - > flags ) ) ;
clear_bit ( BCH_FS_WRITE_DISABLE_COMPLETE , & c - > flags ) ;
if ( ! bch2_journal_error ( & c - > journal ) & &
! test_bit ( BCH_FS_ERROR , & c - > flags ) & &
2019-03-09 14:53:03 -05:00
! test_bit ( BCH_FS_EMERGENCY_RO , & c - > flags ) & &
2019-04-06 15:12:21 -04:00
test_bit ( BCH_FS_STARTED , & c - > flags ) & &
2020-05-24 13:37:44 -04:00
test_bit ( BCH_FS_ALLOC_CLEAN , & c - > flags ) & &
2020-05-24 14:06:10 -04:00
! c - > opts . norecovery ) {
bch_verbose ( c , " marking filesystem clean " ) ;
2019-03-21 22:19:57 -04:00
bch2_fs_mark_clean ( c ) ;
2020-05-24 14:06:10 -04:00
}
2017-03-16 22:18:50 -08:00
2019-03-21 22:19:57 -04:00
clear_bit ( BCH_FS_RW , & c - > flags ) ;
2017-03-16 22:18:50 -08:00
}
static void bch2_fs_read_only_work ( struct work_struct * work )
{
struct bch_fs * c =
container_of ( work , struct bch_fs , read_only_work ) ;
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
bch2_fs_read_only ( c ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
}
static void bch2_fs_read_only_async ( struct bch_fs * c )
{
queue_work ( system_long_wq , & c - > read_only_work ) ;
}
bool bch2_fs_emergency_read_only ( struct bch_fs * c )
{
bool ret = ! test_and_set_bit ( BCH_FS_EMERGENCY_RO , & c - > flags ) ;
bch2_journal_halt ( & c - > journal ) ;
2020-08-04 23:10:08 -04:00
bch2_fs_read_only_async ( c ) ;
2017-03-16 22:18:50 -08:00
wake_up ( & bch_read_only_wait ) ;
return ret ;
}
2019-03-21 22:19:57 -04:00
static int bch2_fs_read_write_late ( struct bch_fs * c )
2017-03-16 22:18:50 -08:00
{
2019-03-21 22:19:57 -04:00
int ret ;
2017-03-16 22:18:50 -08:00
2019-03-21 22:19:57 -04:00
ret = bch2_gc_thread_start ( c ) ;
if ( ret ) {
bch_err ( c , " error starting gc thread " ) ;
return ret ;
}
2020-07-11 16:28:54 -04:00
ret = bch2_copygc_start ( c ) ;
if ( ret ) {
bch_err ( c , " error starting copygc thread " ) ;
return ret ;
2019-03-21 22:19:57 -04:00
}
ret = bch2_rebalance_start ( c ) ;
if ( ret ) {
bch_err ( c , " error starting rebalance thread " ) ;
return ret ;
}
2019-07-10 16:04:58 -04:00
schedule_work ( & c - > ec_stripe_delete_work ) ;
2019-03-21 22:19:57 -04:00
return 0 ;
}
2017-03-16 22:18:50 -08:00
2019-12-26 14:54:43 -05:00
static int __bch2_fs_read_write ( struct bch_fs * c , bool early )
2019-03-21 22:19:57 -04:00
{
struct bch_dev * ca ;
unsigned i ;
int ret ;
2021-04-24 16:32:35 -04:00
if ( test_bit ( BCH_FS_INITIAL_GC_UNFIXED , & c - > flags ) ) {
bch_err ( c , " cannot go rw, unfixed btree errors " ) ;
return - EROFS ;
}
2019-03-21 22:19:57 -04:00
if ( test_bit ( BCH_FS_RW , & c - > flags ) )
return 0 ;
2019-04-17 18:21:19 -04:00
/*
* nochanges is used for fsck - n mode - we have to allow going rw
* during recovery for that to work :
*/
if ( c - > opts . norecovery | |
( c - > opts . nochanges & &
( ! early | | c - > opts . read_only ) ) )
return - EROFS ;
2019-04-05 00:18:52 -04:00
2021-03-19 13:23:01 -04:00
bch_info ( c , " going read-write " ) ;
2019-03-21 22:19:57 -04:00
ret = bch2_fs_mark_dirty ( c ) ;
if ( ret )
goto err ;
2017-03-16 22:18:50 -08:00
2020-05-24 13:37:44 -04:00
clear_bit ( BCH_FS_ALLOC_CLEAN , & c - > flags ) ;
2017-03-16 22:18:50 -08:00
for_each_rw_member ( ca , c , i )
bch2_dev_allocator_add ( c , ca ) ;
bch2_recalc_capacity ( c ) ;
2019-03-21 22:19:57 -04:00
for_each_rw_member ( ca , c , i ) {
ret = bch2_dev_allocator_start ( ca ) ;
if ( ret ) {
bch_err ( c , " error starting allocator threads " ) ;
2017-03-16 22:18:50 -08:00
percpu_ref_put ( & ca - > io_ref ) ;
goto err ;
}
2019-03-21 22:19:57 -04:00
}
2017-03-16 22:18:50 -08:00
2019-03-21 22:19:57 -04:00
set_bit ( BCH_FS_ALLOCATOR_RUNNING , & c - > flags ) ;
2017-03-16 22:18:50 -08:00
2021-03-05 18:00:55 -05:00
for_each_rw_member ( ca , c , i )
bch2_wake_allocator ( ca ) ;
2019-03-21 22:19:57 -04:00
if ( ! early ) {
ret = bch2_fs_read_write_late ( c ) ;
if ( ret )
goto err ;
}
2017-03-16 22:18:50 -08:00
2019-03-21 22:19:57 -04:00
percpu_ref_reinit ( & c - > writes ) ;
set_bit ( BCH_FS_RW , & c - > flags ) ;
2021-04-24 18:02:59 -04:00
set_bit ( BCH_FS_WAS_RW , & c - > flags ) ;
2019-03-21 22:19:57 -04:00
return 0 ;
2017-03-16 22:18:50 -08:00
err :
__bch2_fs_read_only ( c ) ;
2019-03-21 22:19:57 -04:00
return ret ;
}
int bch2_fs_read_write ( struct bch_fs * c )
{
return __bch2_fs_read_write ( c , false ) ;
}
int bch2_fs_read_write_early ( struct bch_fs * c )
{
lockdep_assert_held ( & c - > state_lock ) ;
return __bch2_fs_read_write ( c , true ) ;
2017-03-16 22:18:50 -08:00
}
/* Filesystem startup/shutdown: */
2020-09-08 18:30:32 -04:00
static void __bch2_fs_free ( struct bch_fs * c )
2017-03-16 22:18:50 -08:00
{
unsigned i ;
2020-11-05 20:02:01 -05:00
int cpu ;
2017-03-16 22:18:50 -08:00
for ( i = 0 ; i < BCH_TIME_STAT_NR ; i + + )
bch2_time_stats_exit ( & c - > times [ i ] ) ;
2021-03-16 00:42:25 -04:00
bch2_fs_snapshots_exit ( c ) ;
2017-03-16 22:18:50 -08:00
bch2_fs_quota_exit ( c ) ;
bch2_fs_fsio_exit ( c ) ;
2018-11-01 15:13:19 -04:00
bch2_fs_ec_exit ( c ) ;
2017-03-16 22:18:50 -08:00
bch2_fs_encryption_exit ( c ) ;
bch2_fs_io_exit ( c ) ;
2022-01-04 22:32:09 -05:00
bch2_fs_buckets_waiting_for_journal_exit ( c ) ;
2020-05-25 20:35:53 -04:00
bch2_fs_btree_interior_update_exit ( c ) ;
2019-09-07 14:16:00 -04:00
bch2_fs_btree_iter_exit ( c ) ;
2019-03-07 19:46:10 -05:00
bch2_fs_btree_key_cache_exit ( & c - > btree_key_cache ) ;
2017-03-16 22:18:50 -08:00
bch2_fs_btree_cache_exit ( c ) ;
2021-04-24 00:24:25 -04:00
bch2_fs_replicas_exit ( c ) ;
2017-03-16 22:18:50 -08:00
bch2_fs_journal_exit ( & c - > journal ) ;
bch2_io_clock_exit ( & c - > io_clock [ WRITE ] ) ;
bch2_io_clock_exit ( & c - > io_clock [ READ ] ) ;
bch2_fs_compress_exit ( c ) ;
2020-03-25 16:12:33 -04:00
bch2_journal_keys_free ( & c - > journal_keys ) ;
bch2_journal_entries_free ( & c - > journal_entries ) ;
2018-11-26 00:13:33 -05:00
percpu_free_rwsem ( & c - > mark_lock ) ;
2019-02-10 19:34:47 -05:00
free_percpu ( c - > online_reserved ) ;
2020-11-05 20:02:01 -05:00
2021-08-30 15:18:31 -04:00
if ( c - > btree_paths_bufs )
2020-11-05 20:02:01 -05:00
for_each_possible_cpu ( cpu )
2021-08-30 15:18:31 -04:00
kfree ( per_cpu_ptr ( c - > btree_paths_bufs , cpu ) - > path ) ;
2020-11-05 20:02:01 -05:00
2021-08-30 15:18:31 -04:00
free_percpu ( c - > btree_paths_bufs ) ;
2018-11-27 08:23:22 -05:00
free_percpu ( c - > pcpu ) ;
2019-11-09 16:01:15 -05:00
mempool_exit ( & c - > large_bkey_pool ) ;
2017-03-16 22:18:50 -08:00
mempool_exit ( & c - > btree_bounce_pool ) ;
bioset_exit ( & c - > btree_bio ) ;
mempool_exit ( & c - > fill_iter ) ;
percpu_ref_exit ( & c - > writes ) ;
kfree ( rcu_dereference_protected ( c - > disk_groups , 1 ) ) ;
2019-04-04 21:53:12 -04:00
kfree ( c - > journal_seq_blacklist_table ) ;
2020-11-02 23:51:33 -05:00
kfree ( c - > unused_inode_hints ) ;
2020-07-11 16:28:54 -04:00
free_heap ( & c - > copygc_heap ) ;
2017-03-16 22:18:50 -08:00
2021-05-22 17:37:25 -04:00
if ( c - > io_complete_wq )
destroy_workqueue ( c - > io_complete_wq ) ;
2017-03-16 22:18:50 -08:00
if ( c - > copygc_wq )
destroy_workqueue ( c - > copygc_wq ) ;
2021-07-10 13:44:42 -04:00
if ( c - > btree_io_complete_wq )
destroy_workqueue ( c - > btree_io_complete_wq ) ;
2021-05-22 17:37:25 -04:00
if ( c - > btree_update_wq )
destroy_workqueue ( c - > btree_update_wq ) ;
2017-03-16 22:18:50 -08:00
2021-04-06 14:00:56 -04:00
bch2_free_super ( & c - > disk_sb ) ;
2017-03-16 22:18:50 -08:00
kvpfree ( c , sizeof ( * c ) ) ;
module_put ( THIS_MODULE ) ;
}
static void bch2_fs_release ( struct kobject * kobj )
{
struct bch_fs * c = container_of ( kobj , struct bch_fs , kobj ) ;
2020-09-08 18:30:32 -04:00
__bch2_fs_free ( c ) ;
2017-03-16 22:18:50 -08:00
}
2020-09-08 18:30:32 -04:00
void __bch2_fs_stop ( struct bch_fs * c )
2017-03-16 22:18:50 -08:00
{
struct bch_dev * ca ;
unsigned i ;
2018-07-21 03:56:57 -04:00
bch_verbose ( c , " shutting down " ) ;
2019-04-04 21:53:12 -04:00
set_bit ( BCH_FS_STOPPING , & c - > flags ) ;
2022-01-04 19:41:23 -05:00
cancel_work_sync ( & c - > journal_seq_blacklist_gc_work ) ;
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2020-02-27 15:03:53 -05:00
bch2_fs_read_only ( c ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2020-02-27 15:03:53 -05:00
2017-03-16 22:18:50 -08:00
for_each_member_device ( ca , c , i )
if ( ca - > kobj . state_in_sysfs & &
ca - > disk_sb . bdev )
sysfs_remove_link ( bdev_kobj ( ca - > disk_sb . bdev ) , " bcachefs " ) ;
if ( c - > kobj . state_in_sysfs )
kobject_del ( & c - > kobj ) ;
bch2_fs_debug_exit ( c ) ;
bch2_fs_chardev_exit ( c ) ;
kobject_put ( & c - > time_stats ) ;
kobject_put ( & c - > opts_dir ) ;
kobject_put ( & c - > internal ) ;
/* btree prefetch might have kicked off reads in the background: */
bch2_btree_flush_all_reads ( c ) ;
for_each_member_device ( ca , c , i )
cancel_work_sync ( & ca - > io_error_work ) ;
cancel_work_sync ( & c - > read_only_work ) ;
2020-09-08 18:30:32 -04:00
}
void bch2_fs_free ( struct bch_fs * c )
{
unsigned i ;
mutex_lock ( & bch_fs_list_lock ) ;
list_del ( & c - > list ) ;
mutex_unlock ( & bch_fs_list_lock ) ;
closure_sync ( & c - > cl ) ;
closure_debug_destroy ( & c - > cl ) ;
for ( i = 0 ; i < c - > sb . nr_devices ; i + + ) {
struct bch_dev * ca = rcu_dereference_protected ( c - > devs [ i ] , true ) ;
2017-03-16 22:18:50 -08:00
2020-09-08 18:30:32 -04:00
if ( ca ) {
bch2_free_super ( & ca - > disk_sb ) ;
bch2_dev_free ( ca ) ;
}
}
2017-03-16 22:18:50 -08:00
2018-07-21 03:56:57 -04:00
bch_verbose ( c , " shutdown complete " ) ;
2017-03-16 22:18:50 -08:00
kobject_put ( & c - > kobj ) ;
}
2020-09-08 18:30:32 -04:00
void bch2_fs_stop ( struct bch_fs * c )
{
__bch2_fs_stop ( c ) ;
bch2_fs_free ( c ) ;
}
2021-11-05 21:28:17 -04:00
static int bch2_fs_online ( struct bch_fs * c )
2017-03-16 22:18:50 -08:00
{
struct bch_dev * ca ;
unsigned i ;
2021-11-05 21:28:17 -04:00
int ret = 0 ;
2017-03-16 22:18:50 -08:00
lockdep_assert_held ( & bch_fs_list_lock ) ;
2021-11-05 21:28:17 -04:00
if ( __bch2_uuid_to_fs ( c - > sb . uuid ) ) {
bch_err ( c , " filesystem UUID already open " ) ;
return - EINVAL ;
}
2017-03-16 22:18:50 -08:00
ret = bch2_fs_chardev_init ( c ) ;
2021-11-05 21:28:17 -04:00
if ( ret ) {
bch_err ( c , " error creating character device " ) ;
return ret ;
}
2017-03-16 22:18:50 -08:00
bch2_fs_debug_init ( c ) ;
2021-11-05 21:28:17 -04:00
ret = kobject_add ( & c - > kobj , NULL , " %pU " , c - > sb . user_uuid . b ) ? :
kobject_add ( & c - > internal , & c - > kobj , " internal " ) ? :
kobject_add ( & c - > opts_dir , & c - > kobj , " options " ) ? :
kobject_add ( & c - > time_stats , & c - > kobj , " time_stats " ) ? :
bch2_opts_create_sysfs_files ( & c - > opts_dir ) ;
if ( ret ) {
bch_err ( c , " error creating sysfs objects " ) ;
return ret ;
}
2017-03-16 22:18:50 -08:00
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
2021-11-05 21:28:17 -04:00
for_each_member_device ( ca , c , i ) {
ret = bch2_dev_sysfs_online ( c , ca ) ;
if ( ret ) {
bch_err ( c , " error creating sysfs objects " ) ;
2021-05-07 20:43:43 -04:00
percpu_ref_put ( & ca - > ref ) ;
2017-03-16 22:18:50 -08:00
goto err ;
2021-05-07 20:43:43 -04:00
}
2021-11-05 21:28:17 -04:00
}
2017-03-16 22:18:50 -08:00
2021-11-05 21:28:17 -04:00
BUG_ON ( ! list_empty ( & c - > list ) ) ;
2017-03-16 22:18:50 -08:00
list_add ( & c - > list , & bch_fs_list ) ;
err :
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2021-11-05 21:28:17 -04:00
return ret ;
2017-03-16 22:18:50 -08:00
}
static struct bch_fs * bch2_fs_alloc ( struct bch_sb * sb , struct bch_opts opts )
{
struct bch_sb_field_members * mi ;
struct bch_fs * c ;
2019-02-14 20:39:17 -05:00
unsigned i , iter_size ;
2021-11-04 21:03:16 +00:00
int ret = 0 ;
2017-03-16 22:18:50 -08:00
pr_verbose_init ( opts , " " ) ;
c = kvpmalloc ( sizeof ( struct bch_fs ) , GFP_KERNEL | __GFP_ZERO ) ;
2021-11-04 21:03:16 +00:00
if ( ! c ) {
c = ERR_PTR ( - ENOMEM ) ;
2017-03-16 22:18:50 -08:00
goto out ;
2021-11-04 21:03:16 +00:00
}
2017-03-16 22:18:50 -08:00
__module_get ( THIS_MODULE ) ;
2020-10-15 15:58:36 -04:00
closure_init ( & c - > cl , NULL ) ;
c - > kobj . kset = bcachefs_kset ;
kobject_init ( & c - > kobj , & bch2_fs_ktype ) ;
kobject_init ( & c - > internal , & bch2_fs_internal_ktype ) ;
kobject_init ( & c - > opts_dir , & bch2_fs_opts_dir_ktype ) ;
kobject_init ( & c - > time_stats , & bch2_fs_time_stats_ktype ) ;
2017-03-16 22:18:50 -08:00
c - > minor = - 1 ;
c - > disk_sb . fs_sb = true ;
2020-06-15 14:58:47 -04:00
init_rwsem ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
mutex_init ( & c - > sb_lock ) ;
mutex_init ( & c - > replicas_gc_lock ) ;
mutex_init ( & c - > btree_root_lock ) ;
INIT_WORK ( & c - > read_only_work , bch2_fs_read_only_work ) ;
init_rwsem ( & c - > gc_lock ) ;
2021-12-24 04:51:10 -05:00
mutex_init ( & c - > gc_gens_lock ) ;
2017-03-16 22:18:50 -08:00
for ( i = 0 ; i < BCH_TIME_STAT_NR ; i + + )
bch2_time_stats_init ( & c - > times [ i ] ) ;
2020-07-11 16:28:54 -04:00
bch2_fs_copygc_init ( c ) ;
2019-03-07 19:46:10 -05:00
bch2_fs_btree_key_cache_init_early ( & c - > btree_key_cache ) ;
2018-11-04 21:55:35 -05:00
bch2_fs_allocator_background_init ( c ) ;
bch2_fs_allocator_foreground_init ( c ) ;
2017-03-16 22:18:50 -08:00
bch2_fs_rebalance_init ( c ) ;
bch2_fs_quota_init ( c ) ;
INIT_LIST_HEAD ( & c - > list ) ;
2019-03-15 18:20:46 -04:00
mutex_init ( & c - > usage_scratch_lock ) ;
2017-03-16 22:18:50 -08:00
mutex_init ( & c - > bio_bounce_pages_lock ) ;
2021-03-16 00:42:25 -04:00
mutex_init ( & c - > snapshot_table_lock ) ;
2017-03-16 22:18:50 -08:00
spin_lock_init ( & c - > btree_write_error_lock ) ;
2022-01-04 19:41:23 -05:00
INIT_WORK ( & c - > journal_seq_blacklist_gc_work ,
bch2_blacklist_entries_gc ) ;
2020-03-25 16:12:33 -04:00
INIT_LIST_HEAD ( & c - > journal_entries ) ;
2021-01-26 20:15:46 -05:00
INIT_LIST_HEAD ( & c - > journal_iters ) ;
2020-03-25 16:12:33 -04:00
2017-03-16 22:18:50 -08:00
INIT_LIST_HEAD ( & c - > fsck_errors ) ;
mutex_init ( & c - > fsck_error_lock ) ;
2020-07-06 20:59:46 -04:00
INIT_LIST_HEAD ( & c - > ec_stripe_head_list ) ;
mutex_init ( & c - > ec_stripe_head_lock ) ;
INIT_LIST_HEAD ( & c - > ec_stripe_new_list ) ;
mutex_init ( & c - > ec_stripe_new_lock ) ;
2021-07-23 13:57:19 -06:00
INIT_LIST_HEAD ( & c - > data_progress_list ) ;
mutex_init ( & c - > data_progress_lock ) ;
2018-11-01 15:13:19 -04:00
spin_lock_init ( & c - > ec_stripes_heap_lock ) ;
2017-03-16 22:18:50 -08:00
seqcount_init ( & c - > gc_pos_lock ) ;
2019-02-10 19:34:47 -05:00
seqcount_init ( & c - > usage_lock ) ;
2021-05-18 23:53:43 -04:00
sema_init ( & c - > io_in_flight , 128 ) ;
2017-03-16 22:18:50 -08:00
c - > copy_gc_enabled = 1 ;
c - > rebalance . enabled = 1 ;
c - > promote_whole_extents = true ;
2021-12-10 15:41:38 -05:00
c - > journal . flush_write_time = & c - > times [ BCH_TIME_journal_flush_write ] ;
c - > journal . noflush_write_time = & c - > times [ BCH_TIME_journal_noflush_write ] ;
c - > journal . blocked_time = & c - > times [ BCH_TIME_blocked_journal ] ;
c - > journal . flush_seq_time = & c - > times [ BCH_TIME_journal_flush_seq ] ;
2017-03-16 22:18:50 -08:00
bch2_fs_btree_cache_init_early ( & c - > btree_cache ) ;
2020-12-03 14:17:33 -05:00
mutex_init ( & c - > sectors_available_lock ) ;
2021-11-05 21:28:17 -04:00
ret = percpu_init_rwsem ( & c - > mark_lock ) ;
if ( ret )
2018-12-01 10:32:48 -05:00
goto err ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & c - > sb_lock ) ;
2021-11-05 21:28:17 -04:00
ret = bch2_sb_to_fs ( c , sb ) ;
mutex_unlock ( & c - > sb_lock ) ;
2017-03-16 22:18:50 -08:00
2021-11-05 21:28:17 -04:00
if ( ret )
2017-03-16 22:18:50 -08:00
goto err ;
2022-01-04 19:05:08 -05:00
uuid_unparse_lower ( c - > sb . user_uuid . b , c - > name ) ;
2017-03-16 22:18:50 -08:00
2021-12-04 20:07:19 -05:00
/* Compat: */
if ( sb - > version < = bcachefs_metadata_version_inode_v2 & &
! BCH_SB_JOURNAL_FLUSH_DELAY ( sb ) )
SET_BCH_SB_JOURNAL_FLUSH_DELAY ( sb , 1000 ) ;
if ( sb - > version < = bcachefs_metadata_version_inode_v2 & &
! BCH_SB_JOURNAL_RECLAIM_DELAY ( sb ) )
SET_BCH_SB_JOURNAL_RECLAIM_DELAY ( sb , 100 ) ;
2017-03-16 22:18:50 -08:00
c - > opts = bch2_opts_default ;
2021-12-14 14:24:41 -05:00
ret = bch2_opts_from_sb ( & c - > opts , sb ) ;
if ( ret )
goto err ;
2017-03-16 22:18:50 -08:00
bch2_opts_apply ( & c - > opts , opts ) ;
2022-01-12 02:13:21 -05:00
c - > btree_key_cache_btrees | = 1U < < BTREE_ID_alloc ;
if ( c - > opts . inodes_use_key_cache )
c - > btree_key_cache_btrees | = 1U < < BTREE_ID_inodes ;
2021-12-14 14:24:41 -05:00
c - > block_bits = ilog2 ( block_sectors ( c ) ) ;
2017-03-16 22:18:50 -08:00
c - > btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD ( c ) ;
2021-11-04 21:03:16 +00:00
if ( bch2_fs_init_fault ( " fs_alloc " ) ) {
2021-11-05 21:28:17 -04:00
bch_err ( c , " fs_alloc fault injected " ) ;
ret = - EFAULT ;
2017-03-16 22:18:50 -08:00
goto err ;
2021-11-04 21:03:16 +00:00
}
2017-03-16 22:18:50 -08:00
2019-12-14 16:20:33 -05:00
iter_size = sizeof ( struct sort_iter ) +
2017-03-16 22:18:50 -08:00
( btree_blocks ( c ) + 1 ) * 2 *
2019-12-14 16:20:33 -05:00
sizeof ( struct sort_iter_set ) ;
2017-03-16 22:18:50 -08:00
2020-11-02 23:51:33 -05:00
c - > inode_shard_bits = ilog2 ( roundup_pow_of_two ( num_possible_cpus ( ) ) ) ;
2021-05-22 17:37:25 -04:00
if ( ! ( c - > btree_update_wq = alloc_workqueue ( " bcachefs " ,
2022-10-31 16:13:05 -04:00
WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM , 512 ) ) | |
2021-07-10 13:44:42 -04:00
! ( c - > btree_io_complete_wq = alloc_workqueue ( " bcachefs_btree_io " ,
2021-05-27 21:38:00 -04:00
WQ_FREEZABLE | WQ_MEM_RECLAIM , 1 ) ) | |
2020-11-02 17:51:38 -05:00
! ( c - > copygc_wq = alloc_workqueue ( " bcachefs_copygc " ,
WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE , 1 ) ) | |
2021-05-22 17:37:25 -04:00
! ( c - > io_complete_wq = alloc_workqueue ( " bcachefs_io " ,
WQ_FREEZABLE | WQ_HIGHPRI | WQ_MEM_RECLAIM , 1 ) ) | |
2019-03-21 22:19:57 -04:00
percpu_ref_init ( & c - > writes , bch2_writes_disabled ,
PERCPU_REF_INIT_DEAD , GFP_KERNEL ) | |
2017-03-16 22:18:50 -08:00
mempool_init_kmalloc_pool ( & c - > fill_iter , 1 , iter_size ) | |
bioset_init ( & c - > btree_bio , 1 ,
max ( offsetof ( struct btree_read_bio , bio ) ,
offsetof ( struct btree_write_bio , wbio . bio ) ) ,
BIOSET_NEED_BVECS ) | |
2018-11-27 08:23:22 -05:00
! ( c - > pcpu = alloc_percpu ( struct bch_fs_pcpu ) ) | |
2019-02-10 19:34:47 -05:00
! ( c - > online_reserved = alloc_percpu ( u64 ) ) | |
2021-08-30 15:18:31 -04:00
! ( c - > btree_paths_bufs = alloc_percpu ( struct btree_path_buf ) ) | |
2017-03-16 22:18:50 -08:00
mempool_init_kvpmalloc_pool ( & c - > btree_bounce_pool , 1 ,
btree_bytes ( c ) ) | |
2019-11-09 16:01:15 -05:00
mempool_init_kmalloc_pool ( & c - > large_bkey_pool , 1 , 2048 ) | |
2020-11-02 23:51:33 -05:00
! ( c - > unused_inode_hints = kcalloc ( 1U < < c - > inode_shard_bits ,
2021-11-05 21:28:17 -04:00
sizeof ( u64 ) , GFP_KERNEL ) ) ) {
2021-11-04 21:03:16 +00:00
ret = - ENOMEM ;
goto err ;
}
2021-11-05 21:28:17 -04:00
ret = bch2_io_clock_init ( & c - > io_clock [ READ ] ) ? :
bch2_io_clock_init ( & c - > io_clock [ WRITE ] ) ? :
bch2_fs_journal_init ( & c - > journal ) ? :
bch2_fs_replicas_init ( c ) ? :
bch2_fs_btree_cache_init ( c ) ? :
bch2_fs_btree_key_cache_init ( & c - > btree_key_cache ) ? :
bch2_fs_btree_iter_init ( c ) ? :
bch2_fs_btree_interior_update_init ( c ) ? :
2022-01-04 22:32:09 -05:00
bch2_fs_buckets_waiting_for_journal_init ( c ) ;
2021-11-05 21:28:17 -04:00
bch2_fs_subvolumes_init ( c ) ? :
bch2_fs_io_init ( c ) ? :
bch2_fs_encryption_init ( c ) ? :
bch2_fs_compress_init ( c ) ? :
bch2_fs_ec_init ( c ) ? :
bch2_fs_fsio_init ( c ) ;
2021-11-04 21:03:16 +00:00
if ( ret )
2017-03-16 22:18:50 -08:00
goto err ;
mi = bch2_sb_get_members ( c - > disk_sb . sb ) ;
for ( i = 0 ; i < c - > sb . nr_devices ; i + + )
if ( bch2_dev_exists ( c - > disk_sb . sb , mi , i ) & &
2021-11-04 21:03:16 +00:00
bch2_dev_alloc ( c , i ) ) {
2021-11-05 21:28:17 -04:00
ret = - EEXIST ;
2017-03-16 22:18:50 -08:00
goto err ;
2021-11-04 21:03:16 +00:00
}
2017-03-16 22:18:50 -08:00
2021-02-03 13:10:55 -05:00
bch2_journal_entry_res_resize ( & c - > journal ,
& c - > btree_root_journal_res ,
BTREE_ID_NR * ( JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX ) ) ;
bch2_dev_usage_journal_reserve ( c ) ;
bch2_journal_entry_res_resize ( & c - > journal ,
& c - > clock_journal_res ,
( sizeof ( struct jset_entry_clock ) / sizeof ( u64 ) ) * 2 ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & bch_fs_list_lock ) ;
2021-11-05 21:28:17 -04:00
ret = bch2_fs_online ( c ) ;
2017-03-16 22:18:50 -08:00
mutex_unlock ( & bch_fs_list_lock ) ;
2021-11-05 21:28:17 -04:00
if ( ret )
2017-03-16 22:18:50 -08:00
goto err ;
out :
2021-11-04 21:03:16 +00:00
pr_verbose_init ( opts , " ret %i " , PTR_ERR_OR_ZERO ( c ) ) ;
2017-03-16 22:18:50 -08:00
return c ;
err :
bch2_fs_free ( c ) ;
2021-11-04 21:03:16 +00:00
c = ERR_PTR ( ret ) ;
2017-03-16 22:18:50 -08:00
goto out ;
}
2019-04-17 18:21:19 -04:00
noinline_for_stack
static void print_mount_opts ( struct bch_fs * c )
{
enum bch_opt_id i ;
2022-02-25 13:18:19 -05:00
struct printbuf p = PRINTBUF ;
2019-04-17 18:21:19 -04:00
bool first = true ;
if ( c - > opts . read_only ) {
pr_buf ( & p , " ro " ) ;
first = false ;
}
for ( i = 0 ; i < bch2_opts_nr ; i + + ) {
const struct bch_option * opt = & bch2_opt_table [ i ] ;
u64 v = bch2_opt_get_by_id ( & c - > opts , i ) ;
2021-12-14 14:24:41 -05:00
if ( ! ( opt - > flags & OPT_MOUNT ) )
2019-04-17 18:21:19 -04:00
continue ;
if ( v = = bch2_opt_get_by_id ( & bch2_opts_default , i ) )
continue ;
if ( ! first )
pr_buf ( & p , " , " ) ;
first = false ;
2022-03-05 12:01:16 -05:00
bch2_opt_to_text ( & p , c , c - > disk_sb . sb , opt , v , OPT_SHOW_MOUNT_STYLE ) ;
2019-04-17 18:21:19 -04:00
}
2022-02-25 13:18:19 -05:00
if ( ! p . pos )
pr_buf ( & p , " (null) " ) ;
2022-03-20 23:34:11 -04:00
bch_info ( c , " mounted version=%s opts=%s " , bch2_metadata_versions [ c - > sb . version ] , p . buf ) ;
2022-02-25 13:18:19 -05:00
printbuf_exit ( & p ) ;
2019-04-17 18:21:19 -04:00
}
int bch2_fs_start ( struct bch_fs * c )
2017-03-16 22:18:50 -08:00
{
struct bch_sb_field_members * mi ;
struct bch_dev * ca ;
2018-11-04 20:14:46 -08:00
time64_t now = ktime_get_real_seconds ( ) ;
2017-03-16 22:18:50 -08:00
unsigned i ;
int ret = - EINVAL ;
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
2019-03-21 22:19:57 -04:00
BUG_ON ( test_bit ( BCH_FS_STARTED , & c - > flags ) ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & c - > sb_lock ) ;
for_each_online_member ( ca , c , i )
bch2_sb_from_fs ( c , ca ) ;
mi = bch2_sb_get_members ( c - > disk_sb . sb ) ;
for_each_online_member ( ca , c , i )
mi - > members [ ca - > dev_idx ] . last_mount = cpu_to_le64 ( now ) ;
mutex_unlock ( & c - > sb_lock ) ;
for_each_rw_member ( ca , c , i )
bch2_dev_allocator_add ( c , ca ) ;
bch2_recalc_capacity ( c ) ;
ret = BCH_SB_INITIALIZED ( c - > disk_sb . sb )
? bch2_fs_recovery ( c )
: bch2_fs_initialize ( c ) ;
if ( ret )
goto err ;
2018-11-01 15:13:19 -04:00
ret = bch2_opts_check_may_set ( c ) ;
if ( ret )
goto err ;
2017-03-16 22:18:50 -08:00
2019-04-17 18:21:19 -04:00
ret = - EINVAL ;
2021-11-05 21:28:17 -04:00
if ( bch2_fs_init_fault ( " fs_start " ) ) {
bch_err ( c , " fs_start fault injected " ) ;
2017-03-16 22:18:50 -08:00
goto err ;
2021-11-05 21:28:17 -04:00
}
2017-03-16 22:18:50 -08:00
2020-05-11 20:01:07 -04:00
set_bit ( BCH_FS_STARTED , & c - > flags ) ;
2020-10-15 22:50:48 -04:00
/*
* Allocator threads don ' t start filling copygc reserve until after we
* set BCH_FS_STARTED - wake them now :
2021-04-18 18:01:49 -04:00
*
* XXX ugly hack :
* Need to set ca - > allocator_state here instead of relying on the
* allocator threads to do it to avoid racing with the copygc threads
* checking it and thinking they have no alloc reserve :
2020-10-15 22:50:48 -04:00
*/
2021-04-18 18:01:49 -04:00
for_each_online_member ( ca , c , i ) {
ca - > allocator_state = ALLOCATOR_running ;
2020-10-15 22:50:48 -04:00
bch2_wake_allocator ( ca ) ;
2021-04-18 18:01:49 -04:00
}
2020-10-15 22:50:48 -04:00
2019-04-17 18:21:19 -04:00
if ( c - > opts . read_only | | c - > opts . nochanges ) {
2017-03-16 22:18:50 -08:00
bch2_fs_read_only ( c ) ;
} else {
2019-04-17 18:21:19 -04:00
ret = ! test_bit ( BCH_FS_RW , & c - > flags )
? bch2_fs_read_write ( c )
: bch2_fs_read_write_late ( c ) ;
if ( ret )
2017-03-16 22:18:50 -08:00
goto err ;
}
2019-04-17 18:21:19 -04:00
print_mount_opts ( c ) ;
ret = 0 ;
2017-03-16 22:18:50 -08:00
out :
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2019-04-17 18:21:19 -04:00
return ret ;
2017-03-16 22:18:50 -08:00
err :
switch ( ret ) {
case BCH_FSCK_ERRORS_NOT_FIXED :
bch_err ( c , " filesystem contains errors: please report this to the developers " ) ;
pr_cont ( " mount with -o fix_errors to repair \n " ) ;
break ;
case BCH_FSCK_REPAIR_UNIMPLEMENTED :
bch_err ( c , " filesystem contains errors: please report this to the developers " ) ;
pr_cont ( " repair unimplemented: inform the developers so that it can be added \n " ) ;
break ;
case BCH_FSCK_REPAIR_IMPOSSIBLE :
bch_err ( c , " filesystem contains errors, but repair impossible " ) ;
break ;
case BCH_FSCK_UNKNOWN_VERSION :
2021-11-05 21:28:17 -04:00
bch_err ( c , " unknown metadata version " ) ;
2017-03-16 22:18:50 -08:00
break ;
case - ENOMEM :
2021-11-05 21:28:17 -04:00
bch_err ( c , " cannot allocate memory " ) ;
2017-03-16 22:18:50 -08:00
break ;
case - EIO :
2021-11-05 21:28:17 -04:00
bch_err ( c , " IO error " ) ;
2017-03-16 22:18:50 -08:00
break ;
}
2019-04-22 17:47:49 -04:00
if ( ret > = 0 )
ret = - EIO ;
2017-03-16 22:18:50 -08:00
goto out ;
}
static const char * bch2_dev_may_add ( struct bch_sb * sb , struct bch_fs * c )
{
struct bch_sb_field_members * sb_mi ;
sb_mi = bch2_sb_get_members ( sb ) ;
if ( ! sb_mi )
return " Invalid superblock: member info area missing " ;
2021-12-14 14:24:41 -05:00
if ( le16_to_cpu ( sb - > block_size ) ! = block_sectors ( c ) )
2017-03-16 22:18:50 -08:00
return " mismatched block size " ;
if ( le16_to_cpu ( sb_mi - > members [ sb - > dev_idx ] . bucket_size ) <
BCH_SB_BTREE_NODE_SIZE ( c - > disk_sb . sb ) )
return " new cache bucket size is too small " ;
return NULL ;
}
static const char * bch2_dev_in_fs ( struct bch_sb * fs , struct bch_sb * sb )
{
struct bch_sb * newest =
le64_to_cpu ( fs - > seq ) > le64_to_cpu ( sb - > seq ) ? fs : sb ;
struct bch_sb_field_members * mi = bch2_sb_get_members ( newest ) ;
if ( ! uuid_equal ( & fs - > uuid , & sb - > uuid ) )
return " device not a member of filesystem " ;
if ( ! bch2_dev_exists ( newest , mi , sb - > dev_idx ) )
return " device has been removed " ;
if ( fs - > block_size ! = sb - > block_size )
return " mismatched block size " ;
return NULL ;
}
/* Device startup/shutdown: */
static void bch2_dev_release ( struct kobject * kobj )
{
struct bch_dev * ca = container_of ( kobj , struct bch_dev , kobj ) ;
kfree ( ca ) ;
}
static void bch2_dev_free ( struct bch_dev * ca )
{
2021-03-05 18:00:55 -05:00
bch2_dev_allocator_stop ( ca ) ;
2017-03-16 22:18:50 -08:00
cancel_work_sync ( & ca - > io_error_work ) ;
if ( ca - > kobj . state_in_sysfs & &
ca - > disk_sb . bdev )
sysfs_remove_link ( bdev_kobj ( ca - > disk_sb . bdev ) , " bcachefs " ) ;
if ( ca - > kobj . state_in_sysfs )
kobject_del ( & ca - > kobj ) ;
bch2_free_super ( & ca - > disk_sb ) ;
bch2_dev_journal_exit ( ca ) ;
free_percpu ( ca - > io_done ) ;
bioset_exit ( & ca - > replica_set ) ;
bch2_dev_buckets_free ( ca ) ;
2019-04-06 14:32:06 -04:00
free_page ( ( unsigned long ) ca - > sb_read_scratch ) ;
2017-03-16 22:18:50 -08:00
bch2_time_stats_exit ( & ca - > io_latency [ WRITE ] ) ;
bch2_time_stats_exit ( & ca - > io_latency [ READ ] ) ;
percpu_ref_exit ( & ca - > io_ref ) ;
percpu_ref_exit ( & ca - > ref ) ;
kobject_put ( & ca - > kobj ) ;
}
static void __bch2_dev_offline ( struct bch_fs * c , struct bch_dev * ca )
{
lockdep_assert_held ( & c - > state_lock ) ;
if ( percpu_ref_is_zero ( & ca - > io_ref ) )
return ;
__bch2_dev_read_only ( c , ca ) ;
reinit_completion ( & ca - > io_ref_completion ) ;
percpu_ref_kill ( & ca - > io_ref ) ;
wait_for_completion ( & ca - > io_ref_completion ) ;
if ( ca - > kobj . state_in_sysfs ) {
sysfs_remove_link ( bdev_kobj ( ca - > disk_sb . bdev ) , " bcachefs " ) ;
sysfs_remove_link ( & ca - > kobj , " block " ) ;
}
bch2_free_super ( & ca - > disk_sb ) ;
bch2_dev_journal_exit ( ca ) ;
}
static void bch2_dev_ref_complete ( struct percpu_ref * ref )
{
struct bch_dev * ca = container_of ( ref , struct bch_dev , ref ) ;
complete ( & ca - > ref_completion ) ;
}
static void bch2_dev_io_ref_complete ( struct percpu_ref * ref )
{
struct bch_dev * ca = container_of ( ref , struct bch_dev , io_ref ) ;
complete ( & ca - > io_ref_completion ) ;
}
static int bch2_dev_sysfs_online ( struct bch_fs * c , struct bch_dev * ca )
{
int ret ;
if ( ! c - > kobj . state_in_sysfs )
return 0 ;
if ( ! ca - > kobj . state_in_sysfs ) {
ret = kobject_add ( & ca - > kobj , & c - > kobj ,
" dev-%u " , ca - > dev_idx ) ;
if ( ret )
return ret ;
}
if ( ca - > disk_sb . bdev ) {
struct kobject * block = bdev_kobj ( ca - > disk_sb . bdev ) ;
ret = sysfs_create_link ( block , & ca - > kobj , " bcachefs " ) ;
if ( ret )
return ret ;
ret = sysfs_create_link ( & ca - > kobj , block , " block " ) ;
if ( ret )
return ret ;
}
return 0 ;
}
static struct bch_dev * __bch2_dev_alloc ( struct bch_fs * c ,
struct bch_member * member )
{
struct bch_dev * ca ;
ca = kzalloc ( sizeof ( * ca ) , GFP_KERNEL ) ;
if ( ! ca )
return NULL ;
kobject_init ( & ca - > kobj , & bch2_dev_ktype ) ;
init_completion ( & ca - > ref_completion ) ;
init_completion ( & ca - > io_ref_completion ) ;
init_rwsem ( & ca - > bucket_lock ) ;
INIT_WORK ( & ca - > io_error_work , bch2_io_error_work ) ;
bch2_time_stats_init ( & ca - > io_latency [ READ ] ) ;
bch2_time_stats_init ( & ca - > io_latency [ WRITE ] ) ;
ca - > mi = bch2_mi_to_cpu ( member ) ;
ca - > uuid = member - > uuid ;
if ( percpu_ref_init ( & ca - > ref , bch2_dev_ref_complete ,
0 , GFP_KERNEL ) | |
percpu_ref_init ( & ca - > io_ref , bch2_dev_io_ref_complete ,
PERCPU_REF_INIT_DEAD , GFP_KERNEL ) | |
2019-04-06 14:32:06 -04:00
! ( ca - > sb_read_scratch = ( void * ) __get_free_page ( GFP_KERNEL ) ) | |
2017-03-16 22:18:50 -08:00
bch2_dev_buckets_alloc ( c , ca ) | |
bioset_init ( & ca - > replica_set , 4 ,
offsetof ( struct bch_write_bio , bio ) , 0 ) | |
! ( ca - > io_done = alloc_percpu ( * ca - > io_done ) ) )
goto err ;
return ca ;
err :
bch2_dev_free ( ca ) ;
return NULL ;
}
static void bch2_dev_attach ( struct bch_fs * c , struct bch_dev * ca ,
unsigned dev_idx )
{
ca - > dev_idx = dev_idx ;
__set_bit ( ca - > dev_idx , ca - > self . d ) ;
scnprintf ( ca - > name , sizeof ( ca - > name ) , " dev-%u " , dev_idx ) ;
ca - > fs = c ;
rcu_assign_pointer ( c - > devs [ ca - > dev_idx ] , ca ) ;
if ( bch2_dev_sysfs_online ( c , ca ) )
pr_warn ( " error creating sysfs objects " ) ;
}
static int bch2_dev_alloc ( struct bch_fs * c , unsigned dev_idx )
{
struct bch_member * member =
bch2_sb_get_members ( c - > disk_sb . sb ) - > members + dev_idx ;
struct bch_dev * ca = NULL ;
int ret = 0 ;
pr_verbose_init ( c - > opts , " " ) ;
if ( bch2_fs_init_fault ( " dev_alloc " ) )
goto err ;
ca = __bch2_dev_alloc ( c , member ) ;
if ( ! ca )
goto err ;
2021-03-11 21:46:23 -05:00
ca - > fs = c ;
2021-02-20 19:47:58 -05:00
if ( ca - > mi . state = = BCH_MEMBER_STATE_rw & &
2021-03-05 18:00:55 -05:00
bch2_dev_allocator_start ( ca ) ) {
bch2_dev_free ( ca ) ;
goto err ;
}
2017-03-16 22:18:50 -08:00
bch2_dev_attach ( c , ca , dev_idx ) ;
out :
pr_verbose_init ( c - > opts , " ret %i " , ret ) ;
return ret ;
err :
if ( ca )
bch2_dev_free ( ca ) ;
ret = - ENOMEM ;
goto out ;
}
static int __bch2_dev_attach_bdev ( struct bch_dev * ca , struct bch_sb_handle * sb )
{
unsigned ret ;
if ( bch2_dev_is_online ( ca ) ) {
bch_err ( ca , " already have device online in slot %u " ,
sb - > sb - > dev_idx ) ;
return - EINVAL ;
}
if ( get_capacity ( sb - > bdev - > bd_disk ) <
ca - > mi . bucket_size * ca - > mi . nbuckets ) {
bch_err ( ca , " cannot online: device too small " ) ;
return - EINVAL ;
}
BUG_ON ( ! percpu_ref_is_zero ( & ca - > io_ref ) ) ;
if ( get_capacity ( sb - > bdev - > bd_disk ) <
ca - > mi . bucket_size * ca - > mi . nbuckets ) {
bch_err ( ca , " device too small " ) ;
return - EINVAL ;
}
ret = bch2_dev_journal_init ( ca , sb - > sb ) ;
if ( ret )
return ret ;
/* Commit: */
ca - > disk_sb = * sb ;
memset ( sb , 0 , sizeof ( * sb ) ) ;
2022-01-02 21:45:35 -05:00
ca - > dev = ca - > disk_sb . bdev - > bd_dev ;
2017-03-16 22:18:50 -08:00
percpu_ref_reinit ( & ca - > io_ref ) ;
return 0 ;
}
static int bch2_dev_attach_bdev ( struct bch_fs * c , struct bch_sb_handle * sb )
{
struct bch_dev * ca ;
int ret ;
lockdep_assert_held ( & c - > state_lock ) ;
if ( le64_to_cpu ( sb - > sb - > seq ) >
le64_to_cpu ( c - > disk_sb . sb - > seq ) )
bch2_sb_to_fs ( c , sb - > sb ) ;
BUG_ON ( sb - > sb - > dev_idx > = c - > sb . nr_devices | |
! c - > devs [ sb - > sb - > dev_idx ] ) ;
ca = bch_dev_locked ( c , sb - > sb - > dev_idx ) ;
ret = __bch2_dev_attach_bdev ( ca , sb ) ;
if ( ret )
return ret ;
bch2_dev_sysfs_online ( c , ca ) ;
if ( c - > sb . nr_devices = = 1 )
snprintf ( c - > name , sizeof ( c - > name ) , " %pg " , ca - > disk_sb . bdev ) ;
snprintf ( ca - > name , sizeof ( ca - > name ) , " %pg " , ca - > disk_sb . bdev ) ;
rebalance_wakeup ( c ) ;
return 0 ;
}
/* Device management: */
/*
* Note : this function is also used by the error paths - when a particular
* device sees an error , we call it to determine whether we can just set the
* device RO , or - if this function returns false - we ' ll set the whole
* filesystem RO :
*
* XXX : maybe we should be more explicit about whether we ' re changing state
* because we got an error or what have you ?
*/
bool bch2_dev_state_allowed ( struct bch_fs * c , struct bch_dev * ca ,
enum bch_member_state new_state , int flags )
{
struct bch_devs_mask new_online_devs ;
struct bch_dev * ca2 ;
int i , nr_rw = 0 , required ;
lockdep_assert_held ( & c - > state_lock ) ;
switch ( new_state ) {
2021-02-20 19:47:58 -05:00
case BCH_MEMBER_STATE_rw :
2017-03-16 22:18:50 -08:00
return true ;
2021-02-20 19:47:58 -05:00
case BCH_MEMBER_STATE_ro :
if ( ca - > mi . state ! = BCH_MEMBER_STATE_rw )
2017-03-16 22:18:50 -08:00
return true ;
/* do we have enough devices to write to? */
for_each_member_device ( ca2 , c , i )
if ( ca2 ! = ca )
2021-02-20 19:47:58 -05:00
nr_rw + = ca2 - > mi . state = = BCH_MEMBER_STATE_rw ;
2017-03-16 22:18:50 -08:00
required = max ( ! ( flags & BCH_FORCE_IF_METADATA_DEGRADED )
? c - > opts . metadata_replicas
: c - > opts . metadata_replicas_required ,
! ( flags & BCH_FORCE_IF_DATA_DEGRADED )
? c - > opts . data_replicas
: c - > opts . data_replicas_required ) ;
return nr_rw > = required ;
2021-02-20 19:47:58 -05:00
case BCH_MEMBER_STATE_failed :
case BCH_MEMBER_STATE_spare :
if ( ca - > mi . state ! = BCH_MEMBER_STATE_rw & &
ca - > mi . state ! = BCH_MEMBER_STATE_ro )
2017-03-16 22:18:50 -08:00
return true ;
/* do we have enough devices to read from? */
new_online_devs = bch2_online_devs ( c ) ;
__clear_bit ( ca - > dev_idx , new_online_devs . d ) ;
2021-02-06 23:17:26 -05:00
return bch2_have_enough_devs ( c , new_online_devs , flags , false ) ;
2017-03-16 22:18:50 -08:00
default :
BUG ( ) ;
}
}
static bool bch2_fs_may_start ( struct bch_fs * c )
{
struct bch_sb_field_members * mi ;
struct bch_dev * ca ;
2021-02-06 23:17:26 -05:00
unsigned i , flags = 0 ;
if ( c - > opts . very_degraded )
flags | = BCH_FORCE_IF_DEGRADED | BCH_FORCE_IF_LOST ;
2017-03-16 22:18:50 -08:00
2021-02-06 23:17:26 -05:00
if ( c - > opts . degraded )
flags | = BCH_FORCE_IF_DEGRADED ;
if ( ! c - > opts . degraded & &
! c - > opts . very_degraded ) {
2017-03-16 22:18:50 -08:00
mutex_lock ( & c - > sb_lock ) ;
mi = bch2_sb_get_members ( c - > disk_sb . sb ) ;
for ( i = 0 ; i < c - > disk_sb . sb - > nr_devices ; i + + ) {
if ( ! bch2_dev_exists ( c - > disk_sb . sb , mi , i ) )
continue ;
ca = bch_dev_locked ( c , i ) ;
if ( ! bch2_dev_is_online ( ca ) & &
2021-02-20 19:47:58 -05:00
( ca - > mi . state = = BCH_MEMBER_STATE_rw | |
ca - > mi . state = = BCH_MEMBER_STATE_ro ) ) {
2017-03-16 22:18:50 -08:00
mutex_unlock ( & c - > sb_lock ) ;
return false ;
}
}
mutex_unlock ( & c - > sb_lock ) ;
}
2021-02-06 23:17:26 -05:00
return bch2_have_enough_devs ( c , bch2_online_devs ( c ) , flags , true ) ;
2017-03-16 22:18:50 -08:00
}
static void __bch2_dev_read_only ( struct bch_fs * c , struct bch_dev * ca )
{
2020-07-21 17:12:39 -04:00
/*
* Device going read only means the copygc reserve get smaller , so we
* don ' t want that happening while copygc is in progress :
*/
bch2_copygc_stop ( c ) ;
2017-03-16 22:18:50 -08:00
/*
* The allocator thread itself allocates btree nodes , so stop it first :
*/
bch2_dev_allocator_stop ( ca ) ;
bch2_dev_allocator_remove ( c , ca ) ;
bch2_dev_journal_stop ( & c - > journal , ca ) ;
2020-07-21 17:12:39 -04:00
bch2_copygc_start ( c ) ;
2017-03-16 22:18:50 -08:00
}
2021-11-05 21:28:17 -04:00
static int __bch2_dev_read_write ( struct bch_fs * c , struct bch_dev * ca )
2017-03-16 22:18:50 -08:00
{
lockdep_assert_held ( & c - > state_lock ) ;
2021-02-20 19:47:58 -05:00
BUG_ON ( ca - > mi . state ! = BCH_MEMBER_STATE_rw ) ;
2017-03-16 22:18:50 -08:00
bch2_dev_allocator_add ( c , ca ) ;
bch2_recalc_capacity ( c ) ;
2021-11-05 21:28:17 -04:00
return bch2_dev_allocator_start ( ca ) ;
2017-03-16 22:18:50 -08:00
}
int __bch2_dev_set_state ( struct bch_fs * c , struct bch_dev * ca ,
enum bch_member_state new_state , int flags )
{
struct bch_sb_field_members * mi ;
int ret = 0 ;
if ( ca - > mi . state = = new_state )
return 0 ;
if ( ! bch2_dev_state_allowed ( c , ca , new_state , flags ) )
return - EINVAL ;
2021-02-20 19:47:58 -05:00
if ( new_state ! = BCH_MEMBER_STATE_rw )
2017-03-16 22:18:50 -08:00
__bch2_dev_read_only ( c , ca ) ;
2021-02-20 19:47:58 -05:00
bch_notice ( ca , " %s " , bch2_member_states [ new_state ] ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & c - > sb_lock ) ;
mi = bch2_sb_get_members ( c - > disk_sb . sb ) ;
SET_BCH_MEMBER_STATE ( & mi - > members [ ca - > dev_idx ] , new_state ) ;
bch2_write_super ( c ) ;
mutex_unlock ( & c - > sb_lock ) ;
2021-11-05 21:28:17 -04:00
if ( new_state = = BCH_MEMBER_STATE_rw )
ret = __bch2_dev_read_write ( c , ca ) ;
2017-03-16 22:18:50 -08:00
rebalance_wakeup ( c ) ;
return ret ;
}
int bch2_dev_set_state ( struct bch_fs * c , struct bch_dev * ca ,
enum bch_member_state new_state , int flags )
{
int ret ;
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
ret = __bch2_dev_set_state ( c , ca , new_state , flags ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return ret ;
}
/* Device add/removal: */
2021-05-23 17:04:13 -04:00
static int bch2_dev_remove_alloc ( struct bch_fs * c , struct bch_dev * ca )
2019-10-05 12:54:53 -04:00
{
2021-12-11 17:13:09 -05:00
struct bpos start = POS ( ca - > dev_idx , 0 ) ;
struct bpos end = POS ( ca - > dev_idx , U64_MAX ) ;
2019-10-05 12:54:53 -04:00
int ret ;
2021-12-11 17:13:09 -05:00
ret = bch2_btree_delete_range ( c , BTREE_ID_alloc , start , end ,
BTREE_TRIGGER_NORUN , NULL ) ? :
bch2_btree_delete_range ( c , BTREE_ID_freespace , start , end ,
BTREE_TRIGGER_NORUN , NULL ) ? :
bch2_btree_delete_range ( c , BTREE_ID_need_discard , start , end ,
BTREE_TRIGGER_NORUN , NULL ) ;
if ( ret )
2021-10-07 14:53:21 -04:00
bch_err ( c , " error %i removing dev alloc info " , ret ) ;
2019-10-05 12:54:53 -04:00
2021-12-11 17:13:09 -05:00
return ret ;
2019-10-05 12:54:53 -04:00
}
2017-03-16 22:18:50 -08:00
int bch2_dev_remove ( struct bch_fs * c , struct bch_dev * ca , int flags )
{
struct bch_sb_field_members * mi ;
unsigned dev_idx = ca - > dev_idx , data ;
int ret = - EINVAL ;
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
2020-01-03 22:38:14 -05:00
/*
* We consume a reference to ca - > ref , regardless of whether we succeed
* or fail :
*/
percpu_ref_put ( & ca - > ref ) ;
2017-03-16 22:18:50 -08:00
2021-02-20 19:47:58 -05:00
if ( ! bch2_dev_state_allowed ( c , ca , BCH_MEMBER_STATE_failed , flags ) ) {
2017-03-16 22:18:50 -08:00
bch_err ( ca , " Cannot remove without losing data " ) ;
goto err ;
}
__bch2_dev_read_only ( c , ca ) ;
ret = bch2_dev_data_drop ( c , ca - > dev_idx , flags ) ;
if ( ret ) {
bch_err ( ca , " Remove failed: error %i dropping data " , ret ) ;
goto err ;
}
ret = bch2_journal_flush_device_pins ( & c - > journal , ca - > dev_idx ) ;
if ( ret ) {
bch_err ( ca , " Remove failed: error %i flushing journal " , ret ) ;
goto err ;
}
2019-10-05 12:54:53 -04:00
ret = bch2_dev_remove_alloc ( c , ca ) ;
2017-03-16 22:18:50 -08:00
if ( ret ) {
bch_err ( ca , " Remove failed, error deleting alloc info " ) ;
goto err ;
}
/*
* must flush all existing journal entries , they might have
* ( overwritten ) keys that point to the device we ' re removing :
*/
bch2_journal_flush_all_pins ( & c - > journal ) ;
2020-01-03 22:38:14 -05:00
/*
* hack to ensure bch2_replicas_gc2 ( ) clears out entries to this device
*/
bch2_journal_meta ( & c - > journal ) ;
2017-03-16 22:18:50 -08:00
ret = bch2_journal_error ( & c - > journal ) ;
if ( ret ) {
bch_err ( ca , " Remove failed, journal error " ) ;
goto err ;
}
2020-01-03 22:38:14 -05:00
ret = bch2_replicas_gc2 ( c ) ;
if ( ret ) {
bch_err ( ca , " Remove failed: error %i from replicas gc " , ret ) ;
goto err ;
}
data = bch2_dev_has_data ( c , ca ) ;
if ( data ) {
2022-02-25 13:18:19 -05:00
struct printbuf data_has = PRINTBUF ;
2020-01-03 22:38:14 -05:00
2022-02-25 13:18:19 -05:00
bch2_flags_to_text ( & data_has , bch2_data_types , data ) ;
bch_err ( ca , " Remove failed, still has data (%s) " , data_has . buf ) ;
printbuf_exit ( & data_has ) ;
2020-01-03 22:38:14 -05:00
ret = - EBUSY ;
goto err ;
}
2017-03-16 22:18:50 -08:00
__bch2_dev_offline ( c , ca ) ;
mutex_lock ( & c - > sb_lock ) ;
rcu_assign_pointer ( c - > devs [ ca - > dev_idx ] , NULL ) ;
mutex_unlock ( & c - > sb_lock ) ;
percpu_ref_kill ( & ca - > ref ) ;
wait_for_completion ( & ca - > ref_completion ) ;
bch2_dev_free ( ca ) ;
/*
* Free this device ' s slot in the bch_member array - all pointers to
* this device must be gone :
*/
mutex_lock ( & c - > sb_lock ) ;
mi = bch2_sb_get_members ( c - > disk_sb . sb ) ;
memset ( & mi - > members [ dev_idx ] . uuid , 0 , sizeof ( mi - > members [ dev_idx ] . uuid ) ) ;
bch2_write_super ( c ) ;
mutex_unlock ( & c - > sb_lock ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2021-01-21 21:52:06 -05:00
bch2_dev_usage_journal_reserve ( c ) ;
2017-03-16 22:18:50 -08:00
return 0 ;
err :
2021-02-20 19:47:58 -05:00
if ( ca - > mi . state = = BCH_MEMBER_STATE_rw & &
2018-12-18 08:41:58 -05:00
! percpu_ref_is_zero ( & ca - > io_ref ) )
2017-03-16 22:18:50 -08:00
__bch2_dev_read_write ( c , ca ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return ret ;
}
/* Add new device to running filesystem: */
int bch2_dev_add ( struct bch_fs * c , const char * path )
{
struct bch_opts opts = bch2_opts_empty ( ) ;
struct bch_sb_handle sb ;
const char * err ;
struct bch_dev * ca = NULL ;
struct bch_sb_field_members * mi ;
struct bch_member dev_mi ;
unsigned dev_idx , nr_devices , u64s ;
2022-02-25 13:18:19 -05:00
struct printbuf errbuf = PRINTBUF ;
2017-03-16 22:18:50 -08:00
int ret ;
ret = bch2_read_super ( path , & opts , & sb ) ;
2021-12-28 16:31:57 -05:00
if ( ret ) {
bch_err ( c , " device add error: error reading super: %i " , ret ) ;
2022-01-03 23:38:50 -05:00
goto err ;
2021-12-28 16:31:57 -05:00
}
2017-03-16 22:18:50 -08:00
dev_mi = bch2_sb_get_members ( sb . sb ) - > members [ sb . sb - > dev_idx ] ;
err = bch2_dev_may_add ( sb . sb , c ) ;
2021-12-28 16:31:57 -05:00
if ( err ) {
bch_err ( c , " device add error: %s " , err ) ;
2022-01-03 23:38:50 -05:00
ret = - EINVAL ;
goto err ;
2021-12-28 16:31:57 -05:00
}
2017-03-16 22:18:50 -08:00
ca = __bch2_dev_alloc ( c , & dev_mi ) ;
if ( ! ca ) {
bch2_free_super ( & sb ) ;
2022-01-03 23:38:50 -05:00
ret = - ENOMEM ;
goto err ;
2017-03-16 22:18:50 -08:00
}
ret = __bch2_dev_attach_bdev ( ca , & sb ) ;
if ( ret ) {
bch2_dev_free ( ca ) ;
2022-01-03 23:38:50 -05:00
goto err ;
2017-03-16 22:18:50 -08:00
}
ret = bch2_dev_journal_alloc ( ca ) ;
2021-12-28 16:31:57 -05:00
if ( ret ) {
bch_err ( c , " device add error: journal alloc failed " ) ;
2017-03-16 22:18:50 -08:00
goto err ;
2021-12-28 16:31:57 -05:00
}
2017-03-16 22:18:50 -08:00
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
mutex_lock ( & c - > sb_lock ) ;
ret = bch2_sb_from_fs ( c , ca ) ;
2021-12-28 16:31:57 -05:00
if ( ret ) {
bch_err ( c , " device add error: new device superblock too small " ) ;
2017-03-16 22:18:50 -08:00
goto err_unlock ;
2021-12-28 16:31:57 -05:00
}
2017-03-16 22:18:50 -08:00
mi = bch2_sb_get_members ( ca - > disk_sb . sb ) ;
if ( ! bch2_sb_resize_members ( & ca - > disk_sb ,
le32_to_cpu ( mi - > field . u64s ) +
sizeof ( dev_mi ) / sizeof ( u64 ) ) ) {
2021-12-28 16:31:57 -05:00
bch_err ( c , " device add error: new device superblock too small " ) ;
2017-03-16 22:18:50 -08:00
ret = - ENOSPC ;
goto err_unlock ;
}
if ( dynamic_fault ( " bcachefs:add:no_slot " ) )
goto no_slot ;
mi = bch2_sb_get_members ( c - > disk_sb . sb ) ;
for ( dev_idx = 0 ; dev_idx < BCH_SB_MEMBERS_MAX ; dev_idx + + )
if ( ! bch2_dev_exists ( c - > disk_sb . sb , mi , dev_idx ) )
goto have_slot ;
no_slot :
2021-12-28 16:31:57 -05:00
bch_err ( c , " device add error: already have maximum number of devices " ) ;
2017-03-16 22:18:50 -08:00
ret = - ENOSPC ;
goto err_unlock ;
have_slot :
nr_devices = max_t ( unsigned , dev_idx + 1 , c - > sb . nr_devices ) ;
u64s = ( sizeof ( struct bch_sb_field_members ) +
sizeof ( struct bch_member ) * nr_devices ) / sizeof ( u64 ) ;
mi = bch2_sb_resize_members ( & c - > disk_sb , u64s ) ;
2021-12-28 16:31:57 -05:00
if ( ! mi ) {
bch_err ( c , " device add error: no room in superblock for member info " ) ;
ret = - ENOSPC ;
2017-03-16 22:18:50 -08:00
goto err_unlock ;
2021-12-28 16:31:57 -05:00
}
2017-03-16 22:18:50 -08:00
/* success: */
mi - > members [ dev_idx ] = dev_mi ;
2018-11-04 20:14:46 -08:00
mi - > members [ dev_idx ] . last_mount = cpu_to_le64 ( ktime_get_real_seconds ( ) ) ;
2017-03-16 22:18:50 -08:00
c - > disk_sb . sb - > nr_devices = nr_devices ;
ca - > disk_sb . sb - > dev_idx = dev_idx ;
bch2_dev_attach ( c , ca , dev_idx ) ;
bch2_write_super ( c ) ;
mutex_unlock ( & c - > sb_lock ) ;
2021-01-21 21:52:06 -05:00
bch2_dev_usage_journal_reserve ( c ) ;
2021-04-14 20:25:33 -04:00
ret = bch2_trans_mark_dev_sb ( c , ca ) ;
2021-12-28 16:31:57 -05:00
if ( ret ) {
bch_err ( c , " device add error: error marking new superblock: %i " , ret ) ;
2021-01-22 17:56:34 -05:00
goto err_late ;
2021-12-28 16:31:57 -05:00
}
2020-10-16 21:36:26 -04:00
2021-12-11 17:13:09 -05:00
ret = bch2_fs_freespace_init ( c ) ;
if ( ret ) {
bch_err ( c , " device add error: error initializing free space: %i " , ret ) ;
goto err_late ;
}
2021-12-24 04:22:20 -05:00
ca - > new_fs_bucket_idx = 0 ;
2021-02-20 19:47:58 -05:00
if ( ca - > mi . state = = BCH_MEMBER_STATE_rw ) {
2021-11-05 21:28:17 -04:00
ret = __bch2_dev_read_write ( c , ca ) ;
2021-12-28 16:31:57 -05:00
if ( ret ) {
bch_err ( c , " device add error: error going RW on new device: %i " , ret ) ;
2017-03-16 22:18:50 -08:00
goto err_late ;
2021-12-28 16:31:57 -05:00
}
2017-03-16 22:18:50 -08:00
}
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return 0 ;
err_unlock :
mutex_unlock ( & c - > sb_lock ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
err :
if ( ca )
bch2_dev_free ( ca ) ;
bch2_free_super ( & sb ) ;
2022-02-25 13:18:19 -05:00
printbuf_exit ( & errbuf ) ;
2017-03-16 22:18:50 -08:00
return ret ;
err_late :
2021-01-22 17:56:34 -05:00
up_write ( & c - > state_lock ) ;
2022-01-03 23:38:50 -05:00
ca = NULL ;
goto err ;
2017-03-16 22:18:50 -08:00
}
/* Hot add existing device to running filesystem: */
int bch2_dev_online ( struct bch_fs * c , const char * path )
{
struct bch_opts opts = bch2_opts_empty ( ) ;
struct bch_sb_handle sb = { NULL } ;
struct bch_sb_field_members * mi ;
struct bch_dev * ca ;
unsigned dev_idx ;
const char * err ;
int ret ;
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
ret = bch2_read_super ( path , & opts , & sb ) ;
if ( ret ) {
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return ret ;
}
dev_idx = sb . sb - > dev_idx ;
err = bch2_dev_in_fs ( c - > disk_sb . sb , sb . sb ) ;
2021-11-05 21:28:17 -04:00
if ( err ) {
bch_err ( c , " error bringing %s online: %s " , path , err ) ;
2017-03-16 22:18:50 -08:00
goto err ;
2021-11-05 21:28:17 -04:00
}
2017-03-16 22:18:50 -08:00
2021-11-05 21:28:17 -04:00
ret = bch2_dev_attach_bdev ( c , & sb ) ;
if ( ret )
2017-03-16 22:18:50 -08:00
goto err ;
ca = bch_dev_locked ( c , dev_idx ) ;
2021-01-22 17:56:34 -05:00
2021-11-05 21:28:17 -04:00
ret = bch2_trans_mark_dev_sb ( c , ca ) ;
if ( ret ) {
bch_err ( c , " error bringing %s online: error %i from bch2_trans_mark_dev_sb " ,
path , ret ) ;
2021-01-22 17:56:34 -05:00
goto err ;
}
2021-02-20 19:47:58 -05:00
if ( ca - > mi . state = = BCH_MEMBER_STATE_rw ) {
2021-11-05 21:28:17 -04:00
ret = __bch2_dev_read_write ( c , ca ) ;
if ( ret )
2017-03-16 22:18:50 -08:00
goto err ;
}
mutex_lock ( & c - > sb_lock ) ;
mi = bch2_sb_get_members ( c - > disk_sb . sb ) ;
mi - > members [ ca - > dev_idx ] . last_mount =
2018-11-04 20:14:46 -08:00
cpu_to_le64 ( ktime_get_real_seconds ( ) ) ;
2017-03-16 22:18:50 -08:00
bch2_write_super ( c ) ;
mutex_unlock ( & c - > sb_lock ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return 0 ;
err :
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
bch2_free_super ( & sb ) ;
return - EINVAL ;
}
int bch2_dev_offline ( struct bch_fs * c , struct bch_dev * ca , int flags )
{
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
if ( ! bch2_dev_is_online ( ca ) ) {
bch_err ( ca , " Already offline " ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return 0 ;
}
2021-02-20 19:47:58 -05:00
if ( ! bch2_dev_state_allowed ( c , ca , BCH_MEMBER_STATE_failed , flags ) ) {
2017-03-16 22:18:50 -08:00
bch_err ( ca , " Cannot offline required disk " ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return - EINVAL ;
}
__bch2_dev_offline ( c , ca ) ;
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return 0 ;
}
int bch2_dev_resize ( struct bch_fs * c , struct bch_dev * ca , u64 nbuckets )
{
struct bch_member * mi ;
int ret = 0 ;
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
if ( nbuckets < ca - > mi . nbuckets ) {
bch_err ( ca , " Cannot shrink yet " ) ;
ret = - EINVAL ;
goto err ;
}
if ( bch2_dev_is_online ( ca ) & &
get_capacity ( ca - > disk_sb . bdev - > bd_disk ) <
ca - > mi . bucket_size * nbuckets ) {
bch_err ( ca , " New size larger than device " ) ;
ret = - EINVAL ;
goto err ;
}
ret = bch2_dev_buckets_resize ( c , ca , nbuckets ) ;
if ( ret ) {
bch_err ( ca , " Resize error: %i " , ret ) ;
goto err ;
}
2021-06-08 22:50:30 -04:00
ret = bch2_trans_mark_dev_sb ( c , ca ) ;
if ( ret ) {
goto err ;
}
2017-03-16 22:18:50 -08:00
mutex_lock ( & c - > sb_lock ) ;
mi = & bch2_sb_get_members ( c - > disk_sb . sb ) - > members [ ca - > dev_idx ] ;
mi - > nbuckets = cpu_to_le64 ( nbuckets ) ;
bch2_write_super ( c ) ;
mutex_unlock ( & c - > sb_lock ) ;
bch2_recalc_capacity ( c ) ;
err :
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
return ret ;
}
/* return with ref on ca->ref: */
2022-02-16 06:23:06 -05:00
struct bch_dev * bch2_dev_lookup ( struct bch_fs * c , const char * name )
2017-03-16 22:18:50 -08:00
{
struct bch_dev * ca ;
unsigned i ;
2021-05-07 20:43:43 -04:00
rcu_read_lock ( ) ;
for_each_member_device_rcu ( ca , c , i , NULL )
2022-02-16 06:23:06 -05:00
if ( ! strcmp ( name , ca - > name ) )
2017-03-16 22:18:50 -08:00
goto found ;
ca = ERR_PTR ( - ENOENT ) ;
found :
2021-05-07 20:43:43 -04:00
rcu_read_unlock ( ) ;
2017-03-16 22:18:50 -08:00
return ca ;
}
/* Filesystem open: */
struct bch_fs * bch2_fs_open ( char * const * devices , unsigned nr_devices ,
struct bch_opts opts )
{
struct bch_sb_handle * sb = NULL ;
struct bch_fs * c = NULL ;
2020-09-06 22:58:28 -04:00
struct bch_sb_field_members * mi ;
2017-03-16 22:18:50 -08:00
unsigned i , best_sb = 0 ;
const char * err ;
2022-02-25 13:18:19 -05:00
struct printbuf errbuf = PRINTBUF ;
2021-11-05 21:28:17 -04:00
int ret = 0 ;
2017-03-16 22:18:50 -08:00
2022-01-03 23:38:50 -05:00
if ( ! try_module_get ( THIS_MODULE ) )
return ERR_PTR ( - ENODEV ) ;
2017-03-16 22:18:50 -08:00
pr_verbose_init ( opts , " " ) ;
if ( ! nr_devices ) {
2022-01-03 23:38:50 -05:00
ret = - EINVAL ;
goto err ;
2017-03-16 22:18:50 -08:00
}
sb = kcalloc ( nr_devices , sizeof ( * sb ) , GFP_KERNEL ) ;
2021-11-05 21:28:17 -04:00
if ( ! sb ) {
ret = - ENOMEM ;
2017-03-16 22:18:50 -08:00
goto err ;
2021-11-05 21:28:17 -04:00
}
2017-03-16 22:18:50 -08:00
for ( i = 0 ; i < nr_devices ; i + + ) {
ret = bch2_read_super ( devices [ i ] , & opts , & sb [ i ] ) ;
if ( ret )
goto err ;
}
for ( i = 1 ; i < nr_devices ; i + + )
if ( le64_to_cpu ( sb [ i ] . sb - > seq ) >
le64_to_cpu ( sb [ best_sb ] . sb - > seq ) )
best_sb = i ;
2020-09-06 22:58:28 -04:00
mi = bch2_sb_get_members ( sb [ best_sb ] . sb ) ;
i = 0 ;
while ( i < nr_devices ) {
if ( i ! = best_sb & &
! bch2_dev_exists ( sb [ best_sb ] . sb , mi , sb [ i ] . sb - > dev_idx ) ) {
pr_info ( " %pg has been removed, skipping " , sb [ i ] . bdev ) ;
bch2_free_super ( & sb [ i ] ) ;
array_remove_item ( sb , nr_devices , i ) ;
continue ;
}
2017-03-16 22:18:50 -08:00
err = bch2_dev_in_fs ( sb [ best_sb ] . sb , sb [ i ] . sb ) ;
if ( err )
goto err_print ;
2020-09-06 22:58:28 -04:00
i + + ;
2017-03-16 22:18:50 -08:00
}
c = bch2_fs_alloc ( sb [ best_sb ] . sb , opts ) ;
2021-11-04 21:03:16 +00:00
if ( IS_ERR ( c ) ) {
ret = PTR_ERR ( c ) ;
2017-03-16 22:18:50 -08:00
goto err ;
2021-11-04 21:03:16 +00:00
}
2017-03-16 22:18:50 -08:00
2020-06-15 14:58:47 -04:00
down_write ( & c - > state_lock ) ;
2021-11-05 21:28:17 -04:00
for ( i = 0 ; i < nr_devices ; i + + ) {
ret = bch2_dev_attach_bdev ( c , & sb [ i ] ) ;
if ( ret ) {
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2021-11-05 21:28:17 -04:00
goto err ;
2017-03-16 22:18:50 -08:00
}
2021-11-05 21:28:17 -04:00
}
2020-06-15 14:58:47 -04:00
up_write ( & c - > state_lock ) ;
2017-03-16 22:18:50 -08:00
err = " insufficient devices " ;
if ( ! bch2_fs_may_start ( c ) )
goto err_print ;
if ( ! c - > opts . nostart ) {
2019-04-17 18:21:19 -04:00
ret = bch2_fs_start ( c ) ;
if ( ret )
goto err ;
2017-03-16 22:18:50 -08:00
}
out :
kfree ( sb ) ;
2022-02-25 13:18:19 -05:00
printbuf_exit ( & errbuf ) ;
2017-03-16 22:18:50 -08:00
module_put ( THIS_MODULE ) ;
pr_verbose_init ( opts , " ret %i " , PTR_ERR_OR_ZERO ( c ) ) ;
return c ;
err_print :
pr_err ( " bch_fs_open err opening %s: %s " ,
devices [ 0 ] , err ) ;
ret = - EINVAL ;
err :
2021-11-04 21:03:16 +00:00
if ( ! IS_ERR_OR_NULL ( c ) )
2017-03-16 22:18:50 -08:00
bch2_fs_stop ( c ) ;
2021-11-05 21:28:17 -04:00
if ( sb )
for ( i = 0 ; i < nr_devices ; i + + )
bch2_free_super ( & sb [ i ] ) ;
2017-03-16 22:18:50 -08:00
c = ERR_PTR ( ret ) ;
goto out ;
}
/* Global interfaces/init */
static void bcachefs_exit ( void )
{
bch2_debug_exit ( ) ;
bch2_vfs_exit ( ) ;
bch2_chardev_exit ( ) ;
2020-11-18 14:09:33 -05:00
bch2_btree_key_cache_exit ( ) ;
2017-03-16 22:18:50 -08:00
if ( bcachefs_kset )
kset_unregister ( bcachefs_kset ) ;
}
static int __init bcachefs_init ( void )
{
bch2_bkey_pack_test ( ) ;
if ( ! ( bcachefs_kset = kset_create_and_add ( " bcachefs " , NULL , fs_kobj ) ) | |
2020-11-18 14:09:33 -05:00
bch2_btree_key_cache_init ( ) | |
2017-03-16 22:18:50 -08:00
bch2_chardev_init ( ) | |
bch2_vfs_init ( ) | |
bch2_debug_init ( ) )
goto err ;
return 0 ;
err :
bcachefs_exit ( ) ;
return - ENOMEM ;
}
# define BCH_DEBUG_PARAM(name, description) \
bool bch2_ # # name ; \
module_param_named ( name , bch2_ # # name , bool , 0644 ) ; \
MODULE_PARM_DESC ( name , description ) ;
BCH_DEBUG_PARAMS ( )
# undef BCH_DEBUG_PARAM
2018-11-01 15:10:01 -04:00
unsigned bch2_metadata_version = bcachefs_metadata_version_current ;
2017-03-16 22:18:50 -08:00
module_param_named ( version , bch2_metadata_version , uint , 0400 ) ;
module_exit ( bcachefs_exit ) ;
module_init ( bcachefs_init ) ;