2017-03-17 09:18:50 +03:00
// SPDX-License-Identifier: GPL-2.0
# include "bcachefs.h"
2018-10-06 07:46:55 +03:00
# include "alloc_foreground.h"
2020-01-07 21:29:32 +03:00
# include "btree_io.h"
2020-05-25 21:57:06 +03:00
# include "btree_update_interior.h"
2017-03-17 09:18:50 +03:00
# include "buckets.h"
# include "checksum.h"
# include "error.h"
2020-07-21 20:34:22 +03:00
# include "io.h"
2017-03-17 09:18:50 +03:00
# include "journal.h"
# include "journal_io.h"
# include "journal_reclaim.h"
# include "replicas.h"
# include "trace.h"
struct journal_list {
struct closure cl ;
struct mutex lock ;
struct list_head * head ;
int ret ;
} ;
# define JOURNAL_ENTRY_ADD_OK 0
# define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
/*
* Given a journal entry we just read , add it to the list of journal entries to
* be replayed :
*/
static int journal_entry_add ( struct bch_fs * c , struct bch_dev * ca ,
struct journal_list * jlist , struct jset * j )
{
struct journal_replay * i , * pos ;
struct list_head * where ;
size_t bytes = vstruct_bytes ( j ) ;
__le64 last_seq ;
int ret ;
last_seq = ! list_empty ( jlist - > head )
? list_last_entry ( jlist - > head , struct journal_replay ,
list ) - > j . last_seq
: 0 ;
2020-06-14 01:43:14 +03:00
if ( ! c - > opts . read_entire_journal ) {
/* Is this entry older than the range we need? */
if ( le64_to_cpu ( j - > seq ) < le64_to_cpu ( last_seq ) ) {
ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE ;
goto out ;
}
2017-03-17 09:18:50 +03:00
2020-06-14 01:43:14 +03:00
/* Drop entries we don't need anymore */
list_for_each_entry_safe ( i , pos , jlist - > head , list ) {
if ( le64_to_cpu ( i - > j . seq ) > = le64_to_cpu ( j - > last_seq ) )
break ;
list_del ( & i - > list ) ;
kvpfree ( i , offsetof ( struct journal_replay , j ) +
vstruct_bytes ( & i - > j ) ) ;
}
2017-03-17 09:18:50 +03:00
}
list_for_each_entry_reverse ( i , jlist - > head , list ) {
/* Duplicate? */
if ( le64_to_cpu ( j - > seq ) = = le64_to_cpu ( i - > j . seq ) ) {
fsck_err_on ( bytes ! = vstruct_bytes ( & i - > j ) | |
memcmp ( j , & i - > j , bytes ) , c ,
" found duplicate but non identical journal entries (seq %llu) " ,
le64_to_cpu ( j - > seq ) ) ;
goto found ;
}
if ( le64_to_cpu ( j - > seq ) > le64_to_cpu ( i - > j . seq ) ) {
where = & i - > list ;
goto add ;
}
}
where = jlist - > head ;
add :
i = kvpmalloc ( offsetof ( struct journal_replay , j ) + bytes , GFP_KERNEL ) ;
if ( ! i ) {
ret = - ENOMEM ;
goto out ;
}
list_add ( & i - > list , where ) ;
i - > devs . nr = 0 ;
unsafe_memcpy ( & i - > j , j , bytes , " embedded variable length struct " ) ;
found :
if ( ! bch2_dev_list_has_dev ( i - > devs , ca - > dev_idx ) )
bch2_dev_list_add_dev ( & i - > devs , ca - > dev_idx ) ;
else
fsck_err_on ( 1 , c , " duplicate journal entries on same device " ) ;
ret = JOURNAL_ENTRY_ADD_OK ;
out :
fsck_err :
return ret ;
}
static struct nonce journal_nonce ( const struct jset * jset )
{
return ( struct nonce ) { {
[ 0 ] = 0 ,
[ 1 ] = ( ( __le32 * ) & jset - > seq ) [ 0 ] ,
[ 2 ] = ( ( __le32 * ) & jset - > seq ) [ 1 ] ,
[ 3 ] = BCH_NONCE_JOURNAL ,
} } ;
}
/* this fills in a range with empty jset_entries: */
static void journal_entry_null_range ( void * start , void * end )
{
struct jset_entry * entry ;
for ( entry = start ; entry ! = end ; entry = vstruct_next ( entry ) )
memset ( entry , 0 , sizeof ( * entry ) ) ;
}
# define JOURNAL_ENTRY_REREAD 5
# define JOURNAL_ENTRY_NONE 6
# define JOURNAL_ENTRY_BAD 7
# define journal_entry_err(c, msg, ...) \
( { \
switch ( write ) { \
case READ : \
mustfix_fsck_err ( c , msg , # # __VA_ARGS__ ) ; \
break ; \
case WRITE : \
bch_err ( c , " corrupt metadata before write: \n " \
msg , # # __VA_ARGS__ ) ; \
if ( bch2_fs_inconsistent ( c ) ) { \
ret = BCH_FSCK_ERRORS_NOT_FIXED ; \
goto fsck_err ; \
} \
break ; \
} \
true ; \
} )
# define journal_entry_err_on(cond, c, msg, ...) \
( ( cond ) ? journal_entry_err ( c , msg , # # __VA_ARGS__ ) : false )
static int journal_validate_key ( struct bch_fs * c , struct jset * jset ,
struct jset_entry * entry ,
2020-01-07 21:29:32 +03:00
unsigned level , enum btree_id btree_id ,
struct bkey_i * k ,
2017-03-17 09:18:50 +03:00
const char * type , int write )
{
void * next = vstruct_next ( entry ) ;
const char * invalid ;
2018-11-01 22:10:01 +03:00
unsigned version = le32_to_cpu ( jset - > version ) ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
if ( journal_entry_err_on ( ! k - > k . u64s , c ,
" invalid %s in journal: k->u64s 0 " , type ) ) {
entry - > u64s = cpu_to_le16 ( ( u64 * ) k - entry - > _data ) ;
journal_entry_null_range ( vstruct_next ( entry ) , next ) ;
return 0 ;
}
if ( journal_entry_err_on ( ( void * ) bkey_next ( k ) >
( void * ) vstruct_next ( entry ) , c ,
" invalid %s in journal: extends past end of journal entry " ,
type ) ) {
entry - > u64s = cpu_to_le16 ( ( u64 * ) k - entry - > _data ) ;
journal_entry_null_range ( vstruct_next ( entry ) , next ) ;
return 0 ;
}
if ( journal_entry_err_on ( k - > k . format ! = KEY_FORMAT_CURRENT , c ,
" invalid %s in journal: bad format %u " ,
type , k - > k . format ) ) {
le16_add_cpu ( & entry - > u64s , - k - > k . u64s ) ;
memmove ( k , bkey_next ( k ) , next - ( void * ) bkey_next ( k ) ) ;
journal_entry_null_range ( vstruct_next ( entry ) , next ) ;
return 0 ;
}
2020-01-07 21:29:32 +03:00
if ( ! write )
bch2_bkey_compat ( level , btree_id , version ,
JSET_BIG_ENDIAN ( jset ) , write ,
NULL , bkey_to_packed ( k ) ) ;
2018-11-01 22:10:01 +03:00
2020-01-07 21:29:32 +03:00
invalid = bch2_bkey_invalid ( c , bkey_i_to_s_c ( k ) ,
__btree_node_type ( level , btree_id ) ) ;
2017-03-17 09:18:50 +03:00
if ( invalid ) {
2018-11-09 09:24:07 +03:00
char buf [ 160 ] ;
2018-11-01 22:10:01 +03:00
bch2_bkey_val_to_text ( & PBUF ( buf ) , c , bkey_i_to_s_c ( k ) ) ;
2017-03-17 09:18:50 +03:00
mustfix_fsck_err ( c , " invalid %s in journal: %s \n %s " ,
type , invalid , buf ) ;
le16_add_cpu ( & entry - > u64s , - k - > k . u64s ) ;
memmove ( k , bkey_next ( k ) , next - ( void * ) bkey_next ( k ) ) ;
journal_entry_null_range ( vstruct_next ( entry ) , next ) ;
return 0 ;
}
2018-11-01 22:10:01 +03:00
2020-01-07 21:29:32 +03:00
if ( write )
bch2_bkey_compat ( level , btree_id , version ,
JSET_BIG_ENDIAN ( jset ) , write ,
NULL , bkey_to_packed ( k ) ) ;
2017-03-17 09:18:50 +03:00
fsck_err :
return ret ;
}
static int journal_entry_validate_btree_keys ( struct bch_fs * c ,
struct jset * jset ,
struct jset_entry * entry ,
int write )
{
struct bkey_i * k ;
vstruct_for_each ( entry , k ) {
2020-01-07 21:29:32 +03:00
int ret = journal_validate_key ( c , jset , entry ,
entry - > level ,
entry - > btree_id ,
k , " key " , write ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
return ret ;
}
return 0 ;
}
static int journal_entry_validate_btree_root ( struct bch_fs * c ,
struct jset * jset ,
struct jset_entry * entry ,
int write )
{
struct bkey_i * k = entry - > start ;
int ret = 0 ;
if ( journal_entry_err_on ( ! entry - > u64s | |
le16_to_cpu ( entry - > u64s ) ! = k - > k . u64s , c ,
" invalid btree root journal entry: wrong number of keys " ) ) {
void * next = vstruct_next ( entry ) ;
/*
* we don ' t want to null out this jset_entry ,
* just the contents , so that later we can tell
* we were _supposed_ to have a btree root
*/
entry - > u64s = 0 ;
journal_entry_null_range ( vstruct_next ( entry ) , next ) ;
return 0 ;
}
2020-01-07 21:29:32 +03:00
return journal_validate_key ( c , jset , entry , 1 , entry - > btree_id , k ,
2017-03-17 09:18:50 +03:00
" btree root " , write ) ;
fsck_err :
return ret ;
}
static int journal_entry_validate_prio_ptrs ( struct bch_fs * c ,
struct jset * jset ,
struct jset_entry * entry ,
int write )
{
/* obsolete, don't care: */
return 0 ;
}
static int journal_entry_validate_blacklist ( struct bch_fs * c ,
struct jset * jset ,
struct jset_entry * entry ,
int write )
{
int ret = 0 ;
if ( journal_entry_err_on ( le16_to_cpu ( entry - > u64s ) ! = 1 , c ,
" invalid journal seq blacklist entry: bad size " ) ) {
journal_entry_null_range ( entry , vstruct_next ( entry ) ) ;
}
fsck_err :
return ret ;
}
static int journal_entry_validate_blacklist_v2 ( struct bch_fs * c ,
struct jset * jset ,
struct jset_entry * entry ,
int write )
{
struct jset_entry_blacklist_v2 * bl_entry ;
int ret = 0 ;
if ( journal_entry_err_on ( le16_to_cpu ( entry - > u64s ) ! = 2 , c ,
" invalid journal seq blacklist entry: bad size " ) ) {
journal_entry_null_range ( entry , vstruct_next ( entry ) ) ;
2019-01-25 01:12:00 +03:00
goto out ;
2017-03-17 09:18:50 +03:00
}
bl_entry = container_of ( entry , struct jset_entry_blacklist_v2 , entry ) ;
if ( journal_entry_err_on ( le64_to_cpu ( bl_entry - > start ) >
le64_to_cpu ( bl_entry - > end ) , c ,
" invalid journal seq blacklist entry: start > end " ) ) {
journal_entry_null_range ( entry , vstruct_next ( entry ) ) ;
}
2019-01-25 01:12:00 +03:00
out :
fsck_err :
return ret ;
}
static int journal_entry_validate_usage ( struct bch_fs * c ,
struct jset * jset ,
struct jset_entry * entry ,
int write )
{
struct jset_entry_usage * u =
container_of ( entry , struct jset_entry_usage , entry ) ;
unsigned bytes = jset_u64s ( le16_to_cpu ( entry - > u64s ) ) * sizeof ( u64 ) ;
int ret = 0 ;
2019-02-10 03:20:57 +03:00
if ( journal_entry_err_on ( bytes < sizeof ( * u ) ,
c ,
" invalid journal entry usage: bad size " ) ) {
journal_entry_null_range ( entry , vstruct_next ( entry ) ) ;
return ret ;
}
fsck_err :
return ret ;
}
static int journal_entry_validate_data_usage ( struct bch_fs * c ,
struct jset * jset ,
struct jset_entry * entry ,
int write )
{
struct jset_entry_data_usage * u =
container_of ( entry , struct jset_entry_data_usage , entry ) ;
unsigned bytes = jset_u64s ( le16_to_cpu ( entry - > u64s ) ) * sizeof ( u64 ) ;
int ret = 0 ;
2019-01-25 01:12:00 +03:00
if ( journal_entry_err_on ( bytes < sizeof ( * u ) | |
bytes < sizeof ( * u ) + u - > r . nr_devs ,
c ,
" invalid journal entry usage: bad size " ) ) {
journal_entry_null_range ( entry , vstruct_next ( entry ) ) ;
return ret ;
}
2017-03-17 09:18:50 +03:00
fsck_err :
return ret ;
}
struct jset_entry_ops {
int ( * validate ) ( struct bch_fs * , struct jset * ,
struct jset_entry * , int ) ;
} ;
static const struct jset_entry_ops bch2_jset_entry_ops [ ] = {
# define x(f, nr) \
[ BCH_JSET_ENTRY_ # # f ] = ( struct jset_entry_ops ) { \
. validate = journal_entry_validate_ # # f , \
} ,
BCH_JSET_ENTRY_TYPES ( )
# undef x
} ;
static int journal_entry_validate ( struct bch_fs * c , struct jset * jset ,
struct jset_entry * entry , int write )
{
2019-01-25 01:12:00 +03:00
return entry - > type < BCH_JSET_ENTRY_NR
? bch2_jset_entry_ops [ entry - > type ] . validate ( c , jset ,
entry , write )
: 0 ;
2017-03-17 09:18:50 +03:00
}
static int jset_validate_entries ( struct bch_fs * c , struct jset * jset ,
int write )
{
struct jset_entry * entry ;
int ret = 0 ;
vstruct_for_each ( jset , entry ) {
if ( journal_entry_err_on ( vstruct_next ( entry ) >
vstruct_last ( jset ) , c ,
" journal entry extends past end of jset " ) ) {
jset - > u64s = cpu_to_le32 ( ( u64 * ) entry - jset - > _data ) ;
break ;
}
ret = journal_entry_validate ( c , jset , entry , write ) ;
if ( ret )
break ;
}
fsck_err :
return ret ;
}
static int jset_validate ( struct bch_fs * c ,
struct jset * jset , u64 sector ,
unsigned bucket_sectors_left ,
unsigned sectors_read ,
int write )
{
size_t bytes = vstruct_bytes ( jset ) ;
struct bch_csum csum ;
2018-11-01 22:10:01 +03:00
unsigned version ;
2017-03-17 09:18:50 +03:00
int ret = 0 ;
if ( le64_to_cpu ( jset - > magic ) ! = jset_magic ( c ) )
return JOURNAL_ENTRY_NONE ;
2018-11-01 22:10:01 +03:00
version = le32_to_cpu ( jset - > version ) ;
if ( ( version ! = BCH_JSET_VERSION_OLD & &
version < bcachefs_metadata_version_min ) | |
version > = bcachefs_metadata_version_max ) {
bch_err ( c , " unknown journal entry version %u " , jset - > version ) ;
2017-03-17 09:18:50 +03:00
return BCH_FSCK_UNKNOWN_VERSION ;
}
if ( journal_entry_err_on ( bytes > bucket_sectors_left < < 9 , c ,
" journal entry too big (%zu bytes), sector %lluu " ,
bytes , sector ) ) {
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD ;
}
if ( bytes > sectors_read < < 9 )
return JOURNAL_ENTRY_REREAD ;
if ( fsck_err_on ( ! bch2_checksum_type_valid ( c , JSET_CSUM_TYPE ( jset ) ) , c ,
" journal entry with unknown csum type %llu sector %lluu " ,
JSET_CSUM_TYPE ( jset ) , sector ) )
return JOURNAL_ENTRY_BAD ;
csum = csum_vstruct ( c , JSET_CSUM_TYPE ( jset ) , journal_nonce ( jset ) , jset ) ;
if ( journal_entry_err_on ( bch2_crc_cmp ( csum , jset - > csum ) , c ,
" journal checksum bad, sector %llu " , sector ) ) {
/* XXX: retry IO, when we start retrying checksum errors */
/* XXX: note we might have missing journal entries */
return JOURNAL_ENTRY_BAD ;
}
bch2_encrypt ( c , JSET_CSUM_TYPE ( jset ) , journal_nonce ( jset ) ,
jset - > encrypted_start ,
vstruct_end ( jset ) - ( void * ) jset - > encrypted_start ) ;
if ( journal_entry_err_on ( le64_to_cpu ( jset - > last_seq ) > le64_to_cpu ( jset - > seq ) , c ,
" invalid journal entry: last_seq > seq " ) )
jset - > last_seq = jset - > seq ;
return 0 ;
fsck_err :
return ret ;
}
struct journal_read_buf {
void * data ;
size_t size ;
} ;
static int journal_read_buf_realloc ( struct journal_read_buf * b ,
size_t new_size )
{
void * n ;
/* the bios are sized for this many pages, max: */
if ( new_size > JOURNAL_ENTRY_SIZE_MAX )
return - ENOMEM ;
new_size = roundup_pow_of_two ( new_size ) ;
n = kvpmalloc ( new_size , GFP_KERNEL ) ;
if ( ! n )
return - ENOMEM ;
kvpfree ( b - > data , b - > size ) ;
b - > data = n ;
b - > size = new_size ;
return 0 ;
}
static int journal_read_bucket ( struct bch_dev * ca ,
struct journal_read_buf * buf ,
struct journal_list * jlist ,
2018-11-19 02:32:16 +03:00
unsigned bucket )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = ca - > fs ;
struct journal_device * ja = & ca - > journal ;
struct jset * j = NULL ;
unsigned sectors , sectors_read = 0 ;
u64 offset = bucket_to_sector ( ca , ja - > buckets [ bucket ] ) ,
end = offset + ca - > mi . bucket_size ;
bool saw_bad = false ;
int ret = 0 ;
pr_debug ( " reading %u " , bucket ) ;
while ( offset < end ) {
if ( ! sectors_read ) {
2018-11-04 03:04:54 +03:00
struct bio * bio ;
unsigned nr_bvecs ;
reread :
sectors_read = min_t ( unsigned ,
2017-03-17 09:18:50 +03:00
end - offset , buf - > size > > 9 ) ;
2018-11-04 03:04:54 +03:00
nr_bvecs = buf_pages ( buf - > data , sectors_read < < 9 ) ;
bio = bio_kmalloc ( nr_bvecs , GFP_KERNEL ) ;
bio_init ( bio , ca - > disk_sb . bdev , bio - > bi_inline_vecs , nr_bvecs , REQ_OP_READ ) ;
2017-03-17 09:18:50 +03:00
2019-07-04 02:27:42 +03:00
bio - > bi_iter . bi_sector = offset ;
bch2_bio_map ( bio , buf - > data , sectors_read < < 9 ) ;
2017-03-17 09:18:50 +03:00
ret = submit_bio_wait ( bio ) ;
2018-11-04 03:04:54 +03:00
kfree ( bio ) ;
2017-03-17 09:18:50 +03:00
if ( bch2_dev_io_err_on ( ret , ca ,
" journal read from sector %llu " ,
offset ) | |
bch2_meta_read_fault ( " journal " ) )
return - EIO ;
j = buf - > data ;
}
ret = jset_validate ( c , j , offset ,
end - offset , sectors_read ,
READ ) ;
switch ( ret ) {
case BCH_FSCK_OK :
break ;
case JOURNAL_ENTRY_REREAD :
if ( vstruct_bytes ( j ) > buf - > size ) {
ret = journal_read_buf_realloc ( buf ,
vstruct_bytes ( j ) ) ;
if ( ret )
return ret ;
}
goto reread ;
case JOURNAL_ENTRY_NONE :
if ( ! saw_bad )
return 0 ;
sectors = c - > opts . block_size ;
goto next_block ;
case JOURNAL_ENTRY_BAD :
saw_bad = true ;
sectors = c - > opts . block_size ;
goto next_block ;
default :
return ret ;
}
/*
* This happens sometimes if we don ' t have discards on -
* when we ' ve partially overwritten a bucket with new
* journal entries . We don ' t need the rest of the
* bucket :
*/
if ( le64_to_cpu ( j - > seq ) < ja - > bucket_seq [ bucket ] )
return 0 ;
ja - > bucket_seq [ bucket ] = le64_to_cpu ( j - > seq ) ;
mutex_lock ( & jlist - > lock ) ;
ret = journal_entry_add ( c , ca , jlist , j ) ;
mutex_unlock ( & jlist - > lock ) ;
switch ( ret ) {
case JOURNAL_ENTRY_ADD_OK :
break ;
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE :
break ;
default :
return ret ;
}
sectors = vstruct_sectors ( j , c - > block_bits ) ;
next_block :
pr_debug ( " next " ) ;
offset + = sectors ;
sectors_read - = sectors ;
j = ( ( void * ) j ) + ( sectors < < 9 ) ;
}
return 0 ;
}
static void bch2_journal_read_device ( struct closure * cl )
{
struct journal_device * ja =
container_of ( cl , struct journal_device , read ) ;
struct bch_dev * ca = container_of ( ja , struct bch_dev , journal ) ;
struct journal_list * jlist =
container_of ( cl - > parent , struct journal_list , cl ) ;
struct journal_read_buf buf = { NULL , 0 } ;
2018-11-19 02:32:16 +03:00
u64 min_seq = U64_MAX ;
unsigned i ;
2017-03-17 09:18:50 +03:00
int ret ;
if ( ! ja - > nr )
goto out ;
ret = journal_read_buf_realloc ( & buf , PAGE_SIZE ) ;
if ( ret )
goto err ;
pr_debug ( " %u journal buckets " , ja - > nr ) ;
for ( i = 0 ; i < ja - > nr ; i + + ) {
2018-11-19 02:32:16 +03:00
ret = journal_read_bucket ( ca , & buf , jlist , i ) ;
if ( ret )
goto err ;
2017-03-17 09:18:50 +03:00
}
2018-11-19 02:32:16 +03:00
/* Find the journal bucket with the highest sequence number: */
for ( i = 0 ; i < ja - > nr ; i + + ) {
if ( ja - > bucket_seq [ i ] > ja - > bucket_seq [ ja - > cur_idx ] )
ja - > cur_idx = i ;
2017-03-17 09:18:50 +03:00
2018-11-19 02:32:16 +03:00
min_seq = min ( ja - > bucket_seq [ i ] , min_seq ) ;
2017-03-17 09:18:50 +03:00
}
/*
* If there ' s duplicate journal entries in multiple buckets ( which
* definitely isn ' t supposed to happen , but . . . ) - make sure to start
* cur_idx at the last of those buckets , so we don ' t deadlock trying to
* allocate
*/
2018-11-19 02:32:16 +03:00
while ( ja - > bucket_seq [ ja - > cur_idx ] > min_seq & &
ja - > bucket_seq [ ja - > cur_idx ] >
ja - > bucket_seq [ ( ja - > cur_idx + 1 ) % ja - > nr ] )
2018-12-18 18:15:35 +03:00
ja - > cur_idx = ( ja - > cur_idx + 1 ) % ja - > nr ;
2018-11-19 02:32:16 +03:00
ja - > sectors_free = 0 ;
2017-03-17 09:18:50 +03:00
/*
2019-03-03 23:15:55 +03:00
* Set dirty_idx to indicate the entire journal is full and needs to be
2017-03-17 09:18:50 +03:00
* reclaimed - journal reclaim will immediately reclaim whatever isn ' t
* pinned when it first runs :
*/
2019-03-03 23:15:55 +03:00
ja - > discard_idx = ja - > dirty_idx_ondisk =
ja - > dirty_idx = ( ja - > cur_idx + 1 ) % ja - > nr ;
2017-03-17 09:18:50 +03:00
out :
kvpfree ( buf . data , buf . size ) ;
percpu_ref_put ( & ca - > io_ref ) ;
closure_return ( cl ) ;
return ;
err :
mutex_lock ( & jlist - > lock ) ;
jlist - > ret = ret ;
mutex_unlock ( & jlist - > lock ) ;
goto out ;
}
int bch2_journal_read ( struct bch_fs * c , struct list_head * list )
{
struct journal_list jlist ;
struct journal_replay * i ;
struct bch_dev * ca ;
unsigned iter ;
size_t keys = 0 , entries = 0 ;
bool degraded = false ;
int ret = 0 ;
closure_init_stack ( & jlist . cl ) ;
mutex_init ( & jlist . lock ) ;
jlist . head = list ;
jlist . ret = 0 ;
for_each_member_device ( ca , c , iter ) {
2018-11-04 04:51:31 +03:00
if ( ! test_bit ( BCH_FS_REBUILD_REPLICAS , & c - > flags ) & &
2020-07-10 01:28:11 +03:00
! ( bch2_dev_has_data ( c , ca ) & ( 1 < < BCH_DATA_journal ) ) )
2017-03-17 09:18:50 +03:00
continue ;
if ( ( ca - > mi . state = = BCH_MEMBER_STATE_RW | |
ca - > mi . state = = BCH_MEMBER_STATE_RO ) & &
percpu_ref_tryget ( & ca - > io_ref ) )
closure_call ( & ca - > journal . read ,
bch2_journal_read_device ,
system_unbound_wq ,
& jlist . cl ) ;
else
degraded = true ;
}
closure_sync ( & jlist . cl ) ;
if ( jlist . ret )
return jlist . ret ;
list_for_each_entry ( i , list , list ) {
2019-04-05 04:53:12 +03:00
struct jset_entry * entry ;
struct bkey_i * k , * _n ;
2019-01-21 23:32:13 +03:00
struct bch_replicas_padded replicas ;
char buf [ 80 ] ;
2017-03-17 09:18:50 +03:00
ret = jset_validate_entries ( c , & i - > j , READ ) ;
if ( ret )
goto fsck_err ;
/*
* If we ' re mounting in degraded mode - if we didn ' t read all
* the devices - this is wrong :
*/
2020-07-10 01:28:11 +03:00
bch2_devlist_to_replicas ( & replicas . e , BCH_DATA_journal , i - > devs ) ;
2019-04-05 04:53:12 +03:00
2017-03-17 09:18:50 +03:00
if ( ! degraded & &
( test_bit ( BCH_FS_REBUILD_REPLICAS , & c - > flags ) | |
2020-07-10 23:13:52 +03:00
fsck_err_on ( ! bch2_replicas_marked ( c , & replicas . e ) , c ,
2019-01-21 23:32:13 +03:00
" superblock not marked as containing replicas %s " ,
( bch2_replicas_entry_to_text ( & PBUF ( buf ) ,
& replicas . e ) , buf ) ) ) ) {
ret = bch2_mark_replicas ( c , & replicas . e ) ;
2017-03-17 09:18:50 +03:00
if ( ret )
return ret ;
}
for_each_jset_key ( k , _n , entry , & i - > j )
keys + + ;
entries + + ;
}
2019-04-05 04:53:12 +03:00
if ( ! list_empty ( list ) ) {
i = list_last_entry ( list , struct journal_replay , list ) ;
bch_info ( c , " journal read done, %zu keys in %zu entries, seq %llu " ,
keys , entries , le64_to_cpu ( i - > j . seq ) ) ;
}
2017-03-17 09:18:50 +03:00
fsck_err :
return ret ;
}
/* journal write: */
2018-11-19 02:32:16 +03:00
static void __journal_write_alloc ( struct journal * j ,
struct journal_buf * w ,
struct dev_alloc_list * devs_sorted ,
unsigned sectors ,
unsigned * replicas ,
unsigned replicas_want )
2017-03-17 09:18:50 +03:00
{
struct bch_fs * c = container_of ( j , struct bch_fs , journal ) ;
struct journal_device * ja ;
struct bch_dev * ca ;
2018-11-19 02:32:16 +03:00
unsigned i ;
2018-10-01 01:28:23 +03:00
2018-11-19 02:32:16 +03:00
if ( * replicas > = replicas_want )
return ;
2017-03-17 09:18:50 +03:00
2018-11-19 02:32:16 +03:00
for ( i = 0 ; i < devs_sorted - > nr ; i + + ) {
ca = rcu_dereference ( c - > devs [ devs_sorted - > devs [ i ] ] ) ;
2017-03-17 09:18:50 +03:00
if ( ! ca )
continue ;
ja = & ca - > journal ;
/*
* Check that we can use this device , and aren ' t already using
* it :
*/
2018-11-19 02:32:16 +03:00
if ( ! ca - > mi . durability | |
ca - > mi . state ! = BCH_MEMBER_STATE_RW | |
! ja - > nr | |
2018-11-01 22:10:01 +03:00
bch2_bkey_has_device ( bkey_i_to_s_c ( & w - > key ) ,
ca - > dev_idx ) | |
2018-11-19 02:32:16 +03:00
sectors > ja - > sectors_free )
2017-03-17 09:18:50 +03:00
continue ;
2020-07-22 20:27:00 +03:00
bch2_dev_stripe_increment ( ca , & j - > wp . stripe ) ;
2017-03-17 09:18:50 +03:00
2018-11-01 22:10:01 +03:00
bch2_bkey_append_ptr ( & w - > key ,
2017-03-17 09:18:50 +03:00
( struct bch_extent_ptr ) {
. offset = bucket_to_sector ( ca ,
2018-11-19 02:32:16 +03:00
ja - > buckets [ ja - > cur_idx ] ) +
ca - > mi . bucket_size -
ja - > sectors_free ,
2017-03-17 09:18:50 +03:00
. dev = ca - > dev_idx ,
} ) ;
2018-11-19 02:32:16 +03:00
ja - > sectors_free - = sectors ;
ja - > bucket_seq [ ja - > cur_idx ] = le64_to_cpu ( w - > data - > seq ) ;
* replicas + = ca - > mi . durability ;
if ( * replicas > = replicas_want )
break ;
2017-03-17 09:18:50 +03:00
}
2018-11-19 02:32:16 +03:00
}
2017-03-17 09:18:50 +03:00
2018-11-19 02:32:16 +03:00
/**
* journal_next_bucket - move on to the next journal bucket if possible
*/
static int journal_write_alloc ( struct journal * j , struct journal_buf * w ,
unsigned sectors )
{
struct bch_fs * c = container_of ( j , struct bch_fs , journal ) ;
struct journal_device * ja ;
struct bch_dev * ca ;
struct dev_alloc_list devs_sorted ;
unsigned i , replicas = 0 , replicas_want =
READ_ONCE ( c - > opts . metadata_replicas ) ;
2017-03-17 09:18:50 +03:00
2018-11-19 02:32:16 +03:00
rcu_read_lock ( ) ;
2017-03-17 09:18:50 +03:00
2018-11-19 02:32:16 +03:00
devs_sorted = bch2_dev_alloc_list ( c , & j - > wp . stripe ,
2020-07-10 01:28:11 +03:00
& c - > rw_devs [ BCH_DATA_journal ] ) ;
2017-03-17 09:18:50 +03:00
2018-11-19 02:32:16 +03:00
__journal_write_alloc ( j , w , & devs_sorted ,
sectors , & replicas , replicas_want ) ;
2017-03-17 09:18:50 +03:00
2018-11-19 02:32:16 +03:00
if ( replicas > = replicas_want )
goto done ;
for ( i = 0 ; i < devs_sorted . nr ; i + + ) {
ca = rcu_dereference ( c - > devs [ devs_sorted . devs [ i ] ] ) ;
if ( ! ca )
continue ;
ja = & ca - > journal ;
if ( sectors > ja - > sectors_free & &
sectors < = ca - > mi . bucket_size & &
2019-03-04 00:50:40 +03:00
bch2_journal_dev_buckets_available ( j , ja ,
journal_space_discarded ) ) {
2018-11-19 02:32:16 +03:00
ja - > cur_idx = ( ja - > cur_idx + 1 ) % ja - > nr ;
ja - > sectors_free = ca - > mi . bucket_size ;
2019-02-19 21:41:36 +03:00
/*
* ja - > bucket_seq [ ja - > cur_idx ] must always have
* something sensible :
*/
ja - > bucket_seq [ ja - > cur_idx ] = le64_to_cpu ( w - > data - > seq ) ;
2018-11-19 02:32:16 +03:00
}
}
__journal_write_alloc ( j , w , & devs_sorted ,
sectors , & replicas , replicas_want ) ;
done :
rcu_read_unlock ( ) ;
2018-12-06 17:58:03 +03:00
return replicas > = c - > opts . metadata_replicas_required ? 0 : - EROFS ;
2017-03-17 09:18:50 +03:00
}
static void journal_write_compact ( struct jset * jset )
{
struct jset_entry * i , * next , * prev = NULL ;
/*
* Simple compaction , dropping empty jset_entries ( from journal
* reservations that weren ' t fully used ) and merging jset_entries that
* can be .
*
* If we wanted to be really fancy here , we could sort all the keys in
* the jset and drop keys that were overwritten - probably not worth it :
*/
vstruct_for_each_safe ( jset , i , next ) {
unsigned u64s = le16_to_cpu ( i - > u64s ) ;
/* Empty entry: */
if ( ! u64s )
continue ;
/* Can we merge with previous entry? */
if ( prev & &
i - > btree_id = = prev - > btree_id & &
i - > level = = prev - > level & &
i - > type = = prev - > type & &
i - > type = = BCH_JSET_ENTRY_btree_keys & &
le16_to_cpu ( prev - > u64s ) + u64s < = U16_MAX ) {
memmove_u64s_down ( vstruct_next ( prev ) ,
i - > _data ,
u64s ) ;
le16_add_cpu ( & prev - > u64s , u64s ) ;
continue ;
}
/* Couldn't merge, move i into new position (after prev): */
prev = prev ? vstruct_next ( prev ) : jset - > start ;
if ( i ! = prev )
memmove_u64s_down ( prev , i , jset_u64s ( u64s ) ) ;
}
prev = prev ? vstruct_next ( prev ) : jset - > start ;
jset - > u64s = cpu_to_le32 ( ( u64 * ) prev - jset - > _data ) ;
}
static void journal_buf_realloc ( struct journal * j , struct journal_buf * buf )
{
/* we aren't holding j->lock: */
unsigned new_size = READ_ONCE ( j - > buf_size_want ) ;
void * new_buf ;
2019-02-19 01:39:42 +03:00
if ( buf - > buf_size > = new_size )
2017-03-17 09:18:50 +03:00
return ;
new_buf = kvpmalloc ( new_size , GFP_NOIO | __GFP_NOWARN ) ;
if ( ! new_buf )
return ;
2019-02-19 01:39:42 +03:00
memcpy ( new_buf , buf - > data , buf - > buf_size ) ;
kvpfree ( buf - > data , buf - > buf_size ) ;
2017-03-17 09:18:50 +03:00
buf - > data = new_buf ;
2019-02-19 01:39:42 +03:00
buf - > buf_size = new_size ;
2017-03-17 09:18:50 +03:00
}
static void journal_write_done ( struct closure * cl )
{
struct journal * j = container_of ( cl , struct journal , io ) ;
struct bch_fs * c = container_of ( j , struct bch_fs , journal ) ;
struct journal_buf * w = journal_prev_buf ( j ) ;
struct bch_devs_list devs =
2018-11-01 22:10:01 +03:00
bch2_bkey_devs ( bkey_i_to_s_c ( & w - > key ) ) ;
2019-01-21 23:32:13 +03:00
struct bch_replicas_padded replicas ;
2017-03-17 09:18:50 +03:00
u64 seq = le64_to_cpu ( w - > data - > seq ) ;
2018-07-23 14:38:06 +03:00
u64 last_seq = le64_to_cpu ( w - > data - > last_seq ) ;
2017-03-17 09:18:50 +03:00
2018-07-23 14:52:00 +03:00
bch2_time_stats_update ( j - > write_time , j - > write_start_time ) ;
2017-03-17 09:18:50 +03:00
if ( ! devs . nr ) {
bch_err ( c , " unable to write journal to sufficient devices " ) ;
goto err ;
}
2020-07-10 01:28:11 +03:00
bch2_devlist_to_replicas ( & replicas . e , BCH_DATA_journal , devs ) ;
2019-01-21 23:32:13 +03:00
if ( bch2_mark_replicas ( c , & replicas . e ) )
2017-03-17 09:18:50 +03:00
goto err ;
spin_lock ( & j - > lock ) ;
if ( seq > = j - > pin . front )
journal_seq_pin ( j , seq ) - > devs = devs ;
2019-03-03 23:15:55 +03:00
j - > seq_ondisk = seq ;
j - > last_seq_ondisk = last_seq ;
bch2_journal_space_available ( j ) ;
2017-03-17 09:18:50 +03:00
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work ( ) discard
* more buckets :
*
* Must come before signaling write completion , for
* bch2_fs_journal_stop ( ) :
*/
2019-03-01 06:32:09 +03:00
mod_delayed_work ( c - > journal_reclaim_wq , & j - > reclaim_work , 0 ) ;
2018-07-23 14:52:00 +03:00
out :
2017-03-17 09:18:50 +03:00
/* also must come before signalling write completion: */
closure_debug_destroy ( cl ) ;
BUG_ON ( ! j - > reservations . prev_buf_unwritten ) ;
atomic64_sub ( ( ( union journal_res_state ) { . prev_buf_unwritten = 1 } ) . v ,
& j - > reservations . counter ) ;
closure_wake_up ( & w - > wait ) ;
journal_wake ( j ) ;
if ( test_bit ( JOURNAL_NEED_WRITE , & j - > flags ) )
mod_delayed_work ( system_freezable_wq , & j - > write_work , 0 ) ;
spin_unlock ( & j - > lock ) ;
return ;
err :
bch2_fatal_error ( c ) ;
2018-07-23 14:52:00 +03:00
spin_lock ( & j - > lock ) ;
2017-03-17 09:18:50 +03:00
goto out ;
}
static void journal_write_endio ( struct bio * bio )
{
struct bch_dev * ca = bio - > bi_private ;
struct journal * j = & ca - > fs - > journal ;
2020-07-02 20:43:58 +03:00
if ( bch2_dev_io_err_on ( bio - > bi_status , ca , " journal write: %s " ,
2020-07-21 20:34:22 +03:00
bch2_blk_status_to_str ( bio - > bi_status ) ) | |
2017-03-17 09:18:50 +03:00
bch2_meta_write_fault ( " journal " ) ) {
struct journal_buf * w = journal_prev_buf ( j ) ;
unsigned long flags ;
spin_lock_irqsave ( & j - > err_lock , flags ) ;
2018-11-01 22:10:01 +03:00
bch2_bkey_drop_device ( bkey_i_to_s ( & w - > key ) , ca - > dev_idx ) ;
2017-03-17 09:18:50 +03:00
spin_unlock_irqrestore ( & j - > err_lock , flags ) ;
}
closure_put ( & j - > io ) ;
percpu_ref_put ( & ca - > io_ref ) ;
}
void bch2_journal_write ( struct closure * cl )
{
struct journal * j = container_of ( cl , struct journal , io ) ;
struct bch_fs * c = container_of ( j , struct bch_fs , journal ) ;
struct bch_dev * ca ;
struct journal_buf * w = journal_prev_buf ( j ) ;
2019-01-25 01:54:51 +03:00
struct jset_entry * start , * end ;
2017-03-17 09:18:50 +03:00
struct jset * jset ;
struct bio * bio ;
struct bch_extent_ptr * ptr ;
2018-11-01 22:10:01 +03:00
bool validate_before_checksum = false ;
2019-01-25 01:54:51 +03:00
unsigned i , sectors , bytes , u64s ;
2019-02-21 21:33:21 +03:00
int ret ;
bch2_journal_pin_put ( j , le64_to_cpu ( w - > data - > seq ) ) ;
2017-03-17 09:18:50 +03:00
journal_buf_realloc ( j , w ) ;
jset = w - > data ;
j - > write_start_time = local_clock ( ) ;
2020-05-25 21:57:06 +03:00
/*
* New btree roots are set by journalling them ; when the journal entry
* gets written we have to propagate them to c - > btree_roots
*
* But , every journal entry we write has to contain all the btree roots
* ( at least for now ) ; so after we copy btree roots to c - > btree_roots we
* have to get any missing btree roots and add them to this journal
* entry :
*/
bch2_journal_entries_to_btree_roots ( c , jset ) ;
start = end = vstruct_last ( jset ) ;
end = bch2_btree_roots_to_journal_entries ( c , jset - > start , end ) ;
end = bch2_journal_super_entries_add_common ( c , end ,
2019-01-25 01:54:51 +03:00
le64_to_cpu ( jset - > seq ) ) ;
u64s = ( u64 * ) end - ( u64 * ) start ;
BUG_ON ( u64s > j - > entry_u64s_reserved ) ;
2019-02-19 01:39:42 +03:00
le32_add_cpu ( & jset - > u64s , u64s ) ;
BUG_ON ( vstruct_sectors ( jset , c - > block_bits ) > w - > sectors ) ;
2017-03-17 09:18:50 +03:00
journal_write_compact ( jset ) ;
jset - > read_clock = cpu_to_le16 ( c - > bucket_clock [ READ ] . hand ) ;
jset - > write_clock = cpu_to_le16 ( c - > bucket_clock [ WRITE ] . hand ) ;
jset - > magic = cpu_to_le64 ( jset_magic ( c ) ) ;
2018-11-01 22:10:01 +03:00
jset - > version = c - > sb . version < bcachefs_metadata_version_new_versioning
? cpu_to_le32 ( BCH_JSET_VERSION_OLD )
: cpu_to_le32 ( c - > sb . version ) ;
2017-03-17 09:18:50 +03:00
SET_JSET_BIG_ENDIAN ( jset , CPU_BIG_ENDIAN ) ;
SET_JSET_CSUM_TYPE ( jset , bch2_meta_checksum_type ( c ) ) ;
2018-11-01 22:10:01 +03:00
if ( bch2_csum_type_is_encryption ( JSET_CSUM_TYPE ( jset ) ) )
validate_before_checksum = true ;
2020-01-07 21:29:32 +03:00
if ( le32_to_cpu ( jset - > version ) < bcachefs_metadata_version_max )
2018-11-01 22:10:01 +03:00
validate_before_checksum = true ;
if ( validate_before_checksum & &
2017-03-17 09:18:50 +03:00
jset_validate_entries ( c , jset , WRITE ) )
goto err ;
bch2_encrypt ( c , JSET_CSUM_TYPE ( jset ) , journal_nonce ( jset ) ,
jset - > encrypted_start ,
vstruct_end ( jset ) - ( void * ) jset - > encrypted_start ) ;
jset - > csum = csum_vstruct ( c , JSET_CSUM_TYPE ( jset ) ,
journal_nonce ( jset ) , jset ) ;
2018-11-01 22:10:01 +03:00
if ( ! validate_before_checksum & &
2017-03-17 09:18:50 +03:00
jset_validate_entries ( c , jset , WRITE ) )
goto err ;
sectors = vstruct_sectors ( jset , c - > block_bits ) ;
2019-02-19 01:39:42 +03:00
BUG_ON ( sectors > w - > sectors ) ;
2017-03-17 09:18:50 +03:00
2019-02-19 01:39:42 +03:00
bytes = vstruct_bytes ( jset ) ;
memset ( ( void * ) jset + bytes , 0 , ( sectors < < 9 ) - bytes ) ;
2017-03-17 09:18:50 +03:00
2020-02-26 23:58:36 +03:00
retry_alloc :
2019-02-21 21:33:21 +03:00
spin_lock ( & j - > lock ) ;
ret = journal_write_alloc ( j , w , sectors ) ;
2020-02-26 23:58:36 +03:00
if ( ret & & j - > can_discard ) {
spin_unlock ( & j - > lock ) ;
bch2_journal_do_discards ( j ) ;
goto retry_alloc ;
}
2019-02-21 21:33:21 +03:00
/*
* write is allocated , no longer need to account for it in
* bch2_journal_space_available ( ) :
*/
w - > sectors = 0 ;
/*
* journal entry has been compacted and allocated , recalculate space
* available :
*/
bch2_journal_space_available ( j ) ;
spin_unlock ( & j - > lock ) ;
if ( ret ) {
2017-03-17 09:18:50 +03:00
bch_err ( c , " Unable to allocate journal write " ) ;
bch2_fatal_error ( c ) ;
continue_at ( cl , journal_write_done , system_highpri_wq ) ;
return ;
}
/*
* XXX : we really should just disable the entire journal in nochanges
* mode
*/
if ( c - > opts . nochanges )
goto no_io ;
extent_for_each_ptr ( bkey_i_to_s_extent ( & w - > key ) , ptr ) {
ca = bch_dev_bkey_exists ( c , ptr - > dev ) ;
if ( ! percpu_ref_tryget ( & ca - > io_ref ) ) {
/* XXX: fix this */
bch_err ( c , " missing device for journal write \n " ) ;
continue ;
}
2020-07-10 01:28:11 +03:00
this_cpu_add ( ca - > io_done - > sectors [ WRITE ] [ BCH_DATA_journal ] ,
2017-03-17 09:18:50 +03:00
sectors ) ;
bio = ca - > journal . bio ;
bio_reset ( bio , ca - > disk_sb . bdev ,
REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PREFLUSH | REQ_FUA ) ;
bio - > bi_iter . bi_sector = ptr - > offset ;
bio - > bi_end_io = journal_write_endio ;
bio - > bi_private = ca ;
2019-07-04 02:27:42 +03:00
bch2_bio_map ( bio , jset , sectors < < 9 ) ;
2017-03-17 09:18:50 +03:00
trace_journal_write ( bio ) ;
closure_bio_submit ( bio , cl ) ;
2019-02-19 01:39:42 +03:00
ca - > journal . bucket_seq [ ca - > journal . cur_idx ] = le64_to_cpu ( jset - > seq ) ;
2017-03-17 09:18:50 +03:00
}
for_each_rw_member ( ca , c , i )
if ( journal_flushes_device ( ca ) & &
2019-11-16 04:40:15 +03:00
! bch2_bkey_has_device ( bkey_i_to_s_c ( & w - > key ) , i ) ) {
2017-03-17 09:18:50 +03:00
percpu_ref_get ( & ca - > io_ref ) ;
bio = ca - > journal . bio ;
bio_reset ( bio , ca - > disk_sb . bdev , REQ_OP_FLUSH ) ;
bio - > bi_end_io = journal_write_endio ;
bio - > bi_private = ca ;
closure_bio_submit ( bio , cl ) ;
}
no_io :
2018-07-22 05:57:20 +03:00
bch2_bucket_seq_cleanup ( c ) ;
2017-03-17 09:18:50 +03:00
continue_at ( cl , journal_write_done , system_highpri_wq ) ;
return ;
err :
bch2_inconsistent_error ( c ) ;
continue_at ( cl , journal_write_done , system_highpri_wq ) ;
}