2017-03-16 22:18:50 -08:00
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright ( C ) 2010 Kent Overstreet < kent . overstreet @ gmail . com >
*
* Code for managing the extent btree and dynamically updating the writeback
* dirty sector count .
*/
# include "bcachefs.h"
# include "bkey_methods.h"
# include "btree_gc.h"
2020-01-07 13:29:32 -05:00
# include "btree_io.h"
2019-11-15 15:52:28 -05:00
# include "btree_iter.h"
2017-03-16 22:18:50 -08:00
# include "buckets.h"
# include "checksum.h"
# include "debug.h"
# include "disk_groups.h"
# include "error.h"
# include "extents.h"
# include "inode.h"
# include "journal.h"
# include "replicas.h"
# include "super.h"
# include "super-io.h"
# include "trace.h"
# include "util.h"
2022-06-13 19:17:45 -04:00
static union bch_extent_entry * __bch2_bkey_drop_ptr ( struct bkey_s , struct bch_extent_ptr * ) ;
2019-11-16 16:25:58 -05:00
static unsigned bch2_crc_field_size_max [ ] = {
[ BCH_EXTENT_ENTRY_crc32 ] = CRC32_SIZE_MAX ,
[ BCH_EXTENT_ENTRY_crc64 ] = CRC64_SIZE_MAX ,
[ BCH_EXTENT_ENTRY_crc128 ] = CRC128_SIZE_MAX ,
} ;
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
static void bch2_extent_crc_pack ( union bch_extent_crc * ,
struct bch_extent_crc_unpacked ,
enum bch_extent_entry_type ) ;
2017-03-16 22:18:50 -08:00
2018-11-01 15:10:01 -04:00
static struct bch_dev_io_failures * dev_io_failures ( struct bch_io_failures * f ,
unsigned dev )
{
struct bch_dev_io_failures * i ;
for ( i = f - > devs ; i < f - > devs + f - > nr ; i + + )
if ( i - > dev = = dev )
return i ;
return NULL ;
}
void bch2_mark_io_failure ( struct bch_io_failures * failed ,
struct extent_ptr_decoded * p )
{
struct bch_dev_io_failures * f = dev_io_failures ( failed , p - > ptr . dev ) ;
if ( ! f ) {
BUG_ON ( failed - > nr > = ARRAY_SIZE ( failed - > devs ) ) ;
f = & failed - > devs [ failed - > nr + + ] ;
f - > dev = p - > ptr . dev ;
f - > idx = p - > idx ;
f - > nr_failed = 1 ;
f - > nr_retries = 0 ;
} else if ( p - > idx ! = f - > idx ) {
f - > idx = p - > idx ;
f - > nr_failed = 1 ;
f - > nr_retries = 0 ;
} else {
f - > nr_failed + + ;
}
}
/*
* returns true if p1 is better than p2 :
*/
static inline bool ptr_better ( struct bch_fs * c ,
const struct extent_ptr_decoded p1 ,
const struct extent_ptr_decoded p2 )
{
if ( likely ( ! p1 . idx & & ! p2 . idx ) ) {
struct bch_dev * dev1 = bch_dev_bkey_exists ( c , p1 . ptr . dev ) ;
struct bch_dev * dev2 = bch_dev_bkey_exists ( c , p2 . ptr . dev ) ;
u64 l1 = atomic64_read ( & dev1 - > cur_latency [ READ ] ) ;
u64 l2 = atomic64_read ( & dev2 - > cur_latency [ READ ] ) ;
/* Pick at random, biased in favor of the faster device: */
return bch2_rand_range ( l1 + l2 ) > l1 ;
}
2020-11-02 18:20:44 -05:00
if ( bch2_force_reconstruct_read )
2018-11-01 15:10:01 -04:00
return p1 . idx > p2 . idx ;
return p1 . idx < p2 . idx ;
}
/*
* This picks a non - stale pointer , preferably from a device other than @ avoid .
* Avoid can be NULL , meaning pick any . If there are no non - stale pointers to
* other devices , it will still pick a pointer from avoid .
*/
int bch2_bkey_pick_read_device ( struct bch_fs * c , struct bkey_s_c k ,
struct bch_io_failures * failed ,
struct extent_ptr_decoded * pick )
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
struct extent_ptr_decoded p ;
struct bch_dev_io_failures * f ;
struct bch_dev * ca ;
int ret = 0 ;
if ( k . k - > type = = KEY_TYPE_error )
return - EIO ;
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry ) {
2022-11-13 18:59:01 -05:00
/*
* Unwritten extent : no need to actually read , treat it as a
* hole and return 0 s :
*/
if ( p . ptr . unwritten )
return 0 ;
2018-11-01 15:10:01 -04:00
ca = bch_dev_bkey_exists ( c , p . ptr . dev ) ;
/*
* If there are any dirty pointers it ' s an error if we can ' t
* read :
*/
if ( ! ret & & ! p . ptr . cached )
ret = - EIO ;
if ( p . ptr . cached & & ptr_stale ( ca , & p . ptr ) )
continue ;
f = failed ? dev_io_failures ( failed , p . ptr . dev ) : NULL ;
if ( f )
p . idx = f - > nr_failed < f - > nr_retries
? f - > idx
: f - > idx + 1 ;
if ( ! p . idx & &
! bch2_dev_is_readable ( ca ) )
p . idx + + ;
2020-11-02 18:20:44 -05:00
if ( bch2_force_reconstruct_read & &
2019-10-08 18:45:29 -04:00
! p . idx & & p . has_ec )
2018-11-01 15:10:01 -04:00
p . idx + + ;
2019-10-08 18:45:29 -04:00
if ( p . idx > = ( unsigned ) p . has_ec + 1 )
2018-11-01 15:10:01 -04:00
continue ;
if ( ret > 0 & & ! ptr_better ( c , p , * pick ) )
continue ;
* pick = p ;
ret = 1 ;
}
return ret ;
}
2019-11-16 16:25:58 -05:00
/* KEY_TYPE_btree_ptr: */
2018-11-01 15:10:01 -04:00
2022-04-03 17:50:01 -04:00
int bch2_btree_ptr_invalid ( const struct bch_fs * c , struct bkey_s_c k ,
2022-12-20 19:58:16 -05:00
unsigned flags , struct printbuf * err )
2018-11-01 15:10:01 -04:00
{
2022-04-03 17:50:01 -04:00
if ( bkey_val_u64s ( k . k ) > BCH_REPLICAS_MAX ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " value too big (%zu > %u) " ,
2022-04-03 17:50:01 -04:00
bkey_val_u64s ( k . k ) , BCH_REPLICAS_MAX ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2018-11-01 15:10:01 -04:00
2022-12-20 19:58:16 -05:00
return bch2_bkey_ptrs_invalid ( c , k , flags , err ) ;
2018-11-01 15:10:01 -04:00
}
2019-11-16 16:25:58 -05:00
void bch2_btree_ptr_to_text ( struct printbuf * out , struct bch_fs * c ,
struct bkey_s_c k )
2018-11-01 15:10:01 -04:00
{
2019-11-16 16:25:58 -05:00
bch2_bkey_ptrs_to_text ( out , c , k ) ;
}
2018-11-01 15:10:01 -04:00
2022-04-03 17:50:01 -04:00
int bch2_btree_ptr_v2_invalid ( const struct bch_fs * c , struct bkey_s_c k ,
2022-12-20 19:58:16 -05:00
unsigned flags , struct printbuf * err )
2021-03-22 17:23:30 -04:00
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2 ( k ) ;
2022-04-03 17:50:01 -04:00
if ( bkey_val_bytes ( k . k ) < = sizeof ( * bp . v ) ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " value too small (%zu <= %zu) " ,
2022-04-03 17:50:01 -04:00
bkey_val_bytes ( k . k ) , sizeof ( * bp . v ) ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2021-03-22 17:23:30 -04:00
2022-04-03 17:50:01 -04:00
if ( bkey_val_u64s ( k . k ) > BKEY_BTREE_PTR_VAL_U64s_MAX ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " value too big (%zu > %zu) " ,
2022-04-03 17:50:01 -04:00
bkey_val_u64s ( k . k ) , BKEY_BTREE_PTR_VAL_U64s_MAX ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2021-03-22 17:23:30 -04:00
bcachefs: Start using bpos.snapshot field
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:
* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
and xattrs) now always have their snapshot field set to U32_MAX
The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.
We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).
This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2021-03-24 18:02:16 -04:00
if ( c - > sb . version < bcachefs_metadata_version_snapshot & &
2022-04-03 17:50:01 -04:00
bp . v - > min_key . snapshot ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " invalid min_key.snapshot (%u != 0) " ,
2022-04-03 17:50:01 -04:00
bp . v - > min_key . snapshot ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2021-03-22 17:23:30 -04:00
2022-12-20 19:58:16 -05:00
return bch2_bkey_ptrs_invalid ( c , k , flags , err ) ;
2021-03-22 17:23:30 -04:00
}
2020-03-31 16:25:30 -04:00
void bch2_btree_ptr_v2_to_text ( struct printbuf * out , struct bch_fs * c ,
2022-04-03 21:50:25 -04:00
struct bkey_s_c k )
2020-03-31 16:25:30 -04:00
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2 ( k ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( out , " seq %llx written %u min_key %s " ,
2020-03-31 16:25:30 -04:00
le64_to_cpu ( bp . v - > seq ) ,
2021-07-15 13:42:43 -04:00
le16_to_cpu ( bp . v - > sectors_written ) ,
BTREE_PTR_RANGE_UPDATED ( bp . v ) ? " R " : " " ) ;
2020-03-31 16:25:30 -04:00
bch2_bpos_to_text ( out , bp . v - > min_key ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( out , " " ) ;
2020-03-31 16:25:30 -04:00
bch2_bkey_ptrs_to_text ( out , c , k ) ;
}
2020-01-07 13:29:32 -05:00
void bch2_btree_ptr_v2_compat ( enum btree_id btree_id , unsigned version ,
unsigned big_endian , int write ,
struct bkey_s k )
{
struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2 ( k ) ;
compat_bpos ( 0 , btree_id , version , big_endian , write , & bp . v - > min_key ) ;
if ( version < bcachefs_metadata_version_inode_btree_change & &
btree_node_type_is_extents ( btree_id ) & &
2022-11-24 03:12:22 -05:00
! bkey_eq ( bp . v - > min_key , POS_MIN ) )
2020-01-07 13:29:32 -05:00
bp . v - > min_key = write
bcachefs: Start using bpos.snapshot field
This patch starts treating the bpos.snapshot field like part of the key
in the btree code:
* bpos_successor() and bpos_predecessor() now include the snapshot field
* Keys in btrees that will be using snapshots (extents, inodes, dirents
and xattrs) now always have their snapshot field set to U32_MAX
The btree iterator code gets a new flag, BTREE_ITER_ALL_SNAPSHOTS, that
determines whether we're iterating over keys in all snapshots or not -
internally, this controlls whether bkey_(successor|predecessor)
increment/decrement the snapshot field, or only the higher bits of the
key.
We add a new member to struct btree_iter, iter->snapshot: when
BTREE_ITER_ALL_SNAPSHOTS is not set, iter->pos.snapshot should always
equal iter->snapshot, which will be 0 for btrees that don't use
snapshots, and alsways U32_MAX for btrees that will use snapshots
(until we enable snapshot creation).
This patch also introduces a new metadata version number, and compat
code for reading from/writing to older versions - this isn't a forced
upgrade (yet).
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2021-03-24 18:02:16 -04:00
? bpos_nosnap_predecessor ( bp . v - > min_key )
: bpos_nosnap_successor ( bp . v - > min_key ) ;
2020-01-07 13:29:32 -05:00
}
2019-11-16 16:25:58 -05:00
/* KEY_TYPE_extent: */
2018-11-01 15:10:01 -04:00
2021-05-15 15:04:08 -04:00
bool bch2_extent_merge ( struct bch_fs * c , struct bkey_s l , struct bkey_s_c r )
2017-03-16 22:18:50 -08:00
{
2021-05-15 15:04:08 -04:00
struct bkey_ptrs l_ptrs = bch2_bkey_ptrs ( l ) ;
struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c ( r ) ;
2021-05-15 00:37:37 -04:00
union bch_extent_entry * en_l ;
const union bch_extent_entry * en_r ;
struct extent_ptr_decoded lp , rp ;
bool use_right_ptr ;
struct bch_dev * ca ;
2018-09-30 18:28:23 -04:00
2021-05-15 15:04:08 -04:00
en_l = l_ptrs . start ;
en_r = r_ptrs . start ;
while ( en_l < l_ptrs . end & & en_r < r_ptrs . end ) {
2019-11-16 16:25:58 -05:00
if ( extent_entry_type ( en_l ) ! = extent_entry_type ( en_r ) )
2021-04-28 23:49:30 -04:00
return false ;
2021-05-15 15:04:08 -04:00
en_l = extent_entry_next ( en_l ) ;
en_r = extent_entry_next ( en_r ) ;
2021-05-15 00:37:37 -04:00
}
2019-11-16 16:25:58 -05:00
2021-05-15 15:04:08 -04:00
if ( en_l < l_ptrs . end | | en_r < r_ptrs . end )
return false ;
en_l = l_ptrs . start ;
en_r = r_ptrs . start ;
2021-05-15 00:37:37 -04:00
lp . crc = bch2_extent_crc_unpack ( l . k , NULL ) ;
rp . crc = bch2_extent_crc_unpack ( r . k , NULL ) ;
2021-05-15 15:04:08 -04:00
while ( __bkey_ptr_next_decode ( l . k , l_ptrs . end , lp , en_l ) & &
__bkey_ptr_next_decode ( r . k , r_ptrs . end , rp , en_r ) ) {
2021-05-15 00:37:37 -04:00
if ( lp . ptr . offset + lp . crc . offset + lp . crc . live_size ! =
rp . ptr . offset + rp . crc . offset | |
lp . ptr . dev ! = rp . ptr . dev | |
lp . ptr . gen ! = rp . ptr . gen | |
2022-11-13 18:59:01 -05:00
lp . ptr . unwritten ! = rp . ptr . unwritten | |
2021-05-15 00:37:37 -04:00
lp . has_ec ! = rp . has_ec )
return false ;
2019-11-16 16:25:58 -05:00
2021-05-15 00:37:37 -04:00
/* Extents may not straddle buckets: */
ca = bch_dev_bkey_exists ( c , lp . ptr . dev ) ;
if ( PTR_BUCKET_NR ( ca , & lp . ptr ) ! = PTR_BUCKET_NR ( ca , & rp . ptr ) )
return false ;
2019-11-16 16:25:58 -05:00
2021-05-15 00:37:37 -04:00
if ( lp . has_ec ! = rp . has_ec | |
( lp . has_ec & &
( lp . ec . block ! = rp . ec . block | |
lp . ec . redundancy ! = rp . ec . redundancy | |
lp . ec . idx ! = rp . ec . idx ) ) )
return false ;
2019-11-16 16:25:58 -05:00
2021-05-15 00:37:37 -04:00
if ( lp . crc . compression_type ! = rp . crc . compression_type | |
lp . crc . nonce ! = rp . crc . nonce )
return false ;
2019-11-16 16:25:58 -05:00
2021-05-15 00:37:37 -04:00
if ( lp . crc . offset + lp . crc . live_size + rp . crc . live_size < =
lp . crc . uncompressed_size ) {
/* can use left extent's crc entry */
2022-10-19 18:31:33 -04:00
} else if ( lp . crc . live_size < = rp . crc . offset ) {
2021-05-15 00:37:37 -04:00
/* can use right extent's crc entry */
} else {
/* check if checksums can be merged: */
if ( lp . crc . csum_type ! = rp . crc . csum_type | |
lp . crc . nonce ! = rp . crc . nonce | |
crc_is_compressed ( lp . crc ) | |
! bch2_checksum_mergeable ( lp . crc . csum_type ) )
2021-04-28 23:49:30 -04:00
return false ;
2019-11-16 16:25:58 -05:00
2021-05-15 00:37:37 -04:00
if ( lp . crc . offset + lp . crc . live_size ! = lp . crc . compressed_size | |
rp . crc . offset )
2021-04-28 23:49:30 -04:00
return false ;
2019-11-16 16:25:58 -05:00
2021-05-15 00:37:37 -04:00
if ( lp . crc . csum_type & &
lp . crc . uncompressed_size +
2021-12-14 14:34:03 -05:00
rp . crc . uncompressed_size > ( c - > opts . encoded_extent_max > > 9 ) )
2021-04-28 23:49:30 -04:00
return false ;
2022-04-24 23:03:02 -04:00
}
en_l = extent_entry_next ( en_l ) ;
en_r = extent_entry_next ( en_r ) ;
}
en_l = l_ptrs . start ;
en_r = r_ptrs . start ;
while ( en_l < l_ptrs . end & & en_r < r_ptrs . end ) {
if ( extent_entry_is_crc ( en_l ) ) {
struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack ( l . k , entry_to_crc ( en_l ) ) ;
struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack ( r . k , entry_to_crc ( en_r ) ) ;
2019-11-16 16:25:58 -05:00
2022-04-24 23:03:02 -04:00
if ( crc_l . uncompressed_size + crc_r . uncompressed_size >
2019-11-16 16:25:58 -05:00
bch2_crc_field_size_max [ extent_entry_type ( en_l ) ] )
2021-04-28 23:49:30 -04:00
return false ;
2018-09-30 18:28:23 -04:00
}
2021-05-15 00:37:37 -04:00
en_l = extent_entry_next ( en_l ) ;
en_r = extent_entry_next ( en_r ) ;
2019-11-16 16:25:58 -05:00
}
2018-11-01 15:13:19 -04:00
2021-05-15 00:37:37 -04:00
use_right_ptr = false ;
2021-05-15 15:04:08 -04:00
en_l = l_ptrs . start ;
en_r = r_ptrs . start ;
while ( en_l < l_ptrs . end ) {
2021-05-15 00:37:37 -04:00
if ( extent_entry_type ( en_l ) = = BCH_EXTENT_ENTRY_ptr & &
use_right_ptr )
en_l - > ptr = en_r - > ptr ;
2021-05-15 15:04:08 -04:00
if ( extent_entry_is_crc ( en_l ) ) {
struct bch_extent_crc_unpacked crc_l =
bch2_extent_crc_unpack ( l . k , entry_to_crc ( en_l ) ) ;
struct bch_extent_crc_unpacked crc_r =
bch2_extent_crc_unpack ( r . k , entry_to_crc ( en_r ) ) ;
use_right_ptr = false ;
if ( crc_l . offset + crc_l . live_size + crc_r . live_size < =
crc_l . uncompressed_size ) {
/* can use left extent's crc entry */
2022-10-19 18:31:33 -04:00
} else if ( crc_l . live_size < = crc_r . offset ) {
2021-05-15 15:04:08 -04:00
/* can use right extent's crc entry */
crc_r . offset - = crc_l . live_size ;
bch2_extent_crc_pack ( entry_to_crc ( en_l ) , crc_r ,
extent_entry_type ( en_l ) ) ;
use_right_ptr = true ;
} else {
crc_l . csum = bch2_checksum_merge ( crc_l . csum_type ,
crc_l . csum ,
crc_r . csum ,
crc_r . uncompressed_size < < 9 ) ;
crc_l . uncompressed_size + = crc_r . uncompressed_size ;
crc_l . compressed_size + = crc_r . compressed_size ;
bch2_extent_crc_pack ( entry_to_crc ( en_l ) , crc_l ,
extent_entry_type ( en_l ) ) ;
}
2021-05-15 00:37:37 -04:00
}
2021-05-15 15:04:08 -04:00
en_l = extent_entry_next ( en_l ) ;
en_r = extent_entry_next ( en_r ) ;
2018-09-30 18:28:23 -04:00
}
2019-11-16 16:25:58 -05:00
bch2_key_resize ( l . k , l . k - > size + r . k - > size ) ;
2021-04-28 23:49:30 -04:00
return true ;
2019-11-16 16:25:58 -05:00
}
/* KEY_TYPE_reservation: */
2022-04-03 17:50:01 -04:00
int bch2_reservation_invalid ( const struct bch_fs * c , struct bkey_s_c k ,
2022-12-20 19:58:16 -05:00
unsigned flags , struct printbuf * err )
2019-11-16 16:25:58 -05:00
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation ( k ) ;
2022-04-03 17:50:01 -04:00
if ( bkey_val_bytes ( k . k ) ! = sizeof ( struct bch_reservation ) ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " incorrect value size (%zu != %zu) " ,
2022-04-03 17:50:01 -04:00
bkey_val_bytes ( k . k ) , sizeof ( * r . v ) ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-11-16 16:25:58 -05:00
2022-04-03 17:50:01 -04:00
if ( ! r . v - > nr_replicas | | r . v - > nr_replicas > BCH_REPLICAS_MAX ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " invalid nr_replicas (%u) " ,
2022-04-03 17:50:01 -04:00
r . v - > nr_replicas ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-11-16 16:25:58 -05:00
2022-04-03 17:50:01 -04:00
return 0 ;
2019-11-16 16:25:58 -05:00
}
void bch2_reservation_to_text ( struct printbuf * out , struct bch_fs * c ,
struct bkey_s_c k )
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation ( k ) ;
2023-02-03 21:01:40 -05:00
prt_printf ( out , " generation %u replicas %u " ,
2019-11-16 16:25:58 -05:00
le32_to_cpu ( r . v - > generation ) ,
r . v - > nr_replicas ) ;
}
2021-04-28 23:49:30 -04:00
bool bch2_reservation_merge ( struct bch_fs * c , struct bkey_s _l , struct bkey_s_c _r )
2019-11-16 16:25:58 -05:00
{
struct bkey_s_reservation l = bkey_s_to_reservation ( _l ) ;
2021-04-28 23:49:30 -04:00
struct bkey_s_c_reservation r = bkey_s_c_to_reservation ( _r ) ;
2019-11-16 16:25:58 -05:00
if ( l . v - > generation ! = r . v - > generation | |
l . v - > nr_replicas ! = r . v - > nr_replicas )
2021-04-28 23:49:30 -04:00
return false ;
2019-11-16 16:25:58 -05:00
bch2_key_resize ( l . k , l . k - > size + r . k - > size ) ;
2021-04-28 23:49:30 -04:00
return true ;
2019-11-16 16:25:58 -05:00
}
/* Extent checksum entries: */
/* returns true if not equal */
static inline bool bch2_crc_unpacked_cmp ( struct bch_extent_crc_unpacked l ,
struct bch_extent_crc_unpacked r )
{
return ( l . csum_type ! = r . csum_type | |
l . compression_type ! = r . compression_type | |
l . compressed_size ! = r . compressed_size | |
l . uncompressed_size ! = r . uncompressed_size | |
l . offset ! = r . offset | |
l . live_size ! = r . live_size | |
l . nonce ! = r . nonce | |
bch2_crc_cmp ( l . csum , r . csum ) ) ;
2017-03-16 22:18:50 -08:00
}
static inline bool can_narrow_crc ( struct bch_extent_crc_unpacked u ,
struct bch_extent_crc_unpacked n )
{
2018-02-23 16:26:10 -05:00
return ! crc_is_compressed ( u ) & &
2017-03-16 22:18:50 -08:00
u . csum_type & &
u . uncompressed_size > u . live_size & &
bch2_csum_type_is_encryption ( u . csum_type ) = =
bch2_csum_type_is_encryption ( n . csum_type ) ;
}
2019-07-25 13:52:14 -04:00
bool bch2_can_narrow_extent_crcs ( struct bkey_s_c k ,
2017-03-16 22:18:50 -08:00
struct bch_extent_crc_unpacked n )
{
2019-07-25 13:52:14 -04:00
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
2017-03-16 22:18:50 -08:00
struct bch_extent_crc_unpacked crc ;
const union bch_extent_entry * i ;
if ( ! n . csum_type )
return false ;
2019-07-25 13:52:14 -04:00
bkey_for_each_crc ( k . k , ptrs , crc , i )
2017-03-16 22:18:50 -08:00
if ( can_narrow_crc ( crc , n ) )
return true ;
return false ;
}
/*
* We ' re writing another replica for this extent , so while we ' ve got the data in
* memory we ' ll be computing a new checksum for the currently live data .
*
* If there are other replicas we aren ' t moving , and they are checksummed but
* not compressed , we can modify them to point to only the data that is
* currently live ( so that readers won ' t have to bounce ) while we ' ve got the
* checksum we need :
*/
2019-07-25 13:52:14 -04:00
bool bch2_bkey_narrow_crcs ( struct bkey_i * k , struct bch_extent_crc_unpacked n )
2017-03-16 22:18:50 -08:00
{
2019-07-25 13:52:14 -04:00
struct bkey_ptrs ptrs = bch2_bkey_ptrs ( bkey_i_to_s ( k ) ) ;
2017-03-16 22:18:50 -08:00
struct bch_extent_crc_unpacked u ;
2018-09-27 21:08:39 -04:00
struct extent_ptr_decoded p ;
2017-03-16 22:18:50 -08:00
union bch_extent_entry * i ;
2018-09-27 21:08:39 -04:00
bool ret = false ;
2017-03-16 22:18:50 -08:00
/* Find a checksum entry that covers only live data: */
2018-09-27 21:08:39 -04:00
if ( ! n . csum_type ) {
2019-07-25 13:52:14 -04:00
bkey_for_each_crc ( & k - > k , ptrs , u , i )
2018-02-23 16:26:10 -05:00
if ( ! crc_is_compressed ( u ) & &
2017-03-16 22:18:50 -08:00
u . csum_type & &
u . live_size = = u . uncompressed_size ) {
n = u ;
2018-09-27 21:08:39 -04:00
goto found ;
2017-03-16 22:18:50 -08:00
}
return false ;
2018-09-27 21:08:39 -04:00
}
found :
2018-02-23 16:26:10 -05:00
BUG_ON ( crc_is_compressed ( n ) ) ;
2017-03-16 22:18:50 -08:00
BUG_ON ( n . offset ) ;
2019-07-25 13:52:14 -04:00
BUG_ON ( n . live_size ! = k - > k . size ) ;
2017-03-16 22:18:50 -08:00
restart_narrow_pointers :
2019-08-21 18:55:07 -04:00
ptrs = bch2_bkey_ptrs ( bkey_i_to_s ( k ) ) ;
2019-07-25 13:52:14 -04:00
bkey_for_each_ptr_decode ( & k - > k , ptrs , p , i )
2018-09-27 21:08:39 -04:00
if ( can_narrow_crc ( p . crc , n ) ) {
2021-10-13 13:12:26 -04:00
__bch2_bkey_drop_ptr ( bkey_i_to_s ( k ) , & i - > ptr ) ;
2018-09-27 21:08:39 -04:00
p . ptr . offset + = p . crc . offset ;
p . crc = n ;
2019-07-25 13:52:14 -04:00
bch2_extent_ptr_decoded_append ( k , & p ) ;
2018-09-27 21:08:39 -04:00
ret = true ;
2017-03-16 22:18:50 -08:00
goto restart_narrow_pointers ;
}
2018-09-27 21:08:39 -04:00
return ret ;
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
static void bch2_extent_crc_pack ( union bch_extent_crc * dst ,
struct bch_extent_crc_unpacked src ,
enum bch_extent_entry_type type )
2017-03-16 22:18:50 -08:00
{
2019-11-16 16:25:58 -05:00
# define set_common_fields(_dst, _src) \
_dst . type = 1 < < type ; \
_dst . csum_type = _src . csum_type , \
_dst . compression_type = _src . compression_type , \
_dst . _compressed_size = _src . compressed_size - 1 , \
_dst . _uncompressed_size = _src . uncompressed_size - 1 , \
_dst . offset = _src . offset
switch ( type ) {
case BCH_EXTENT_ENTRY_crc32 :
set_common_fields ( dst - > crc32 , src ) ;
dst - > crc32 . csum = * ( ( __le32 * ) & src . csum . lo ) ;
break ;
case BCH_EXTENT_ENTRY_crc64 :
set_common_fields ( dst - > crc64 , src ) ;
dst - > crc64 . nonce = src . nonce ;
dst - > crc64 . csum_lo = src . csum . lo ;
dst - > crc64 . csum_hi = * ( ( __le16 * ) & src . csum . hi ) ;
break ;
case BCH_EXTENT_ENTRY_crc128 :
set_common_fields ( dst - > crc128 , src ) ;
dst - > crc128 . nonce = src . nonce ;
dst - > crc128 . csum = src . csum ;
break ;
default :
BUG ( ) ;
}
# undef set_common_fields
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
void bch2_extent_crc_append ( struct bkey_i * k ,
struct bch_extent_crc_unpacked new )
2017-03-16 22:18:50 -08:00
{
2019-11-16 16:25:58 -05:00
struct bkey_ptrs ptrs = bch2_bkey_ptrs ( bkey_i_to_s ( k ) ) ;
union bch_extent_crc * crc = ( void * ) ptrs . end ;
enum bch_extent_entry_type type ;
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
if ( bch_crc_bytes [ new . csum_type ] < = 4 & &
2020-03-05 17:06:15 -05:00
new . uncompressed_size < = CRC32_SIZE_MAX & &
2019-11-16 16:25:58 -05:00
new . nonce < = CRC32_NONCE_MAX )
type = BCH_EXTENT_ENTRY_crc32 ;
else if ( bch_crc_bytes [ new . csum_type ] < = 10 & &
2020-03-05 17:06:15 -05:00
new . uncompressed_size < = CRC64_SIZE_MAX & &
2019-11-16 16:25:58 -05:00
new . nonce < = CRC64_NONCE_MAX )
type = BCH_EXTENT_ENTRY_crc64 ;
else if ( bch_crc_bytes [ new . csum_type ] < = 16 & &
2020-03-05 17:06:15 -05:00
new . uncompressed_size < = CRC128_SIZE_MAX & &
2019-11-16 16:25:58 -05:00
new . nonce < = CRC128_NONCE_MAX )
type = BCH_EXTENT_ENTRY_crc128 ;
else
BUG ( ) ;
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
bch2_extent_crc_pack ( crc , new , type ) ;
k - > k . u64s + = extent_entry_u64s ( ptrs . end ) ;
EBUG_ON ( bkey_val_u64s ( & k - > k ) > BKEY_EXTENT_VAL_U64s_MAX ) ;
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
/* Generic code for keys with pointers: */
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
unsigned bch2_bkey_nr_ptrs ( struct bkey_s_c k )
{
return bch2_bkey_devs ( k ) . nr ;
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
unsigned bch2_bkey_nr_ptrs_allocated ( struct bkey_s_c k )
2019-05-11 17:32:07 -04:00
{
2019-11-16 16:25:58 -05:00
return k . k - > type = = KEY_TYPE_reservation
? bkey_s_c_to_reservation ( k ) . v - > nr_replicas
: bch2_bkey_dirty_devs ( k ) . nr ;
2019-05-11 17:32:07 -04:00
}
2019-11-16 16:25:58 -05:00
unsigned bch2_bkey_nr_ptrs_fully_allocated ( struct bkey_s_c k )
2017-03-16 22:18:50 -08:00
{
2019-11-16 16:25:58 -05:00
unsigned ret = 0 ;
2019-05-11 17:32:07 -04:00
2019-11-16 16:25:58 -05:00
if ( k . k - > type = = KEY_TYPE_reservation ) {
ret = bkey_s_c_to_reservation ( k ) . v - > nr_replicas ;
} else {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
struct extent_ptr_decoded p ;
2019-05-11 17:32:07 -04:00
2019-11-16 16:25:58 -05:00
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry )
2018-02-23 16:26:10 -05:00
ret + = ! p . ptr . cached & & ! crc_is_compressed ( p . crc ) ;
2017-03-16 22:18:50 -08:00
}
2018-11-01 15:10:01 -04:00
2019-11-16 16:25:58 -05:00
return ret ;
2019-05-11 17:32:07 -04:00
}
2019-11-16 16:25:58 -05:00
unsigned bch2_bkey_sectors_compressed ( struct bkey_s_c k )
2017-03-16 22:18:50 -08:00
{
2018-11-01 15:10:01 -04:00
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
2019-11-16 16:25:58 -05:00
const union bch_extent_entry * entry ;
struct extent_ptr_decoded p ;
unsigned ret = 0 ;
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry )
2018-02-23 16:26:10 -05:00
if ( ! p . ptr . cached & & crc_is_compressed ( p . crc ) )
2019-11-16 16:25:58 -05:00
ret + = p . crc . compressed_size ;
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
return ret ;
2017-03-16 22:18:50 -08:00
}
2018-02-23 16:26:10 -05:00
bool bch2_bkey_is_incompressible ( struct bkey_s_c k )
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
struct bch_extent_crc_unpacked crc ;
bkey_for_each_crc ( k . k , ptrs , crc , entry )
if ( crc . compression_type = = BCH_COMPRESSION_TYPE_incompressible )
return true ;
return false ;
}
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-14 21:59:33 -05:00
unsigned bch2_bkey_replicas ( struct bch_fs * c , struct bkey_s_c k )
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
2021-03-24 22:11:22 -04:00
struct extent_ptr_decoded p = { 0 } ;
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-14 21:59:33 -05:00
unsigned replicas = 0 ;
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry ) {
if ( p . ptr . cached )
continue ;
2021-01-22 18:01:07 -05:00
if ( p . has_ec )
replicas + = p . ec . redundancy ;
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-14 21:59:33 -05:00
replicas + + ;
}
return replicas ;
}
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
unsigned bch2_extent_ptr_durability ( struct bch_fs * c , struct extent_ptr_decoded * p )
2017-03-16 22:18:50 -08:00
{
2019-11-16 16:25:58 -05:00
unsigned durability = 0 ;
struct bch_dev * ca ;
2017-03-16 22:18:50 -08:00
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
if ( p - > ptr . cached )
2019-11-09 19:02:48 -05:00
return 0 ;
2017-03-16 22:18:50 -08:00
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
ca = bch_dev_bkey_exists ( c , p - > ptr . dev ) ;
2017-03-16 22:18:50 -08:00
2021-02-20 19:47:58 -05:00
if ( ca - > mi . state ! = BCH_MEMBER_STATE_failed )
2019-11-16 16:25:58 -05:00
durability = max_t ( unsigned , durability , ca - > mi . durability ) ;
2017-03-16 22:18:50 -08:00
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
if ( p - > has_ec )
durability + = p - > ec . redundancy ;
2019-11-09 19:02:48 -05:00
2019-11-16 16:25:58 -05:00
return durability ;
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
unsigned bch2_bkey_durability ( struct bch_fs * c , struct bkey_s_c k )
2017-03-16 22:18:50 -08:00
{
2019-11-16 16:25:58 -05:00
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
2018-12-06 10:24:22 -05:00
const union bch_extent_entry * entry ;
struct extent_ptr_decoded p ;
2019-11-16 16:25:58 -05:00
unsigned durability = 0 ;
2018-12-06 10:24:22 -05:00
2019-11-16 16:25:58 -05:00
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry )
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
durability + = bch2_extent_ptr_durability ( c , & p ) ;
2018-12-06 10:24:22 -05:00
2019-11-16 16:25:58 -05:00
return durability ;
2017-03-16 22:18:50 -08:00
}
2021-02-17 13:37:22 -05:00
void bch2_bkey_extent_entry_drop ( struct bkey_i * k , union bch_extent_entry * entry )
{
union bch_extent_entry * end = bkey_val_end ( bkey_i_to_s ( k ) ) ;
union bch_extent_entry * next = extent_entry_next ( entry ) ;
memmove_u64s ( entry , next , ( u64 * ) end - ( u64 * ) next ) ;
k - > k . u64s - = extent_entry_u64s ( entry ) ;
}
2019-07-25 13:52:14 -04:00
static inline void __extent_entry_insert ( struct bkey_i * k ,
2018-09-27 21:08:39 -04:00
union bch_extent_entry * dst ,
union bch_extent_entry * new )
{
2019-07-25 13:52:14 -04:00
union bch_extent_entry * end = bkey_val_end ( bkey_i_to_s ( k ) ) ;
2017-03-16 22:18:50 -08:00
2019-10-22 17:35:35 -04:00
memmove_u64s_up_small ( ( u64 * ) dst + extent_entry_u64s ( new ) ,
dst , ( u64 * ) end - ( u64 * ) dst ) ;
2019-07-25 13:52:14 -04:00
k - > k . u64s + = extent_entry_u64s ( new ) ;
2018-09-27 21:08:39 -04:00
memcpy_u64s_small ( dst , new , extent_entry_u64s ( new ) ) ;
}
2017-03-16 22:18:50 -08:00
2019-07-25 13:52:14 -04:00
void bch2_extent_ptr_decoded_append ( struct bkey_i * k ,
2018-09-27 21:08:39 -04:00
struct extent_ptr_decoded * p )
{
2019-07-25 13:52:14 -04:00
struct bkey_ptrs ptrs = bch2_bkey_ptrs ( bkey_i_to_s ( k ) ) ;
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack ( & k - > k , NULL ) ;
2018-09-27 21:08:39 -04:00
union bch_extent_entry * pos ;
2017-03-16 22:18:50 -08:00
2018-09-27 21:08:39 -04:00
if ( ! bch2_crc_unpacked_cmp ( crc , p - > crc ) ) {
2019-07-25 13:52:14 -04:00
pos = ptrs . start ;
2018-09-27 21:08:39 -04:00
goto found ;
}
2019-07-25 13:52:14 -04:00
bkey_for_each_crc ( & k - > k , ptrs , crc , pos )
2018-09-27 21:08:39 -04:00
if ( ! bch2_crc_unpacked_cmp ( crc , p - > crc ) ) {
pos = extent_entry_next ( pos ) ;
goto found ;
}
2019-07-25 13:52:14 -04:00
bch2_extent_crc_append ( k , p - > crc ) ;
pos = bkey_val_end ( bkey_i_to_s ( k ) ) ;
2018-09-27 21:08:39 -04:00
found :
p - > ptr . type = 1 < < BCH_EXTENT_ENTRY_ptr ;
2019-07-25 13:52:14 -04:00
__extent_entry_insert ( k , pos , to_entry ( & p - > ptr ) ) ;
2018-11-01 15:13:19 -04:00
2019-10-08 18:45:29 -04:00
if ( p - > has_ec ) {
p - > ec . type = 1 < < BCH_EXTENT_ENTRY_stripe_ptr ;
__extent_entry_insert ( k , pos , to_entry ( & p - > ec ) ) ;
2018-11-01 15:13:19 -04:00
}
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
static union bch_extent_entry * extent_entry_prev ( struct bkey_ptrs ptrs ,
union bch_extent_entry * entry )
{
union bch_extent_entry * i = ptrs . start ;
if ( i = = entry )
return NULL ;
while ( extent_entry_next ( i ) ! = entry )
i = extent_entry_next ( i ) ;
return i ;
}
2021-10-13 13:12:26 -04:00
static void extent_entry_drop ( struct bkey_s k , union bch_extent_entry * entry )
{
union bch_extent_entry * next = extent_entry_next ( entry ) ;
/* stripes have ptrs, but their layout doesn't work with this code */
BUG_ON ( k . k - > type = = KEY_TYPE_stripe ) ;
memmove_u64s_down ( entry , next ,
( u64 * ) bkey_val_end ( k ) - ( u64 * ) next ) ;
k . k - > u64s - = ( u64 * ) next - ( u64 * ) entry ;
}
/*
* Returns pointer to the next entry after the one being dropped :
*/
2022-06-13 19:17:45 -04:00
static union bch_extent_entry * __bch2_bkey_drop_ptr ( struct bkey_s k ,
struct bch_extent_ptr * ptr )
2019-11-16 16:25:58 -05:00
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs ( k ) ;
2021-10-13 13:12:26 -04:00
union bch_extent_entry * entry = to_entry ( ptr ) , * next ;
union bch_extent_entry * ret = entry ;
2019-11-16 16:25:58 -05:00
bool drop_crc = true ;
EBUG_ON ( ptr < & ptrs . start - > ptr | |
ptr > = & ptrs . end - > ptr ) ;
EBUG_ON ( ptr - > type ! = 1 < < BCH_EXTENT_ENTRY_ptr ) ;
2021-10-13 13:12:26 -04:00
for ( next = extent_entry_next ( entry ) ;
next ! = ptrs . end ;
next = extent_entry_next ( next ) ) {
if ( extent_entry_is_crc ( next ) ) {
2019-11-16 16:25:58 -05:00
break ;
2021-10-13 13:12:26 -04:00
} else if ( extent_entry_is_ptr ( next ) ) {
drop_crc = false ;
2019-11-16 16:25:58 -05:00
break ;
}
2021-10-13 13:12:26 -04:00
}
extent_entry_drop ( k , entry ) ;
2019-11-16 16:25:58 -05:00
2021-10-13 13:12:26 -04:00
while ( ( entry = extent_entry_prev ( ptrs , entry ) ) ) {
if ( extent_entry_is_ptr ( entry ) )
break ;
if ( ( extent_entry_is_crc ( entry ) & & drop_crc ) | |
extent_entry_is_stripe_ptr ( entry ) ) {
ret = ( void * ) ret - extent_entry_bytes ( entry ) ;
extent_entry_drop ( k , entry ) ;
}
2019-11-16 16:25:58 -05:00
}
2021-10-13 13:12:26 -04:00
return ret ;
}
union bch_extent_entry * bch2_bkey_drop_ptr ( struct bkey_s k ,
struct bch_extent_ptr * ptr )
{
bool have_dirty = bch2_bkey_dirty_devs ( k . s_c ) . nr ;
union bch_extent_entry * ret =
__bch2_bkey_drop_ptr ( k , ptr ) ;
/*
* If we deleted all the dirty pointers and there ' s still cached
* pointers , we could set the cached pointers to dirty if they ' re not
* stale - but to do that correctly we ' d need to grab an open_bucket
* reference so that we don ' t race with bucket reuse :
*/
if ( have_dirty & &
! bch2_bkey_dirty_devs ( k . s_c ) . nr ) {
k . k - > type = KEY_TYPE_error ;
set_bkey_val_u64s ( k . k , 0 ) ;
ret = NULL ;
} else if ( ! bch2_bkey_nr_ptrs ( k . s_c ) ) {
k . k - > type = KEY_TYPE_deleted ;
set_bkey_val_u64s ( k . k , 0 ) ;
ret = NULL ;
}
2019-11-16 16:25:58 -05:00
2021-10-13 13:12:26 -04:00
return ret ;
2019-11-16 16:25:58 -05:00
}
void bch2_bkey_drop_device ( struct bkey_s k , unsigned dev )
{
struct bch_extent_ptr * ptr ;
bch2_bkey_drop_ptrs ( k , ptr , ptr - > dev = = dev ) ;
}
2022-06-13 19:17:45 -04:00
void bch2_bkey_drop_device_noerror ( struct bkey_s k , unsigned dev )
{
struct bch_extent_ptr * ptr = ( void * ) bch2_bkey_has_device ( k . s_c , dev ) ;
if ( ptr )
__bch2_bkey_drop_ptr ( k , ptr ) ;
}
2019-11-16 16:25:58 -05:00
const struct bch_extent_ptr *
bch2_bkey_has_device ( struct bkey_s_c k , unsigned dev )
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const struct bch_extent_ptr * ptr ;
bkey_for_each_ptr ( ptrs , ptr )
if ( ptr - > dev = = dev )
return ptr ;
return NULL ;
}
bool bch2_bkey_has_target ( struct bch_fs * c , struct bkey_s_c k , unsigned target )
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const struct bch_extent_ptr * ptr ;
bkey_for_each_ptr ( ptrs , ptr )
if ( bch2_dev_in_target ( c , ptr - > dev , target ) & &
( ! ptr - > cached | |
! ptr_stale ( bch_dev_bkey_exists ( c , ptr - > dev ) , ptr ) ) )
return true ;
return false ;
}
bool bch2_bkey_matches_ptr ( struct bch_fs * c , struct bkey_s_c k ,
struct bch_extent_ptr m , u64 offset )
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
struct extent_ptr_decoded p ;
bkey_for_each_ptr_decode ( k . k , ptrs , p , entry )
if ( p . ptr . dev = = m . dev & &
p . ptr . gen = = m . gen & &
( s64 ) p . ptr . offset + p . crc . offset - bkey_start_offset ( k . k ) = =
( s64 ) m . offset - offset )
return true ;
return false ;
}
2022-06-13 19:17:45 -04:00
/*
* Returns true if two extents refer to the same data :
*/
bool bch2_extents_match ( struct bkey_s_c k1 , struct bkey_s_c k2 )
{
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
if ( k1 . k - > type ! = k2 . k - > type )
2022-11-13 18:59:01 -05:00
return false ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
if ( bkey_extent_is_direct_data ( k1 . k ) ) {
struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c ( k1 ) ;
struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c ( k2 ) ;
const union bch_extent_entry * entry1 , * entry2 ;
struct extent_ptr_decoded p1 , p2 ;
if ( bkey_extent_is_unwritten ( k1 ) ! = bkey_extent_is_unwritten ( k2 ) )
return false ;
bkey_for_each_ptr_decode ( k1 . k , ptrs1 , p1 , entry1 )
bkey_for_each_ptr_decode ( k2 . k , ptrs2 , p2 , entry2 )
2022-06-13 19:17:45 -04:00
if ( p1 . ptr . dev = = p2 . ptr . dev & &
p1 . ptr . gen = = p2 . ptr . gen & &
( s64 ) p1 . ptr . offset + p1 . crc . offset - bkey_start_offset ( k1 . k ) = =
( s64 ) p2 . ptr . offset + p2 . crc . offset - bkey_start_offset ( k2 . k ) )
return true ;
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
return false ;
} else {
/* KEY_TYPE_deleted, etc. */
return true ;
}
2022-06-13 19:17:45 -04:00
}
bool bch2_extent_has_ptr ( struct bkey_s_c k1 , struct extent_ptr_decoded p1 ,
struct bkey_s_c k2 )
{
struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c ( k2 ) ;
const union bch_extent_entry * entry2 ;
struct extent_ptr_decoded p2 ;
bkey_for_each_ptr_decode ( k2 . k , ptrs2 , p2 , entry2 )
if ( p1 . ptr . dev = = p2 . ptr . dev & &
p1 . ptr . gen = = p2 . ptr . gen & &
( s64 ) p1 . ptr . offset + p1 . crc . offset - bkey_start_offset ( k1 . k ) = =
( s64 ) p2 . ptr . offset + p2 . crc . offset - bkey_start_offset ( k2 . k ) )
return true ;
return false ;
}
2017-03-16 22:18:50 -08:00
/*
* bch_extent_normalize - clean up an extent , dropping stale pointers etc .
*
* Returns true if @ k should be dropped entirely
*
* For existing keys , only called when btree nodes are being rewritten , not when
* they ' re merely being compacted / resorted in memory .
*/
bool bch2_extent_normalize ( struct bch_fs * c , struct bkey_s k )
{
2018-11-01 15:10:01 -04:00
struct bch_extent_ptr * ptr ;
2017-03-16 22:18:50 -08:00
2018-11-01 15:10:01 -04:00
bch2_bkey_drop_ptrs ( k , ptr ,
ptr - > cached & &
ptr_stale ( bch_dev_bkey_exists ( c , ptr - > dev ) , ptr ) ) ;
2017-03-16 22:18:50 -08:00
2021-02-19 23:41:40 -05:00
return bkey_deleted ( k . k ) ;
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
void bch2_bkey_ptrs_to_text ( struct printbuf * out , struct bch_fs * c ,
struct bkey_s_c k )
2017-03-16 22:18:50 -08:00
{
2019-11-16 16:25:58 -05:00
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
struct bch_extent_crc_unpacked crc ;
const struct bch_extent_ptr * ptr ;
const struct bch_extent_stripe_ptr * ec ;
struct bch_dev * ca ;
bool first = true ;
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
bkey_extent_entry_for_each ( ptrs , entry ) {
if ( ! first )
2023-02-03 21:01:40 -05:00
prt_printf ( out , " " ) ;
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
switch ( __extent_entry_type ( entry ) ) {
case BCH_EXTENT_ENTRY_ptr :
ptr = entry_to_ptr ( entry ) ;
2022-03-30 23:40:19 -04:00
ca = c & & ptr - > dev < c - > sb . nr_devices & & c - > devs [ ptr - > dev ]
? bch_dev_bkey_exists ( c , ptr - > dev )
: NULL ;
if ( ! ca ) {
2023-02-03 21:01:40 -05:00
prt_printf ( out , " ptr: %u:%llu gen %u%s " , ptr - > dev ,
2022-03-30 23:40:19 -04:00
( u64 ) ptr - > offset , ptr - > gen ,
ptr - > cached ? " cached " : " " ) ;
} else {
u32 offset ;
u64 b = sector_to_bucket_and_offset ( ca , ptr - > offset , & offset ) ;
2017-03-16 22:18:50 -08:00
2022-11-13 18:59:01 -05:00
prt_printf ( out , " ptr: %u:%llu:%u gen %u " ,
ptr - > dev , b , offset , ptr - > gen ) ;
if ( ptr - > cached )
prt_str ( out , " cached " ) ;
if ( ptr - > unwritten )
prt_str ( out , " unwritten " ) ;
2022-02-20 05:00:45 -05:00
if ( ca & & ptr_stale ( ca , ptr ) )
2023-02-03 21:01:40 -05:00
prt_printf ( out , " stale " ) ;
2022-02-20 05:00:45 -05:00
}
2019-11-16 16:25:58 -05:00
break ;
case BCH_EXTENT_ENTRY_crc32 :
case BCH_EXTENT_ENTRY_crc64 :
case BCH_EXTENT_ENTRY_crc128 :
crc = bch2_extent_crc_unpack ( k . k , entry_to_crc ( entry ) ) ;
2017-03-16 22:18:50 -08:00
2023-02-03 21:01:40 -05:00
prt_printf ( out , " crc: c_size %u size %u offset %u nonce %u csum %s compress %s " ,
2019-11-16 16:25:58 -05:00
crc . compressed_size ,
crc . uncompressed_size ,
crc . offset , crc . nonce ,
2021-11-11 12:11:33 -05:00
bch2_csum_types [ crc . csum_type ] ,
bch2_compression_types [ crc . compression_type ] ) ;
2019-11-16 16:25:58 -05:00
break ;
case BCH_EXTENT_ENTRY_stripe_ptr :
ec = & entry - > stripe_ptr ;
2023-02-03 21:01:40 -05:00
prt_printf ( out , " ec: idx %llu block %u " ,
2019-11-16 16:25:58 -05:00
( u64 ) ec - > idx , ec - > block ) ;
break ;
default :
2023-02-03 21:01:40 -05:00
prt_printf ( out , " (invalid extent entry %.16llx) " , * ( ( u64 * ) entry ) ) ;
2019-11-16 16:25:58 -05:00
return ;
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
first = false ;
}
2017-03-16 22:18:50 -08:00
}
2022-04-03 17:50:01 -04:00
static int extent_ptr_invalid ( const struct bch_fs * c ,
struct bkey_s_c k ,
const struct bch_extent_ptr * ptr ,
unsigned size_ondisk ,
bool metadata ,
struct printbuf * err )
2017-03-16 22:18:50 -08:00
{
2019-11-16 16:25:58 -05:00
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const struct bch_extent_ptr * ptr2 ;
2022-04-03 17:50:01 -04:00
u64 bucket ;
u32 bucket_offset ;
2019-11-16 16:25:58 -05:00
struct bch_dev * ca ;
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
if ( ! bch2_dev_exists2 ( c , ptr - > dev ) ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " pointer to invalid device (%u) " , ptr - > dev ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-05-21 10:14:54 -04:00
2019-11-16 16:25:58 -05:00
ca = bch_dev_bkey_exists ( c , ptr - > dev ) ;
bkey_for_each_ptr ( ptrs , ptr2 )
2022-04-03 17:50:01 -04:00
if ( ptr ! = ptr2 & & ptr - > dev = = ptr2 - > dev ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " multiple pointers to same device (%u) " , ptr - > dev ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
bucket = sector_to_bucket_and_offset ( ca , ptr - > offset , & bucket_offset ) ;
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
if ( bucket > = ca - > mi . nbuckets ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " pointer past last bucket (%llu > %llu) " ,
2022-04-03 17:50:01 -04:00
bucket , ca - > mi . nbuckets ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
if ( ptr - > offset < bucket_to_sector ( ca , ca - > mi . first_bucket ) ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " pointer before first bucket (%llu < %u) " ,
2022-04-03 17:50:01 -04:00
bucket , ca - > mi . first_bucket ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
if ( bucket_offset + size_ondisk > ca - > mi . bucket_size ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " pointer spans multiple buckets (%u + %u > %u) " ,
2022-04-03 17:50:01 -04:00
bucket_offset , size_ondisk , ca - > mi . bucket_size ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
return 0 ;
2019-11-16 16:25:58 -05:00
}
2017-03-16 22:18:50 -08:00
2022-04-03 17:50:01 -04:00
int bch2_bkey_ptrs_invalid ( const struct bch_fs * c , struct bkey_s_c k ,
2022-12-20 19:58:16 -05:00
unsigned flags , struct printbuf * err )
2019-11-16 16:25:58 -05:00
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c ( k ) ;
const union bch_extent_entry * entry ;
struct bch_extent_crc_unpacked crc ;
unsigned size_ondisk = k . k - > size ;
unsigned nonce = UINT_MAX ;
2022-06-16 22:38:10 -04:00
unsigned nr_ptrs = 0 ;
2022-11-13 18:59:01 -05:00
bool unwritten = false ;
2022-04-03 17:50:01 -04:00
int ret ;
2019-11-16 16:25:58 -05:00
2022-04-03 17:50:01 -04:00
if ( bkey_is_btree_ptr ( k . k ) )
2021-12-14 14:24:41 -05:00
size_ondisk = btree_sectors ( c ) ;
2019-11-16 16:25:58 -05:00
bkey_extent_entry_for_each ( ptrs , entry ) {
2022-04-03 17:50:01 -04:00
if ( __extent_entry_type ( entry ) > = BCH_EXTENT_ENTRY_MAX ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " invalid extent entry type (got %u, max %u) " ,
2022-04-03 17:50:01 -04:00
__extent_entry_type ( entry ) , BCH_EXTENT_ENTRY_MAX ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-05-12 22:23:30 -04:00
2022-04-03 17:50:01 -04:00
if ( bkey_is_btree_ptr ( k . k ) & &
! extent_entry_is_ptr ( entry ) ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " has non ptr field " ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-11-16 16:25:58 -05:00
switch ( extent_entry_type ( entry ) ) {
case BCH_EXTENT_ENTRY_ptr :
2022-04-03 17:50:01 -04:00
ret = extent_ptr_invalid ( c , k , & entry - > ptr , size_ondisk ,
false , err ) ;
if ( ret )
return ret ;
2022-11-13 18:59:01 -05:00
if ( nr_ptrs & & unwritten ! = entry - > ptr . unwritten ) {
prt_printf ( err , " extent with unwritten and written ptrs " ) ;
return - BCH_ERR_invalid_bkey ;
}
if ( k . k - > type ! = KEY_TYPE_extent & & entry - > ptr . unwritten ) {
prt_printf ( err , " has unwritten ptrs " ) ;
return - BCH_ERR_invalid_bkey ;
}
unwritten = entry - > ptr . unwritten ;
2022-06-16 22:38:10 -04:00
nr_ptrs + + ;
2019-05-12 22:23:30 -04:00
break ;
case BCH_EXTENT_ENTRY_crc32 :
case BCH_EXTENT_ENTRY_crc64 :
case BCH_EXTENT_ENTRY_crc128 :
2019-11-16 16:25:58 -05:00
crc = bch2_extent_crc_unpack ( k . k , entry_to_crc ( entry ) ) ;
2019-05-12 22:23:30 -04:00
2019-11-16 16:25:58 -05:00
if ( crc . offset + crc . live_size >
2022-04-03 17:50:01 -04:00
crc . uncompressed_size ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " checksum offset + key size > uncompressed size " ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-05-12 22:23:30 -04:00
2019-11-16 16:25:58 -05:00
size_ondisk = crc . compressed_size ;
2019-05-12 22:23:30 -04:00
2022-04-03 17:50:01 -04:00
if ( ! bch2_checksum_type_valid ( c , crc . csum_type ) ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " invalid checksum type " ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-05-12 22:23:30 -04:00
2022-04-03 17:50:01 -04:00
if ( crc . compression_type > = BCH_COMPRESSION_TYPE_NR ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " invalid compression type " ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-05-12 22:23:30 -04:00
2019-11-16 16:25:58 -05:00
if ( bch2_csum_type_is_encryption ( crc . csum_type ) ) {
if ( nonce = = UINT_MAX )
nonce = crc . offset + crc . nonce ;
2022-04-03 17:50:01 -04:00
else if ( nonce ! = crc . offset + crc . nonce ) {
2023-02-03 21:01:40 -05:00
prt_printf ( err , " incorrect nonce " ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-04-03 17:50:01 -04:00
}
2019-11-16 16:25:58 -05:00
}
break ;
case BCH_EXTENT_ENTRY_stripe_ptr :
2019-05-12 22:23:30 -04:00
break ;
}
2017-03-16 22:18:50 -08:00
}
2022-06-16 22:38:10 -04:00
if ( nr_ptrs > = BCH_BKEY_PTRS_MAX ) {
prt_str ( err , " too many ptrs " ) ;
2022-11-19 22:39:08 -05:00
return - BCH_ERR_invalid_bkey ;
2022-06-16 22:38:10 -04:00
}
2022-04-03 17:50:01 -04:00
return 0 ;
2019-11-16 16:25:58 -05:00
}
2019-05-12 22:23:30 -04:00
2020-02-06 20:15:15 -05:00
void bch2_ptr_swab ( struct bkey_s k )
2019-11-16 16:25:58 -05:00
{
2020-02-06 20:15:15 -05:00
struct bkey_ptrs ptrs = bch2_bkey_ptrs ( k ) ;
2019-11-16 16:25:58 -05:00
union bch_extent_entry * entry ;
2020-02-06 20:15:15 -05:00
u64 * d ;
2019-05-12 22:23:30 -04:00
2020-02-06 20:15:15 -05:00
for ( d = ( u64 * ) ptrs . start ;
d ! = ( u64 * ) ptrs . end ;
d + + )
* d = swab64 ( * d ) ;
2019-05-12 22:23:30 -04:00
2020-02-06 20:15:15 -05:00
for ( entry = ptrs . start ;
entry < ptrs . end ;
2019-11-16 16:25:58 -05:00
entry = extent_entry_next ( entry ) ) {
switch ( extent_entry_type ( entry ) ) {
case BCH_EXTENT_ENTRY_ptr :
break ;
case BCH_EXTENT_ENTRY_crc32 :
entry - > crc32 . csum = swab32 ( entry - > crc32 . csum ) ;
break ;
case BCH_EXTENT_ENTRY_crc64 :
entry - > crc64 . csum_hi = swab16 ( entry - > crc64 . csum_hi ) ;
entry - > crc64 . csum_lo = swab64 ( entry - > crc64 . csum_lo ) ;
break ;
case BCH_EXTENT_ENTRY_crc128 :
entry - > crc128 . csum . hi = ( __force __le64 )
swab64 ( ( __force u64 ) entry - > crc128 . csum . hi ) ;
entry - > crc128 . csum . lo = ( __force __le64 )
swab64 ( ( __force u64 ) entry - > crc128 . csum . lo ) ;
break ;
case BCH_EXTENT_ENTRY_stripe_ptr :
break ;
}
2017-03-16 22:18:50 -08:00
}
}
2019-11-16 16:25:58 -05:00
/* Generic extent code: */
int bch2_cut_front_s ( struct bpos where , struct bkey_s k )
2017-03-16 22:18:50 -08:00
{
2019-11-16 16:25:58 -05:00
unsigned new_val_u64s = bkey_val_u64s ( k . k ) ;
int val_u64s_delta ;
u64 sub ;
2017-03-16 22:18:50 -08:00
2022-11-24 03:12:22 -05:00
if ( bkey_le ( where , bkey_start_pos ( k . k ) ) )
2019-11-16 16:25:58 -05:00
return 0 ;
2017-03-16 22:18:50 -08:00
2022-11-24 03:12:22 -05:00
EBUG_ON ( bkey_gt ( where , k . k - > p ) ) ;
2019-03-25 15:10:15 -04:00
2019-11-16 16:25:58 -05:00
sub = where . offset - bkey_start_offset ( k . k ) ;
2017-03-16 22:18:50 -08:00
2019-11-16 16:25:58 -05:00
k . k - > size - = sub ;
if ( ! k . k - > size ) {
k . k - > type = KEY_TYPE_deleted ;
new_val_u64s = 0 ;
2017-03-16 22:18:50 -08:00
}
2019-11-16 16:25:58 -05:00
switch ( k . k - > type ) {
case KEY_TYPE_extent :
case KEY_TYPE_reflink_v : {
struct bkey_ptrs ptrs = bch2_bkey_ptrs ( k ) ;
union bch_extent_entry * entry ;
bool seen_crc = false ;
2018-11-01 15:10:01 -04:00
2019-11-16 16:25:58 -05:00
bkey_extent_entry_for_each ( ptrs , entry ) {
switch ( extent_entry_type ( entry ) ) {
case BCH_EXTENT_ENTRY_ptr :
if ( ! seen_crc )
entry - > ptr . offset + = sub ;
break ;
case BCH_EXTENT_ENTRY_crc32 :
entry - > crc32 . offset + = sub ;
break ;
case BCH_EXTENT_ENTRY_crc64 :
entry - > crc64 . offset + = sub ;
break ;
case BCH_EXTENT_ENTRY_crc128 :
entry - > crc128 . offset + = sub ;
break ;
case BCH_EXTENT_ENTRY_stripe_ptr :
break ;
}
2019-01-21 15:32:13 -05:00
2019-11-16 16:25:58 -05:00
if ( extent_entry_is_crc ( entry ) )
seen_crc = true ;
}
2019-01-21 15:32:13 -05:00
break ;
}
2019-11-16 16:25:58 -05:00
case KEY_TYPE_reflink_p : {
struct bkey_s_reflink_p p = bkey_s_to_reflink_p ( k ) ;
le64_add_cpu ( & p . v - > idx , sub ) ;
2019-01-21 15:32:13 -05:00
break ;
}
2020-10-24 19:51:34 -04:00
case KEY_TYPE_inline_data :
case KEY_TYPE_indirect_inline_data : {
void * p = bkey_inline_data_p ( k ) ;
unsigned bytes = bkey_inline_data_bytes ( k . k ) ;
2019-01-21 15:32:13 -05:00
2020-10-24 19:51:34 -04:00
sub = min_t ( u64 , sub < < 9 , bytes ) ;
2018-11-01 15:10:01 -04:00
2020-10-24 19:51:34 -04:00
memmove ( p , p + sub , bytes - sub ) ;
2018-11-01 15:10:01 -04:00
2019-11-16 16:25:58 -05:00
new_val_u64s - = sub > > 3 ;
break ;
}
}
2018-11-01 15:10:01 -04:00
2019-11-16 16:25:58 -05:00
val_u64s_delta = bkey_val_u64s ( k . k ) - new_val_u64s ;
BUG_ON ( val_u64s_delta < 0 ) ;
2018-11-01 15:10:01 -04:00
2019-11-16 16:25:58 -05:00
set_bkey_val_u64s ( k . k , new_val_u64s ) ;
memset ( bkey_val_end ( k ) , 0 , val_u64s_delta * sizeof ( u64 ) ) ;
return - val_u64s_delta ;
2018-11-01 15:10:01 -04:00
}
2019-11-16 16:25:58 -05:00
int bch2_cut_back_s ( struct bpos where , struct bkey_s k )
2018-11-01 15:10:01 -04:00
{
2019-11-16 16:25:58 -05:00
unsigned new_val_u64s = bkey_val_u64s ( k . k ) ;
int val_u64s_delta ;
u64 len = 0 ;
2018-11-01 15:10:01 -04:00
2022-11-24 03:12:22 -05:00
if ( bkey_ge ( where , k . k - > p ) )
2019-11-16 16:25:58 -05:00
return 0 ;
2018-11-01 15:10:01 -04:00
2022-11-24 03:12:22 -05:00
EBUG_ON ( bkey_lt ( where , bkey_start_pos ( k . k ) ) ) ;
2018-11-01 15:10:01 -04:00
2019-11-16 16:25:58 -05:00
len = where . offset - bkey_start_offset ( k . k ) ;
2018-11-01 15:10:01 -04:00
2021-03-19 16:37:24 -04:00
k . k - > p . offset = where . offset ;
2019-11-16 16:25:58 -05:00
k . k - > size = len ;
if ( ! len ) {
k . k - > type = KEY_TYPE_deleted ;
new_val_u64s = 0 ;
2018-11-01 15:10:01 -04:00
}
2019-11-16 16:25:58 -05:00
switch ( k . k - > type ) {
case KEY_TYPE_inline_data :
2020-10-24 19:51:34 -04:00
case KEY_TYPE_indirect_inline_data :
new_val_u64s = ( bkey_inline_data_offset ( k . k ) +
min ( bkey_inline_data_bytes ( k . k ) , k . k - > size < < 9 ) ) > > 3 ;
2019-11-16 16:25:58 -05:00
break ;
}
2018-11-01 15:10:01 -04:00
2019-11-16 16:25:58 -05:00
val_u64s_delta = bkey_val_u64s ( k . k ) - new_val_u64s ;
BUG_ON ( val_u64s_delta < 0 ) ;
set_bkey_val_u64s ( k . k , new_val_u64s ) ;
memset ( bkey_val_end ( k ) , 0 , val_u64s_delta * sizeof ( u64 ) ) ;
return - val_u64s_delta ;
2018-11-01 15:10:01 -04:00
}