2018-04-03 20:16:55 +03:00
/* SPDX-License-Identifier: GPL-2.0 */
2013-01-30 03:40:14 +04:00
/*
* Copyright ( C ) 2012 Fusion - io All rights reserved .
* Copyright ( C ) 2012 Intel Corp . All rights reserved .
*/
2018-04-03 20:16:55 +03:00
# ifndef BTRFS_RAID56_H
# define BTRFS_RAID56_H
2022-06-01 12:46:59 +03:00
# include <linux/workqueue.h>
# include "volumes.h"
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE ,
BTRFS_RBIO_READ_REBUILD ,
BTRFS_RBIO_PARITY_SCRUB ,
} ;
struct btrfs_raid_bio {
struct btrfs_io_context * bioc ;
/*
* While we ' re doing RMW on a stripe we put it into a hash table so we
* can lock the stripe and merge more rbios into it .
*/
struct list_head hash_list ;
/* LRU list for the stripe cache */
struct list_head stripe_cache ;
/* For scheduling work in the helper threads */
struct work_struct work ;
/*
* bio_list and bio_list_lock are used to add more bios into the stripe
* in hopes of avoiding the full RMW
*/
struct bio_list bio_list ;
spinlock_t bio_list_lock ;
/*
* Also protected by the bio_list_lock , the plug list is used by the
* plugging code to collect partial bios while plugged . The stripe
* locking code also uses it to hand off the stripe lock to the next
* pending IO .
*/
struct list_head plug_list ;
/* Flags that tell us if it is safe to merge with this bio. */
unsigned long flags ;
/*
* Set if we ' re doing a parity rebuild for a read from higher up , which
* is handled differently from a parity rebuild as part of RMW .
*/
enum btrfs_rbio_ops operation ;
/* How many pages there are for the full stripe including P/Q */
u16 nr_pages ;
/* How many sectors there are for the full stripe including P/Q */
u16 nr_sectors ;
/* Number of data stripes (no p/q) */
u8 nr_data ;
2023-01-17 13:03:21 +03:00
/* Number of all stripes (including P/Q) */
2022-06-01 12:46:59 +03:00
u8 real_stripes ;
/* How many pages there are for each stripe */
u8 stripe_npages ;
/* How many sectors there are for each stripe */
u8 stripe_nsectors ;
/* Stripe number that we're scrubbing */
u8 scrubp ;
/*
* Size of all the bios in the bio_list . This helps us decide if the
* rbio maps to a full stripe or not .
*/
int bio_list_bytes ;
refcount_t refs ;
atomic_t stripes_pending ;
2022-11-01 14:16:05 +03:00
wait_queue_head_t io_wait ;
2022-06-01 12:46:59 +03:00
/* Bitmap to record which horizontal stripe has data */
unsigned long dbitmap ;
/* Allocated with stripe_nsectors-many bits for finish_*() calls */
unsigned long finish_pbitmap ;
/*
* These are two arrays of pointers . We allocate the rbio big enough
* to hold them both and setup their locations when the rbio is
* allocated .
*/
/*
* Pointers to pages that we allocated for reading / writing stripes
* directly from the disk ( including P / Q ) .
*/
struct page * * stripe_pages ;
/* Pointers to the sectors in the bio_list, for faster lookup */
struct sector_ptr * bio_sectors ;
/*
* For subpage support , we need to map each sector to above
* stripe_pages .
*/
struct sector_ptr * stripe_sectors ;
/* Allocated with real_stripes-many pointers for finish_*() calls */
void * * finish_pointers ;
btrfs: raid56: introduce btrfs_raid_bio::error_bitmap
Currently btrfs raid56 uses btrfs_raid_bio::faila and failb to indicate
which stripe(s) had IO errors.
But that has some problems:
- If one sector failed csum check, the whole stripe where the corruption
is will be marked error.
This can reduce the chance we do recover, like this:
0 4K 8K
Data 1 |XX| |
Data 2 | |XX|
Parity | | |
In above case, 0~4K in data 1 should be recovered using data 2 and
parity, while 4K~8K in data 2 should be recovered using data 1 and
parity.
Currently if we trigger read on 0~4K of data 1, we will also recover
4K~8K of data 1 using corrupted data 2 and parity, causing wrong
result in rbio cache.
- Harder to expand for future M-N scheme
As we're limited to just faila/b, two corruptions.
- Harder to expand to handle extra csum errors
This can be problematic if we start to do csum verification.
This patch will introduce an extra @error_bitmap, where one bit
represents error that happened for that sector.
The choice to introduce a new error bitmap other than reusing
sector_ptr, is to avoid extra search between rbio::stripe_sectors[] and
rbio::bio_sectors[].
Since we can submit bio using sectors from both sectors, doing proper
search on both array will more complex.
Although the new bitmap will take extra memory, later we can remove
things like @error and faila/b to save some memory.
Currently the new error bitmap and failab mechanism coexists, the error
bitmap is only updated at endio time and recover entrance.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-07 10:32:29 +03:00
/*
* The bitmap recording where IO errors happened .
* Each bit is corresponding to one sector in either bio_sectors [ ] or
* stripe_sectors [ ] array .
*
* The reason we don ' t use another bit in sector_ptr is , we have two
* arrays of sectors , and a lot of IO can use sectors in both arrays .
* Thus making it much harder to iterate .
*/
unsigned long * error_bitmap ;
2022-11-14 03:26:33 +03:00
/*
* Checksum buffer if the rbio is for data . The buffer should cover
2023-01-17 13:03:21 +03:00
* all data sectors ( excluding P / Q sectors ) .
2022-11-14 03:26:33 +03:00
*/
u8 * csum_buf ;
/*
* Each bit represents if the corresponding sector has data csum found .
* Should only cover data sectors ( excluding P / Q sectors ) .
*/
unsigned long * csum_bitmap ;
2022-06-01 12:46:59 +03:00
} ;
/*
* For trace event usage only . Records useful debug info for each bio submitted
* by RAID56 to each physical device .
*
* No matter signed or not , ( - 1 ) is always the one indicating we can not grab
* the proper stripe number .
*/
struct raid56_bio_trace_info {
u64 devid ;
/* The offset inside the stripe. (<= STRIPE_LEN) */
u32 offset ;
/*
* Stripe number .
* 0 is the first data stripe , and nr_data for P stripe ,
* nr_data + 1 for Q stripe .
* > = real_stripes for
*/
u8 stripe_nr ;
} ;
btrfs: use a dedicated data structure for chunk maps
Currently we abuse the extent_map structure for two purposes:
1) To actually represent extents for inodes;
2) To represent chunk mappings.
This is odd and has several disadvantages:
1) To create a chunk map, we need to do two memory allocations: one for
an extent_map structure and another one for a map_lookup structure, so
more potential for an allocation failure and more complicated code to
manage and link two structures;
2) For a chunk map we actually only use 3 fields (24 bytes) of the
respective extent map structure: the 'start' field to have the logical
start address of the chunk, the 'len' field to have the chunk's size,
and the 'orig_block_len' field to contain the chunk's stripe size.
Besides wasting a memory, it's also odd and not intuitive at all to
have the stripe size in a field named 'orig_block_len'.
We are also using 'block_len' of the extent_map structure to contain
the chunk size, so we have 2 fields for the same value, 'len' and
'block_len', which is pointless;
3) When an extent map is associated to a chunk mapping, we set the bit
EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
'map_lookup' point to the associated map_lookup structure. This means
that for an extent map associated to an inode extent, we are not using
this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);
4) Extent maps associated to a chunk mapping are never merged or split so
it's pointless to use the existing extent map infrastructure.
So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:
1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.
This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.
We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-11-21 16:38:38 +03:00
static inline int nr_data_stripes ( const struct btrfs_chunk_map * map )
2013-01-30 03:40:14 +04:00
{
2022-05-13 11:34:30 +03:00
return map - > num_stripes - btrfs_nr_parity_stripes ( map - > type ) ;
2013-01-30 03:40:14 +04:00
}
2022-06-01 12:46:59 +03:00
2023-03-20 05:12:49 +03:00
static inline int nr_bioc_data_stripes ( const struct btrfs_io_context * bioc )
{
return bioc - > num_stripes - btrfs_nr_parity_stripes ( bioc - > map_type ) ;
}
2013-01-30 03:40:14 +04:00
# define RAID5_P_STRIPE ((u64)-2)
# define RAID6_Q_STRIPE ((u64)-1)
# define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
( ( x ) = = RAID6_Q_STRIPE ) )
Btrfs, raid56: support parity scrub on raid56
The implementation is:
- Read and check all the data with checksum in the same stripe.
All the data which has checksum is COW data, and we are sure
that it is not changed though we don't lock the stripe. because
the space of that data just can be reclaimed after the current
transction is committed, and then the fs can use it to store the
other data, but when doing scrub, we hold the current transaction,
that is that data can not be recovered, it is safe that read and check
it out of the stripe lock.
- Lock the stripe
- Read out all the data without checksum and parity
The data without checksum and the parity may be changed if we don't
lock the stripe, so we need read it in the stripe lock context.
- Check the parity
- Re-calculate the new parity and write back it if the old parity
is not right
- Unlock the stripe
If we can not read out the data or the data we read is corrupted,
we will try to repair it. If the repair fails. we will mark the
horizontal sub-stripe(pages on the same horizontal) as corrupted
sub-stripe, and we will skip the parity check and repair of that
horizontal sub-stripe.
And in order to skip the horizontal sub-stripe that has no data, we
introduce a bitmap. If there is some data on the horizontal sub-stripe,
we will the relative bit to 1, and when we check and repair the
parity, we will skip those horizontal sub-stripes that the relative
bits is 0.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
2014-11-06 12:20:58 +03:00
struct btrfs_device ;
2022-06-17 13:04:09 +03:00
void raid56_parity_recover ( struct bio * bio , struct btrfs_io_context * bioc ,
2022-08-06 11:03:25 +03:00
int mirror_num ) ;
2022-06-17 13:04:08 +03:00
void raid56_parity_write ( struct bio * bio , struct btrfs_io_context * bioc ) ;
2013-01-30 03:40:14 +04:00
2021-09-23 09:00:09 +03:00
struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio ( struct bio * bio ,
2022-06-17 13:04:05 +03:00
struct btrfs_io_context * bioc ,
2021-09-23 09:00:09 +03:00
struct btrfs_device * scrub_dev ,
unsigned long * dbitmap , int stripe_nsectors ) ;
Btrfs, raid56: support parity scrub on raid56
The implementation is:
- Read and check all the data with checksum in the same stripe.
All the data which has checksum is COW data, and we are sure
that it is not changed though we don't lock the stripe. because
the space of that data just can be reclaimed after the current
transction is committed, and then the fs can use it to store the
other data, but when doing scrub, we hold the current transaction,
that is that data can not be recovered, it is safe that read and check
it out of the stripe lock.
- Lock the stripe
- Read out all the data without checksum and parity
The data without checksum and the parity may be changed if we don't
lock the stripe, so we need read it in the stripe lock context.
- Check the parity
- Re-calculate the new parity and write back it if the old parity
is not right
- Unlock the stripe
If we can not read out the data or the data we read is corrupted,
we will try to repair it. If the repair fails. we will mark the
horizontal sub-stripe(pages on the same horizontal) as corrupted
sub-stripe, and we will skip the parity check and repair of that
horizontal sub-stripe.
And in order to skip the horizontal sub-stripe that has no data, we
introduce a bitmap. If there is some data on the horizontal sub-stripe,
we will the relative bit to 1, and when we check and repair the
parity, we will skip those horizontal sub-stripes that the relative
bits is 0.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
2014-11-06 12:20:58 +03:00
void raid56_parity_submit_scrub_rbio ( struct btrfs_raid_bio * rbio ) ;
btrfs: scrub: use recovered data stripes as cache to avoid unnecessary read
For P/Q stripe scrub, we have quite some duplicated read IO:
- Data stripes read for verification
This is triggered by the scrub_submit_initial_read() inside
scrub_raid56_parity_stripe().
- Data stripes read (again) for P/Q stripe verification
This is triggered by scrub_assemble_read_bios() from scrub_rbio().
Although we can have hit rbio cache and avoid unnecessary read, the
chance is very low, as scrub would easily flush the whole rbio cache.
This means, even we're just scrubbing a single P/Q stripe, we would read
the data stripes twice for the best case scenario. If we need to
recover some data stripes, it would cause more reads on the same data
stripes, again and again.
However before we call raid56_parity_submit_scrub_rbio() we already
have all data stripes repaired and their contents ready to use.
But RAID56 cache is unaware about the scrub cache, thus RAID56 layer
itself still needs to re-read the data stripes.
To avoid such cache miss, this patch would:
- Introduce a new helper, raid56_parity_cache_data_pages()
This function would grab the pages from an array, and copy the content
to the rbio, marking all the involved sectors uptodate.
The page copy is unavoidable because of the cache pages of rbio are all
self managed, thus can not utilize outside pages without screwing up
the lifespan.
- Use the repaired data stripes as cache inside
scrub_raid56_parity_stripe()
By this, we ensure all the data sectors of the scrub rbio are already
uptodate, and no need to read them again from disk.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-04-13 08:57:18 +03:00
void raid56_parity_cache_data_pages ( struct btrfs_raid_bio * rbio ,
struct page * * data_pages , u64 data_logical ) ;
2013-01-30 03:40:14 +04:00
int btrfs_alloc_stripe_hash_table ( struct btrfs_fs_info * info ) ;
void btrfs_free_stripe_hash_table ( struct btrfs_fs_info * info ) ;
2018-04-03 20:16:55 +03:00
2013-01-30 03:40:14 +04:00
# endif