2022-11-15 10:44:05 +01:00
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
* Copyright ( C ) 2022 Christoph Hellwig .
*/
# include <linux/bio.h>
# include "bio.h"
# include "ctree.h"
# include "volumes.h"
# include "raid56.h"
# include "async-thread.h"
# include "dev-replace.h"
# include "zoned.h"
2023-01-21 07:50:05 +01:00
# include "file-item.h"
2023-09-14 09:06:58 -07:00
# include "raid-stripe-tree.h"
2022-11-15 10:44:05 +01:00
static struct bio_set btrfs_bioset ;
2023-01-21 07:50:20 +01:00
static struct bio_set btrfs_clone_bioset ;
2023-01-21 07:50:07 +01:00
static struct bio_set btrfs_repair_bioset ;
static mempool_t btrfs_failed_bio_pool ;
struct btrfs_failed_bio {
struct btrfs_bio * bbio ;
int num_copies ;
atomic_t repair_count ;
} ;
2022-11-15 10:44:05 +01:00
2023-05-31 09:54:00 +02:00
/* Is this a data path I/O that needs storage layer checksum and repair? */
static inline bool is_data_bbio ( struct btrfs_bio * bbio )
{
return bbio - > inode & & is_data_inode ( & bbio - > inode - > vfs_inode ) ;
}
2023-05-31 09:54:02 +02:00
static bool bbio_has_ordered_extent ( struct btrfs_bio * bbio )
{
return is_data_bbio ( bbio ) & & btrfs_op ( & bbio - > bio ) = = BTRFS_MAP_WRITE ;
}
2022-11-15 10:44:05 +01:00
/*
* Initialize a btrfs_bio structure . This skips the embedded bio itself as it
* is already initialized by the block layer .
*/
2023-03-23 17:01:20 +08:00
void btrfs_bio_init ( struct btrfs_bio * bbio , struct btrfs_fs_info * fs_info ,
2023-01-21 07:50:21 +01:00
btrfs_bio_end_io_t end_io , void * private )
2022-11-15 10:44:05 +01:00
{
memset ( bbio , 0 , offsetof ( struct btrfs_bio , bio ) ) ;
2023-03-23 17:01:20 +08:00
bbio - > fs_info = fs_info ;
2022-11-15 10:44:05 +01:00
bbio - > end_io = end_io ;
bbio - > private = private ;
2023-01-21 07:50:20 +01:00
atomic_set ( & bbio - > pending_ios , 1 ) ;
2022-11-15 10:44:05 +01:00
}
/*
* Allocate a btrfs_bio structure . The btrfs_bio is the main I / O container for
* btrfs , and is used for all I / O submitted through btrfs_submit_bio .
*
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
* a mempool .
*/
2023-03-07 17:39:44 +01:00
struct btrfs_bio * btrfs_bio_alloc ( unsigned int nr_vecs , blk_opf_t opf ,
2023-03-23 17:01:20 +08:00
struct btrfs_fs_info * fs_info ,
2023-03-07 17:39:44 +01:00
btrfs_bio_end_io_t end_io , void * private )
2022-11-15 10:44:05 +01:00
{
2023-03-07 17:39:44 +01:00
struct btrfs_bio * bbio ;
2022-11-15 10:44:05 +01:00
struct bio * bio ;
bio = bio_alloc_bioset ( NULL , nr_vecs , opf , GFP_NOFS , & btrfs_bioset ) ;
2023-03-07 17:39:44 +01:00
bbio = btrfs_bio ( bio ) ;
2023-03-23 17:01:20 +08:00
btrfs_bio_init ( bbio , fs_info , end_io , private ) ;
2023-03-07 17:39:44 +01:00
return bbio ;
2022-11-15 10:44:05 +01:00
}
2023-03-07 17:39:45 +01:00
static struct btrfs_bio * btrfs_split_bio ( struct btrfs_fs_info * fs_info ,
struct btrfs_bio * orig_bbio ,
u64 map_length , bool use_append )
2023-01-21 07:50:20 +01:00
{
2023-03-07 17:39:45 +01:00
struct btrfs_bio * bbio ;
2023-01-21 07:50:20 +01:00
struct bio * bio ;
2023-01-21 07:50:30 +01:00
if ( use_append ) {
unsigned int nr_segs ;
2023-03-07 17:39:45 +01:00
bio = bio_split_rw ( & orig_bbio - > bio , & fs_info - > limits , & nr_segs ,
2023-01-21 07:50:30 +01:00
& btrfs_clone_bioset , map_length ) ;
} else {
2023-03-07 17:39:45 +01:00
bio = bio_split ( & orig_bbio - > bio , map_length > > SECTOR_SHIFT ,
GFP_NOFS , & btrfs_clone_bioset ) ;
2023-01-21 07:50:30 +01:00
}
2023-03-07 17:39:45 +01:00
bbio = btrfs_bio ( bio ) ;
2023-03-23 17:01:20 +08:00
btrfs_bio_init ( bbio , fs_info , NULL , orig_bbio ) ;
bbio - > inode = orig_bbio - > inode ;
2023-03-07 17:39:45 +01:00
bbio - > file_offset = orig_bbio - > file_offset ;
2023-05-31 09:53:54 +02:00
orig_bbio - > file_offset + = map_length ;
2023-05-31 09:54:02 +02:00
if ( bbio_has_ordered_extent ( bbio ) ) {
refcount_inc ( & orig_bbio - > ordered - > refs ) ;
bbio - > ordered = orig_bbio - > ordered ;
}
2023-01-21 07:50:20 +01:00
atomic_inc ( & orig_bbio - > pending_ios ) ;
2023-03-07 17:39:45 +01:00
return bbio ;
2023-01-21 07:50:20 +01:00
}
2023-05-31 09:54:02 +02:00
/* Free a bio that was never submitted to the underlying device. */
static void btrfs_cleanup_bio ( struct btrfs_bio * bbio )
{
if ( bbio_has_ordered_extent ( bbio ) )
btrfs_put_ordered_extent ( bbio - > ordered ) ;
bio_put ( & bbio - > bio ) ;
}
static void __btrfs_bio_end_io ( struct btrfs_bio * bbio )
{
if ( bbio_has_ordered_extent ( bbio ) ) {
struct btrfs_ordered_extent * ordered = bbio - > ordered ;
bbio - > end_io ( bbio ) ;
btrfs_put_ordered_extent ( ordered ) ;
} else {
bbio - > end_io ( bbio ) ;
}
}
void btrfs_bio_end_io ( struct btrfs_bio * bbio , blk_status_t status )
{
bbio - > bio . bi_status = status ;
__btrfs_bio_end_io ( bbio ) ;
}
2023-01-21 07:50:20 +01:00
static void btrfs_orig_write_end_io ( struct bio * bio ) ;
static void btrfs_bbio_propagate_error ( struct btrfs_bio * bbio ,
struct btrfs_bio * orig_bbio )
{
/*
* For writes we tolerate nr_mirrors - 1 write failures , so we can ' t
* just blindly propagate a write failure here . Instead increment the
* error count in the original I / O context so that it is guaranteed to
* be larger than the error tolerance .
*/
if ( bbio - > bio . bi_end_io = = & btrfs_orig_write_end_io ) {
struct btrfs_io_stripe * orig_stripe = orig_bbio - > bio . bi_private ;
struct btrfs_io_context * orig_bioc = orig_stripe - > bioc ;
atomic_add ( orig_bioc - > max_errors , & orig_bioc - > error ) ;
} else {
orig_bbio - > bio . bi_status = bbio - > bio . bi_status ;
}
}
static void btrfs_orig_bbio_end_io ( struct btrfs_bio * bbio )
{
if ( bbio - > bio . bi_pool = = & btrfs_clone_bioset ) {
struct btrfs_bio * orig_bbio = bbio - > private ;
if ( bbio - > bio . bi_status )
btrfs_bbio_propagate_error ( bbio , orig_bbio ) ;
2023-05-31 09:54:02 +02:00
btrfs_cleanup_bio ( bbio ) ;
2023-01-21 07:50:20 +01:00
bbio = orig_bbio ;
}
if ( atomic_dec_and_test ( & bbio - > pending_ios ) )
2023-05-31 09:54:02 +02:00
__btrfs_bio_end_io ( bbio ) ;
2023-01-21 07:50:20 +01:00
}
2023-01-21 07:50:07 +01:00
static int next_repair_mirror ( struct btrfs_failed_bio * fbio , int cur_mirror )
{
if ( cur_mirror = = fbio - > num_copies )
return cur_mirror + 1 - fbio - > num_copies ;
return cur_mirror + 1 ;
}
static int prev_repair_mirror ( struct btrfs_failed_bio * fbio , int cur_mirror )
{
if ( cur_mirror = = 1 )
return fbio - > num_copies ;
return cur_mirror - 1 ;
}
static void btrfs_repair_done ( struct btrfs_failed_bio * fbio )
{
if ( atomic_dec_and_test ( & fbio - > repair_count ) ) {
2023-01-21 07:50:20 +01:00
btrfs_orig_bbio_end_io ( fbio - > bbio ) ;
2023-01-21 07:50:07 +01:00
mempool_free ( fbio , & btrfs_failed_bio_pool ) ;
}
}
static void btrfs_end_repair_bio ( struct btrfs_bio * repair_bbio ,
struct btrfs_device * dev )
{
struct btrfs_failed_bio * fbio = repair_bbio - > private ;
struct btrfs_inode * inode = repair_bbio - > inode ;
struct btrfs_fs_info * fs_info = inode - > root - > fs_info ;
struct bio_vec * bv = bio_first_bvec_all ( & repair_bbio - > bio ) ;
int mirror = repair_bbio - > mirror_num ;
btrfs: migrate btrfs_repair_io_failure() to folio interfaces
[BUG]
Test case btrfs/124 failed if larger metadata folio is enabled, the
dying message looks like this:
BTRFS error (device dm-2): bad tree block start, mirror 2 want 31686656 have 0
BTRFS info (device dm-2): read error corrected: ino 0 off 31686656 (dev /dev/mapper/test-scratch2 sector 20928)
BUG: kernel NULL pointer dereference, address: 0000000000000020
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
CPU: 6 PID: 350881 Comm: btrfs Tainted: G OE 6.7.0-rc3-custom+ #128
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022
RIP: 0010:btrfs_read_extent_buffer+0x106/0x180 [btrfs]
PKRU: 55555554
Call Trace:
<TASK>
read_tree_block+0x33/0xb0 [btrfs]
read_block_for_search+0x23e/0x340 [btrfs]
btrfs_search_slot+0x2f9/0xe60 [btrfs]
btrfs_lookup_csum+0x75/0x160 [btrfs]
btrfs_lookup_bio_sums+0x21a/0x560 [btrfs]
btrfs_submit_chunk+0x152/0x680 [btrfs]
btrfs_submit_bio+0x1c/0x50 [btrfs]
submit_one_bio+0x40/0x80 [btrfs]
submit_extent_page+0x158/0x390 [btrfs]
btrfs_do_readpage+0x330/0x740 [btrfs]
extent_readahead+0x38d/0x6c0 [btrfs]
read_pages+0x94/0x2c0
page_cache_ra_unbounded+0x12d/0x190
relocate_file_extent_cluster+0x7c1/0x9d0 [btrfs]
relocate_block_group+0x2d3/0x560 [btrfs]
btrfs_relocate_block_group+0x2c7/0x4b0 [btrfs]
btrfs_relocate_chunk+0x4c/0x1a0 [btrfs]
btrfs_balance+0x925/0x13c0 [btrfs]
btrfs_ioctl+0x19f1/0x25d0 [btrfs]
__x64_sys_ioctl+0x90/0xd0
do_syscall_64+0x3f/0xf0
entry_SYSCALL_64_after_hwframe+0x6e/0x76
[CAUSE]
The dying line is at btrfs_repair_io_failure() call inside
btrfs_repair_eb_io_failure().
The function is still relying on the extent buffer using page sized
folios.
When the extent buffer is using larger folio, we go into the 2nd slot of
folios[], and triggered the NULL pointer dereference.
[FIX]
Migrate btrfs_repair_io_failure() to folio interfaces.
So that when we hit a larger folio, we just submit the whole folio in
one go.
This also affects data repair path through btrfs_end_repair_bio(),
thankfully data is still fully page based, we can just add an
ASSERT(), and use page_folio() to convert the page to folio.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-12-12 15:54:10 +10:30
/*
* We can only trigger this for data bio , which doesn ' t support larger
* folios yet .
*/
ASSERT ( folio_order ( page_folio ( bv - > bv_page ) ) = = 0 ) ;
2023-01-21 07:50:07 +01:00
if ( repair_bbio - > bio . bi_status | |
! btrfs_data_csum_ok ( repair_bbio , dev , 0 , bv ) ) {
bio_reset ( & repair_bbio - > bio , NULL , REQ_OP_READ ) ;
2023-01-21 07:50:13 +01:00
repair_bbio - > bio . bi_iter = repair_bbio - > saved_iter ;
2023-01-21 07:50:07 +01:00
mirror = next_repair_mirror ( fbio , mirror ) ;
if ( mirror = = fbio - > bbio - > mirror_num ) {
btrfs_debug ( fs_info , " no mirror left " ) ;
fbio - > bbio - > bio . bi_status = BLK_STS_IOERR ;
goto done ;
}
2023-03-07 17:39:39 +01:00
btrfs_submit_bio ( repair_bbio , mirror ) ;
2023-01-21 07:50:07 +01:00
return ;
}
do {
mirror = prev_repair_mirror ( fbio , mirror ) ;
btrfs_repair_io_failure ( fs_info , btrfs_ino ( inode ) ,
repair_bbio - > file_offset , fs_info - > sectorsize ,
2023-01-21 07:50:13 +01:00
repair_bbio - > saved_iter . bi_sector < < SECTOR_SHIFT ,
btrfs: migrate btrfs_repair_io_failure() to folio interfaces
[BUG]
Test case btrfs/124 failed if larger metadata folio is enabled, the
dying message looks like this:
BTRFS error (device dm-2): bad tree block start, mirror 2 want 31686656 have 0
BTRFS info (device dm-2): read error corrected: ino 0 off 31686656 (dev /dev/mapper/test-scratch2 sector 20928)
BUG: kernel NULL pointer dereference, address: 0000000000000020
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
CPU: 6 PID: 350881 Comm: btrfs Tainted: G OE 6.7.0-rc3-custom+ #128
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022
RIP: 0010:btrfs_read_extent_buffer+0x106/0x180 [btrfs]
PKRU: 55555554
Call Trace:
<TASK>
read_tree_block+0x33/0xb0 [btrfs]
read_block_for_search+0x23e/0x340 [btrfs]
btrfs_search_slot+0x2f9/0xe60 [btrfs]
btrfs_lookup_csum+0x75/0x160 [btrfs]
btrfs_lookup_bio_sums+0x21a/0x560 [btrfs]
btrfs_submit_chunk+0x152/0x680 [btrfs]
btrfs_submit_bio+0x1c/0x50 [btrfs]
submit_one_bio+0x40/0x80 [btrfs]
submit_extent_page+0x158/0x390 [btrfs]
btrfs_do_readpage+0x330/0x740 [btrfs]
extent_readahead+0x38d/0x6c0 [btrfs]
read_pages+0x94/0x2c0
page_cache_ra_unbounded+0x12d/0x190
relocate_file_extent_cluster+0x7c1/0x9d0 [btrfs]
relocate_block_group+0x2d3/0x560 [btrfs]
btrfs_relocate_block_group+0x2c7/0x4b0 [btrfs]
btrfs_relocate_chunk+0x4c/0x1a0 [btrfs]
btrfs_balance+0x925/0x13c0 [btrfs]
btrfs_ioctl+0x19f1/0x25d0 [btrfs]
__x64_sys_ioctl+0x90/0xd0
do_syscall_64+0x3f/0xf0
entry_SYSCALL_64_after_hwframe+0x6e/0x76
[CAUSE]
The dying line is at btrfs_repair_io_failure() call inside
btrfs_repair_eb_io_failure().
The function is still relying on the extent buffer using page sized
folios.
When the extent buffer is using larger folio, we go into the 2nd slot of
folios[], and triggered the NULL pointer dereference.
[FIX]
Migrate btrfs_repair_io_failure() to folio interfaces.
So that when we hit a larger folio, we just submit the whole folio in
one go.
This also affects data repair path through btrfs_end_repair_bio(),
thankfully data is still fully page based, we can just add an
ASSERT(), and use page_folio() to convert the page to folio.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-12-12 15:54:10 +10:30
page_folio ( bv - > bv_page ) , bv - > bv_offset , mirror ) ;
2023-01-21 07:50:07 +01:00
} while ( mirror ! = fbio - > bbio - > mirror_num ) ;
done :
btrfs_repair_done ( fbio ) ;
bio_put ( & repair_bbio - > bio ) ;
}
/*
* Try to kick off a repair read to the next available mirror for a bad sector .
*
* This primarily tries to recover good data to serve the actual read request ,
* but also tries to write the good data back to the bad mirror ( s ) when a
* read succeeded to restore the redundancy .
*/
static struct btrfs_failed_bio * repair_one_sector ( struct btrfs_bio * failed_bbio ,
u32 bio_offset ,
struct bio_vec * bv ,
struct btrfs_failed_bio * fbio )
{
struct btrfs_inode * inode = failed_bbio - > inode ;
struct btrfs_fs_info * fs_info = inode - > root - > fs_info ;
const u32 sectorsize = fs_info - > sectorsize ;
2023-01-21 07:50:13 +01:00
const u64 logical = ( failed_bbio - > saved_iter . bi_sector < < SECTOR_SHIFT ) ;
2023-01-21 07:50:07 +01:00
struct btrfs_bio * repair_bbio ;
struct bio * repair_bio ;
int num_copies ;
int mirror ;
btrfs_debug ( fs_info , " repair read error: read error at %llu " ,
failed_bbio - > file_offset + bio_offset ) ;
num_copies = btrfs_num_copies ( fs_info , logical , sectorsize ) ;
if ( num_copies = = 1 ) {
btrfs_debug ( fs_info , " no copy to repair from " ) ;
failed_bbio - > bio . bi_status = BLK_STS_IOERR ;
return fbio ;
}
if ( ! fbio ) {
fbio = mempool_alloc ( & btrfs_failed_bio_pool , GFP_NOFS ) ;
fbio - > bbio = failed_bbio ;
fbio - > num_copies = num_copies ;
atomic_set ( & fbio - > repair_count , 1 ) ;
}
atomic_inc ( & fbio - > repair_count ) ;
repair_bio = bio_alloc_bioset ( NULL , 1 , REQ_OP_READ , GFP_NOFS ,
& btrfs_repair_bioset ) ;
2023-01-21 07:50:13 +01:00
repair_bio - > bi_iter . bi_sector = failed_bbio - > saved_iter . bi_sector ;
2023-03-30 03:43:50 -07:00
__bio_add_page ( repair_bio , bv - > bv_page , bv - > bv_len , bv - > bv_offset ) ;
2023-01-21 07:50:07 +01:00
repair_bbio = btrfs_bio ( repair_bio ) ;
2023-03-23 17:01:20 +08:00
btrfs_bio_init ( repair_bbio , fs_info , NULL , fbio ) ;
repair_bbio - > inode = failed_bbio - > inode ;
2023-01-21 07:50:07 +01:00
repair_bbio - > file_offset = failed_bbio - > file_offset + bio_offset ;
mirror = next_repair_mirror ( fbio , failed_bbio - > mirror_num ) ;
btrfs_debug ( fs_info , " submitting repair read to mirror %d " , mirror ) ;
2023-03-07 17:39:39 +01:00
btrfs_submit_bio ( repair_bbio , mirror ) ;
2023-01-21 07:50:07 +01:00
return fbio ;
}
static void btrfs_check_read_bio ( struct btrfs_bio * bbio , struct btrfs_device * dev )
{
struct btrfs_inode * inode = bbio - > inode ;
struct btrfs_fs_info * fs_info = inode - > root - > fs_info ;
u32 sectorsize = fs_info - > sectorsize ;
2023-01-21 07:50:13 +01:00
struct bvec_iter * iter = & bbio - > saved_iter ;
2023-01-21 07:50:07 +01:00
blk_status_t status = bbio - > bio . bi_status ;
struct btrfs_failed_bio * fbio = NULL ;
u32 offset = 0 ;
2023-03-23 17:01:20 +08:00
/* Read-repair requires the inode field to be set by the submitter. */
ASSERT ( inode ) ;
2023-01-21 07:50:07 +01:00
/*
* Hand off repair bios to the repair code as there is no upper level
* submitter for them .
*/
if ( bbio - > bio . bi_pool = = & btrfs_repair_bioset ) {
btrfs_end_repair_bio ( bbio , dev ) ;
return ;
}
/* Clear the I/O error. A failed repair will reset it. */
bbio - > bio . bi_status = BLK_STS_OK ;
while ( iter - > bi_size ) {
struct bio_vec bv = bio_iter_iovec ( & bbio - > bio , * iter ) ;
bv . bv_len = min ( bv . bv_len , sectorsize ) ;
if ( status | | ! btrfs_data_csum_ok ( bbio , dev , offset , & bv ) )
fbio = repair_one_sector ( bbio , offset , & bv , fbio ) ;
bio_advance_iter_single ( & bbio - > bio , iter , sectorsize ) ;
offset + = sectorsize ;
}
2023-01-21 07:50:08 +01:00
if ( bbio - > csum ! = bbio - > csum_inline )
kfree ( bbio - > csum ) ;
2023-01-21 07:50:07 +01:00
if ( fbio )
btrfs_repair_done ( fbio ) ;
else
2023-01-21 07:50:20 +01:00
btrfs_orig_bbio_end_io ( bbio ) ;
2023-01-21 07:50:07 +01:00
}
2022-11-15 10:44:05 +01:00
static void btrfs_log_dev_io_error ( struct bio * bio , struct btrfs_device * dev )
{
if ( ! dev | | ! dev - > bdev )
return ;
if ( bio - > bi_status ! = BLK_STS_IOERR & & bio - > bi_status ! = BLK_STS_TARGET )
return ;
if ( btrfs_op ( bio ) = = BTRFS_MAP_WRITE )
btrfs_dev_stat_inc_and_print ( dev , BTRFS_DEV_STAT_WRITE_ERRS ) ;
2023-02-13 14:10:38 +09:00
else if ( ! ( bio - > bi_opf & REQ_RAHEAD ) )
2022-11-15 10:44:05 +01:00
btrfs_dev_stat_inc_and_print ( dev , BTRFS_DEV_STAT_READ_ERRS ) ;
if ( bio - > bi_opf & REQ_PREFLUSH )
btrfs_dev_stat_inc_and_print ( dev , BTRFS_DEV_STAT_FLUSH_ERRS ) ;
}
static struct workqueue_struct * btrfs_end_io_wq ( struct btrfs_fs_info * fs_info ,
struct bio * bio )
{
if ( bio - > bi_opf & REQ_META )
return fs_info - > endio_meta_workers ;
return fs_info - > endio_workers ;
}
static void btrfs_end_bio_work ( struct work_struct * work )
{
struct btrfs_bio * bbio = container_of ( work , struct btrfs_bio , end_io_work ) ;
2023-01-21 07:50:07 +01:00
/* Metadata reads are checked and repaired by the submitter. */
2023-05-31 09:54:00 +02:00
if ( is_data_bbio ( bbio ) )
2023-01-21 07:50:11 +01:00
btrfs_check_read_bio ( bbio , bbio - > bio . bi_private ) ;
2023-03-23 17:01:20 +08:00
else
2023-05-15 11:18:21 +02:00
btrfs_orig_bbio_end_io ( bbio ) ;
2022-11-15 10:44:05 +01:00
}
static void btrfs_simple_end_io ( struct bio * bio )
{
struct btrfs_bio * bbio = btrfs_bio ( bio ) ;
2023-01-21 07:50:11 +01:00
struct btrfs_device * dev = bio - > bi_private ;
2023-03-23 17:01:20 +08:00
struct btrfs_fs_info * fs_info = bbio - > fs_info ;
2022-11-15 10:44:05 +01:00
btrfs_bio_counter_dec ( fs_info ) ;
if ( bio - > bi_status )
2023-01-21 07:50:11 +01:00
btrfs_log_dev_io_error ( bio , dev ) ;
2022-11-15 10:44:05 +01:00
if ( bio_op ( bio ) = = REQ_OP_READ ) {
INIT_WORK ( & bbio - > end_io_work , btrfs_end_bio_work ) ;
queue_work ( btrfs_end_io_wq ( fs_info , bio ) , & bbio - > end_io_work ) ;
} else {
2023-05-24 17:03:05 +02:00
if ( bio_op ( bio ) = = REQ_OP_ZONE_APPEND & & ! bio - > bi_status )
2023-01-21 07:50:18 +01:00
btrfs_record_physical_zoned ( bbio ) ;
2023-01-21 07:50:20 +01:00
btrfs_orig_bbio_end_io ( bbio ) ;
2022-11-15 10:44:05 +01:00
}
}
static void btrfs_raid56_end_io ( struct bio * bio )
{
struct btrfs_io_context * bioc = bio - > bi_private ;
struct btrfs_bio * bbio = btrfs_bio ( bio ) ;
btrfs_bio_counter_dec ( bioc - > fs_info ) ;
bbio - > mirror_num = bioc - > mirror_num ;
2023-05-31 09:54:00 +02:00
if ( bio_op ( bio ) = = REQ_OP_READ & & is_data_bbio ( bbio ) )
2023-01-21 07:50:07 +01:00
btrfs_check_read_bio ( bbio , NULL ) ;
else
2023-01-21 07:50:20 +01:00
btrfs_orig_bbio_end_io ( bbio ) ;
2022-11-15 10:44:05 +01:00
btrfs_put_bioc ( bioc ) ;
}
static void btrfs_orig_write_end_io ( struct bio * bio )
{
struct btrfs_io_stripe * stripe = bio - > bi_private ;
struct btrfs_io_context * bioc = stripe - > bioc ;
struct btrfs_bio * bbio = btrfs_bio ( bio ) ;
btrfs_bio_counter_dec ( bioc - > fs_info ) ;
if ( bio - > bi_status ) {
atomic_inc ( & bioc - > error ) ;
btrfs_log_dev_io_error ( bio , stripe - > dev ) ;
}
/*
* Only send an error to the higher layers if it is beyond the tolerance
* threshold .
*/
if ( atomic_read ( & bioc - > error ) > bioc - > max_errors )
bio - > bi_status = BLK_STS_IOERR ;
else
bio - > bi_status = BLK_STS_OK ;
2023-09-14 09:06:58 -07:00
if ( bio_op ( bio ) = = REQ_OP_ZONE_APPEND & & ! bio - > bi_status )
stripe - > physical = bio - > bi_iter . bi_sector < < SECTOR_SHIFT ;
2023-01-21 07:50:20 +01:00
btrfs_orig_bbio_end_io ( bbio ) ;
2022-11-15 10:44:05 +01:00
btrfs_put_bioc ( bioc ) ;
}
static void btrfs_clone_write_end_io ( struct bio * bio )
{
struct btrfs_io_stripe * stripe = bio - > bi_private ;
if ( bio - > bi_status ) {
atomic_inc ( & stripe - > bioc - > error ) ;
btrfs_log_dev_io_error ( bio , stripe - > dev ) ;
2023-09-14 09:06:58 -07:00
} else if ( bio_op ( bio ) = = REQ_OP_ZONE_APPEND ) {
stripe - > physical = bio - > bi_iter . bi_sector < < SECTOR_SHIFT ;
2022-11-15 10:44:05 +01:00
}
/* Pass on control to the original bio this one was cloned from */
bio_endio ( stripe - > bioc - > orig_bio ) ;
bio_put ( bio ) ;
}
static void btrfs_submit_dev_bio ( struct btrfs_device * dev , struct bio * bio )
{
if ( ! dev | | ! dev - > bdev | |
test_bit ( BTRFS_DEV_STATE_MISSING , & dev - > dev_state ) | |
( btrfs_op ( bio ) = = BTRFS_MAP_WRITE & &
! test_bit ( BTRFS_DEV_STATE_WRITEABLE , & dev - > dev_state ) ) ) {
bio_io_error ( bio ) ;
return ;
}
bio_set_dev ( bio , dev - > bdev ) ;
/*
* For zone append writing , bi_sector must point the beginning of the
* zone
*/
if ( bio_op ( bio ) = = REQ_OP_ZONE_APPEND ) {
u64 physical = bio - > bi_iter . bi_sector < < SECTOR_SHIFT ;
2023-01-21 07:50:30 +01:00
u64 zone_start = round_down ( physical , dev - > fs_info - > zone_size ) ;
2022-11-15 10:44:05 +01:00
2023-01-21 07:50:30 +01:00
ASSERT ( btrfs_dev_is_sequential ( dev , physical ) ) ;
bio - > bi_iter . bi_sector = zone_start > > SECTOR_SHIFT ;
2022-11-15 10:44:05 +01:00
}
btrfs_debug_in_rcu ( dev - > fs_info ,
" %s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u " ,
__func__ , bio_op ( bio ) , bio - > bi_opf , bio - > bi_iter . bi_sector ,
( unsigned long ) dev - > bdev - > bd_dev , btrfs_dev_name ( dev ) ,
dev - > devid , bio - > bi_iter . bi_size ) ;
2023-03-27 09:49:51 +09:00
if ( bio - > bi_opf & REQ_BTRFS_CGROUP_PUNT )
blkcg_punt_bio_submit ( bio ) ;
else
submit_bio ( bio ) ;
2022-11-15 10:44:05 +01:00
}
static void btrfs_submit_mirrored_bio ( struct btrfs_io_context * bioc , int dev_nr )
{
struct bio * orig_bio = bioc - > orig_bio , * bio ;
ASSERT ( bio_op ( orig_bio ) ! = REQ_OP_READ ) ;
/* Reuse the bio embedded into the btrfs_bio for the last mirror */
if ( dev_nr = = bioc - > num_stripes - 1 ) {
bio = orig_bio ;
bio - > bi_end_io = btrfs_orig_write_end_io ;
} else {
bio = bio_alloc_clone ( NULL , orig_bio , GFP_NOFS , & fs_bio_set ) ;
bio_inc_remaining ( orig_bio ) ;
bio - > bi_end_io = btrfs_clone_write_end_io ;
}
bio - > bi_private = & bioc - > stripes [ dev_nr ] ;
bio - > bi_iter . bi_sector = bioc - > stripes [ dev_nr ] . physical > > SECTOR_SHIFT ;
bioc - > stripes [ dev_nr ] . bioc = bioc ;
2023-09-14 09:06:58 -07:00
bioc - > size = bio - > bi_iter . bi_size ;
2022-11-15 10:44:05 +01:00
btrfs_submit_dev_bio ( bioc - > stripes [ dev_nr ] . dev , bio ) ;
}
2023-01-21 07:50:17 +01:00
static void __btrfs_submit_bio ( struct bio * bio , struct btrfs_io_context * bioc ,
struct btrfs_io_stripe * smap , int mirror_num )
{
if ( ! bioc ) {
/* Single mirror read/write fast path. */
btrfs_bio ( bio ) - > mirror_num = mirror_num ;
bio - > bi_iter . bi_sector = smap - > physical > > SECTOR_SHIFT ;
2023-06-09 07:27:04 +02:00
if ( bio_op ( bio ) ! = REQ_OP_READ )
btrfs_bio ( bio ) - > orig_physical = smap - > physical ;
2023-01-21 07:50:17 +01:00
bio - > bi_private = smap - > dev ;
bio - > bi_end_io = btrfs_simple_end_io ;
btrfs_submit_dev_bio ( smap - > dev , bio ) ;
} else if ( bioc - > map_type & BTRFS_BLOCK_GROUP_RAID56_MASK ) {
/* Parity RAID write or read recovery. */
bio - > bi_private = bioc ;
bio - > bi_end_io = btrfs_raid56_end_io ;
if ( bio_op ( bio ) = = REQ_OP_READ )
raid56_parity_recover ( bio , bioc , mirror_num ) ;
else
raid56_parity_write ( bio , bioc ) ;
} else {
/* Write to multiple mirrors. */
int total_devs = bioc - > num_stripes ;
bioc - > orig_bio = bio ;
for ( int dev_nr = 0 ; dev_nr < total_devs ; dev_nr + + )
btrfs_submit_mirrored_bio ( bioc , dev_nr ) ;
}
}
static blk_status_t btrfs_bio_csum ( struct btrfs_bio * bbio )
{
if ( bbio - > bio . bi_opf & REQ_META )
2023-01-21 07:50:19 +01:00
return btree_csum_one_bio ( bbio ) ;
2023-01-21 07:50:17 +01:00
return btrfs_csum_one_bio ( bbio ) ;
}
/*
* Async submit bios are used to offload expensive checksumming onto the worker
* threads .
*/
struct async_submit_bio {
struct btrfs_bio * bbio ;
struct btrfs_io_context * bioc ;
struct btrfs_io_stripe smap ;
int mirror_num ;
struct btrfs_work work ;
} ;
/*
* In order to insert checksums into the metadata in large chunks , we wait
* until bio submission time . All the pages in the bio are checksummed and
* sums are attached onto the ordered extent record .
*
* At IO completion time the csums attached on the ordered extent record are
* inserted into the btree .
*/
static void run_one_async_start ( struct btrfs_work * work )
{
struct async_submit_bio * async =
container_of ( work , struct async_submit_bio , work ) ;
blk_status_t ret ;
ret = btrfs_bio_csum ( async - > bbio ) ;
if ( ret )
async - > bbio - > bio . bi_status = ret ;
}
/*
* In order to insert checksums into the metadata in large chunks , we wait
* until bio submission time . All the pages in the bio are checksummed and
* sums are attached onto the ordered extent record .
*
* At IO completion time the csums attached on the ordered extent record are
* inserted into the tree .
2023-09-19 18:49:23 +02:00
*
* If called with @ do_free = = true , then it will free the work struct .
2023-01-21 07:50:17 +01:00
*/
2023-09-19 18:49:23 +02:00
static void run_one_async_done ( struct btrfs_work * work , bool do_free )
2023-01-21 07:50:17 +01:00
{
struct async_submit_bio * async =
container_of ( work , struct async_submit_bio , work ) ;
struct bio * bio = & async - > bbio - > bio ;
2023-09-19 18:49:23 +02:00
if ( do_free ) {
kfree ( container_of ( work , struct async_submit_bio , work ) ) ;
return ;
}
2023-01-21 07:50:17 +01:00
/* If an error occurred we just want to clean up the bio and move on. */
if ( bio - > bi_status ) {
2023-01-21 07:50:20 +01:00
btrfs_orig_bbio_end_io ( async - > bbio ) ;
2023-01-21 07:50:17 +01:00
return ;
}
/*
* All of the bios that pass through here are from async helpers .
2023-03-27 09:49:51 +09:00
* Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup ' s
* context . This changes nothing when cgroups aren ' t in use .
2023-01-21 07:50:17 +01:00
*/
2023-03-27 09:49:51 +09:00
bio - > bi_opf | = REQ_BTRFS_CGROUP_PUNT ;
2023-01-21 07:50:17 +01:00
__btrfs_submit_bio ( bio , async - > bioc , & async - > smap , async - > mirror_num ) ;
}
static bool should_async_write ( struct btrfs_bio * bbio )
{
2024-02-05 22:01:16 +09:00
bool auto_csum_mode = true ;
# ifdef CONFIG_BTRFS_DEBUG
struct btrfs_fs_devices * fs_devices = bbio - > fs_info - > fs_devices ;
enum btrfs_offload_csum_mode csum_mode = READ_ONCE ( fs_devices - > offload_csum_mode ) ;
if ( csum_mode = = BTRFS_OFFLOAD_CSUM_FORCE_OFF )
return false ;
auto_csum_mode = ( csum_mode = = BTRFS_OFFLOAD_CSUM_AUTO ) ;
# endif
2023-05-03 09:06:13 +02:00
/* Submit synchronously if the checksum implementation is fast. */
2024-02-05 22:01:16 +09:00
if ( auto_csum_mode & & test_bit ( BTRFS_FS_CSUM_IMPL_FAST , & bbio - > fs_info - > flags ) )
2023-05-03 09:06:13 +02:00
return false ;
2023-01-21 07:50:17 +01:00
/*
2023-05-03 09:06:14 +02:00
* Try to defer the submission to a workqueue to parallelize the
* checksum calculation unless the I / O is issued synchronously .
2023-01-21 07:50:17 +01:00
*/
2023-05-03 09:06:14 +02:00
if ( op_is_sync ( bbio - > bio . bi_opf ) )
2023-01-21 07:50:17 +01:00
return false ;
2023-05-03 09:06:13 +02:00
/* Zoned devices require I/O to be submitted in order. */
if ( ( bbio - > bio . bi_opf & REQ_META ) & & btrfs_is_zoned ( bbio - > fs_info ) )
return false ;
2023-01-21 07:50:17 +01:00
return true ;
}
/*
* Submit bio to an async queue .
*
2023-12-05 19:26:39 +01:00
* Return true if the work has been successfully submitted , else false .
2023-01-21 07:50:17 +01:00
*/
static bool btrfs_wq_submit_bio ( struct btrfs_bio * bbio ,
struct btrfs_io_context * bioc ,
struct btrfs_io_stripe * smap , int mirror_num )
{
2023-03-23 17:01:20 +08:00
struct btrfs_fs_info * fs_info = bbio - > fs_info ;
2023-01-21 07:50:17 +01:00
struct async_submit_bio * async ;
async = kmalloc ( sizeof ( * async ) , GFP_NOFS ) ;
if ( ! async )
return false ;
async - > bbio = bbio ;
async - > bioc = bioc ;
async - > smap = * smap ;
async - > mirror_num = mirror_num ;
2023-09-19 18:49:23 +02:00
btrfs_init_work ( & async - > work , run_one_async_start , run_one_async_done ) ;
2023-05-03 09:06:15 +02:00
btrfs_queue_work ( fs_info - > workers , & async - > work ) ;
2023-01-21 07:50:17 +01:00
return true ;
}
2023-03-07 17:39:39 +01:00
static bool btrfs_submit_chunk ( struct btrfs_bio * bbio , int mirror_num )
2022-11-15 10:44:05 +01:00
{
2023-01-21 07:50:30 +01:00
struct btrfs_inode * inode = bbio - > inode ;
2023-03-23 17:01:20 +08:00
struct btrfs_fs_info * fs_info = bbio - > fs_info ;
2023-01-21 07:50:20 +01:00
struct btrfs_bio * orig_bbio = bbio ;
2023-03-07 17:39:39 +01:00
struct bio * bio = & bbio - > bio ;
2023-04-15 19:32:38 +08:00
u64 logical = bio - > bi_iter . bi_sector < < SECTOR_SHIFT ;
2022-11-15 10:44:05 +01:00
u64 length = bio - > bi_iter . bi_size ;
u64 map_length = length ;
2022-12-12 08:37:21 +01:00
bool use_append = btrfs_use_zone_append ( bbio ) ;
2022-11-15 10:44:05 +01:00
struct btrfs_io_context * bioc = NULL ;
struct btrfs_io_stripe smap ;
2023-01-21 07:50:03 +01:00
blk_status_t ret ;
int error ;
2022-11-15 10:44:05 +01:00
2023-09-14 09:07:01 -07:00
smap . is_scrub = ! bbio - > inode ;
2022-11-15 10:44:05 +01:00
btrfs_bio_counter_inc_blocked ( fs_info ) ;
2023-05-31 06:17:37 +02:00
error = btrfs_map_block ( fs_info , btrfs_op ( bio ) , logical , & map_length ,
2023-09-17 19:36:21 +09:30
& bioc , & smap , & mirror_num ) ;
2023-01-21 07:50:03 +01:00
if ( error ) {
ret = errno_to_blk_status ( error ) ;
goto fail ;
2022-11-15 10:44:05 +01:00
}
2023-01-21 07:50:20 +01:00
map_length = min ( map_length , length ) ;
2023-01-21 07:50:30 +01:00
if ( use_append )
map_length = min ( map_length , fs_info - > max_zone_append_size ) ;
2022-11-15 10:44:05 +01:00
if ( map_length < length ) {
2023-03-07 17:39:45 +01:00
bbio = btrfs_split_bio ( fs_info , bbio , map_length , use_append ) ;
bio = & bbio - > bio ;
2022-11-15 10:44:05 +01:00
}
2023-01-21 07:50:05 +01:00
/*
* Save the iter for the end_io handler and preload the checksums for
* data reads .
*/
2023-05-31 09:54:00 +02:00
if ( bio_op ( bio ) = = REQ_OP_READ & & is_data_bbio ( bbio ) ) {
2023-01-21 07:50:13 +01:00
bbio - > saved_iter = bio - > bi_iter ;
2023-01-21 07:50:05 +01:00
ret = btrfs_lookup_bio_sums ( bbio ) ;
if ( ret )
2023-01-21 07:50:20 +01:00
goto fail_put_bio ;
2023-01-21 07:50:05 +01:00
}
2023-01-21 07:50:04 +01:00
2023-01-21 07:50:17 +01:00
if ( btrfs_op ( bio ) = = BTRFS_MAP_WRITE ) {
2023-01-21 07:50:30 +01:00
if ( use_append ) {
bio - > bi_opf & = ~ REQ_OP_WRITE ;
bio - > bi_opf | = REQ_OP_ZONE_APPEND ;
2023-01-21 07:50:18 +01:00
}
2023-09-14 09:06:58 -07:00
if ( is_data_bbio ( bbio ) & & bioc & &
btrfs_need_stripe_tree_update ( bioc - > fs_info , bioc - > map_type ) ) {
/*
* No locking for the list update , as we only add to
* the list in the I / O submission path , and list
* iteration only happens in the completion path , which
* can ' t happen until after the last submission .
*/
btrfs_get_bioc ( bioc ) ;
list_add_tail ( & bioc - > rst_ordered_entry , & bbio - > ordered - > bioc_list ) ;
}
2023-01-21 07:50:17 +01:00
/*
* Csum items for reloc roots have already been cloned at this
* point , so they are handled as part of the no - checksum case .
*/
2023-03-23 17:01:20 +08:00
if ( inode & & ! ( inode - > flags & BTRFS_INODE_NODATASUM ) & &
2023-01-21 07:50:17 +01:00
! test_bit ( BTRFS_FS_STATE_NO_CSUMS , & fs_info - > fs_state ) & &
2023-01-21 07:50:30 +01:00
! btrfs_is_data_reloc_root ( inode - > root ) ) {
2023-01-21 07:50:17 +01:00
if ( should_async_write ( bbio ) & &
btrfs_wq_submit_bio ( bbio , bioc , & smap , mirror_num ) )
2023-01-21 07:50:20 +01:00
goto done ;
2023-01-21 07:50:17 +01:00
ret = btrfs_bio_csum ( bbio ) ;
if ( ret )
2023-01-21 07:50:20 +01:00
goto fail_put_bio ;
2023-05-24 17:03:08 +02:00
} else if ( use_append ) {
ret = btrfs_alloc_dummy_sum ( bbio ) ;
if ( ret )
goto fail_put_bio ;
2023-01-21 07:50:17 +01:00
}
2022-11-15 10:44:05 +01:00
}
2023-01-21 07:50:17 +01:00
__btrfs_submit_bio ( bio , bioc , & smap , mirror_num ) ;
2023-01-21 07:50:20 +01:00
done :
return map_length = = length ;
2023-01-21 07:50:03 +01:00
2023-01-21 07:50:20 +01:00
fail_put_bio :
if ( map_length < length )
2023-05-31 09:54:02 +02:00
btrfs_cleanup_bio ( bbio ) ;
2023-01-21 07:50:03 +01:00
fail :
btrfs_bio_counter_dec ( fs_info ) ;
2023-01-21 07:50:20 +01:00
btrfs_bio_end_io ( orig_bbio , ret ) ;
/* Do not submit another chunk */
return true ;
}
2023-03-07 17:39:39 +01:00
void btrfs_submit_bio ( struct btrfs_bio * bbio , int mirror_num )
2023-01-21 07:50:20 +01:00
{
2023-03-23 17:01:20 +08:00
/* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT ( bbio - > inode | | bbio - > file_offset = = 0 ) ;
2023-03-07 17:39:39 +01:00
while ( ! btrfs_submit_chunk ( bbio , mirror_num ) )
2023-01-21 07:50:20 +01:00
;
2022-11-15 10:44:05 +01:00
}
2022-11-15 10:44:06 +01:00
/*
* Submit a repair write .
*
* This bypasses btrfs_submit_bio deliberately , as that writes all copies in a
* RAID setup . Here we only want to write the one bad copy , so we do the
* mapping ourselves and submit the bio directly .
*
2023-01-17 10:03:21 +00:00
* The I / O is issued synchronously to block the repair read completion from
2022-11-15 10:44:06 +01:00
* freeing the bio .
*/
int btrfs_repair_io_failure ( struct btrfs_fs_info * fs_info , u64 ino , u64 start ,
btrfs: migrate btrfs_repair_io_failure() to folio interfaces
[BUG]
Test case btrfs/124 failed if larger metadata folio is enabled, the
dying message looks like this:
BTRFS error (device dm-2): bad tree block start, mirror 2 want 31686656 have 0
BTRFS info (device dm-2): read error corrected: ino 0 off 31686656 (dev /dev/mapper/test-scratch2 sector 20928)
BUG: kernel NULL pointer dereference, address: 0000000000000020
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
CPU: 6 PID: 350881 Comm: btrfs Tainted: G OE 6.7.0-rc3-custom+ #128
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022
RIP: 0010:btrfs_read_extent_buffer+0x106/0x180 [btrfs]
PKRU: 55555554
Call Trace:
<TASK>
read_tree_block+0x33/0xb0 [btrfs]
read_block_for_search+0x23e/0x340 [btrfs]
btrfs_search_slot+0x2f9/0xe60 [btrfs]
btrfs_lookup_csum+0x75/0x160 [btrfs]
btrfs_lookup_bio_sums+0x21a/0x560 [btrfs]
btrfs_submit_chunk+0x152/0x680 [btrfs]
btrfs_submit_bio+0x1c/0x50 [btrfs]
submit_one_bio+0x40/0x80 [btrfs]
submit_extent_page+0x158/0x390 [btrfs]
btrfs_do_readpage+0x330/0x740 [btrfs]
extent_readahead+0x38d/0x6c0 [btrfs]
read_pages+0x94/0x2c0
page_cache_ra_unbounded+0x12d/0x190
relocate_file_extent_cluster+0x7c1/0x9d0 [btrfs]
relocate_block_group+0x2d3/0x560 [btrfs]
btrfs_relocate_block_group+0x2c7/0x4b0 [btrfs]
btrfs_relocate_chunk+0x4c/0x1a0 [btrfs]
btrfs_balance+0x925/0x13c0 [btrfs]
btrfs_ioctl+0x19f1/0x25d0 [btrfs]
__x64_sys_ioctl+0x90/0xd0
do_syscall_64+0x3f/0xf0
entry_SYSCALL_64_after_hwframe+0x6e/0x76
[CAUSE]
The dying line is at btrfs_repair_io_failure() call inside
btrfs_repair_eb_io_failure().
The function is still relying on the extent buffer using page sized
folios.
When the extent buffer is using larger folio, we go into the 2nd slot of
folios[], and triggered the NULL pointer dereference.
[FIX]
Migrate btrfs_repair_io_failure() to folio interfaces.
So that when we hit a larger folio, we just submit the whole folio in
one go.
This also affects data repair path through btrfs_end_repair_bio(),
thankfully data is still fully page based, we can just add an
ASSERT(), and use page_folio() to convert the page to folio.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-12-12 15:54:10 +10:30
u64 length , u64 logical , struct folio * folio ,
unsigned int folio_offset , int mirror_num )
2022-11-15 10:44:06 +01:00
{
2023-03-20 10:12:49 +08:00
struct btrfs_io_stripe smap = { 0 } ;
2022-11-15 10:44:06 +01:00
struct bio_vec bvec ;
struct bio bio ;
int ret = 0 ;
ASSERT ( ! ( fs_info - > sb - > s_flags & SB_RDONLY ) ) ;
BUG_ON ( ! mirror_num ) ;
if ( btrfs_repair_one_zone ( fs_info , logical ) )
return 0 ;
/*
* Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don ' t go away while we are doing the
* read repair operation .
*/
btrfs_bio_counter_inc_blocked ( fs_info ) ;
2023-03-20 10:12:49 +08:00
ret = btrfs_map_repair_block ( fs_info , & smap , logical , length , mirror_num ) ;
if ( ret < 0 )
goto out_counter_dec ;
2022-11-15 10:44:06 +01:00
2023-03-20 10:12:49 +08:00
if ( ! smap . dev - > bdev | |
! test_bit ( BTRFS_DEV_STATE_WRITEABLE , & smap . dev - > dev_state ) ) {
2022-11-15 10:44:06 +01:00
ret = - EIO ;
goto out_counter_dec ;
}
2023-03-20 10:12:49 +08:00
bio_init ( & bio , smap . dev - > bdev , & bvec , 1 , REQ_OP_WRITE | REQ_SYNC ) ;
bio . bi_iter . bi_sector = smap . physical > > SECTOR_SHIFT ;
btrfs: migrate btrfs_repair_io_failure() to folio interfaces
[BUG]
Test case btrfs/124 failed if larger metadata folio is enabled, the
dying message looks like this:
BTRFS error (device dm-2): bad tree block start, mirror 2 want 31686656 have 0
BTRFS info (device dm-2): read error corrected: ino 0 off 31686656 (dev /dev/mapper/test-scratch2 sector 20928)
BUG: kernel NULL pointer dereference, address: 0000000000000020
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
CPU: 6 PID: 350881 Comm: btrfs Tainted: G OE 6.7.0-rc3-custom+ #128
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022
RIP: 0010:btrfs_read_extent_buffer+0x106/0x180 [btrfs]
PKRU: 55555554
Call Trace:
<TASK>
read_tree_block+0x33/0xb0 [btrfs]
read_block_for_search+0x23e/0x340 [btrfs]
btrfs_search_slot+0x2f9/0xe60 [btrfs]
btrfs_lookup_csum+0x75/0x160 [btrfs]
btrfs_lookup_bio_sums+0x21a/0x560 [btrfs]
btrfs_submit_chunk+0x152/0x680 [btrfs]
btrfs_submit_bio+0x1c/0x50 [btrfs]
submit_one_bio+0x40/0x80 [btrfs]
submit_extent_page+0x158/0x390 [btrfs]
btrfs_do_readpage+0x330/0x740 [btrfs]
extent_readahead+0x38d/0x6c0 [btrfs]
read_pages+0x94/0x2c0
page_cache_ra_unbounded+0x12d/0x190
relocate_file_extent_cluster+0x7c1/0x9d0 [btrfs]
relocate_block_group+0x2d3/0x560 [btrfs]
btrfs_relocate_block_group+0x2c7/0x4b0 [btrfs]
btrfs_relocate_chunk+0x4c/0x1a0 [btrfs]
btrfs_balance+0x925/0x13c0 [btrfs]
btrfs_ioctl+0x19f1/0x25d0 [btrfs]
__x64_sys_ioctl+0x90/0xd0
do_syscall_64+0x3f/0xf0
entry_SYSCALL_64_after_hwframe+0x6e/0x76
[CAUSE]
The dying line is at btrfs_repair_io_failure() call inside
btrfs_repair_eb_io_failure().
The function is still relying on the extent buffer using page sized
folios.
When the extent buffer is using larger folio, we go into the 2nd slot of
folios[], and triggered the NULL pointer dereference.
[FIX]
Migrate btrfs_repair_io_failure() to folio interfaces.
So that when we hit a larger folio, we just submit the whole folio in
one go.
This also affects data repair path through btrfs_end_repair_bio(),
thankfully data is still fully page based, we can just add an
ASSERT(), and use page_folio() to convert the page to folio.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-12-12 15:54:10 +10:30
ret = bio_add_folio ( & bio , folio , length , folio_offset ) ;
ASSERT ( ret ) ;
2022-11-15 10:44:06 +01:00
ret = submit_bio_wait ( & bio ) ;
if ( ret ) {
/* try to remap that extent elsewhere? */
2023-03-20 10:12:49 +08:00
btrfs_dev_stat_inc_and_print ( smap . dev , BTRFS_DEV_STAT_WRITE_ERRS ) ;
2022-11-15 10:44:06 +01:00
goto out_bio_uninit ;
}
btrfs_info_rl_in_rcu ( fs_info ,
" read error corrected: ino %llu off %llu (dev %s sector %llu) " ,
2023-03-20 10:12:49 +08:00
ino , start , btrfs_dev_name ( smap . dev ) ,
smap . physical > > SECTOR_SHIFT ) ;
2022-11-15 10:44:06 +01:00
ret = 0 ;
out_bio_uninit :
bio_uninit ( & bio ) ;
out_counter_dec :
btrfs_bio_counter_dec ( fs_info ) ;
return ret ;
}
2023-03-20 10:12:49 +08:00
/*
* Submit a btrfs_bio based repair write .
*
* If @ dev_replace is true , the write would be submitted to dev - replace target .
*/
void btrfs_submit_repair_write ( struct btrfs_bio * bbio , int mirror_num , bool dev_replace )
{
struct btrfs_fs_info * fs_info = bbio - > fs_info ;
u64 logical = bbio - > bio . bi_iter . bi_sector < < SECTOR_SHIFT ;
u64 length = bbio - > bio . bi_iter . bi_size ;
struct btrfs_io_stripe smap = { 0 } ;
int ret ;
ASSERT ( fs_info ) ;
ASSERT ( mirror_num > 0 ) ;
ASSERT ( btrfs_op ( & bbio - > bio ) = = BTRFS_MAP_WRITE ) ;
ASSERT ( ! bbio - > inode ) ;
btrfs_bio_counter_inc_blocked ( fs_info ) ;
ret = btrfs_map_repair_block ( fs_info , & smap , logical , length , mirror_num ) ;
if ( ret < 0 )
goto fail ;
if ( dev_replace ) {
ASSERT ( smap . dev = = fs_info - > dev_replace . srcdev ) ;
smap . dev = fs_info - > dev_replace . tgtdev ;
}
__btrfs_submit_bio ( & bbio - > bio , NULL , & smap , mirror_num ) ;
return ;
fail :
btrfs_bio_counter_dec ( fs_info ) ;
btrfs_bio_end_io ( bbio , errno_to_blk_status ( ret ) ) ;
}
2022-11-15 10:44:05 +01:00
int __init btrfs_bioset_init ( void )
{
if ( bioset_init ( & btrfs_bioset , BIO_POOL_SIZE ,
offsetof ( struct btrfs_bio , bio ) ,
BIOSET_NEED_BVECS ) )
return - ENOMEM ;
2023-01-21 07:50:20 +01:00
if ( bioset_init ( & btrfs_clone_bioset , BIO_POOL_SIZE ,
offsetof ( struct btrfs_bio , bio ) , 0 ) )
goto out_free_bioset ;
2023-01-21 07:50:07 +01:00
if ( bioset_init ( & btrfs_repair_bioset , BIO_POOL_SIZE ,
offsetof ( struct btrfs_bio , bio ) ,
BIOSET_NEED_BVECS ) )
2023-01-21 07:50:20 +01:00
goto out_free_clone_bioset ;
2023-01-21 07:50:07 +01:00
if ( mempool_init_kmalloc_pool ( & btrfs_failed_bio_pool , BIO_POOL_SIZE ,
sizeof ( struct btrfs_failed_bio ) ) )
goto out_free_repair_bioset ;
2022-11-15 10:44:05 +01:00
return 0 ;
2023-01-21 07:50:07 +01:00
out_free_repair_bioset :
bioset_exit ( & btrfs_repair_bioset ) ;
2023-01-21 07:50:20 +01:00
out_free_clone_bioset :
bioset_exit ( & btrfs_clone_bioset ) ;
2023-01-21 07:50:07 +01:00
out_free_bioset :
bioset_exit ( & btrfs_bioset ) ;
return - ENOMEM ;
2022-11-15 10:44:05 +01:00
}
void __cold btrfs_bioset_exit ( void )
{
2023-01-21 07:50:07 +01:00
mempool_exit ( & btrfs_failed_bio_pool ) ;
bioset_exit ( & btrfs_repair_bioset ) ;
2023-01-21 07:50:20 +01:00
bioset_exit ( & btrfs_clone_bioset ) ;
2022-11-15 10:44:05 +01:00
bioset_exit ( & btrfs_bioset ) ;
}