2018-04-03 20:16:55 +03:00
/* SPDX-License-Identifier: GPL-2.0 */
2007-06-12 17:07:21 +04:00
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
*/
2018-04-03 20:16:55 +03:00
# ifndef BTRFS_DISK_IO_H
# define BTRFS_DISK_IO_H
2007-02-02 17:18:22 +03:00
2024-01-27 02:53:06 +03:00
# include <linux/sizes.h>
2024-01-27 05:19:56 +03:00
# include <linux/compiler_types.h>
2024-01-27 02:53:06 +03:00
# include "ctree.h"
# include "fs.h"
2024-01-27 05:19:56 +03:00
struct block_device ;
struct super_block ;
struct extent_buffer ;
struct btrfs_device ;
struct btrfs_fs_devices ;
struct btrfs_fs_info ;
struct btrfs_super_block ;
struct btrfs_trans_handle ;
struct btrfs_tree_parent_check ;
struct btrfs_transaction ;
2008-12-09 00:46:26 +03:00
# define BTRFS_SUPER_MIRROR_MAX 3
# define BTRFS_SUPER_MIRROR_SHIFT 12
2017-06-16 02:48:05 +03:00
/*
* Fixed blocksize for all devices , applies to specific ways of reading
* metadata like superblock . Must meet the set_blocksize requirements .
*
* Do not change .
*/
# define BTRFS_BDEV_BLOCKSIZE (4096)
2008-12-09 00:46:26 +03:00
static inline u64 btrfs_sb_offset ( int mirror )
{
2015-12-14 19:42:10 +03:00
u64 start = SZ_16K ;
2008-12-09 00:46:26 +03:00
if ( mirror )
return start < < ( BTRFS_SUPER_MIRROR_SHIFT * mirror ) ;
return BTRFS_SUPER_INFO_OFFSET ;
}
2020-01-24 17:33:00 +03:00
void btrfs_check_leaked_roots ( struct btrfs_fs_info * fs_info ) ;
2020-01-24 17:32:59 +03:00
void btrfs_init_fs_info ( struct btrfs_fs_info * fs_info ) ;
2018-03-29 04:08:11 +03:00
struct extent_buffer * read_tree_block ( struct btrfs_fs_info * fs_info , u64 bytenr ,
2022-09-14 08:32:50 +03:00
struct btrfs_tree_parent_check * check ) ;
2016-06-23 01:54:24 +03:00
struct extent_buffer * btrfs_find_create_tree_block (
struct btrfs_fs_info * fs_info ,
2020-11-05 18:45:20 +03:00
u64 bytenr , u64 owner_root ,
int level ) ;
btrfs: lift read-write mount setup from mount and remount
Mounting rw and remounting from ro to rw naturally share invariants and
functionality which result in a correctly setup rw filesystem. Luckily,
there is even a strong unity in the code which implements them. In
mount's open_ctree, these operations mostly happen after an early return
for ro file systems, and in remount, they happen in a section devoted to
remounting ro->rw, after some remount specific validation passes.
However, there are unfortunately a few differences. There are small
deviations in the order of some of the operations, remount does not
start orphan cleanup in root_tree or fs_tree, remount does not create
the free space tree, and remount does not handle "one-shot" mount
options like clear_cache and uuid tree rescan.
Since we want to add building the free space tree to remount, and also
to start the same orphan cleanup process on a filesystem mounted as ro
then remounted rw, we would benefit from unifying the logic between the
two code paths.
This patch only lifts the existing common functionality, and leaves a
natural path for fixing the discrepancies.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-19 02:06:16 +03:00
int btrfs_start_pre_rw_mount ( struct btrfs_fs_info * fs_info ) ;
2022-10-18 04:56:38 +03:00
int btrfs_check_super_csum ( struct btrfs_fs_info * fs_info ,
const struct btrfs_super_block * disk_sb ) ;
2019-10-01 20:57:35 +03:00
int __cold open_ctree ( struct super_block * sb ,
2011-11-17 10:10:02 +04:00
struct btrfs_fs_devices * fs_devices ,
char * options ) ;
2019-10-01 20:57:35 +03:00
void __cold close_ctree ( struct btrfs_fs_info * fs_info ) ;
btrfs: check superblock to ensure the fs was not modified at thaw time
[BACKGROUND]
There is an incident report that, one user hibernated the system, with
one btrfs on removable device still mounted.
Then by some incident, the btrfs got mounted and modified by another
system/OS, then back to the hibernated system.
After resuming from the hibernation, new write happened into the victim btrfs.
Now the fs is completely broken, since the underlying btrfs is no longer
the same one before the hibernation, and the user lost their data due to
various transid mismatch.
[REPRODUCER]
We can emulate the situation using the following small script:
truncate -s 1G $dev
mkfs.btrfs -f $dev
mount $dev $mnt
fsstress -w -d $mnt -n 500
sync
xfs_freeze -f $mnt
cp $dev $dev.backup
# There is no way to mount the same cloned fs on the same system,
# as the conflicting fsid will be rejected by btrfs.
# Thus here we have to wipe the fs using a different btrfs.
mkfs.btrfs -f $dev.backup
dd if=$dev.backup of=$dev bs=1M
xfs_freeze -u $mnt
fsstress -w -d $mnt -n 20
umount $mnt
btrfs check $dev
The final fsck will fail due to some tree blocks has incorrect fsid.
This is enough to emulate the problem hit by the unfortunate user.
[ENHANCEMENT]
Although such case should not be that common, it can still happen from
time to time.
From the view of btrfs, we can detect any unexpected super block change,
and if there is any unexpected change, we just mark the fs read-only,
and thaw the fs.
By this we can limit the damage to minimal, and I hope no one would lose
their data by this anymore.
Suggested-by: Goffredo Baroncelli <kreijack@libero.it>
Link: https://lore.kernel.org/linux-btrfs/83bf3b4b-7f4c-387a-b286-9251e3991e34@bluemole.com/
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-08-24 15:16:22 +03:00
int btrfs_validate_super ( struct btrfs_fs_info * fs_info ,
struct btrfs_super_block * sb , int mirror_num ) ;
btrfs: fix compat_ro checks against remount
[BUG]
Even with commit 81d5d61454c3 ("btrfs: enhance unsupported compat RO
flags handling"), btrfs can still mount a fs with unsupported compat_ro
flags read-only, then remount it RW:
# btrfs ins dump-super /dev/loop0 | grep compat_ro_flags -A 3
compat_ro_flags 0x403
( FREE_SPACE_TREE |
FREE_SPACE_TREE_VALID |
unknown flag: 0x400 )
# mount /dev/loop0 /mnt/btrfs
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/loop0, missing codepage or helper program, or other error.
dmesg(1) may have more information after failed mount system call.
^^^ RW mount failed as expected ^^^
# dmesg -t | tail -n5
loop0: detected capacity change from 0 to 1048576
BTRFS: device fsid cb5b82f5-0fdd-4d81-9b4b-78533c324afa devid 1 transid 7 /dev/loop0 scanned by mount (1146)
BTRFS info (device loop0): using crc32c (crc32c-intel) checksum algorithm
BTRFS info (device loop0): using free space tree
BTRFS error (device loop0): cannot mount read-write because of unknown compat_ro features (0x403)
BTRFS error (device loop0): open_ctree failed
# mount /dev/loop0 -o ro /mnt/btrfs
# mount -o remount,rw /mnt/btrfs
^^^ RW remount succeeded unexpectedly ^^^
[CAUSE]
Currently we use btrfs_check_features() to check compat_ro flags against
our current mount flags.
That function get reused between open_ctree() and btrfs_remount().
But for btrfs_remount(), the super block we passed in still has the old
mount flags, thus btrfs_check_features() still believes we're mounting
read-only.
[FIX]
Replace the existing @sb argument with @is_rw_mount.
As originally we only use @sb to determine if the mount is RW.
Now it's callers' responsibility to determine if the mount is RW, and
since there are only two callers, the check is pretty simple:
- caller in open_ctree()
Just pass !sb_rdonly().
- caller in btrfs_remount()
Pass !(*flags & SB_RDONLY), as our check should be against the new
flags.
Now we can correctly reject the RW remount:
# mount /dev/loop0 -o ro /mnt/btrfs
# mount -o remount,rw /mnt/btrfs
mount: /mnt/btrfs: mount point not mounted or bad option.
dmesg(1) may have more information after failed mount system call.
# dmesg -t | tail -n 1
BTRFS error (device loop0: state M): cannot mount read-write because of unknown compat_ro features (0x403)
Reported-by: Chung-Chiang Cheng <shepjeng@gmail.com>
Fixes: 81d5d61454c3 ("btrfs: enhance unsupported compat RO flags handling")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-12-22 02:59:17 +03:00
int btrfs_check_features ( struct btrfs_fs_info * fs_info , bool is_rw_mount ) ;
2017-02-10 21:04:32 +03:00
int write_all_supers ( struct btrfs_fs_info * fs_info , int max_mirrors ) ;
2020-02-13 18:24:32 +03:00
struct btrfs_super_block * btrfs_read_dev_super ( struct block_device * bdev ) ;
struct btrfs_super_block * btrfs_read_dev_one_super ( struct block_device * bdev ,
btrfs: check superblock to ensure the fs was not modified at thaw time
[BACKGROUND]
There is an incident report that, one user hibernated the system, with
one btrfs on removable device still mounted.
Then by some incident, the btrfs got mounted and modified by another
system/OS, then back to the hibernated system.
After resuming from the hibernation, new write happened into the victim btrfs.
Now the fs is completely broken, since the underlying btrfs is no longer
the same one before the hibernation, and the user lost their data due to
various transid mismatch.
[REPRODUCER]
We can emulate the situation using the following small script:
truncate -s 1G $dev
mkfs.btrfs -f $dev
mount $dev $mnt
fsstress -w -d $mnt -n 500
sync
xfs_freeze -f $mnt
cp $dev $dev.backup
# There is no way to mount the same cloned fs on the same system,
# as the conflicting fsid will be rejected by btrfs.
# Thus here we have to wipe the fs using a different btrfs.
mkfs.btrfs -f $dev.backup
dd if=$dev.backup of=$dev bs=1M
xfs_freeze -u $mnt
fsstress -w -d $mnt -n 20
umount $mnt
btrfs check $dev
The final fsck will fail due to some tree blocks has incorrect fsid.
This is enough to emulate the problem hit by the unfortunate user.
[ENHANCEMENT]
Although such case should not be that common, it can still happen from
time to time.
From the view of btrfs, we can detect any unexpected super block change,
and if there is any unexpected change, we just mark the fs read-only,
and thaw the fs.
By this we can limit the damage to minimal, and I hope no one would lose
their data by this anymore.
Suggested-by: Goffredo Baroncelli <kreijack@libero.it>
Link: https://lore.kernel.org/linux-btrfs/83bf3b4b-7f4c-387a-b286-9251e3991e34@bluemole.com/
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-08-24 15:16:22 +03:00
int copy_num , bool drop_cache ) ;
2016-06-22 04:16:51 +03:00
int btrfs_commit_super ( struct btrfs_fs_info * fs_info ) ;
2020-01-24 17:32:21 +03:00
struct btrfs_root * btrfs_read_tree_root ( struct btrfs_root * tree_root ,
struct btrfs_key * key ) ;
2013-05-15 11:48:19 +04:00
int btrfs_insert_fs_root ( struct btrfs_fs_info * fs_info ,
struct btrfs_root * root ) ;
2014-05-08 01:06:09 +04:00
void btrfs_free_fs_roots ( struct btrfs_fs_info * fs_info ) ;
2013-09-25 17:47:44 +04:00
struct btrfs_root * btrfs_get_fs_root ( struct btrfs_fs_info * fs_info ,
2020-05-15 20:35:55 +03:00
u64 objectid , bool check_ref ) ;
2020-06-16 05:17:36 +03:00
struct btrfs_root * btrfs_get_new_fs_root ( struct btrfs_fs_info * fs_info ,
btrfs: fix double free of anonymous device after snapshot creation failure
When creating a snapshot we may do a double free of an anonymous device
in case there's an error committing the transaction. The second free may
result in freeing an anonymous device number that was allocated by some
other subsystem in the kernel or another btrfs filesystem.
The steps that lead to this:
1) At ioctl.c:create_snapshot() we allocate an anonymous device number
and assign it to pending_snapshot->anon_dev;
2) Then we call btrfs_commit_transaction() and end up at
transaction.c:create_pending_snapshot();
3) There we call btrfs_get_new_fs_root() and pass it the anonymous device
number stored in pending_snapshot->anon_dev;
4) btrfs_get_new_fs_root() frees that anonymous device number because
btrfs_lookup_fs_root() returned a root - someone else did a lookup
of the new root already, which could some task doing backref walking;
5) After that some error happens in the transaction commit path, and at
ioctl.c:create_snapshot() we jump to the 'fail' label, and after
that we free again the same anonymous device number, which in the
meanwhile may have been reallocated somewhere else, because
pending_snapshot->anon_dev still has the same value as in step 1.
Recently syzbot ran into this and reported the following trace:
------------[ cut here ]------------
ida_free called for id=51 which is not allocated.
WARNING: CPU: 1 PID: 31038 at lib/idr.c:525 ida_free+0x370/0x420 lib/idr.c:525
Modules linked in:
CPU: 1 PID: 31038 Comm: syz-executor.2 Not tainted 6.8.0-rc4-syzkaller-00410-gc02197fc9076 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
RIP: 0010:ida_free+0x370/0x420 lib/idr.c:525
Code: 10 42 80 3c 28 (...)
RSP: 0018:ffffc90015a67300 EFLAGS: 00010246
RAX: be5130472f5dd000 RBX: 0000000000000033 RCX: 0000000000040000
RDX: ffffc90009a7a000 RSI: 000000000003ffff RDI: 0000000000040000
RBP: ffffc90015a673f0 R08: ffffffff81577992 R09: 1ffff92002b4cdb4
R10: dffffc0000000000 R11: fffff52002b4cdb5 R12: 0000000000000246
R13: dffffc0000000000 R14: ffffffff8e256b80 R15: 0000000000000246
FS: 00007fca3f4b46c0(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f167a17b978 CR3: 000000001ed26000 CR4: 0000000000350ef0
Call Trace:
<TASK>
btrfs_get_root_ref+0xa48/0xaf0 fs/btrfs/disk-io.c:1346
create_pending_snapshot+0xff2/0x2bc0 fs/btrfs/transaction.c:1837
create_pending_snapshots+0x195/0x1d0 fs/btrfs/transaction.c:1931
btrfs_commit_transaction+0xf1c/0x3740 fs/btrfs/transaction.c:2404
create_snapshot+0x507/0x880 fs/btrfs/ioctl.c:848
btrfs_mksubvol+0x5d0/0x750 fs/btrfs/ioctl.c:998
btrfs_mksnapshot+0xb5/0xf0 fs/btrfs/ioctl.c:1044
__btrfs_ioctl_snap_create+0x387/0x4b0 fs/btrfs/ioctl.c:1306
btrfs_ioctl_snap_create_v2+0x1ca/0x400 fs/btrfs/ioctl.c:1393
btrfs_ioctl+0xa74/0xd40
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:871 [inline]
__se_sys_ioctl+0xfe/0x170 fs/ioctl.c:857
do_syscall_64+0xfb/0x240
entry_SYSCALL_64_after_hwframe+0x6f/0x77
RIP: 0033:0x7fca3e67dda9
Code: 28 00 00 00 (...)
RSP: 002b:00007fca3f4b40c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007fca3e7abf80 RCX: 00007fca3e67dda9
RDX: 00000000200005c0 RSI: 0000000050009417 RDI: 0000000000000003
RBP: 00007fca3e6ca47a R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 000000000000000b R14: 00007fca3e7abf80 R15: 00007fff6bf95658
</TASK>
Where we get an explicit message where we attempt to free an anonymous
device number that is not currently allocated. It happens in a different
code path from the example below, at btrfs_get_root_ref(), so this change
may not fix the case triggered by syzbot.
To fix at least the code path from the example above, change
btrfs_get_root_ref() and its callers to receive a dev_t pointer argument
for the anonymous device number, so that in case it frees the number, it
also resets it to 0, so that up in the call chain we don't attempt to do
the double free.
CC: stable@vger.kernel.org # 5.10+
Link: https://lore.kernel.org/linux-btrfs/000000000000f673a1061202f630@google.com/
Fixes: e03ee2fe873e ("btrfs: do not ASSERT() if the newly created subvolume already got read")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2024-02-23 19:38:43 +03:00
u64 objectid , dev_t * anon_dev ) ;
2020-10-19 23:02:31 +03:00
struct btrfs_root * btrfs_get_fs_root_commit_root ( struct btrfs_fs_info * fs_info ,
struct btrfs_path * path ,
u64 objectid ) ;
2021-11-05 23:45:51 +03:00
int btrfs_global_root_insert ( struct btrfs_root * root ) ;
void btrfs_global_root_delete ( struct btrfs_root * root ) ;
struct btrfs_root * btrfs_global_root ( struct btrfs_fs_info * fs_info ,
struct btrfs_key * key ) ;
struct btrfs_root * btrfs_csum_root ( struct btrfs_fs_info * fs_info , u64 bytenr ) ;
struct btrfs_root * btrfs_extent_root ( struct btrfs_fs_info * fs_info , u64 bytenr ) ;
2022-09-14 18:06:29 +03:00
struct btrfs_root * btrfs_block_group_root ( struct btrfs_fs_info * fs_info ) ;
2013-09-25 17:47:44 +04:00
2020-01-24 17:32:53 +03:00
void btrfs_free_fs_info ( struct btrfs_fs_info * fs_info ) ;
2016-06-23 01:54:24 +03:00
void btrfs_btree_balance_dirty ( struct btrfs_fs_info * fs_info ) ;
void btrfs_btree_balance_dirty_nodelay ( struct btrfs_fs_info * fs_info ) ;
2013-05-15 11:48:19 +04:00
void btrfs_drop_and_free_fs_root ( struct btrfs_fs_info * fs_info ,
struct btrfs_root * root ) ;
2023-05-03 18:24:28 +03:00
int btrfs_validate_extent_buffer ( struct extent_buffer * eb ,
struct btrfs_tree_parent_check * check ) ;
2013-09-20 00:07:01 +04:00
# ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
2016-06-15 16:22:56 +03:00
struct btrfs_root * btrfs_alloc_dummy_root ( struct btrfs_fs_info * fs_info ) ;
2013-09-20 00:07:01 +04:00
# endif
2013-05-15 11:48:20 +04:00
/*
* This function is used to grab the root , and avoid it is freed when we
* access it . But it doesn ' t ensure that the tree is not dropped .
*
* If you want to ensure the whole tree is safe , you should use
* fs_info - > subvol_srcu
*/
2020-01-24 17:33:01 +03:00
static inline struct btrfs_root * btrfs_grab_root ( struct btrfs_root * root )
2013-05-15 11:48:20 +04:00
{
2020-01-24 17:32:26 +03:00
if ( ! root )
return NULL ;
2017-03-03 11:55:18 +03:00
if ( refcount_inc_not_zero ( & root - > refs ) )
2013-05-15 11:48:20 +04:00
return root ;
return NULL ;
}
2020-02-15 00:11:42 +03:00
void btrfs_put_root ( struct btrfs_root * root ) ;
2023-09-12 15:04:29 +03:00
void btrfs_mark_buffer_dirty ( struct btrfs_trans_handle * trans ,
struct extent_buffer * buf ) ;
2012-05-06 15:23:47 +04:00
int btrfs_buffer_uptodate ( struct extent_buffer * buf , u64 parent_transid ,
int atomic ) ;
2022-09-14 08:32:50 +03:00
int btrfs_read_extent_buffer ( struct extent_buffer * buf ,
struct btrfs_tree_parent_check * check ) ;
2022-10-27 03:22:19 +03:00
2023-01-21 09:50:19 +03:00
blk_status_t btree_csum_one_bio ( struct btrfs_bio * bbio ) ;
2021-02-04 13:22:17 +03:00
int btrfs_alloc_log_tree_node ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
2008-09-06 00:13:11 +04:00
int btrfs_init_log_root_tree ( struct btrfs_trans_handle * trans ,
struct btrfs_fs_info * fs_info ) ;
2009-01-21 20:54:03 +03:00
int btrfs_add_log_tree ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
2016-07-21 03:44:12 +03:00
void btrfs_cleanup_dirty_bgs ( struct btrfs_transaction * trans ,
2016-06-23 01:54:24 +03:00
struct btrfs_fs_info * fs_info ) ;
2012-03-01 20:24:58 +04:00
void btrfs_cleanup_one_transaction ( struct btrfs_transaction * trans ,
2016-06-23 01:54:24 +03:00
struct btrfs_fs_info * fs_info ) ;
2011-09-13 14:44:20 +04:00
struct btrfs_root * btrfs_create_tree ( struct btrfs_trans_handle * trans ,
u64 objectid ) ;
2015-08-19 10:54:15 +03:00
int btrfs_get_num_tolerated_disk_barrier_failures ( u64 flags ) ;
2020-12-07 18:32:33 +03:00
int btrfs_get_free_objectid ( struct btrfs_root * root , u64 * objectid ) ;
2020-12-07 18:32:32 +03:00
int btrfs_init_root_free_objectid ( struct btrfs_root * root ) ;
2009-02-12 22:09:45 +03:00
2007-02-02 17:18:22 +03:00
# endif