2018-04-03 20:23:33 +03:00
// SPDX-License-Identifier: GPL-2.0
2008-06-12 05:53:53 +04:00
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
*/
# include <linux/kernel.h>
# include <linux/bio.h>
# include <linux/file.h>
# include <linux/fs.h>
2008-10-09 21:39:39 +04:00
# include <linux/fsnotify.h>
2008-06-12 05:53:53 +04:00
# include <linux/pagemap.h>
# include <linux/highmem.h>
# include <linux/time.h>
# include <linux/string.h>
# include <linux/backing-dev.h>
2008-10-09 21:39:39 +04:00
# include <linux/mount.h>
# include <linux/namei.h>
2008-06-12 05:53:53 +04:00
# include <linux/writeback.h>
# include <linux/compat.h>
2008-10-09 21:39:39 +04:00
# include <linux/security.h>
2008-06-12 05:53:53 +04:00
# include <linux/xattr.h>
2017-05-31 20:32:09 +03:00
# include <linux/mm.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
# include <linux/slab.h>
2011-03-24 13:24:28 +03:00
# include <linux/blkdev.h>
2012-07-25 19:35:53 +04:00
# include <linux/uuid.h>
2013-01-29 10:04:50 +04:00
# include <linux/btrfs.h>
2013-08-06 22:42:51 +04:00
# include <linux/uaccess.h>
2018-01-29 14:41:30 +03:00
# include <linux/iversion.h>
2008-06-12 05:53:53 +04:00
# include "ctree.h"
# include "disk-io.h"
# include "transaction.h"
# include "btrfs_inode.h"
# include "print-tree.h"
# include "volumes.h"
2008-06-26 00:01:30 +04:00
# include "locking.h"
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 06:06:11 +04:00
# include "inode-map.h"
2011-07-07 18:48:38 +04:00
# include "backref.h"
2012-06-04 22:03:51 +04:00
# include "rcu-string.h"
2012-07-26 01:19:24 +04:00
# include "send.h"
2012-11-06 18:08:53 +04:00
# include "dev-replace.h"
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 15:47:46 +04:00
# include "props.h"
2013-11-01 21:07:02 +04:00
# include "sysfs.h"
2014-05-14 04:30:47 +04:00
# include "qgroup.h"
Btrfs: fix unreplayable log after snapshot delete + parent dir fsync
If we delete a snapshot, fsync its parent directory and crash/power fail
before the next transaction commit, on the next mount when we attempt to
replay the log tree of the root containing the parent directory we will
fail and prevent the filesystem from mounting, which is solvable by wiping
out the log trees with the btrfs-zero-log tool but very inconvenient as
we will lose any data and metadata fsynced before the parent directory
was fsynced.
For example:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ mkdir /mnt/testdir
$ btrfs subvolume snapshot /mnt /mnt/testdir/snap
$ btrfs subvolume delete /mnt/testdir/snap
$ xfs_io -c "fsync" /mnt/testdir
< crash / power failure and reboot >
$ mount /dev/sdc /mnt
mount: mount(2) failed: No such file or directory
And in dmesg/syslog we get the following message and trace:
[192066.361162] BTRFS info (device dm-0): failed to delete reference to snap, inode 257 parent 257
[192066.363010] ------------[ cut here ]------------
[192066.365268] WARNING: CPU: 4 PID: 5130 at fs/btrfs/inode.c:3986 __btrfs_unlink_inode+0x17a/0x354 [btrfs]()
[192066.367250] BTRFS: Transaction aborted (error -2)
[192066.368401] Modules linked in: btrfs dm_flakey dm_mod ppdev sha256_generic xor raid6_pq hmac drbg ansi_cprng aesni_intel acpi_cpufreq tpm_tis aes_x86_64 tpm ablk_helper evdev cryptd sg parport_pc i2c_piix4 psmouse lrw parport i2c_core pcspkr gf128mul processor serio_raw glue_helper button loop autofs4 ext4 crc16 mbcache jbd2 sd_mod sr_mod cdrom ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel scsi_mod e1000 virtio floppy [last unloaded: btrfs]
[192066.377154] CPU: 4 PID: 5130 Comm: mount Tainted: G W 4.4.0-rc6-btrfs-next-20+ #1
[192066.378875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[192066.380889] 0000000000000000 ffff880143923670 ffffffff81257570 ffff8801439236b8
[192066.382561] ffff8801439236a8 ffffffff8104ec07 ffffffffa039dc2c 00000000fffffffe
[192066.384191] ffff8801ed31d000 ffff8801b9fc9c88 ffff8801086875e0 ffff880143923710
[192066.385827] Call Trace:
[192066.386373] [<ffffffff81257570>] dump_stack+0x4e/0x79
[192066.387387] [<ffffffff8104ec07>] warn_slowpath_common+0x99/0xb2
[192066.388429] [<ffffffffa039dc2c>] ? __btrfs_unlink_inode+0x17a/0x354 [btrfs]
[192066.389236] [<ffffffff8104ec68>] warn_slowpath_fmt+0x48/0x50
[192066.389884] [<ffffffffa039dc2c>] __btrfs_unlink_inode+0x17a/0x354 [btrfs]
[192066.390621] [<ffffffff81184b55>] ? iput+0xb0/0x266
[192066.391200] [<ffffffffa039ea25>] btrfs_unlink_inode+0x1c/0x3d [btrfs]
[192066.391930] [<ffffffffa03ca623>] check_item_in_log+0x1fe/0x29b [btrfs]
[192066.392715] [<ffffffffa03ca827>] replay_dir_deletes+0x167/0x1cf [btrfs]
[192066.393510] [<ffffffffa03cccc7>] replay_one_buffer+0x417/0x570 [btrfs]
[192066.394241] [<ffffffffa03ca164>] walk_up_log_tree+0x10e/0x1dc [btrfs]
[192066.394958] [<ffffffffa03cac72>] walk_log_tree+0xa5/0x190 [btrfs]
[192066.395628] [<ffffffffa03ce8b8>] btrfs_recover_log_trees+0x239/0x32c [btrfs]
[192066.396790] [<ffffffffa03cc8b0>] ? replay_one_extent+0x50a/0x50a [btrfs]
[192066.397891] [<ffffffffa0394041>] open_ctree+0x1d8b/0x2167 [btrfs]
[192066.398897] [<ffffffffa03706e1>] btrfs_mount+0x5ef/0x729 [btrfs]
[192066.399823] [<ffffffff8108ad98>] ? trace_hardirqs_on+0xd/0xf
[192066.400739] [<ffffffff8108959b>] ? lockdep_init_map+0xb9/0x1b3
[192066.401700] [<ffffffff811714b9>] mount_fs+0x67/0x131
[192066.402482] [<ffffffff81188560>] vfs_kern_mount+0x6c/0xde
[192066.403930] [<ffffffffa03702bd>] btrfs_mount+0x1cb/0x729 [btrfs]
[192066.404831] [<ffffffff8108ad98>] ? trace_hardirqs_on+0xd/0xf
[192066.405726] [<ffffffff8108959b>] ? lockdep_init_map+0xb9/0x1b3
[192066.406621] [<ffffffff811714b9>] mount_fs+0x67/0x131
[192066.407401] [<ffffffff81188560>] vfs_kern_mount+0x6c/0xde
[192066.408247] [<ffffffff8118ae36>] do_mount+0x893/0x9d2
[192066.409047] [<ffffffff8113009b>] ? strndup_user+0x3f/0x8c
[192066.409842] [<ffffffff8118b187>] SyS_mount+0x75/0xa1
[192066.410621] [<ffffffff8147e517>] entry_SYSCALL_64_fastpath+0x12/0x6b
[192066.411572] ---[ end trace 2de42126c1e0a0f0 ]---
[192066.412344] BTRFS: error (device dm-0) in __btrfs_unlink_inode:3986: errno=-2 No such entry
[192066.413748] BTRFS: error (device dm-0) in btrfs_replay_log:2464: errno=-2 No such entry (Failed to recover log tree)
[192066.415458] BTRFS error (device dm-0): cleaner transaction attach returned -30
[192066.444613] BTRFS: open_ctree failed
This happens because when we are replaying the log and processing the
directory entry pointing to the snapshot in the subvolume tree, we treat
its btrfs_dir_item item as having a location with a key type matching
BTRFS_INODE_ITEM_KEY, which is wrong because the type matches
BTRFS_ROOT_ITEM_KEY and therefore must be processed differently, as the
object id refers to a root number and not to an inode in the root
containing the parent directory.
So fix this by triggering a transaction commit if an fsync against the
parent directory is requested after deleting a snapshot. This is the
simplest approach for a rare use case. Some alternative that avoids the
transaction commit would require more code to explicitly delete the
snapshot at log replay time (factoring out common code from ioctl.c:
btrfs_ioctl_snap_destroy()), special care at fsync time to remove the
log tree of the snapshot's root from the log root of the root of tree
roots, amongst other steps.
A test case for xfstests that triggers the issue follows.
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
cd /
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_dm_target flakey
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create a snapshot at the root of our filesystem (mount point path), delete it,
# fsync the mount point path, crash and mount to replay the log. This should
# succeed and after the filesystem is mounted the snapshot should not be visible
# anymore.
_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/snap1
_run_btrfs_util_prog subvolume delete $SCRATCH_MNT/snap1
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT
_flakey_drop_and_remount
[ -e $SCRATCH_MNT/snap1 ] && \
echo "Snapshot snap1 still exists after log replay"
# Similar scenario as above, but this time the snapshot is created inside a
# directory and not directly under the root (mount point path).
mkdir $SCRATCH_MNT/testdir
_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/testdir/snap2
_run_btrfs_util_prog subvolume delete $SCRATCH_MNT/testdir/snap2
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir
_flakey_drop_and_remount
[ -e $SCRATCH_MNT/testdir/snap2 ] && \
echo "Snapshot snap2 still exists after log replay"
_unmount_flakey
echo "Silence is golden"
status=0
exit
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-02-10 13:42:25 +03:00
# include "tree-log.h"
2016-03-10 12:26:59 +03:00
# include "compression.h"
2019-06-18 23:09:16 +03:00
# include "space-info.h"
2019-06-19 22:12:00 +03:00
# include "delalloc-space.h"
2019-06-20 22:37:44 +03:00
# include "block-group.h"
2008-06-12 05:53:53 +04:00
2014-01-31 00:17:00 +04:00
# ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
* structures are incorrect , as the timespec structure from userspace
* is 4 bytes too small . We define these alternatives here to teach
* the kernel about the 32 - bit struct packing .
*/
struct btrfs_ioctl_timespec_32 {
__u64 sec ;
__u32 nsec ;
} __attribute__ ( ( __packed__ ) ) ;
struct btrfs_ioctl_received_subvol_args_32 {
char uuid [ BTRFS_UUID_SIZE ] ; /* in */
__u64 stransid ; /* in */
__u64 rtransid ; /* out */
struct btrfs_ioctl_timespec_32 stime ; /* in */
struct btrfs_ioctl_timespec_32 rtime ; /* out */
__u64 flags ; /* in */
__u64 reserved [ 16 ] ; /* in */
} __attribute__ ( ( __packed__ ) ) ;
# define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
struct btrfs_ioctl_received_subvol_args_32 )
# endif
2017-09-27 17:43:13 +03:00
# if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_send_args_32 {
__s64 send_fd ; /* in */
__u64 clone_sources_count ; /* in */
compat_uptr_t clone_sources ; /* in */
__u64 parent_root ; /* in */
__u64 flags ; /* in */
__u64 reserved [ 4 ] ; /* in */
} __attribute__ ( ( __packed__ ) ) ;
# define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
struct btrfs_ioctl_send_args_32 )
# endif
2014-01-31 00:17:00 +04:00
2013-08-06 22:42:51 +04:00
static int btrfs_clone ( struct inode * src , struct inode * inode ,
2015-07-01 00:42:08 +03:00
u64 off , u64 olen , u64 olen_aligned , u64 destoff ,
int no_time_update ) ;
2013-08-06 22:42:51 +04:00
2009-04-17 12:37:41 +04:00
/* Mask out flags that are inappropriate for the given type of inode. */
2018-03-26 19:52:15 +03:00
static unsigned int btrfs_mask_fsflags_for_type ( struct inode * inode ,
unsigned int flags )
2009-04-17 12:37:41 +04:00
{
2018-03-26 19:52:15 +03:00
if ( S_ISDIR ( inode - > i_mode ) )
2009-04-17 12:37:41 +04:00
return flags ;
2018-03-26 19:52:15 +03:00
else if ( S_ISREG ( inode - > i_mode ) )
2009-04-17 12:37:41 +04:00
return flags & ~ FS_DIRSYNC_FL ;
else
return flags & ( FS_NODUMP_FL | FS_NOATIME_FL ) ;
}
/*
2018-03-26 20:12:25 +03:00
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl .
2009-04-17 12:37:41 +04:00
*/
2018-03-26 20:12:25 +03:00
static unsigned int btrfs_inode_flags_to_fsflags ( unsigned int flags )
2009-04-17 12:37:41 +04:00
{
unsigned int iflags = 0 ;
if ( flags & BTRFS_INODE_SYNC )
iflags | = FS_SYNC_FL ;
if ( flags & BTRFS_INODE_IMMUTABLE )
iflags | = FS_IMMUTABLE_FL ;
if ( flags & BTRFS_INODE_APPEND )
iflags | = FS_APPEND_FL ;
if ( flags & BTRFS_INODE_NODUMP )
iflags | = FS_NODUMP_FL ;
if ( flags & BTRFS_INODE_NOATIME )
iflags | = FS_NOATIME_FL ;
if ( flags & BTRFS_INODE_DIRSYNC )
iflags | = FS_DIRSYNC_FL ;
2011-04-15 07:03:06 +04:00
if ( flags & BTRFS_INODE_NODATACOW )
iflags | = FS_NOCOW_FL ;
2016-03-15 03:09:59 +03:00
if ( flags & BTRFS_INODE_NOCOMPRESS )
2011-04-15 07:03:06 +04:00
iflags | = FS_NOCOMP_FL ;
2016-03-15 03:09:59 +03:00
else if ( flags & BTRFS_INODE_COMPRESS )
iflags | = FS_COMPR_FL ;
2009-04-17 12:37:41 +04:00
return iflags ;
}
/*
* Update inode - > i_flags based on the btrfs internal flags .
*/
2018-03-26 19:40:21 +03:00
void btrfs_sync_inode_flags_to_i_flags ( struct inode * inode )
2009-04-17 12:37:41 +04:00
{
2018-04-23 16:45:18 +03:00
struct btrfs_inode * binode = BTRFS_I ( inode ) ;
2014-06-26 01:36:02 +04:00
unsigned int new_fl = 0 ;
2009-04-17 12:37:41 +04:00
2018-04-23 16:45:18 +03:00
if ( binode - > flags & BTRFS_INODE_SYNC )
2014-06-26 01:36:02 +04:00
new_fl | = S_SYNC ;
2018-04-23 16:45:18 +03:00
if ( binode - > flags & BTRFS_INODE_IMMUTABLE )
2014-06-26 01:36:02 +04:00
new_fl | = S_IMMUTABLE ;
2018-04-23 16:45:18 +03:00
if ( binode - > flags & BTRFS_INODE_APPEND )
2014-06-26 01:36:02 +04:00
new_fl | = S_APPEND ;
2018-04-23 16:45:18 +03:00
if ( binode - > flags & BTRFS_INODE_NOATIME )
2014-06-26 01:36:02 +04:00
new_fl | = S_NOATIME ;
2018-04-23 16:45:18 +03:00
if ( binode - > flags & BTRFS_INODE_DIRSYNC )
2014-06-26 01:36:02 +04:00
new_fl | = S_DIRSYNC ;
set_mask_bits ( & inode - > i_flags ,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC ,
new_fl ) ;
2009-04-17 12:37:41 +04:00
}
static int btrfs_ioctl_getflags ( struct file * file , void __user * arg )
{
2018-04-23 16:45:18 +03:00
struct btrfs_inode * binode = BTRFS_I ( file_inode ( file ) ) ;
unsigned int flags = btrfs_inode_flags_to_fsflags ( binode - > flags ) ;
2009-04-17 12:37:41 +04:00
if ( copy_to_user ( arg , & flags , sizeof ( flags ) ) )
return - EFAULT ;
return 0 ;
}
2018-03-26 19:52:15 +03:00
/* Check if @flags are a supported and valid set of FS_*_FL flags */
static int check_fsflags ( unsigned int flags )
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 13:12:20 +03:00
{
if ( flags & ~ ( FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
FS_SYNC_FL | FS_DIRSYNC_FL | \
2011-04-15 07:02:49 +04:00
FS_NOCOMP_FL | FS_COMPR_FL |
FS_NOCOW_FL ) )
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 13:12:20 +03:00
return - EOPNOTSUPP ;
if ( ( flags & FS_NOCOMP_FL ) & & ( flags & FS_COMPR_FL ) )
return - EINVAL ;
return 0 ;
}
2009-04-17 12:37:41 +04:00
static int btrfs_ioctl_setflags ( struct file * file , void __user * arg )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2018-04-23 16:45:18 +03:00
struct btrfs_inode * binode = BTRFS_I ( inode ) ;
struct btrfs_root * root = binode - > root ;
2009-04-17 12:37:41 +04:00
struct btrfs_trans_handle * trans ;
2019-07-01 18:25:34 +03:00
unsigned int fsflags , old_fsflags ;
2009-04-17 12:37:41 +04:00
int ret ;
2019-04-20 14:48:53 +03:00
const char * comp = NULL ;
2019-04-20 14:48:55 +03:00
u32 binode_flags = binode - > flags ;
2009-04-17 12:37:41 +04:00
2014-01-16 18:50:22 +04:00
if ( ! inode_owner_or_capable ( inode ) )
return - EPERM ;
2010-12-20 11:04:08 +03:00
if ( btrfs_root_readonly ( root ) )
return - EROFS ;
2018-04-23 16:45:18 +03:00
if ( copy_from_user ( & fsflags , arg , sizeof ( fsflags ) ) )
2009-04-17 12:37:41 +04:00
return - EFAULT ;
2018-04-23 16:45:18 +03:00
ret = check_fsflags ( fsflags ) ;
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 13:12:20 +03:00
if ( ret )
return ret ;
2008-06-12 05:53:53 +04:00
2012-06-12 18:20:32 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2016-01-22 23:40:57 +03:00
inode_lock ( inode ) ;
2009-04-17 12:37:41 +04:00
2018-04-23 16:45:18 +03:00
fsflags = btrfs_mask_fsflags_for_type ( inode , fsflags ) ;
2019-07-01 18:25:34 +03:00
old_fsflags = btrfs_inode_flags_to_fsflags ( binode - > flags ) ;
ret = vfs_ioc_setflags_prepare ( inode , old_fsflags , fsflags ) ;
if ( ret )
goto out_unlock ;
2009-04-17 12:37:41 +04:00
2018-04-23 16:45:18 +03:00
if ( fsflags & FS_SYNC_FL )
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_SYNC ;
2009-04-17 12:37:41 +04:00
else
2019-04-20 14:48:55 +03:00
binode_flags & = ~ BTRFS_INODE_SYNC ;
2018-04-23 16:45:18 +03:00
if ( fsflags & FS_IMMUTABLE_FL )
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_IMMUTABLE ;
2009-04-17 12:37:41 +04:00
else
2019-04-20 14:48:55 +03:00
binode_flags & = ~ BTRFS_INODE_IMMUTABLE ;
2018-04-23 16:45:18 +03:00
if ( fsflags & FS_APPEND_FL )
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_APPEND ;
2009-04-17 12:37:41 +04:00
else
2019-04-20 14:48:55 +03:00
binode_flags & = ~ BTRFS_INODE_APPEND ;
2018-04-23 16:45:18 +03:00
if ( fsflags & FS_NODUMP_FL )
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_NODUMP ;
2009-04-17 12:37:41 +04:00
else
2019-04-20 14:48:55 +03:00
binode_flags & = ~ BTRFS_INODE_NODUMP ;
2018-04-23 16:45:18 +03:00
if ( fsflags & FS_NOATIME_FL )
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_NOATIME ;
2009-04-17 12:37:41 +04:00
else
2019-04-20 14:48:55 +03:00
binode_flags & = ~ BTRFS_INODE_NOATIME ;
2018-04-23 16:45:18 +03:00
if ( fsflags & FS_DIRSYNC_FL )
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_DIRSYNC ;
2009-04-17 12:37:41 +04:00
else
2019-04-20 14:48:55 +03:00
binode_flags & = ~ BTRFS_INODE_DIRSYNC ;
2018-04-23 16:45:18 +03:00
if ( fsflags & FS_NOCOW_FL ) {
2019-04-20 14:48:57 +03:00
if ( S_ISREG ( inode - > i_mode ) ) {
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 15:56:55 +04:00
/*
* It ' s safe to turn csums off here , no extents exist .
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it .
*/
if ( inode - > i_size = = 0 )
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM ;
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 15:56:55 +04:00
} else {
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_NODATACOW ;
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 15:56:55 +04:00
}
} else {
/*
2016-05-20 04:18:45 +03:00
* Revert back under same assumptions as above
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 15:56:55 +04:00
*/
2019-04-20 14:48:57 +03:00
if ( S_ISREG ( inode - > i_mode ) ) {
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 15:56:55 +04:00
if ( inode - > i_size = = 0 )
2019-04-20 14:48:55 +03:00
binode_flags & = ~ ( BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM ) ;
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 15:56:55 +04:00
} else {
2019-04-20 14:48:55 +03:00
binode_flags & = ~ BTRFS_INODE_NODATACOW ;
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 15:56:55 +04:00
}
}
2009-04-17 12:37:41 +04:00
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 13:12:20 +03:00
/*
* The COMPRESS flag can only be changed by users , while the NOCOMPRESS
* flag may be changed automatically if compression code won ' t make
* things smaller .
*/
2018-04-23 16:45:18 +03:00
if ( fsflags & FS_NOCOMP_FL ) {
2019-04-20 14:48:55 +03:00
binode_flags & = ~ BTRFS_INODE_COMPRESS ;
binode_flags | = BTRFS_INODE_NOCOMPRESS ;
2018-04-23 16:45:18 +03:00
} else if ( fsflags & FS_COMPR_FL ) {
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 15:47:46 +04:00
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-03 20:28:12 +03:00
if ( IS_SWAPFILE ( inode ) ) {
ret = - ETXTBSY ;
goto out_unlock ;
}
2019-04-20 14:48:55 +03:00
binode_flags | = BTRFS_INODE_COMPRESS ;
binode_flags & = ~ BTRFS_INODE_NOCOMPRESS ;
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 15:47:46 +04:00
2017-10-31 19:32:41 +03:00
comp = btrfs_compress_type2str ( fs_info - > compress_type ) ;
if ( ! comp | | comp [ 0 ] = = 0 )
comp = btrfs_compress_type2str ( BTRFS_COMPRESS_ZLIB ) ;
2011-04-15 07:03:17 +04:00
} else {
2019-04-20 14:48:55 +03:00
binode_flags & = ~ ( BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS ) ;
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 13:12:20 +03:00
}
2009-04-17 12:37:41 +04:00
2019-04-20 14:48:53 +03:00
/*
* 1 for inode item
* 2 for properties
*/
trans = btrfs_start_transaction ( root , 3 ) ;
2011-12-29 09:36:45 +04:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
2019-04-20 14:48:55 +03:00
goto out_unlock ;
2011-12-29 09:36:45 +04:00
}
2009-04-17 12:37:41 +04:00
2019-04-20 14:48:53 +03:00
if ( comp ) {
ret = btrfs_set_prop ( trans , inode , " btrfs.compression " , comp ,
strlen ( comp ) , 0 ) ;
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
goto out_end_trans ;
}
} else {
ret = btrfs_set_prop ( trans , inode , " btrfs.compression " , NULL ,
0 , 0 ) ;
if ( ret & & ret ! = - ENODATA ) {
btrfs_abort_transaction ( trans , ret ) ;
goto out_end_trans ;
}
}
2019-04-20 14:48:55 +03:00
binode - > flags = binode_flags ;
2018-03-26 19:40:21 +03:00
btrfs_sync_inode_flags_to_i_flags ( inode ) ;
2012-04-05 23:03:02 +04:00
inode_inc_iversion ( inode ) ;
2016-09-14 17:48:06 +03:00
inode - > i_ctime = current_time ( inode ) ;
2009-04-17 12:37:41 +04:00
ret = btrfs_update_inode ( trans , root , inode ) ;
2019-04-20 14:48:53 +03:00
out_end_trans :
2016-09-10 04:39:03 +03:00
btrfs_end_transaction ( trans ) ;
2009-04-17 12:37:41 +04:00
out_unlock :
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2012-06-12 18:20:32 +04:00
mnt_drop_write_file ( file ) ;
2011-02-24 12:38:16 +03:00
return ret ;
2009-04-17 12:37:41 +04:00
}
2018-03-26 20:42:05 +03:00
/*
* Translate btrfs internal inode flags to xflags as expected by the
* FS_IOC_FSGETXATT ioctl . Filter only the supported ones , unknown flags are
* silently dropped .
*/
static unsigned int btrfs_inode_flags_to_xflags ( unsigned int flags )
{
unsigned int xflags = 0 ;
if ( flags & BTRFS_INODE_APPEND )
xflags | = FS_XFLAG_APPEND ;
if ( flags & BTRFS_INODE_IMMUTABLE )
xflags | = FS_XFLAG_IMMUTABLE ;
if ( flags & BTRFS_INODE_NOATIME )
xflags | = FS_XFLAG_NOATIME ;
if ( flags & BTRFS_INODE_NODUMP )
xflags | = FS_XFLAG_NODUMP ;
if ( flags & BTRFS_INODE_SYNC )
xflags | = FS_XFLAG_SYNC ;
return xflags ;
}
/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
static int check_xflags ( unsigned int flags )
{
if ( flags & ~ ( FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
FS_XFLAG_NODUMP | FS_XFLAG_SYNC ) )
return - EOPNOTSUPP ;
return 0 ;
}
2018-03-26 20:51:16 +03:00
/*
* Set the xflags from the internal inode flags . The remaining items of fsxattr
* are zeroed .
*/
static int btrfs_ioctl_fsgetxattr ( struct file * file , void __user * arg )
{
struct btrfs_inode * binode = BTRFS_I ( file_inode ( file ) ) ;
struct fsxattr fa ;
2019-07-01 18:25:35 +03:00
simple_fill_fsxattr ( & fa , btrfs_inode_flags_to_xflags ( binode - > flags ) ) ;
2018-03-26 20:51:16 +03:00
if ( copy_to_user ( arg , & fa , sizeof ( fa ) ) )
return - EFAULT ;
return 0 ;
}
2018-03-26 20:51:16 +03:00
static int btrfs_ioctl_fssetxattr ( struct file * file , void __user * arg )
{
struct inode * inode = file_inode ( file ) ;
struct btrfs_inode * binode = BTRFS_I ( inode ) ;
struct btrfs_root * root = binode - > root ;
struct btrfs_trans_handle * trans ;
2019-07-01 18:25:35 +03:00
struct fsxattr fa , old_fa ;
2018-03-26 20:51:16 +03:00
unsigned old_flags ;
unsigned old_i_flags ;
int ret = 0 ;
if ( ! inode_owner_or_capable ( inode ) )
return - EPERM ;
if ( btrfs_root_readonly ( root ) )
return - EROFS ;
if ( copy_from_user ( & fa , arg , sizeof ( fa ) ) )
return - EFAULT ;
ret = check_xflags ( fa . fsx_xflags ) ;
if ( ret )
return ret ;
if ( fa . fsx_extsize ! = 0 | | fa . fsx_projid ! = 0 | | fa . fsx_cowextsize ! = 0 )
return - EOPNOTSUPP ;
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
inode_lock ( inode ) ;
old_flags = binode - > flags ;
old_i_flags = inode - > i_flags ;
2019-07-01 18:25:35 +03:00
simple_fill_fsxattr ( & old_fa ,
btrfs_inode_flags_to_xflags ( binode - > flags ) ) ;
ret = vfs_ioc_fssetxattr_check ( inode , & old_fa , & fa ) ;
if ( ret )
2018-03-26 20:51:16 +03:00
goto out_unlock ;
if ( fa . fsx_xflags & FS_XFLAG_SYNC )
binode - > flags | = BTRFS_INODE_SYNC ;
else
binode - > flags & = ~ BTRFS_INODE_SYNC ;
if ( fa . fsx_xflags & FS_XFLAG_IMMUTABLE )
binode - > flags | = BTRFS_INODE_IMMUTABLE ;
else
binode - > flags & = ~ BTRFS_INODE_IMMUTABLE ;
if ( fa . fsx_xflags & FS_XFLAG_APPEND )
binode - > flags | = BTRFS_INODE_APPEND ;
else
binode - > flags & = ~ BTRFS_INODE_APPEND ;
if ( fa . fsx_xflags & FS_XFLAG_NODUMP )
binode - > flags | = BTRFS_INODE_NODUMP ;
else
binode - > flags & = ~ BTRFS_INODE_NODUMP ;
if ( fa . fsx_xflags & FS_XFLAG_NOATIME )
binode - > flags | = BTRFS_INODE_NOATIME ;
else
binode - > flags & = ~ BTRFS_INODE_NOATIME ;
/* 1 item for the inode */
trans = btrfs_start_transaction ( root , 1 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out_unlock ;
}
btrfs_sync_inode_flags_to_i_flags ( inode ) ;
inode_inc_iversion ( inode ) ;
inode - > i_ctime = current_time ( inode ) ;
ret = btrfs_update_inode ( trans , root , inode ) ;
btrfs_end_transaction ( trans ) ;
out_unlock :
if ( ret ) {
binode - > flags = old_flags ;
inode - > i_flags = old_i_flags ;
}
inode_unlock ( inode ) ;
mnt_drop_write_file ( file ) ;
return ret ;
}
2009-04-17 12:37:41 +04:00
static int btrfs_ioctl_getversion ( struct file * file , int __user * arg )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2009-04-17 12:37:41 +04:00
return put_user ( inode - > i_generation , arg ) ;
}
2008-06-12 05:53:53 +04:00
2011-03-24 13:24:28 +03:00
static noinline int btrfs_ioctl_fitrim ( struct file * file , void __user * arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2011-03-24 13:24:28 +03:00
struct btrfs_device * device ;
struct request_queue * q ;
struct fstrim_range range ;
u64 minlen = ULLONG_MAX ;
u64 num_devices = 0 ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2019-03-26 13:49:56 +03:00
/*
* If the fs is mounted with nologreplay , which requires it to be
* mounted in RO mode as well , we can not allow discard on free space
* inside block groups , because log trees refer to extents that are not
* pinned in a block group ' s free space cache ( pinning the extents is
* precisely the first phase of replaying a log tree ) .
*/
if ( btrfs_test_opt ( fs_info , NOLOGREPLAY ) )
return - EROFS ;
2011-04-20 14:09:16 +04:00
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( device , & fs_info - > fs_devices - > devices ,
dev_list ) {
2011-03-24 13:24:28 +03:00
if ( ! device - > bdev )
continue ;
q = bdev_get_queue ( device - > bdev ) ;
if ( blk_queue_discard ( q ) ) {
num_devices + + ;
2016-12-15 16:38:28 +03:00
minlen = min_t ( u64 , q - > limits . discard_granularity ,
2011-03-24 13:24:28 +03:00
minlen ) ;
}
}
2011-04-20 14:09:16 +04:00
rcu_read_unlock ( ) ;
2011-09-05 18:34:54 +04:00
2011-03-24 13:24:28 +03:00
if ( ! num_devices )
return - EOPNOTSUPP ;
if ( copy_from_user ( & range , arg , sizeof ( range ) ) )
return - EFAULT ;
btrfs: Ensure btrfs_trim_fs can trim the whole filesystem
[BUG]
fstrim on some btrfs only trims the unallocated space, not trimming any
space in existing block groups.
[CAUSE]
Before fstrim_range passed to btrfs_trim_fs(), it gets truncated to
range [0, super->total_bytes). So later btrfs_trim_fs() will only be
able to trim block groups in range [0, super->total_bytes).
While for btrfs, any bytenr aligned to sectorsize is valid, since btrfs
uses its logical address space, there is nothing limiting the location
where we put block groups.
For filesystem with frequent balance, it's quite easy to relocate all
block groups and bytenr of block groups will start beyond
super->total_bytes.
In that case, btrfs will not trim existing block groups.
[FIX]
Just remove the truncation in btrfs_ioctl_fitrim(), so btrfs_trim_fs()
can get the unmodified range, which is normally set to [0, U64_MAX].
Reported-by: Chris Murphy <lists@colorremedies.com>
Fixes: f4c697e6406d ("btrfs: return EINVAL if start > total_bytes in fitrim ioctl")
CC: <stable@vger.kernel.org> # v4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-07 09:16:24 +03:00
/*
* NOTE : Don ' t truncate the range using super - > total_bytes . Bytenr of
* block group is in the logical address space , which can be any
* sectorsize aligned bytenr in the range [ 0 , U64_MAX ] .
*/
if ( range . len < fs_info - > sb - > s_blocksize )
2011-09-05 18:34:54 +04:00
return - EINVAL ;
2011-03-24 13:24:28 +03:00
range . minlen = max ( range . minlen , minlen ) ;
2016-06-23 01:54:24 +03:00
ret = btrfs_trim_fs ( fs_info , & range ) ;
2011-03-24 13:24:28 +03:00
if ( ret < 0 )
return ret ;
if ( copy_to_user ( arg , & range , sizeof ( range ) ) )
return - EFAULT ;
return 0 ;
}
2013-08-15 19:11:20 +04:00
int btrfs_is_empty_uuid ( u8 * uuid )
{
2013-11-15 15:14:55 +04:00
int i ;
for ( i = 0 ; i < BTRFS_UUID_SIZE ; i + + ) {
if ( uuid [ i ] )
return 0 ;
}
return 1 ;
2013-08-15 19:11:20 +04:00
}
2013-02-28 14:04:33 +04:00
static noinline int create_subvol ( struct inode * dir ,
2008-10-09 21:39:39 +04:00
struct dentry * dentry ,
2017-02-14 20:33:53 +03:00
const char * name , int namelen ,
2011-09-14 17:58:21 +04:00
u64 * async_transid ,
2013-02-07 10:02:44 +04:00
struct btrfs_qgroup_inherit * inherit )
2008-06-12 05:53:53 +04:00
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( dir - > i_sb ) ;
2008-06-12 05:53:53 +04:00
struct btrfs_trans_handle * trans ;
struct btrfs_key key ;
2016-03-24 19:49:22 +03:00
struct btrfs_root_item * root_item ;
2008-06-12 05:53:53 +04:00
struct btrfs_inode_item * inode_item ;
struct extent_buffer * leaf ;
2013-02-28 14:04:33 +04:00
struct btrfs_root * root = BTRFS_I ( dir ) - > root ;
2009-09-22 00:00:26 +04:00
struct btrfs_root * new_root ;
2013-02-28 14:04:33 +04:00
struct btrfs_block_rsv block_rsv ;
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-09 05:36:02 +03:00
struct timespec64 cur_time = current_time ( dir ) ;
2013-12-13 04:51:42 +04:00
struct inode * inode ;
2008-06-12 05:53:53 +04:00
int ret ;
int err ;
u64 objectid ;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID ;
2008-11-18 05:02:50 +03:00
u64 index = 0 ;
2012-07-25 19:35:53 +04:00
uuid_le new_uuid ;
2008-06-12 05:53:53 +04:00
2016-03-24 19:49:22 +03:00
root_item = kzalloc ( sizeof ( * root_item ) , GFP_KERNEL ) ;
if ( ! root_item )
return - ENOMEM ;
2016-06-23 01:54:23 +03:00
ret = btrfs_find_free_objectid ( fs_info - > tree_root , & objectid ) ;
2011-07-17 05:38:06 +04:00
if ( ret )
2016-03-24 19:49:22 +03:00
goto fail_free ;
2010-11-20 12:48:00 +03:00
2015-02-27 11:24:23 +03:00
/*
* Don ' t create subvolume whose level is not zero . Or qgroup will be
2016-05-20 04:18:45 +03:00
* screwed up since it assumes subvolume qgroup ' s level to be 0.
2015-02-27 11:24:23 +03:00
*/
2016-03-24 19:49:22 +03:00
if ( btrfs_qgroup_level ( objectid ) ) {
ret = - ENOSPC ;
goto fail_free ;
}
2015-02-27 11:24:23 +03:00
2013-02-28 14:04:33 +04:00
btrfs_init_block_rsv ( & block_rsv , BTRFS_BLOCK_RSV_TEMP ) ;
2009-09-12 00:12:44 +04:00
/*
2013-02-28 14:04:33 +04:00
* The same as the snapshot creation , please see the comment
* of create_snapshot ( ) .
2009-09-12 00:12:44 +04:00
*/
2018-05-30 06:00:38 +03:00
ret = btrfs_subvolume_reserve_metadata ( root , & block_rsv , 8 , false ) ;
2013-02-28 14:04:33 +04:00
if ( ret )
2016-03-24 19:49:22 +03:00
goto fail_free ;
2013-02-28 14:04:33 +04:00
trans = btrfs_start_transaction ( root , 0 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
2017-02-10 21:18:18 +03:00
btrfs_subvolume_release_metadata ( fs_info , & block_rsv ) ;
2016-03-24 19:49:22 +03:00
goto fail_free ;
2013-02-28 14:04:33 +04:00
}
trans - > block_rsv = & block_rsv ;
trans - > bytes_reserved = block_rsv . size ;
2008-06-12 05:53:53 +04:00
2018-07-18 09:45:41 +03:00
ret = btrfs_qgroup_inherit ( trans , 0 , objectid , inherit ) ;
2011-09-14 17:58:21 +04:00
if ( ret )
goto fail ;
2014-06-15 03:54:12 +04:00
leaf = btrfs_alloc_tree_block ( trans , root , 0 , objectid , NULL , 0 , 0 , 0 ) ;
2008-07-24 20:17:14 +04:00
if ( IS_ERR ( leaf ) ) {
ret = PTR_ERR ( leaf ) ;
goto fail ;
}
2008-06-12 05:53:53 +04:00
btrfs_mark_buffer_dirty ( leaf ) ;
2016-03-24 19:49:22 +03:00
inode_item = & root_item - > inode ;
2013-07-16 07:19:18 +04:00
btrfs_set_stack_inode_generation ( inode_item , 1 ) ;
btrfs_set_stack_inode_size ( inode_item , 3 ) ;
btrfs_set_stack_inode_nlink ( inode_item , 1 ) ;
2016-06-15 16:22:56 +03:00
btrfs_set_stack_inode_nbytes ( inode_item ,
2016-06-23 01:54:23 +03:00
fs_info - > nodesize ) ;
2013-07-16 07:19:18 +04:00
btrfs_set_stack_inode_mode ( inode_item , S_IFDIR | 0755 ) ;
2008-06-12 05:53:53 +04:00
2016-03-24 19:49:22 +03:00
btrfs_set_root_flags ( root_item , 0 ) ;
btrfs_set_root_limit ( root_item , 0 ) ;
2013-07-16 07:19:18 +04:00
btrfs_set_stack_inode_flags ( inode_item , BTRFS_INODE_ROOT_ITEM_INIT ) ;
2011-03-28 06:01:25 +04:00
2016-03-24 19:49:22 +03:00
btrfs_set_root_bytenr ( root_item , leaf - > start ) ;
btrfs_set_root_generation ( root_item , trans - > transid ) ;
btrfs_set_root_level ( root_item , 0 ) ;
btrfs_set_root_refs ( root_item , 1 ) ;
btrfs_set_root_used ( root_item , leaf - > len ) ;
btrfs_set_root_last_snapshot ( root_item , 0 ) ;
2008-06-12 05:53:53 +04:00
2016-03-24 19:49:22 +03:00
btrfs_set_root_generation_v2 ( root_item ,
btrfs_root_generation ( root_item ) ) ;
2012-07-25 19:35:53 +04:00
uuid_le_gen ( & new_uuid ) ;
2016-03-24 19:49:22 +03:00
memcpy ( root_item - > uuid , new_uuid . b , BTRFS_UUID_SIZE ) ;
btrfs_set_stack_timespec_sec ( & root_item - > otime , cur_time . tv_sec ) ;
btrfs_set_stack_timespec_nsec ( & root_item - > otime , cur_time . tv_nsec ) ;
root_item - > ctime = root_item - > otime ;
btrfs_set_root_ctransid ( root_item , trans - > transid ) ;
btrfs_set_root_otransid ( root_item , trans - > transid ) ;
2008-06-12 05:53:53 +04:00
2008-06-26 00:01:30 +04:00
btrfs_tree_unlock ( leaf ) ;
2008-06-12 05:53:53 +04:00
free_extent_buffer ( leaf ) ;
leaf = NULL ;
2016-03-24 19:49:22 +03:00
btrfs_set_root_dirid ( root_item , new_dirid ) ;
2008-06-12 05:53:53 +04:00
key . objectid = objectid ;
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 18:45:14 +04:00
key . offset = 0 ;
2014-06-04 20:41:45 +04:00
key . type = BTRFS_ROOT_ITEM_KEY ;
2016-06-23 01:54:23 +03:00
ret = btrfs_insert_root ( trans , fs_info - > tree_root , & key ,
2016-03-24 19:49:22 +03:00
root_item ) ;
2008-06-12 05:53:53 +04:00
if ( ret )
goto fail ;
2009-09-22 00:00:26 +04:00
key . offset = ( u64 ) - 1 ;
2016-06-23 01:54:23 +03:00
new_root = btrfs_read_fs_root_no_name ( fs_info , & key ) ;
2012-03-12 19:03:00 +04:00
if ( IS_ERR ( new_root ) ) {
ret = PTR_ERR ( new_root ) ;
2016-06-11 01:19:25 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2012-03-12 19:03:00 +04:00
goto fail ;
}
2009-09-22 00:00:26 +04:00
btrfs_record_root_in_trans ( trans , new_root ) ;
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 15:47:46 +04:00
ret = btrfs_create_subvol_root ( trans , new_root , root , new_dirid ) ;
2011-07-26 22:32:23 +04:00
if ( ret ) {
/* We potentially lose an unused inode item here */
2016-06-11 01:19:25 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2011-07-26 22:32:23 +04:00
goto fail ;
}
2016-01-07 16:26:59 +03:00
mutex_lock ( & new_root - > objectid_mutex ) ;
new_root - > highest_objectid = new_dirid ;
mutex_unlock ( & new_root - > objectid_mutex ) ;
2008-06-12 05:53:53 +04:00
/*
* insert the directory item
*/
2017-02-20 14:50:33 +03:00
ret = btrfs_set_inode_index ( BTRFS_I ( dir ) , & index ) ;
2012-03-12 19:03:00 +04:00
if ( ret ) {
2016-06-11 01:19:25 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2012-03-12 19:03:00 +04:00
goto fail ;
}
2008-11-18 05:02:50 +03:00
2018-08-04 16:10:57 +03:00
ret = btrfs_insert_dir_item ( trans , name , namelen , BTRFS_I ( dir ) , & key ,
2008-11-18 05:02:50 +03:00
BTRFS_FT_DIR , index ) ;
2012-03-12 19:03:00 +04:00
if ( ret ) {
2016-06-11 01:19:25 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2008-06-12 05:53:53 +04:00
goto fail ;
2012-03-12 19:03:00 +04:00
}
2008-11-18 04:37:39 +03:00
2017-02-20 14:50:34 +03:00
btrfs_i_size_write ( BTRFS_I ( dir ) , dir - > i_size + namelen * 2 ) ;
2009-01-05 23:43:43 +03:00
ret = btrfs_update_inode ( trans , root , dir ) ;
BUG_ON ( ret ) ;
2018-08-01 06:32:29 +03:00
ret = btrfs_add_root_ref ( trans , objectid , root - > root_key . objectid ,
2017-01-10 21:35:31 +03:00
btrfs_ino ( BTRFS_I ( dir ) ) , index , name , namelen ) ;
2009-09-22 00:00:26 +04:00
BUG_ON ( ret ) ;
2008-06-12 05:53:53 +04:00
2018-05-29 10:01:53 +03:00
ret = btrfs_uuid_tree_add ( trans , root_item - > uuid ,
2016-06-22 04:16:51 +03:00
BTRFS_UUID_KEY_SUBVOL , objectid ) ;
2013-08-15 19:11:20 +04:00
if ( ret )
2016-06-11 01:19:25 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2013-08-15 19:11:20 +04:00
2008-06-12 05:53:53 +04:00
fail :
2016-03-24 19:49:22 +03:00
kfree ( root_item ) ;
2013-02-28 14:04:33 +04:00
trans - > block_rsv = NULL ;
trans - > bytes_reserved = 0 ;
2017-02-10 21:18:18 +03:00
btrfs_subvolume_release_metadata ( fs_info , & block_rsv ) ;
2014-01-09 10:57:06 +04:00
2010-10-29 23:41:32 +04:00
if ( async_transid ) {
* async_transid = trans - > transid ;
2016-09-10 04:39:03 +03:00
err = btrfs_commit_transaction_async ( trans , 1 ) ;
2013-03-04 13:45:06 +04:00
if ( err )
2016-09-10 04:39:03 +03:00
err = btrfs_commit_transaction ( trans ) ;
2010-10-29 23:41:32 +04:00
} else {
2016-09-10 04:39:03 +03:00
err = btrfs_commit_transaction ( trans ) ;
2010-10-29 23:41:32 +04:00
}
2008-06-12 05:53:53 +04:00
if ( err & & ! ret )
ret = err ;
2013-02-06 21:06:02 +04:00
2013-12-13 04:51:42 +04:00
if ( ! ret ) {
inode = btrfs_lookup_dentry ( dir , dentry ) ;
2014-01-09 10:57:06 +04:00
if ( IS_ERR ( inode ) )
return PTR_ERR ( inode ) ;
2013-12-13 04:51:42 +04:00
d_instantiate ( dentry , inode ) ;
}
2008-06-12 05:53:53 +04:00
return ret ;
2016-03-24 19:49:22 +03:00
fail_free :
kfree ( root_item ) ;
return ret ;
2008-06-12 05:53:53 +04:00
}
2013-02-28 14:01:15 +04:00
static int create_snapshot ( struct btrfs_root * root , struct inode * dir ,
2017-02-10 21:54:06 +03:00
struct dentry * dentry ,
2013-02-28 14:01:15 +04:00
u64 * async_transid , bool readonly ,
struct btrfs_qgroup_inherit * inherit )
2008-06-12 05:53:53 +04:00
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( dir - > i_sb ) ;
2009-11-12 12:37:02 +03:00
struct inode * inode ;
2008-06-12 05:53:53 +04:00
struct btrfs_pending_snapshot * pending_snapshot ;
struct btrfs_trans_handle * trans ;
2009-11-12 12:37:02 +03:00
int ret ;
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space
Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced
nocow writes to fallback to COW, during writeback, when a snapshot is
created. This resulted in writes made before creating the snapshot to
unexpectedly fail with ENOSPC during writeback when success (0) was
returned to user space through the write system call.
The steps leading to this problem are:
1. When it's not possible to allocate data space for a write, the
buffered write path checks if a NOCOW write is possible. If it is,
it will not reserve space and success (0) is returned to user space.
2. Then when a snapshot is created, the root's will_be_snapshotted
atomic is incremented and writeback is triggered for all inode's that
belong to the root being snapshotted. Incrementing that atomic forces
all previous writes to fallback to COW during writeback (running
delalloc).
3. This results in the writeback for the inodes to fail and therefore
setting the ENOSPC error in their mappings, so that a subsequent
fsync on them will report the error to user space. So it's not a
completely silent data loss (since fsync will report ENOSPC) but it's
a very unexpected and undesirable behaviour, because if a clean
shutdown/unmount of the filesystem happens without previous calls to
fsync, it is expected to have the data present in the files after
mounting the filesystem again.
So fix this by adding a new atomic named snapshot_force_cow to the
root structure which prevents this behaviour and works the following way:
1. It is incremented when we start to create a snapshot after triggering
writeback and before waiting for writeback to finish.
2. This new atomic is now what is used by writeback (running delalloc)
to decide whether we need to fallback to COW or not. Because we
incremented this new atomic after triggering writeback in the
snapshot creation ioctl, we ensure that all buffered writes that
happened before snapshot creation will succeed and not fallback to
COW (which would make them fail with ENOSPC).
3. The existing atomic, will_be_snapshotted, is kept because it is used
to force new buffered writes, that start after we started
snapshotting, to reserve data space even when NOCOW is possible.
This makes these writes fail early with ENOSPC when there's no
available space to allocate, preventing the unexpected behaviour of
writeback later failing with ENOSPC due to a fallback to COW mode.
Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 05:30:30 +03:00
bool snapshot_force_cow = false ;
2008-06-12 05:53:53 +04:00
2014-04-02 15:51:05 +04:00
if ( ! test_bit ( BTRFS_ROOT_REF_COWS , & root - > state ) )
2008-06-12 05:53:53 +04:00
return - EINVAL ;
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-03 20:28:12 +03:00
if ( atomic_read ( & root - > nr_swapfiles ) ) {
btrfs_warn ( fs_info ,
" cannot snapshot subvolume with active swapfile " ) ;
return - ETXTBSY ;
}
2017-02-13 13:03:44 +03:00
pending_snapshot = kzalloc ( sizeof ( * pending_snapshot ) , GFP_KERNEL ) ;
2015-11-10 20:53:56 +03:00
if ( ! pending_snapshot )
return - ENOMEM ;
2015-11-10 20:54:00 +03:00
pending_snapshot - > root_item = kzalloc ( sizeof ( struct btrfs_root_item ) ,
2017-02-13 13:03:44 +03:00
GFP_KERNEL ) ;
2015-11-10 20:54:03 +03:00
pending_snapshot - > path = btrfs_alloc_path ( ) ;
if ( ! pending_snapshot - > root_item | | ! pending_snapshot - > path ) {
2015-11-10 20:54:00 +03:00
ret = - ENOMEM ;
goto free_pending ;
}
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space
Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced
nocow writes to fallback to COW, during writeback, when a snapshot is
created. This resulted in writes made before creating the snapshot to
unexpectedly fail with ENOSPC during writeback when success (0) was
returned to user space through the write system call.
The steps leading to this problem are:
1. When it's not possible to allocate data space for a write, the
buffered write path checks if a NOCOW write is possible. If it is,
it will not reserve space and success (0) is returned to user space.
2. Then when a snapshot is created, the root's will_be_snapshotted
atomic is incremented and writeback is triggered for all inode's that
belong to the root being snapshotted. Incrementing that atomic forces
all previous writes to fallback to COW during writeback (running
delalloc).
3. This results in the writeback for the inodes to fail and therefore
setting the ENOSPC error in their mappings, so that a subsequent
fsync on them will report the error to user space. So it's not a
completely silent data loss (since fsync will report ENOSPC) but it's
a very unexpected and undesirable behaviour, because if a clean
shutdown/unmount of the filesystem happens without previous calls to
fsync, it is expected to have the data present in the files after
mounting the filesystem again.
So fix this by adding a new atomic named snapshot_force_cow to the
root structure which prevents this behaviour and works the following way:
1. It is incremented when we start to create a snapshot after triggering
writeback and before waiting for writeback to finish.
2. This new atomic is now what is used by writeback (running delalloc)
to decide whether we need to fallback to COW or not. Because we
incremented this new atomic after triggering writeback in the
snapshot creation ioctl, we ensure that all buffered writes that
happened before snapshot creation will succeed and not fallback to
COW (which would make them fail with ENOSPC).
3. The existing atomic, will_be_snapshotted, is kept because it is used
to force new buffered writes, that start after we started
snapshotting, to reserve data space even when NOCOW is possible.
This makes these writes fail early with ENOSPC when there's no
available space to allocate, preventing the unexpected behaviour of
writeback later failing with ENOSPC due to a fallback to COW mode.
Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 05:30:30 +03:00
/*
* Force new buffered writes to reserve space even when NOCOW is
* possible . This is to avoid later writeback ( running dealloc ) to
* fallback to COW mode and unexpectedly fail with ENOSPC .
*/
2017-06-22 03:19:11 +03:00
atomic_inc ( & root - > will_be_snapshotted ) ;
2014-03-17 21:06:10 +04:00
smp_mb__after_atomic ( ) ;
2017-09-02 01:14:29 +03:00
/* wait for no snapshot writes */
wait_event ( root - > subv_writers - > wait ,
percpu_counter_sum ( & root - > subv_writers - > counter ) = = 0 ) ;
2014-03-06 09:38:19 +04:00
2018-11-01 09:49:03 +03:00
ret = btrfs_start_delalloc_snapshot ( root ) ;
2013-05-15 11:48:24 +04:00
if ( ret )
2015-11-10 20:53:56 +03:00
goto dec_and_free ;
2013-05-15 11:48:24 +04:00
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space
Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced
nocow writes to fallback to COW, during writeback, when a snapshot is
created. This resulted in writes made before creating the snapshot to
unexpectedly fail with ENOSPC during writeback when success (0) was
returned to user space through the write system call.
The steps leading to this problem are:
1. When it's not possible to allocate data space for a write, the
buffered write path checks if a NOCOW write is possible. If it is,
it will not reserve space and success (0) is returned to user space.
2. Then when a snapshot is created, the root's will_be_snapshotted
atomic is incremented and writeback is triggered for all inode's that
belong to the root being snapshotted. Incrementing that atomic forces
all previous writes to fallback to COW during writeback (running
delalloc).
3. This results in the writeback for the inodes to fail and therefore
setting the ENOSPC error in their mappings, so that a subsequent
fsync on them will report the error to user space. So it's not a
completely silent data loss (since fsync will report ENOSPC) but it's
a very unexpected and undesirable behaviour, because if a clean
shutdown/unmount of the filesystem happens without previous calls to
fsync, it is expected to have the data present in the files after
mounting the filesystem again.
So fix this by adding a new atomic named snapshot_force_cow to the
root structure which prevents this behaviour and works the following way:
1. It is incremented when we start to create a snapshot after triggering
writeback and before waiting for writeback to finish.
2. This new atomic is now what is used by writeback (running delalloc)
to decide whether we need to fallback to COW or not. Because we
incremented this new atomic after triggering writeback in the
snapshot creation ioctl, we ensure that all buffered writes that
happened before snapshot creation will succeed and not fallback to
COW (which would make them fail with ENOSPC).
3. The existing atomic, will_be_snapshotted, is kept because it is used
to force new buffered writes, that start after we started
snapshotting, to reserve data space even when NOCOW is possible.
This makes these writes fail early with ENOSPC when there's no
available space to allocate, preventing the unexpected behaviour of
writeback later failing with ENOSPC due to a fallback to COW mode.
Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 05:30:30 +03:00
/*
* All previous writes have started writeback in NOCOW mode , so now
* we force future writes to fallback to COW mode during snapshot
* creation .
*/
atomic_inc ( & root - > snapshot_force_cow ) ;
snapshot_force_cow = true ;
2017-06-23 19:48:21 +03:00
btrfs_wait_ordered_extents ( root , U64_MAX , 0 , ( u64 ) - 1 ) ;
2013-05-15 11:48:24 +04:00
2012-09-06 14:02:28 +04:00
btrfs_init_block_rsv ( & pending_snapshot - > block_rsv ,
BTRFS_BLOCK_RSV_TEMP ) ;
2013-02-28 14:04:33 +04:00
/*
* 1 - parent dir inode
* 2 - dir entries
* 1 - root item
* 2 - root ref / backref
* 1 - root of snapshot
2013-08-15 19:11:20 +04:00
* 1 - UUID item
2013-02-28 14:04:33 +04:00
*/
ret = btrfs_subvolume_reserve_metadata ( BTRFS_I ( dir ) - > root ,
2013-08-15 19:11:20 +04:00
& pending_snapshot - > block_rsv , 8 ,
2013-07-10 00:37:21 +04:00
false ) ;
2013-02-28 14:04:33 +04:00
if ( ret )
2015-11-10 20:53:56 +03:00
goto dec_and_free ;
2013-02-28 14:04:33 +04:00
2008-11-18 05:02:50 +03:00
pending_snapshot - > dentry = dentry ;
2008-06-12 05:53:53 +04:00
pending_snapshot - > root = root ;
2010-12-20 11:04:08 +03:00
pending_snapshot - > readonly = readonly ;
2013-02-28 14:01:15 +04:00
pending_snapshot - > dir = dir ;
2013-02-07 10:02:44 +04:00
pending_snapshot - > inherit = inherit ;
2010-05-16 18:48:46 +04:00
2013-02-28 14:04:33 +04:00
trans = btrfs_start_transaction ( root , 0 ) ;
2010-05-16 18:48:46 +04:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto fail ;
}
2016-06-23 01:54:23 +03:00
spin_lock ( & fs_info - > trans_lock ) ;
2008-06-12 05:53:53 +04:00
list_add ( & pending_snapshot - > list ,
& trans - > transaction - > pending_snapshots ) ;
2016-06-23 01:54:23 +03:00
spin_unlock ( & fs_info - > trans_lock ) ;
2010-10-29 23:41:32 +04:00
if ( async_transid ) {
* async_transid = trans - > transid ;
2016-09-10 04:39:03 +03:00
ret = btrfs_commit_transaction_async ( trans , 1 ) ;
2013-03-04 13:45:06 +04:00
if ( ret )
2016-09-10 04:39:03 +03:00
ret = btrfs_commit_transaction ( trans ) ;
2010-10-29 23:41:32 +04:00
} else {
2016-09-10 04:39:03 +03:00
ret = btrfs_commit_transaction ( trans ) ;
2010-10-29 23:41:32 +04:00
}
2013-03-04 13:44:29 +04:00
if ( ret )
2012-10-22 23:51:44 +04:00
goto fail ;
2010-05-16 18:48:46 +04:00
ret = pending_snapshot - > error ;
if ( ret )
goto fail ;
2014-10-16 00:50:56 +04:00
ret = btrfs_orphan_cleanup ( pending_snapshot - > snap ) ;
if ( ret )
goto fail ;
2015-03-18 01:25:59 +03:00
inode = btrfs_lookup_dentry ( d_inode ( dentry - > d_parent ) , dentry ) ;
2009-11-12 12:37:02 +03:00
if ( IS_ERR ( inode ) ) {
ret = PTR_ERR ( inode ) ;
goto fail ;
}
2013-12-13 04:51:42 +04:00
2009-11-12 12:37:02 +03:00
d_instantiate ( dentry , inode ) ;
ret = 0 ;
fail :
2017-02-10 21:18:18 +03:00
btrfs_subvolume_release_metadata ( fs_info , & pending_snapshot - > block_rsv ) ;
2015-11-10 20:53:56 +03:00
dec_and_free :
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space
Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced
nocow writes to fallback to COW, during writeback, when a snapshot is
created. This resulted in writes made before creating the snapshot to
unexpectedly fail with ENOSPC during writeback when success (0) was
returned to user space through the write system call.
The steps leading to this problem are:
1. When it's not possible to allocate data space for a write, the
buffered write path checks if a NOCOW write is possible. If it is,
it will not reserve space and success (0) is returned to user space.
2. Then when a snapshot is created, the root's will_be_snapshotted
atomic is incremented and writeback is triggered for all inode's that
belong to the root being snapshotted. Incrementing that atomic forces
all previous writes to fallback to COW during writeback (running
delalloc).
3. This results in the writeback for the inodes to fail and therefore
setting the ENOSPC error in their mappings, so that a subsequent
fsync on them will report the error to user space. So it's not a
completely silent data loss (since fsync will report ENOSPC) but it's
a very unexpected and undesirable behaviour, because if a clean
shutdown/unmount of the filesystem happens without previous calls to
fsync, it is expected to have the data present in the files after
mounting the filesystem again.
So fix this by adding a new atomic named snapshot_force_cow to the
root structure which prevents this behaviour and works the following way:
1. It is incremented when we start to create a snapshot after triggering
writeback and before waiting for writeback to finish.
2. This new atomic is now what is used by writeback (running delalloc)
to decide whether we need to fallback to COW or not. Because we
incremented this new atomic after triggering writeback in the
snapshot creation ioctl, we ensure that all buffered writes that
happened before snapshot creation will succeed and not fallback to
COW (which would make them fail with ENOSPC).
3. The existing atomic, will_be_snapshotted, is kept because it is used
to force new buffered writes, that start after we started
snapshotting, to reserve data space even when NOCOW is possible.
This makes these writes fail early with ENOSPC when there's no
available space to allocate, preventing the unexpected behaviour of
writeback later failing with ENOSPC due to a fallback to COW mode.
Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
Signed-off-by: Robbie Ko <robbieko@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 05:30:30 +03:00
if ( snapshot_force_cow )
atomic_dec ( & root - > snapshot_force_cow ) ;
2017-06-22 03:19:11 +03:00
if ( atomic_dec_and_test ( & root - > will_be_snapshotted ) )
2018-03-15 13:43:08 +03:00
wake_up_var ( & root - > will_be_snapshotted ) ;
2015-11-10 20:54:00 +03:00
free_pending :
kfree ( pending_snapshot - > root_item ) ;
2015-11-10 20:54:03 +03:00
btrfs_free_path ( pending_snapshot - > path ) ;
2015-11-10 20:53:56 +03:00
kfree ( pending_snapshot ) ;
2008-06-12 05:53:53 +04:00
return ret ;
}
2010-10-29 23:46:43 +04:00
/* copy of may_delete in fs/namei.c()
* Check whether we can remove a link victim from directory dir , check
* whether the type of victim is right .
* 1. We can ' t do it if dir is read - only ( done in permission ( ) )
* 2. We should have write and exec permissions on dir
* 3. We can ' t remove anything from append - only dir
* 4. We can ' t do anything with immutable dir ( done in permission ( ) )
* 5. If the sticky bit on dir is set we should either
* a . be owner of dir , or
* b . be owner of victim , or
* c . have CAP_FOWNER capability
2016-05-20 04:18:45 +03:00
* 6. If the victim is append - only or immutable we can ' t do anything with
2010-10-29 23:46:43 +04:00
* links pointing to it .
* 7. If we were asked to remove a directory and victim isn ' t one - ENOTDIR .
* 8. If we were asked to remove a non - directory and victim isn ' t one - EISDIR .
* 9. We can ' t remove a root or mountpoint .
* 10. We don ' t allow removal of NFS sillyrenamed files ; it ' s handled by
* nfs_async_unlink ( ) .
*/
2013-10-31 09:03:04 +04:00
static int btrfs_may_delete ( struct inode * dir , struct dentry * victim , int isdir )
2010-10-29 23:46:43 +04:00
{
int error ;
2015-03-18 01:25:59 +03:00
if ( d_really_is_negative ( victim ) )
2010-10-29 23:46:43 +04:00
return - ENOENT ;
2015-03-18 01:25:59 +03:00
BUG_ON ( d_inode ( victim - > d_parent ) ! = dir ) ;
2012-10-10 23:25:25 +04:00
audit_inode_child ( dir , victim , AUDIT_TYPE_CHILD_DELETE ) ;
2010-10-29 23:46:43 +04:00
error = inode_permission ( dir , MAY_WRITE | MAY_EXEC ) ;
if ( error )
return error ;
if ( IS_APPEND ( dir ) )
return - EPERM ;
2015-03-18 01:25:59 +03:00
if ( check_sticky ( dir , d_inode ( victim ) ) | | IS_APPEND ( d_inode ( victim ) ) | |
IS_IMMUTABLE ( d_inode ( victim ) ) | | IS_SWAPFILE ( d_inode ( victim ) ) )
2010-10-29 23:46:43 +04:00
return - EPERM ;
if ( isdir ) {
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry)
Convert the following where appropriate:
(1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry).
(2) S_ISREG(dentry->d_inode) to d_is_reg(dentry).
(3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more
complicated than it appears as some calls should be converted to
d_can_lookup() instead. The difference is whether the directory in
question is a real dir with a ->lookup op or whether it's a fake dir with
a ->d_automount op.
In some circumstances, we can subsume checks for dentry->d_inode not being
NULL into this, provided we the code isn't in a filesystem that expects
d_inode to be NULL if the dirent really *is* negative (ie. if we're going to
use d_inode() rather than d_backing_inode() to get the inode pointer).
Note that the dentry type field may be set to something other than
DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS
manages the fall-through from a negative dentry to a lower layer. In such a
case, the dentry type of the negative union dentry is set to the same as the
type of the lower dentry.
However, if you know d_inode is not NULL at the call site, then you can use
the d_is_xxx() functions even in a filesystem.
There is one further complication: a 0,0 chardev dentry may be labelled
DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was
intended for special directory entry types that don't have attached inodes.
The following perl+coccinelle script was used:
use strict;
my @callers;
open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') ||
die "Can't grep for S_ISDIR and co. callers";
@callers = <$fd>;
close($fd);
unless (@callers) {
print "No matches\n";
exit(0);
}
my @cocci = (
'@@',
'expression E;',
'@@',
'',
'- S_ISLNK(E->d_inode->i_mode)',
'+ d_is_symlink(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISDIR(E->d_inode->i_mode)',
'+ d_is_dir(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISREG(E->d_inode->i_mode)',
'+ d_is_reg(E)' );
my $coccifile = "tmp.sp.cocci";
open($fd, ">$coccifile") || die $coccifile;
print($fd "$_\n") || die $coccifile foreach (@cocci);
close($fd);
foreach my $file (@callers) {
chomp $file;
print "Processing ", $file, "\n";
system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 ||
die "spatch failed";
}
[AV: overlayfs parts skipped]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 15:02:35 +03:00
if ( ! d_is_dir ( victim ) )
2010-10-29 23:46:43 +04:00
return - ENOTDIR ;
if ( IS_ROOT ( victim ) )
return - EBUSY ;
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry)
Convert the following where appropriate:
(1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry).
(2) S_ISREG(dentry->d_inode) to d_is_reg(dentry).
(3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more
complicated than it appears as some calls should be converted to
d_can_lookup() instead. The difference is whether the directory in
question is a real dir with a ->lookup op or whether it's a fake dir with
a ->d_automount op.
In some circumstances, we can subsume checks for dentry->d_inode not being
NULL into this, provided we the code isn't in a filesystem that expects
d_inode to be NULL if the dirent really *is* negative (ie. if we're going to
use d_inode() rather than d_backing_inode() to get the inode pointer).
Note that the dentry type field may be set to something other than
DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS
manages the fall-through from a negative dentry to a lower layer. In such a
case, the dentry type of the negative union dentry is set to the same as the
type of the lower dentry.
However, if you know d_inode is not NULL at the call site, then you can use
the d_is_xxx() functions even in a filesystem.
There is one further complication: a 0,0 chardev dentry may be labelled
DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was
intended for special directory entry types that don't have attached inodes.
The following perl+coccinelle script was used:
use strict;
my @callers;
open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') ||
die "Can't grep for S_ISDIR and co. callers";
@callers = <$fd>;
close($fd);
unless (@callers) {
print "No matches\n";
exit(0);
}
my @cocci = (
'@@',
'expression E;',
'@@',
'',
'- S_ISLNK(E->d_inode->i_mode)',
'+ d_is_symlink(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISDIR(E->d_inode->i_mode)',
'+ d_is_dir(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISREG(E->d_inode->i_mode)',
'+ d_is_reg(E)' );
my $coccifile = "tmp.sp.cocci";
open($fd, ">$coccifile") || die $coccifile;
print($fd "$_\n") || die $coccifile foreach (@cocci);
close($fd);
foreach my $file (@callers) {
chomp $file;
print "Processing ", $file, "\n";
system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 ||
die "spatch failed";
}
[AV: overlayfs parts skipped]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 15:02:35 +03:00
} else if ( d_is_dir ( victim ) )
2010-10-29 23:46:43 +04:00
return - EISDIR ;
if ( IS_DEADDIR ( dir ) )
return - ENOENT ;
if ( victim - > d_flags & DCACHE_NFSFS_RENAMED )
return - EBUSY ;
return 0 ;
}
2008-10-09 21:39:39 +04:00
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create ( struct inode * dir , struct dentry * child )
{
2015-03-18 01:25:59 +03:00
if ( d_really_is_positive ( child ) )
2008-10-09 21:39:39 +04:00
return - EEXIST ;
if ( IS_DEADDIR ( dir ) )
return - ENOENT ;
return inode_permission ( dir , MAY_WRITE | MAY_EXEC ) ;
}
/*
* Create a new subvolume below @ parent . This is largely modeled after
* sys_mkdirat and vfs_mkdir , but we only do a single component lookup
* inside this filesystem so it ' s quite a bit simpler .
*/
2016-11-21 03:34:31 +03:00
static noinline int btrfs_mksubvol ( const struct path * parent ,
2017-02-14 20:33:53 +03:00
const char * name , int namelen ,
2010-10-29 23:41:32 +04:00
struct btrfs_root * snap_src ,
2011-09-14 17:58:21 +04:00
u64 * async_transid , bool readonly ,
2013-02-07 10:02:44 +04:00
struct btrfs_qgroup_inherit * inherit )
2008-10-09 21:39:39 +04:00
{
2016-06-23 01:54:23 +03:00
struct inode * dir = d_inode ( parent - > dentry ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( dir - > i_sb ) ;
2008-10-09 21:39:39 +04:00
struct dentry * dentry ;
int error ;
2016-05-26 07:05:12 +03:00
error = down_write_killable_nested ( & dir - > i_rwsem , I_MUTEX_PARENT ) ;
if ( error = = - EINTR )
return error ;
2008-10-09 21:39:39 +04:00
dentry = lookup_one_len ( name , parent - > dentry , namelen ) ;
error = PTR_ERR ( dentry ) ;
if ( IS_ERR ( dentry ) )
goto out_unlock ;
2009-09-22 00:00:26 +04:00
error = btrfs_may_create ( dir , dentry ) ;
2008-10-09 21:39:39 +04:00
if ( error )
2012-06-29 13:58:46 +04:00
goto out_dput ;
2008-10-09 21:39:39 +04:00
2012-12-17 23:26:57 +04:00
/*
* even if this name doesn ' t exist , we may get hash collisions .
* check for them now when we can safely fail
*/
error = btrfs_check_dir_item_collision ( BTRFS_I ( dir ) - > root ,
dir - > i_ino , name ,
namelen ) ;
if ( error )
goto out_dput ;
2016-06-23 01:54:23 +03:00
down_read ( & fs_info - > subvol_sem ) ;
2009-09-22 00:00:26 +04:00
if ( btrfs_root_refs ( & BTRFS_I ( dir ) - > root - > root_item ) = = 0 )
goto out_up_read ;
2008-11-18 05:02:50 +03:00
if ( snap_src ) {
2017-02-10 21:54:06 +03:00
error = create_snapshot ( snap_src , dir , dentry ,
2011-09-14 17:58:21 +04:00
async_transid , readonly , inherit ) ;
2008-11-18 05:02:50 +03:00
} else {
2013-02-28 14:04:33 +04:00
error = create_subvol ( dir , dentry , name , namelen ,
async_transid , inherit ) ;
2008-11-18 05:02:50 +03:00
}
2009-09-22 00:00:26 +04:00
if ( ! error )
fsnotify_mkdir ( dir , dentry ) ;
out_up_read :
2016-06-23 01:54:23 +03:00
up_read ( & fs_info - > subvol_sem ) ;
2008-10-09 21:39:39 +04:00
out_dput :
dput ( dentry ) ;
out_unlock :
2016-01-22 23:40:57 +03:00
inode_unlock ( dir ) ;
2008-10-09 21:39:39 +04:00
return error ;
}
2011-05-24 23:35:30 +04:00
/*
* When we ' re defragging a range , we don ' t want to kick it off again
* if it is really just waiting for delalloc to send it down .
* If we find a nice big extent or delalloc range for the bytes in the
* file you want to defrag , we return 0 to let you know to skip this
* part of the file
*/
2014-07-29 19:32:10 +04:00
static int check_defrag_in_cache ( struct inode * inode , u64 offset , u32 thresh )
2011-05-24 23:35:30 +04:00
{
struct extent_io_tree * io_tree = & BTRFS_I ( inode ) - > io_tree ;
struct extent_map * em = NULL ;
struct extent_map_tree * em_tree = & BTRFS_I ( inode ) - > extent_tree ;
u64 end ;
read_lock ( & em_tree - > lock ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
em = lookup_extent_mapping ( em_tree , offset , PAGE_SIZE ) ;
2011-05-24 23:35:30 +04:00
read_unlock ( & em_tree - > lock ) ;
if ( em ) {
end = extent_map_end ( em ) ;
free_extent_map ( em ) ;
if ( end - offset > thresh )
return 0 ;
}
/* if we already have a nice delalloc here, just stop */
thresh / = 2 ;
end = count_range_bits ( io_tree , & offset , offset + thresh ,
thresh , EXTENT_DELALLOC , 1 ) ;
if ( end > = thresh )
return 0 ;
return 1 ;
}
/*
* helper function to walk through a file and find extents
* newer than a specific transid , and smaller than thresh .
*
* This is used by the defragging code to find new and small
* extents
*/
static int find_new_extents ( struct btrfs_root * root ,
struct inode * inode , u64 newer_than ,
2014-07-29 19:32:10 +04:00
u64 * off , u32 thresh )
2011-05-24 23:35:30 +04:00
{
struct btrfs_path * path ;
struct btrfs_key min_key ;
struct extent_buffer * leaf ;
struct btrfs_file_extent_item * extent ;
int type ;
int ret ;
2017-01-10 21:35:31 +03:00
u64 ino = btrfs_ino ( BTRFS_I ( inode ) ) ;
2011-05-24 23:35:30 +04:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2011-05-31 21:08:14 +04:00
min_key . objectid = ino ;
2011-05-24 23:35:30 +04:00
min_key . type = BTRFS_EXTENT_DATA_KEY ;
min_key . offset = * off ;
2013-10-31 09:03:04 +04:00
while ( 1 ) {
2013-10-01 19:13:42 +04:00
ret = btrfs_search_forward ( root , & min_key , path , newer_than ) ;
2011-05-24 23:35:30 +04:00
if ( ret ! = 0 )
goto none ;
Btrfs: less fs tree lock contention when using autodefrag
When finding new extents during an autodefrag, don't do so many fs tree
lookups to find an extent with a size smaller then the target treshold.
Instead, after each fs tree forward search immediately unlock upper
levels and process the entire leaf while holding a read lock on the leaf,
since our leaf processing is very fast.
This reduces lock contention, allowing for higher concurrency when other
tasks want to write/update items related to other inodes in the fs tree,
as we're not holding read locks on upper tree levels while processing the
leaf and we do less tree searches.
Test:
sysbench --test=fileio --file-num=512 --file-total-size=16G \
--file-test-mode=rndrw --num-threads=32 --file-block-size=32768 \
--file-rw-ratio=3 --file-io-mode=sync --max-time=1800 \
--max-requests=10000000000 [prepare|run]
(fileystem mounted with -o autodefrag, averages of 5 runs)
Before this change: 58.852Mb/sec throughtput, read 77.589Gb, written 25.863Gb
After this change: 63.034Mb/sec throughtput, read 83.102Gb, written 27.701Gb
Test machine: quad core intel i5-3570K, 32Gb of RAM, SSD.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-03-12 05:28:24 +04:00
process_slot :
2011-05-31 21:08:14 +04:00
if ( min_key . objectid ! = ino )
2011-05-24 23:35:30 +04:00
goto none ;
if ( min_key . type ! = BTRFS_EXTENT_DATA_KEY )
goto none ;
leaf = path - > nodes [ 0 ] ;
extent = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
type = btrfs_file_extent_type ( leaf , extent ) ;
if ( type = = BTRFS_FILE_EXTENT_REG & &
btrfs_file_extent_num_bytes ( leaf , extent ) < thresh & &
check_defrag_in_cache ( inode , min_key . offset , thresh ) ) {
* off = min_key . offset ;
btrfs_free_path ( path ) ;
return 0 ;
}
Btrfs: less fs tree lock contention when using autodefrag
When finding new extents during an autodefrag, don't do so many fs tree
lookups to find an extent with a size smaller then the target treshold.
Instead, after each fs tree forward search immediately unlock upper
levels and process the entire leaf while holding a read lock on the leaf,
since our leaf processing is very fast.
This reduces lock contention, allowing for higher concurrency when other
tasks want to write/update items related to other inodes in the fs tree,
as we're not holding read locks on upper tree levels while processing the
leaf and we do less tree searches.
Test:
sysbench --test=fileio --file-num=512 --file-total-size=16G \
--file-test-mode=rndrw --num-threads=32 --file-block-size=32768 \
--file-rw-ratio=3 --file-io-mode=sync --max-time=1800 \
--max-requests=10000000000 [prepare|run]
(fileystem mounted with -o autodefrag, averages of 5 runs)
Before this change: 58.852Mb/sec throughtput, read 77.589Gb, written 25.863Gb
After this change: 63.034Mb/sec throughtput, read 83.102Gb, written 27.701Gb
Test machine: quad core intel i5-3570K, 32Gb of RAM, SSD.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-03-12 05:28:24 +04:00
path - > slots [ 0 ] + + ;
if ( path - > slots [ 0 ] < btrfs_header_nritems ( leaf ) ) {
btrfs_item_key_to_cpu ( leaf , & min_key , path - > slots [ 0 ] ) ;
goto process_slot ;
}
2011-05-24 23:35:30 +04:00
if ( min_key . offset = = ( u64 ) - 1 )
goto none ;
min_key . offset + + ;
btrfs_release_path ( path ) ;
}
none :
btrfs_free_path ( path ) ;
return - ENOENT ;
}
2012-06-11 12:03:35 +04:00
static struct extent_map * defrag_lookup_extent ( struct inode * inode , u64 start )
2012-03-29 17:57:45 +04:00
{
struct extent_map_tree * em_tree = & BTRFS_I ( inode ) - > extent_tree ;
2012-06-11 12:03:35 +04:00
struct extent_io_tree * io_tree = & BTRFS_I ( inode ) - > io_tree ;
struct extent_map * em ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
u64 len = PAGE_SIZE ;
2012-03-29 17:57:45 +04:00
2012-06-11 12:03:35 +04:00
/*
* hopefully we have this extent in the tree already , try without
* the full extent lock
*/
2012-03-29 17:57:45 +04:00
read_lock ( & em_tree - > lock ) ;
2012-06-11 12:03:35 +04:00
em = lookup_extent_mapping ( em_tree , start , len ) ;
2012-03-29 17:57:45 +04:00
read_unlock ( & em_tree - > lock ) ;
2012-06-11 12:03:35 +04:00
if ( ! em ) {
2014-03-11 17:56:15 +04:00
struct extent_state * cached = NULL ;
u64 end = start + len - 1 ;
2012-06-11 12:03:35 +04:00
/* get the big lock and read metadata off disk */
2015-12-03 16:30:40 +03:00
lock_extent_bits ( io_tree , start , end , & cached ) ;
2017-02-20 14:51:06 +03:00
em = btrfs_get_extent ( BTRFS_I ( inode ) , NULL , 0 , start , len , 0 ) ;
2017-12-12 23:43:52 +03:00
unlock_extent_cached ( io_tree , start , end , & cached ) ;
2012-06-11 12:03:35 +04:00
if ( IS_ERR ( em ) )
return NULL ;
}
return em ;
}
2012-03-29 17:57:45 +04:00
2012-06-11 12:03:35 +04:00
static bool defrag_check_next_extent ( struct inode * inode , struct extent_map * em )
{
struct extent_map * next ;
bool ret = true ;
/* this is the last extent */
if ( em - > start + em - > len > = i_size_read ( inode ) )
return false ;
next = defrag_lookup_extent ( inode , em - > start + em - > len ) ;
2014-08-27 00:55:54 +04:00
if ( ! next | | next - > block_start > = EXTENT_MAP_LAST_BYTE )
ret = false ;
else if ( ( em - > block_start + em - > block_len = = next - > block_start ) & &
2015-12-14 19:42:10 +03:00
( em - > block_len > SZ_128K & & next - > block_len > SZ_128K ) )
2012-06-11 12:03:35 +04:00
ret = false ;
free_extent_map ( next ) ;
2012-03-29 17:57:45 +04:00
return ret ;
}
2014-07-29 19:32:10 +04:00
static int should_defrag_range ( struct inode * inode , u64 start , u32 thresh ,
2012-06-20 05:08:32 +04:00
u64 * last_len , u64 * skip , u64 * defrag_end ,
int compress )
2010-03-10 18:52:59 +03:00
{
2012-06-11 12:03:35 +04:00
struct extent_map * em ;
2010-03-10 18:52:59 +03:00
int ret = 1 ;
2012-06-11 12:03:35 +04:00
bool next_mergeable = true ;
Btrfs: fix defrag to merge tail file extent
The file layout is
[extent 1]...[extent n][4k extent][HOLE][extent x]
extent 1~n and 4k extent can be merged during defrag, and the whole
defrag bytes is larger than our defrag thresh(256k), 4k extent as a
tail is left unmerged since we check if its next extent can be merged
(the next one is a hole, so the check will fail), the layout thus can
be
[new extent][4k extent][HOLE][extent x]
(1~n)
To fix it, beside looking at the next one, this also looks at the
previous one by checking @defrag_end, which is set to 0 when we
decide to stop merging contiguous extents, otherwise, we can merge
the previous one with our extent.
Also, this makes btrfs behave consistent with how xfs and ext4 do.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-08-07 11:48:41 +03:00
bool prev_mergeable = true ;
2010-03-10 18:52:59 +03:00
/*
2011-09-02 11:57:07 +04:00
* make sure that once we start defragging an extent , we keep on
2010-03-10 18:52:59 +03:00
* defragging it
*/
if ( start < * defrag_end )
return 1 ;
* skip = 0 ;
2012-06-11 12:03:35 +04:00
em = defrag_lookup_extent ( inode , start ) ;
if ( ! em )
return 0 ;
2010-03-10 18:52:59 +03:00
/* this will cover holes, and inline extents */
2012-03-29 17:57:45 +04:00
if ( em - > block_start > = EXTENT_MAP_LAST_BYTE ) {
2010-03-10 18:52:59 +03:00
ret = 0 ;
2012-03-29 17:57:45 +04:00
goto out ;
}
Btrfs: fix defrag to merge tail file extent
The file layout is
[extent 1]...[extent n][4k extent][HOLE][extent x]
extent 1~n and 4k extent can be merged during defrag, and the whole
defrag bytes is larger than our defrag thresh(256k), 4k extent as a
tail is left unmerged since we check if its next extent can be merged
(the next one is a hole, so the check will fail), the layout thus can
be
[new extent][4k extent][HOLE][extent x]
(1~n)
To fix it, beside looking at the next one, this also looks at the
previous one by checking @defrag_end, which is set to 0 when we
decide to stop merging contiguous extents, otherwise, we can merge
the previous one with our extent.
Also, this makes btrfs behave consistent with how xfs and ext4 do.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-08-07 11:48:41 +03:00
if ( ! * defrag_end )
prev_mergeable = false ;
2012-06-11 12:03:35 +04:00
next_mergeable = defrag_check_next_extent ( inode , em ) ;
2010-03-10 18:52:59 +03:00
/*
2012-06-11 12:03:35 +04:00
* we hit a real extent , if it is big or the next extent is not a
* real extent , don ' t bother defragging it
2010-03-10 18:52:59 +03:00
*/
2012-06-20 05:08:32 +04:00
if ( ! compress & & ( * last_len = = 0 | | * last_len > = thresh ) & &
Btrfs: fix defrag to merge tail file extent
The file layout is
[extent 1]...[extent n][4k extent][HOLE][extent x]
extent 1~n and 4k extent can be merged during defrag, and the whole
defrag bytes is larger than our defrag thresh(256k), 4k extent as a
tail is left unmerged since we check if its next extent can be merged
(the next one is a hole, so the check will fail), the layout thus can
be
[new extent][4k extent][HOLE][extent x]
(1~n)
To fix it, beside looking at the next one, this also looks at the
previous one by checking @defrag_end, which is set to 0 when we
decide to stop merging contiguous extents, otherwise, we can merge
the previous one with our extent.
Also, this makes btrfs behave consistent with how xfs and ext4 do.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-08-07 11:48:41 +03:00
( em - > len > = thresh | | ( ! next_mergeable & & ! prev_mergeable ) ) )
2010-03-10 18:52:59 +03:00
ret = 0 ;
2012-03-29 17:57:45 +04:00
out :
2010-03-10 18:52:59 +03:00
/*
* last_len ends up being a counter of how many bytes we ' ve defragged .
* every time we choose not to defrag an extent , we reset * last_len
* so that the next tiny extent will force a defrag .
*
* The end result of this is that tiny extents before a single big
* extent will force at least part of that big extent to be defragged .
*/
if ( ret ) {
* defrag_end = extent_map_end ( em ) ;
} else {
* last_len = 0 ;
* skip = extent_map_end ( em ) ;
* defrag_end = 0 ;
}
free_extent_map ( em ) ;
return ret ;
}
2011-05-24 23:35:30 +04:00
/*
* it doesn ' t do much good to defrag one or two pages
* at a time . This pulls in a nice chunk of pages
* to COW and defrag .
*
* It also makes sure the delalloc code has enough
* dirty data to avoid making new small extents as part
* of the defrag
*
* It ' s a good idea to start RA on this range
* before calling this .
*/
static int cluster_pages_for_defrag ( struct inode * inode ,
struct page * * pages ,
unsigned long start_index ,
2014-01-21 23:18:29 +04:00
unsigned long num_pages )
2008-06-12 05:53:53 +04:00
{
2011-05-24 23:35:30 +04:00
unsigned long file_end ;
u64 isize = i_size_read ( inode ) ;
u64 page_start ;
u64 page_end ;
2012-03-29 17:57:44 +04:00
u64 page_cnt ;
2011-05-24 23:35:30 +04:00
int ret ;
int i ;
int i_done ;
2008-07-24 19:57:52 +04:00
struct btrfs_ordered_extent * ordered ;
2011-05-24 23:35:30 +04:00
struct extent_state * cached_state = NULL ;
2012-02-16 11:01:24 +04:00
struct extent_io_tree * tree ;
2017-02-27 10:10:38 +03:00
struct extent_changeset * data_reserved = NULL ;
2011-09-21 23:05:58 +04:00
gfp_t mask = btrfs_alloc_write_mask ( inode - > i_mapping ) ;
2011-05-24 23:35:30 +04:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
file_end = ( isize - 1 ) > > PAGE_SHIFT ;
2012-03-29 17:57:44 +04:00
if ( ! isize | | start_index > file_end )
return 0 ;
page_cnt = min_t ( u64 , ( u64 ) num_pages , ( u64 ) file_end - start_index + 1 ) ;
2011-05-24 23:35:30 +04:00
2017-02-27 10:10:38 +03:00
ret = btrfs_delalloc_reserve_space ( inode , & data_reserved ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
start_index < < PAGE_SHIFT ,
page_cnt < < PAGE_SHIFT ) ;
2011-05-24 23:35:30 +04:00
if ( ret )
return ret ;
i_done = 0 ;
2012-02-16 11:01:24 +04:00
tree = & BTRFS_I ( inode ) - > io_tree ;
2011-05-24 23:35:30 +04:00
/* step one, lock all the pages */
2012-03-29 17:57:44 +04:00
for ( i = 0 ; i < page_cnt ; i + + ) {
2011-05-24 23:35:30 +04:00
struct page * page ;
2012-02-16 11:01:24 +04:00
again :
2011-07-11 18:47:06 +04:00
page = find_or_create_page ( inode - > i_mapping ,
2012-02-16 11:01:24 +04:00
start_index + i , mask ) ;
2011-05-24 23:35:30 +04:00
if ( ! page )
break ;
2012-02-16 11:01:24 +04:00
page_start = page_offset ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
page_end = page_start + PAGE_SIZE - 1 ;
2012-02-16 11:01:24 +04:00
while ( 1 ) {
2014-03-11 17:56:15 +04:00
lock_extent_bits ( tree , page_start , page_end ,
2015-12-03 16:30:40 +03:00
& cached_state ) ;
2012-02-16 11:01:24 +04:00
ordered = btrfs_lookup_ordered_extent ( inode ,
page_start ) ;
2014-03-11 17:56:15 +04:00
unlock_extent_cached ( tree , page_start , page_end ,
2017-12-12 23:43:52 +03:00
& cached_state ) ;
2012-02-16 11:01:24 +04:00
if ( ! ordered )
break ;
unlock_page ( page ) ;
btrfs_start_ordered_extent ( inode , ordered , 1 ) ;
btrfs_put_ordered_extent ( ordered ) ;
lock_page ( page ) ;
2012-03-29 17:57:44 +04:00
/*
* we unlocked the page above , so we need check if
* it was released or not .
*/
if ( page - > mapping ! = inode - > i_mapping ) {
unlock_page ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( page ) ;
2012-03-29 17:57:44 +04:00
goto again ;
}
2012-02-16 11:01:24 +04:00
}
2011-05-24 23:35:30 +04:00
if ( ! PageUptodate ( page ) ) {
btrfs_readpage ( NULL , page ) ;
lock_page ( page ) ;
if ( ! PageUptodate ( page ) ) {
unlock_page ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( page ) ;
2011-05-24 23:35:30 +04:00
ret = - EIO ;
break ;
}
}
2012-02-16 11:01:24 +04:00
if ( page - > mapping ! = inode - > i_mapping ) {
unlock_page ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( page ) ;
2012-02-16 11:01:24 +04:00
goto again ;
}
2011-05-24 23:35:30 +04:00
pages [ i ] = page ;
i_done + + ;
}
if ( ! i_done | | ret )
goto out ;
2017-11-28 00:05:09 +03:00
if ( ! ( inode - > i_sb - > s_flags & SB_ACTIVE ) )
2011-05-24 23:35:30 +04:00
goto out ;
/*
* so now we have a nice long stream of locked
* and up to date pages , lets wait on them
*/
for ( i = 0 ; i < i_done ; i + + )
wait_on_page_writeback ( pages [ i ] ) ;
page_start = page_offset ( pages [ 0 ] ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
page_end = page_offset ( pages [ i_done - 1 ] ) + PAGE_SIZE ;
2011-05-24 23:35:30 +04:00
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree ,
2015-12-03 16:30:40 +03:00
page_start , page_end - 1 , & cached_state ) ;
2011-05-24 23:35:30 +04:00
clear_extent_bit ( & BTRFS_I ( inode ) - > io_tree , page_start ,
2019-08-16 00:04:04 +03:00
page_end - 1 , EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG , 0 , 0 , & cached_state ) ;
2011-05-24 23:35:30 +04:00
2012-03-29 17:57:44 +04:00
if ( i_done ! = page_cnt ) {
2011-07-15 19:16:44 +04:00
spin_lock ( & BTRFS_I ( inode ) - > lock ) ;
2018-09-05 06:07:33 +03:00
btrfs_mod_outstanding_extents ( BTRFS_I ( inode ) , 1 ) ;
2011-07-15 19:16:44 +04:00
spin_unlock ( & BTRFS_I ( inode ) - > lock ) ;
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
btrfs_delalloc_release_space ( inode , data_reserved ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
start_index < < PAGE_SHIFT ,
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:32 +03:00
( page_cnt - i_done ) < < PAGE_SHIFT , true ) ;
2011-05-24 23:35:30 +04:00
}
2012-09-06 05:10:51 +04:00
set_extent_defrag ( & BTRFS_I ( inode ) - > io_tree , page_start , page_end - 1 ,
2016-04-27 00:54:39 +03:00
& cached_state ) ;
2011-05-24 23:35:30 +04:00
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
2017-12-12 23:43:52 +03:00
page_start , page_end - 1 , & cached_state ) ;
2011-05-24 23:35:30 +04:00
for ( i = 0 ; i < i_done ; i + + ) {
clear_page_dirty_for_io ( pages [ i ] ) ;
ClearPageChecked ( pages [ i ] ) ;
set_page_extent_mapped ( pages [ i ] ) ;
set_page_dirty ( pages [ i ] ) ;
unlock_page ( pages [ i ] ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( pages [ i ] ) ;
2011-05-24 23:35:30 +04:00
}
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents()
[Background]
Btrfs qgroup uses two types of reserved space for METADATA space,
PERTRANS and PREALLOC.
PERTRANS is metadata space reserved for each transaction started by
btrfs_start_transaction().
While PREALLOC is for delalloc, where we reserve space before joining a
transaction, and finally it will be converted to PERTRANS after the
writeback is done.
[Inconsistency]
However there is inconsistency in how we handle PREALLOC metadata space.
The most obvious one is:
In btrfs_buffered_write():
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true);
We always free qgroup PREALLOC meta space.
While in btrfs_truncate_block():
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
We only free qgroup PREALLOC meta space when something went wrong.
[The Correct Behavior]
The correct behavior should be the one in btrfs_buffered_write(), we
should always free PREALLOC metadata space.
The reason is, the btrfs_delalloc_* mechanism works by:
- Reserve metadata first, even it's not necessary
In btrfs_delalloc_reserve_metadata()
- Free the unused metadata space
Normally in:
btrfs_delalloc_release_extents()
|- btrfs_inode_rsv_release()
Here we do calculation on whether we should release or not.
E.g. for 64K buffered write, the metadata rsv works like:
/* The first page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=0
total: num_bytes=calc_inode_reservations()
/* The first page caused one outstanding extent, thus needs metadata
rsv */
/* The 2nd page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed
/* The 2nd page doesn't cause new outstanding extent, needs no new meta
rsv, so we free what we have reserved */
/* The 3rd~16th pages */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed (still space for one outstanding extent)
This means, if btrfs_delalloc_release_extents() determines to free some
space, then those space should be freed NOW.
So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other
than btrfs_qgroup_convert_reserved_meta().
The good news is:
- The callers are not that hot
The hottest caller is in btrfs_buffered_write(), which is already
fixed by commit 336a8bb8e36a ("btrfs: Fix wrong
btrfs_delalloc_release_extents parameter"). Thus it's not that
easy to cause false EDQUOT.
- The trans commit in advance for qgroup would hide the bug
Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction
commit more aggressive"), when btrfs qgroup metadata free space is slow,
it will try to commit transaction and free the wrongly converted
PERTRANS space, so it's not that easy to hit such bug.
[FIX]
So to fix the problem, remove the @qgroup_free parameter for
btrfs_delalloc_release_extents(), and always pass true to
btrfs_inode_rsv_release().
Reported-by: Filipe Manana <fdmanana@suse.com>
Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 09:34:51 +03:00
btrfs_delalloc_release_extents ( BTRFS_I ( inode ) , page_cnt < < PAGE_SHIFT ) ;
2017-02-27 10:10:38 +03:00
extent_changeset_free ( data_reserved ) ;
2011-05-24 23:35:30 +04:00
return i_done ;
out :
for ( i = 0 ; i < i_done ; i + + ) {
unlock_page ( pages [ i ] ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( pages [ i ] ) ;
2011-05-24 23:35:30 +04:00
}
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 10:10:39 +03:00
btrfs_delalloc_release_space ( inode , data_reserved ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
start_index < < PAGE_SHIFT ,
btrfs: qgroup: Use separate meta reservation type for delalloc
Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.
Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.
Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.
And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.
This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.
And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.
This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.
The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.
So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 10:34:32 +03:00
page_cnt < < PAGE_SHIFT , true ) ;
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents()
[Background]
Btrfs qgroup uses two types of reserved space for METADATA space,
PERTRANS and PREALLOC.
PERTRANS is metadata space reserved for each transaction started by
btrfs_start_transaction().
While PREALLOC is for delalloc, where we reserve space before joining a
transaction, and finally it will be converted to PERTRANS after the
writeback is done.
[Inconsistency]
However there is inconsistency in how we handle PREALLOC metadata space.
The most obvious one is:
In btrfs_buffered_write():
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true);
We always free qgroup PREALLOC meta space.
While in btrfs_truncate_block():
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
We only free qgroup PREALLOC meta space when something went wrong.
[The Correct Behavior]
The correct behavior should be the one in btrfs_buffered_write(), we
should always free PREALLOC metadata space.
The reason is, the btrfs_delalloc_* mechanism works by:
- Reserve metadata first, even it's not necessary
In btrfs_delalloc_reserve_metadata()
- Free the unused metadata space
Normally in:
btrfs_delalloc_release_extents()
|- btrfs_inode_rsv_release()
Here we do calculation on whether we should release or not.
E.g. for 64K buffered write, the metadata rsv works like:
/* The first page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=0
total: num_bytes=calc_inode_reservations()
/* The first page caused one outstanding extent, thus needs metadata
rsv */
/* The 2nd page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed
/* The 2nd page doesn't cause new outstanding extent, needs no new meta
rsv, so we free what we have reserved */
/* The 3rd~16th pages */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed (still space for one outstanding extent)
This means, if btrfs_delalloc_release_extents() determines to free some
space, then those space should be freed NOW.
So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other
than btrfs_qgroup_convert_reserved_meta().
The good news is:
- The callers are not that hot
The hottest caller is in btrfs_buffered_write(), which is already
fixed by commit 336a8bb8e36a ("btrfs: Fix wrong
btrfs_delalloc_release_extents parameter"). Thus it's not that
easy to cause false EDQUOT.
- The trans commit in advance for qgroup would hide the bug
Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction
commit more aggressive"), when btrfs qgroup metadata free space is slow,
it will try to commit transaction and free the wrongly converted
PERTRANS space, so it's not that easy to hit such bug.
[FIX]
So to fix the problem, remove the @qgroup_free parameter for
btrfs_delalloc_release_extents(), and always pass true to
btrfs_inode_rsv_release().
Reported-by: Filipe Manana <fdmanana@suse.com>
Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 09:34:51 +03:00
btrfs_delalloc_release_extents ( BTRFS_I ( inode ) , page_cnt < < PAGE_SHIFT ) ;
2017-02-27 10:10:38 +03:00
extent_changeset_free ( data_reserved ) ;
2011-05-24 23:35:30 +04:00
return ret ;
}
int btrfs_defrag_file ( struct inode * inode , struct file * file ,
struct btrfs_ioctl_defrag_range_args * range ,
u64 newer_than , unsigned long max_to_defrag )
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2011-05-24 23:35:30 +04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct file_ra_state * ra = NULL ;
2008-06-12 05:53:53 +04:00
unsigned long last_index ;
2011-09-02 11:56:39 +04:00
u64 isize = i_size_read ( inode ) ;
2010-03-10 18:52:59 +03:00
u64 last_len = 0 ;
u64 skip = 0 ;
u64 defrag_end = 0 ;
2011-05-24 23:35:30 +04:00
u64 newer_off = range - > start ;
2008-06-12 05:53:53 +04:00
unsigned long i ;
2011-09-02 11:57:07 +04:00
unsigned long ra_index = 0 ;
2008-06-12 05:53:53 +04:00
int ret ;
2011-05-24 23:35:30 +04:00
int defrag_count = 0 ;
2010-10-25 11:12:50 +04:00
int compress_type = BTRFS_COMPRESS_ZLIB ;
2014-07-29 19:32:10 +04:00
u32 extent_thresh = range - > extent_thresh ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
unsigned long max_cluster = SZ_256K > > PAGE_SHIFT ;
2014-01-21 23:18:29 +04:00
unsigned long cluster = max_cluster ;
2015-12-14 19:42:10 +03:00
u64 new_align = ~ ( ( u64 ) SZ_128K - 1 ) ;
2011-05-24 23:35:30 +04:00
struct page * * pages = NULL ;
2017-07-17 21:01:59 +03:00
bool do_compress = range - > flags & BTRFS_DEFRAG_RANGE_COMPRESS ;
2011-05-24 23:35:30 +04:00
2013-04-16 13:20:28 +04:00
if ( isize = = 0 )
return 0 ;
if ( range - > start > = isize )
return - EINVAL ;
2010-10-25 11:12:50 +04:00
2017-07-17 21:01:59 +03:00
if ( do_compress ) {
2010-10-25 11:12:50 +04:00
if ( range - > compress_type > BTRFS_COMPRESS_TYPES )
return - EINVAL ;
if ( range - > compress_type )
compress_type = range - > compress_type ;
}
2008-06-12 05:53:53 +04:00
2013-04-16 13:20:28 +04:00
if ( extent_thresh = = 0 )
2015-12-14 19:42:10 +03:00
extent_thresh = SZ_256K ;
2010-03-10 18:52:59 +03:00
2011-05-24 23:35:30 +04:00
/*
2017-06-22 04:22:58 +03:00
* If we were not given a file , allocate a readahead context . As
* readahead is just an optimization , defrag will work without it so
* we don ' t error out .
2011-05-24 23:35:30 +04:00
*/
if ( ! file ) {
2017-06-22 04:13:02 +03:00
ra = kzalloc ( sizeof ( * ra ) , GFP_KERNEL ) ;
2017-06-22 04:22:58 +03:00
if ( ra )
file_ra_state_init ( ra , inode - > i_mapping ) ;
2011-05-24 23:35:30 +04:00
} else {
ra = & file - > f_ra ;
}
2017-06-22 04:13:02 +03:00
pages = kmalloc_array ( max_cluster , sizeof ( struct page * ) , GFP_KERNEL ) ;
2011-05-24 23:35:30 +04:00
if ( ! pages ) {
ret = - ENOMEM ;
goto out_ra ;
}
/* find the last page to defrag */
2010-03-11 17:42:04 +03:00
if ( range - > start + range - > len > range - > start ) {
2011-09-02 11:56:39 +04:00
last_index = min_t ( u64 , isize - 1 ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
range - > start + range - > len - 1 ) > > PAGE_SHIFT ;
2010-03-11 17:42:04 +03:00
} else {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
last_index = ( isize - 1 ) > > PAGE_SHIFT ;
2010-03-11 17:42:04 +03:00
}
2011-05-24 23:35:30 +04:00
if ( newer_than ) {
ret = find_new_extents ( root , inode , newer_than ,
2015-12-14 19:42:10 +03:00
& newer_off , SZ_64K ) ;
2011-05-24 23:35:30 +04:00
if ( ! ret ) {
range - > start = newer_off ;
/*
* we always align our defrag to help keep
* the extents in the file evenly spaced
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
i = ( newer_off & new_align ) > > PAGE_SHIFT ;
2011-05-24 23:35:30 +04:00
} else
goto out_ra ;
} else {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
i = range - > start > > PAGE_SHIFT ;
2011-05-24 23:35:30 +04:00
}
if ( ! max_to_defrag )
2015-06-09 08:05:11 +03:00
max_to_defrag = last_index - i + 1 ;
2011-05-24 23:35:30 +04:00
2011-10-10 23:43:34 +04:00
/*
* make writeback starts from i , so the defrag range can be
* written sequentially .
*/
if ( i < inode - > i_mapping - > writeback_index )
inode - > i_mapping - > writeback_index = i ;
2011-10-11 19:41:40 +04:00
while ( i < = last_index & & defrag_count < max_to_defrag & &
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
( i < DIV_ROUND_UP ( i_size_read ( inode ) , PAGE_SIZE ) ) ) {
2011-05-24 23:35:30 +04:00
/*
* make sure we stop running if someone unmounts
* the FS
*/
2017-11-28 00:05:09 +03:00
if ( ! ( inode - > i_sb - > s_flags & SB_ACTIVE ) )
2011-05-24 23:35:30 +04:00
break ;
2016-06-23 01:54:23 +03:00
if ( btrfs_defrag_cancelled ( fs_info ) ) {
btrfs_debug ( fs_info , " defrag_file cancelled " ) ;
2013-02-10 03:38:06 +04:00
ret = - EAGAIN ;
break ;
}
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
if ( ! should_defrag_range ( inode , ( u64 ) i < < PAGE_SHIFT ,
2012-06-11 12:03:35 +04:00
extent_thresh , & last_len , & skip ,
2017-07-17 21:01:59 +03:00
& defrag_end , do_compress ) ) {
2010-03-10 18:52:59 +03:00
unsigned long next ;
/*
* the should_defrag function tells us how much to skip
* bump our counter by the suggested amount
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
next = DIV_ROUND_UP ( skip , PAGE_SIZE ) ;
2010-03-10 18:52:59 +03:00
i = max ( i + 1 , next ) ;
continue ;
}
2011-09-02 11:57:07 +04:00
if ( ! newer_than ) {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
cluster = ( PAGE_ALIGN ( defrag_end ) > >
PAGE_SHIFT ) - i ;
2011-09-02 11:57:07 +04:00
cluster = min ( cluster , max_cluster ) ;
} else {
cluster = max_cluster ;
}
if ( i + cluster > ra_index ) {
ra_index = max ( i , ra_index ) ;
2017-06-22 04:22:58 +03:00
if ( ra )
2017-06-22 04:35:28 +03:00
page_cache_sync_readahead ( inode - > i_mapping , ra ,
file , ra_index , cluster ) ;
2015-06-09 15:08:32 +03:00
ra_index + = cluster ;
2011-09-02 11:57:07 +04:00
}
2010-03-10 18:52:59 +03:00
2016-01-22 23:40:57 +03:00
inode_lock ( inode ) ;
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-03 20:28:12 +03:00
if ( IS_SWAPFILE ( inode ) ) {
ret = - ETXTBSY ;
} else {
if ( do_compress )
BTRFS_I ( inode ) - > defrag_compress = compress_type ;
ret = cluster_pages_for_defrag ( inode , pages , i , cluster ) ;
}
2012-03-29 17:57:44 +04:00
if ( ret < 0 ) {
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2011-05-24 23:35:30 +04:00
goto out_ra ;
2012-03-29 17:57:44 +04:00
}
2011-05-24 23:35:30 +04:00
defrag_count + = ret ;
2012-12-12 04:00:21 +04:00
balance_dirty_pages_ratelimited ( inode - > i_mapping ) ;
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2011-05-24 23:35:30 +04:00
if ( newer_than ) {
if ( newer_off = = ( u64 ) - 1 )
break ;
2012-03-29 17:57:45 +04:00
if ( ret > 0 )
i + = ret ;
2011-05-24 23:35:30 +04:00
newer_off = max ( newer_off + 1 ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
( u64 ) i < < PAGE_SHIFT ) ;
2011-05-24 23:35:30 +04:00
2015-12-14 19:42:10 +03:00
ret = find_new_extents ( root , inode , newer_than ,
& newer_off , SZ_64K ) ;
2011-05-24 23:35:30 +04:00
if ( ! ret ) {
range - > start = newer_off ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
i = ( newer_off & new_align ) > > PAGE_SHIFT ;
2011-05-24 23:35:30 +04:00
} else {
break ;
2008-06-12 05:53:53 +04:00
}
2011-05-24 23:35:30 +04:00
} else {
2011-09-02 11:57:07 +04:00
if ( ret > 0 ) {
2011-09-02 11:56:25 +04:00
i + = ret ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
last_len + = ret < < PAGE_SHIFT ;
2011-09-02 11:57:07 +04:00
} else {
2011-09-02 11:56:25 +04:00
i + + ;
2011-09-02 11:57:07 +04:00
last_len = 0 ;
}
2008-06-12 05:53:53 +04:00
}
}
2014-03-01 14:55:54 +04:00
if ( ( range - > flags & BTRFS_DEFRAG_RANGE_START_IO ) ) {
2010-03-11 17:42:04 +03:00
filemap_flush ( inode - > i_mapping ) ;
2014-03-01 14:55:54 +04:00
if ( test_bit ( BTRFS_INODE_HAS_ASYNC_EXTENT ,
& BTRFS_I ( inode ) - > runtime_flags ) )
filemap_flush ( inode - > i_mapping ) ;
}
2010-03-11 17:42:04 +03:00
2010-10-25 11:12:50 +04:00
if ( range - > compress_type = = BTRFS_COMPRESS_LZO ) {
2016-06-23 01:54:23 +03:00
btrfs_set_fs_incompat ( fs_info , COMPRESS_LZO ) ;
btrfs: Add zstd support
Add zstd compression and decompression support to BtrFS. zstd at its
fastest level compresses almost as well as zlib, while offering much
faster compression and decompression, approaching lzo speeds.
I benchmarked btrfs with zstd compression against no compression, lzo
compression, and zlib compression. I benchmarked two scenarios. Copying
a set of files to btrfs, and then reading the files. Copying a tarball
to btrfs, extracting it to btrfs, and then reading the extracted files.
After every operation, I call `sync` and include the sync time.
Between every pair of operations I unmount and remount the filesystem
to avoid caching. The benchmark files can be found in the upstream
zstd source repository under
`contrib/linux-kernel/{btrfs-benchmark.sh,btrfs-extract-benchmark.sh}`
[1] [2].
I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD.
The first compression benchmark is copying 10 copies of the unzipped
Silesia corpus [3] into a BtrFS filesystem mounted with
`-o compress-force=Method`. The decompression benchmark times how long
it takes to `tar` all 10 copies into `/dev/null`. The compression ratio is
measured by comparing the output of `df` and `du`. See the benchmark file
[1] for details. I benchmarked multiple zstd compression levels, although
the patch uses zstd level 1.
| Method | Ratio | Compression MB/s | Decompression speed |
|---------|-------|------------------|---------------------|
| None | 0.99 | 504 | 686 |
| lzo | 1.66 | 398 | 442 |
| zlib | 2.58 | 65 | 241 |
| zstd 1 | 2.57 | 260 | 383 |
| zstd 3 | 2.71 | 174 | 408 |
| zstd 6 | 2.87 | 70 | 398 |
| zstd 9 | 2.92 | 43 | 406 |
| zstd 12 | 2.93 | 21 | 408 |
| zstd 15 | 3.01 | 11 | 354 |
The next benchmark first copies `linux-4.11.6.tar` [4] to btrfs. Then it
measures the compression ratio, extracts the tar, and deletes the tar.
Then it measures the compression ratio again, and `tar`s the extracted
files into `/dev/null`. See the benchmark file [2] for details.
| Method | Tar Ratio | Extract Ratio | Copy (s) | Extract (s)| Read (s) |
|--------|-----------|---------------|----------|------------|----------|
| None | 0.97 | 0.78 | 0.981 | 5.501 | 8.807 |
| lzo | 2.06 | 1.38 | 1.631 | 8.458 | 8.585 |
| zlib | 3.40 | 1.86 | 7.750 | 21.544 | 11.744 |
| zstd 1 | 3.57 | 1.85 | 2.579 | 11.479 | 9.389 |
[1] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-benchmark.sh
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-extract-benchmark.sh
[3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
[4] https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.11.6.tar.xz
zstd source repository: https://github.com/facebook/zstd
Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2017-08-10 05:39:02 +03:00
} else if ( range - > compress_type = = BTRFS_COMPRESS_ZSTD ) {
btrfs_set_fs_incompat ( fs_info , COMPRESS_ZSTD ) ;
2010-10-25 11:12:50 +04:00
}
2011-09-01 18:33:57 +04:00
ret = defrag_count ;
2010-03-10 18:52:59 +03:00
2011-05-24 23:35:30 +04:00
out_ra :
2017-07-17 21:01:59 +03:00
if ( do_compress ) {
2016-01-22 23:40:57 +03:00
inode_lock ( inode ) ;
2017-07-17 20:41:31 +03:00
BTRFS_I ( inode ) - > defrag_compress = BTRFS_COMPRESS_NONE ;
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2013-08-16 18:23:33 +04:00
}
2011-05-24 23:35:30 +04:00
if ( ! file )
kfree ( ra ) ;
kfree ( pages ) ;
2010-03-10 18:52:59 +03:00
return ret ;
2008-06-12 05:53:53 +04:00
}
2012-11-26 12:43:45 +04:00
static noinline int btrfs_ioctl_resize ( struct file * file ,
2009-09-22 00:00:26 +04:00
void __user * arg )
2008-06-12 05:53:53 +04:00
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2008-06-12 05:53:53 +04:00
u64 new_size ;
u64 old_size ;
u64 devid = 1 ;
2016-06-23 01:54:23 +03:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2008-06-12 05:53:53 +04:00
struct btrfs_ioctl_vol_args * vol_args ;
struct btrfs_trans_handle * trans ;
struct btrfs_device * device = NULL ;
char * sizestr ;
2014-03-31 14:03:25 +04:00
char * retptr ;
2008-06-12 05:53:53 +04:00
char * devstr = NULL ;
int ret = 0 ;
int mod = 0 ;
2009-01-06 00:57:23 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 12:43:45 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2017-03-28 15:44:21 +03:00
if ( test_and_set_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ) {
2012-12-21 14:38:50 +04:00
mnt_drop_write_file ( file ) ;
2013-08-21 07:44:48 +04:00
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2012-01-17 00:04:47 +04:00
}
2009-04-08 11:06:54 +04:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
2012-01-17 00:04:47 +04:00
if ( IS_ERR ( vol_args ) ) {
ret = PTR_ERR ( vol_args ) ;
goto out ;
}
2008-07-24 20:20:14 +04:00
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
2008-06-12 05:53:53 +04:00
sizestr = vol_args - > name ;
devstr = strchr ( sizestr , ' : ' ) ;
if ( devstr ) {
sizestr = devstr + 1 ;
* devstr = ' \0 ' ;
devstr = vol_args - > name ;
2014-05-13 12:36:08 +04:00
ret = kstrtoull ( devstr , 10 , & devid ) ;
if ( ret )
goto out_free ;
2012-12-21 13:21:30 +04:00
if ( ! devid ) {
ret = - EINVAL ;
goto out_free ;
}
2016-06-23 01:54:23 +03:00
btrfs_info ( fs_info , " resizing devid %llu " , devid ) ;
2008-06-12 05:53:53 +04:00
}
2012-12-21 13:19:51 +04:00
2019-01-19 09:48:55 +03:00
device = btrfs_find_device ( fs_info - > fs_devices , devid , NULL , NULL , true ) ;
2008-06-12 05:53:53 +04:00
if ( ! device ) {
2016-06-23 01:54:23 +03:00
btrfs_info ( fs_info , " resizer unable to find device %llu " ,
devid ) ;
2012-12-21 13:21:30 +04:00
ret = - ENODEV ;
2012-01-17 00:04:47 +04:00
goto out_free ;
2008-06-12 05:53:53 +04:00
}
2012-12-21 13:19:51 +04:00
2017-12-04 07:54:52 +03:00
if ( ! test_bit ( BTRFS_DEV_STATE_WRITEABLE , & device - > dev_state ) ) {
2016-06-23 01:54:23 +03:00
btrfs_info ( fs_info ,
2013-12-20 20:37:06 +04:00
" resizer unable to apply on readonly device %llu " ,
2013-08-20 15:20:07 +04:00
devid ) ;
2012-12-21 13:21:30 +04:00
ret = - EPERM ;
2012-06-14 12:23:19 +04:00
goto out_free ;
}
2008-06-12 05:53:53 +04:00
if ( ! strcmp ( sizestr , " max " ) )
new_size = device - > bdev - > bd_inode - > i_size ;
else {
if ( sizestr [ 0 ] = = ' - ' ) {
mod = - 1 ;
sizestr + + ;
} else if ( sizestr [ 0 ] = = ' + ' ) {
mod = 1 ;
sizestr + + ;
}
2014-03-31 14:03:25 +04:00
new_size = memparse ( sizestr , & retptr ) ;
if ( * retptr ! = ' \0 ' | | new_size = = 0 ) {
2008-06-12 05:53:53 +04:00
ret = - EINVAL ;
2012-01-17 00:04:47 +04:00
goto out_free ;
2008-06-12 05:53:53 +04:00
}
}
2017-12-04 07:54:55 +03:00
if ( test_bit ( BTRFS_DEV_STATE_REPLACE_TGT , & device - > dev_state ) ) {
2012-12-21 13:21:30 +04:00
ret = - EPERM ;
2012-11-05 21:29:28 +04:00
goto out_free ;
}
2014-09-03 17:35:38 +04:00
old_size = btrfs_device_get_total_bytes ( device ) ;
2008-06-12 05:53:53 +04:00
if ( mod < 0 ) {
if ( new_size > old_size ) {
ret = - EINVAL ;
2012-01-17 00:04:47 +04:00
goto out_free ;
2008-06-12 05:53:53 +04:00
}
new_size = old_size - new_size ;
} else if ( mod > 0 ) {
2013-12-20 11:28:56 +04:00
if ( new_size > ULLONG_MAX - old_size ) {
2014-05-29 05:19:58 +04:00
ret = - ERANGE ;
2013-12-20 11:28:56 +04:00
goto out_free ;
}
2008-06-12 05:53:53 +04:00
new_size = old_size + new_size ;
}
2015-12-14 19:42:10 +03:00
if ( new_size < SZ_256M ) {
2008-06-12 05:53:53 +04:00
ret = - EINVAL ;
2012-01-17 00:04:47 +04:00
goto out_free ;
2008-06-12 05:53:53 +04:00
}
if ( new_size > device - > bdev - > bd_inode - > i_size ) {
ret = - EFBIG ;
2012-01-17 00:04:47 +04:00
goto out_free ;
2008-06-12 05:53:53 +04:00
}
2017-07-18 15:39:08 +03:00
new_size = round_down ( new_size , fs_info - > sectorsize ) ;
2008-06-12 05:53:53 +04:00
2016-06-23 01:54:23 +03:00
btrfs_info_in_rcu ( fs_info , " new size for %s is %llu " ,
rcu_str_deref ( device - > name ) , new_size ) ;
2008-06-12 05:53:53 +04:00
if ( new_size > old_size ) {
2010-05-16 18:48:46 +04:00
trans = btrfs_start_transaction ( root , 0 ) ;
2011-01-20 09:19:37 +03:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
2012-01-17 00:04:47 +04:00
goto out_free ;
2011-01-20 09:19:37 +03:00
}
2008-06-12 05:53:53 +04:00
ret = btrfs_grow_device ( trans , device , new_size ) ;
2016-09-10 04:39:03 +03:00
btrfs_commit_transaction ( trans ) ;
2011-11-18 22:55:01 +04:00
} else if ( new_size < old_size ) {
2008-06-12 05:53:53 +04:00
ret = btrfs_shrink_device ( device , new_size ) ;
2012-10-27 16:06:39 +04:00
} /* equal, nothing need to do */
2008-06-12 05:53:53 +04:00
2012-01-17 00:04:47 +04:00
out_free :
2008-06-12 05:53:53 +04:00
kfree ( vol_args ) ;
2012-01-17 00:04:47 +04:00
out :
2017-03-28 15:44:21 +03:00
clear_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ;
2013-01-20 17:57:57 +04:00
mnt_drop_write_file ( file ) ;
2008-06-12 05:53:53 +04:00
return ret ;
}
2010-10-29 23:41:32 +04:00
static noinline int btrfs_ioctl_snap_create_transid ( struct file * file ,
2017-02-14 20:33:53 +03:00
const char * name , unsigned long fd , int subvol ,
2011-09-14 17:58:21 +04:00
u64 * transid , bool readonly ,
2013-02-07 10:02:44 +04:00
struct btrfs_qgroup_inherit * inherit )
2008-06-12 05:53:53 +04:00
{
int namelen ;
2008-11-18 05:02:50 +03:00
int ret = 0 ;
2008-06-12 05:53:53 +04:00
2016-09-21 15:31:29 +03:00
if ( ! S_ISDIR ( file_inode ( file ) - > i_mode ) )
return - ENOTDIR ;
2012-06-29 13:58:46 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
goto out ;
2010-10-29 23:41:32 +04:00
namelen = strlen ( name ) ;
if ( strchr ( name , ' / ' ) ) {
2008-06-12 05:53:53 +04:00
ret = - EINVAL ;
2012-06-29 13:58:46 +04:00
goto out_drop_write ;
2008-06-12 05:53:53 +04:00
}
2012-02-21 07:14:55 +04:00
if ( name [ 0 ] = = ' . ' & &
( namelen = = 1 | | ( name [ 1 ] = = ' . ' & & namelen = = 2 ) ) ) {
ret = - EEXIST ;
2012-06-29 13:58:46 +04:00
goto out_drop_write ;
2012-02-21 07:14:55 +04:00
}
2008-11-18 05:02:50 +03:00
if ( subvol ) {
2010-10-29 23:41:32 +04:00
ret = btrfs_mksubvol ( & file - > f_path , name , namelen ,
2011-09-14 17:58:21 +04:00
NULL , transid , readonly , inherit ) ;
2008-10-09 21:39:39 +04:00
} else {
2012-08-28 20:52:22 +04:00
struct fd src = fdget ( fd ) ;
2008-11-18 05:02:50 +03:00
struct inode * src_inode ;
2012-08-28 20:52:22 +04:00
if ( ! src . file ) {
2008-11-18 05:02:50 +03:00
ret = - EINVAL ;
2012-06-29 13:58:46 +04:00
goto out_drop_write ;
2008-11-18 05:02:50 +03:00
}
2013-01-24 02:07:38 +04:00
src_inode = file_inode ( src . file ) ;
if ( src_inode - > i_sb ! = file_inode ( file ) - > i_sb ) {
2016-03-25 17:02:41 +03:00
btrfs_info ( BTRFS_I ( file_inode ( file ) ) - > root - > fs_info ,
2013-12-20 20:37:06 +04:00
" Snapshot src from another FS " ) ;
2014-01-30 11:32:02 +04:00
ret = - EXDEV ;
2014-01-15 21:15:52 +04:00
} else if ( ! inode_owner_or_capable ( src_inode ) ) {
/*
* Subvolume creation is not restricted , but snapshots
* are limited to own subvolumes only
*/
ret = - EPERM ;
2012-08-27 05:20:24 +04:00
} else {
ret = btrfs_mksubvol ( & file - > f_path , name , namelen ,
BTRFS_I ( src_inode ) - > root ,
transid , readonly , inherit ) ;
2008-11-18 05:02:50 +03:00
}
2012-08-28 20:52:22 +04:00
fdput ( src ) ;
2008-10-09 21:39:39 +04:00
}
2012-06-29 13:58:46 +04:00
out_drop_write :
mnt_drop_write_file ( file ) ;
2008-06-12 05:53:53 +04:00
out :
2010-10-29 23:41:32 +04:00
return ret ;
}
static noinline int btrfs_ioctl_snap_create ( struct file * file ,
2010-12-20 10:53:28 +03:00
void __user * arg , int subvol )
2010-10-29 23:41:32 +04:00
{
2010-12-20 10:53:28 +03:00
struct btrfs_ioctl_vol_args * vol_args ;
2010-10-29 23:41:32 +04:00
int ret ;
2016-09-21 15:31:29 +03:00
if ( ! S_ISDIR ( file_inode ( file ) - > i_mode ) )
return - ENOTDIR ;
2010-12-20 10:53:28 +03:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
if ( IS_ERR ( vol_args ) )
return PTR_ERR ( vol_args ) ;
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
2010-10-29 23:41:32 +04:00
2010-12-20 10:53:28 +03:00
ret = btrfs_ioctl_snap_create_transid ( file , vol_args - > name ,
2010-12-20 11:04:08 +03:00
vol_args - > fd , subvol ,
2011-09-14 17:58:21 +04:00
NULL , false , NULL ) ;
2010-12-10 09:41:56 +03:00
2010-12-20 10:53:28 +03:00
kfree ( vol_args ) ;
return ret ;
}
2010-12-10 09:41:56 +03:00
2010-12-20 10:53:28 +03:00
static noinline int btrfs_ioctl_snap_create_v2 ( struct file * file ,
void __user * arg , int subvol )
{
struct btrfs_ioctl_vol_args_v2 * vol_args ;
int ret ;
u64 transid = 0 ;
u64 * ptr = NULL ;
2010-12-20 11:04:08 +03:00
bool readonly = false ;
2011-09-14 17:58:21 +04:00
struct btrfs_qgroup_inherit * inherit = NULL ;
2010-12-10 03:36:28 +03:00
2016-09-21 15:31:29 +03:00
if ( ! S_ISDIR ( file_inode ( file ) - > i_mode ) )
return - ENOTDIR ;
2010-12-20 10:53:28 +03:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
if ( IS_ERR ( vol_args ) )
return PTR_ERR ( vol_args ) ;
vol_args - > name [ BTRFS_SUBVOL_NAME_MAX ] = ' \0 ' ;
2010-12-10 03:36:28 +03:00
2010-12-20 11:04:08 +03:00
if ( vol_args - > flags &
2011-09-14 17:58:21 +04:00
~ ( BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
BTRFS_SUBVOL_QGROUP_INHERIT ) ) {
2010-12-20 11:04:08 +03:00
ret = - EOPNOTSUPP ;
2014-09-04 15:09:15 +04:00
goto free_args ;
2010-10-29 23:41:32 +04:00
}
2010-12-20 10:53:28 +03:00
2019-08-26 17:34:24 +03:00
if ( vol_args - > flags & BTRFS_SUBVOL_CREATE_ASYNC ) {
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
btrfs_warn ( fs_info ,
" SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7 " ) ;
2010-12-20 10:53:28 +03:00
ptr = & transid ;
2019-08-26 17:34:24 +03:00
}
2010-12-20 11:04:08 +03:00
if ( vol_args - > flags & BTRFS_SUBVOL_RDONLY )
readonly = true ;
2011-09-14 17:58:21 +04:00
if ( vol_args - > flags & BTRFS_SUBVOL_QGROUP_INHERIT ) {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
if ( vol_args - > size > PAGE_SIZE ) {
2011-09-14 17:58:21 +04:00
ret = - EINVAL ;
2014-09-04 15:09:15 +04:00
goto free_args ;
2011-09-14 17:58:21 +04:00
}
inherit = memdup_user ( vol_args - > qgroup_inherit , vol_args - > size ) ;
if ( IS_ERR ( inherit ) ) {
ret = PTR_ERR ( inherit ) ;
2014-09-04 15:09:15 +04:00
goto free_args ;
2011-09-14 17:58:21 +04:00
}
}
2010-12-20 10:53:28 +03:00
ret = btrfs_ioctl_snap_create_transid ( file , vol_args - > name ,
2011-09-14 17:58:21 +04:00
vol_args - > fd , subvol , ptr ,
2013-02-07 10:02:44 +04:00
readonly , inherit ) ;
2014-09-04 15:09:15 +04:00
if ( ret )
goto free_inherit ;
2010-12-20 10:53:28 +03:00
2014-09-04 15:09:15 +04:00
if ( ptr & & copy_to_user ( arg +
offsetof ( struct btrfs_ioctl_vol_args_v2 ,
transid ) ,
ptr , sizeof ( * ptr ) ) )
2010-12-20 10:53:28 +03:00
ret = - EFAULT ;
2014-09-04 15:09:15 +04:00
free_inherit :
2011-09-14 17:58:21 +04:00
kfree ( inherit ) ;
2014-09-04 15:09:15 +04:00
free_args :
kfree ( vol_args ) ;
2008-06-12 05:53:53 +04:00
return ret ;
}
2010-12-20 11:30:25 +03:00
static noinline int btrfs_ioctl_subvol_getflags ( struct file * file ,
void __user * arg )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2010-12-20 11:30:25 +03:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
int ret = 0 ;
u64 flags = 0 ;
2017-01-10 21:35:31 +03:00
if ( btrfs_ino ( BTRFS_I ( inode ) ) ! = BTRFS_FIRST_FREE_OBJECTID )
2010-12-20 11:30:25 +03:00
return - EINVAL ;
2016-06-23 01:54:23 +03:00
down_read ( & fs_info - > subvol_sem ) ;
2010-12-20 11:30:25 +03:00
if ( btrfs_root_readonly ( root ) )
flags | = BTRFS_SUBVOL_RDONLY ;
2016-06-23 01:54:23 +03:00
up_read ( & fs_info - > subvol_sem ) ;
2010-12-20 11:30:25 +03:00
if ( copy_to_user ( arg , & flags , sizeof ( flags ) ) )
ret = - EFAULT ;
return ret ;
}
static noinline int btrfs_ioctl_subvol_setflags ( struct file * file ,
void __user * arg )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2010-12-20 11:30:25 +03:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_trans_handle * trans ;
u64 root_flags ;
u64 flags ;
int ret = 0 ;
2014-01-16 18:50:22 +04:00
if ( ! inode_owner_or_capable ( inode ) )
return - EPERM ;
2012-06-29 13:58:49 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
goto out ;
2010-12-20 11:30:25 +03:00
2017-01-10 21:35:31 +03:00
if ( btrfs_ino ( BTRFS_I ( inode ) ) ! = BTRFS_FIRST_FREE_OBJECTID ) {
2012-06-29 13:58:49 +04:00
ret = - EINVAL ;
goto out_drop_write ;
}
2010-12-20 11:30:25 +03:00
2012-06-29 13:58:49 +04:00
if ( copy_from_user ( & flags , arg , sizeof ( flags ) ) ) {
ret = - EFAULT ;
goto out_drop_write ;
}
2010-12-20 11:30:25 +03:00
2012-06-29 13:58:49 +04:00
if ( flags & BTRFS_SUBVOL_CREATE_ASYNC ) {
ret = - EINVAL ;
goto out_drop_write ;
}
2010-12-20 11:30:25 +03:00
2012-06-29 13:58:49 +04:00
if ( flags & ~ BTRFS_SUBVOL_RDONLY ) {
ret = - EOPNOTSUPP ;
goto out_drop_write ;
}
2010-12-20 11:30:25 +03:00
2016-06-23 01:54:23 +03:00
down_write ( & fs_info - > subvol_sem ) ;
2010-12-20 11:30:25 +03:00
/* nothing to do */
if ( ! ! ( flags & BTRFS_SUBVOL_RDONLY ) = = btrfs_root_readonly ( root ) )
2012-06-29 13:58:49 +04:00
goto out_drop_sem ;
2010-12-20 11:30:25 +03:00
root_flags = btrfs_root_flags ( & root - > root_item ) ;
2013-12-16 20:34:17 +04:00
if ( flags & BTRFS_SUBVOL_RDONLY ) {
2010-12-20 11:30:25 +03:00
btrfs_set_root_flags ( & root - > root_item ,
root_flags | BTRFS_ROOT_SUBVOL_RDONLY ) ;
2013-12-16 20:34:17 +04:00
} else {
/*
* Block RO - > RW transition if this subvolume is involved in
* send
*/
spin_lock ( & root - > root_item_lock ) ;
if ( root - > send_in_progress = = 0 ) {
btrfs_set_root_flags ( & root - > root_item ,
2010-12-20 11:30:25 +03:00
root_flags & ~ BTRFS_ROOT_SUBVOL_RDONLY ) ;
2013-12-16 20:34:17 +04:00
spin_unlock ( & root - > root_item_lock ) ;
} else {
spin_unlock ( & root - > root_item_lock ) ;
2016-06-23 01:54:23 +03:00
btrfs_warn ( fs_info ,
" Attempt to set subvolume %llu read-write during send " ,
root - > root_key . objectid ) ;
2013-12-16 20:34:17 +04:00
ret = - EPERM ;
goto out_drop_sem ;
}
}
2010-12-20 11:30:25 +03:00
trans = btrfs_start_transaction ( root , 1 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out_reset ;
}
2016-06-23 01:54:23 +03:00
ret = btrfs_update_root ( trans , fs_info - > tree_root ,
2010-12-20 11:30:25 +03:00
& root - > root_key , & root - > root_item ) ;
2017-09-28 10:53:17 +03:00
if ( ret < 0 ) {
btrfs_end_transaction ( trans ) ;
goto out_reset ;
}
ret = btrfs_commit_transaction ( trans ) ;
2010-12-20 11:30:25 +03:00
out_reset :
if ( ret )
btrfs_set_root_flags ( & root - > root_item , root_flags ) ;
2012-06-29 13:58:49 +04:00
out_drop_sem :
2016-06-23 01:54:23 +03:00
up_write ( & fs_info - > subvol_sem ) ;
2012-06-29 13:58:49 +04:00
out_drop_write :
mnt_drop_write_file ( file ) ;
out :
2010-12-20 11:30:25 +03:00
return ret ;
}
2010-02-28 23:39:26 +03:00
static noinline int key_in_sk ( struct btrfs_key * key ,
struct btrfs_ioctl_search_key * sk )
{
2010-03-18 19:10:08 +03:00
struct btrfs_key test ;
int ret ;
test . objectid = sk - > min_objectid ;
test . type = sk - > min_type ;
test . offset = sk - > min_offset ;
ret = btrfs_comp_cpu_keys ( key , & test ) ;
if ( ret < 0 )
2010-02-28 23:39:26 +03:00
return 0 ;
2010-03-18 19:10:08 +03:00
test . objectid = sk - > max_objectid ;
test . type = sk - > max_type ;
test . offset = sk - > max_offset ;
ret = btrfs_comp_cpu_keys ( key , & test ) ;
if ( ret > 0 )
2010-02-28 23:39:26 +03:00
return 0 ;
return 1 ;
}
2016-06-22 03:18:21 +03:00
static noinline int copy_to_sk ( struct btrfs_path * path ,
2010-02-28 23:39:26 +03:00
struct btrfs_key * key ,
struct btrfs_ioctl_search_key * sk ,
2014-01-30 19:24:00 +04:00
size_t * buf_size ,
2014-01-30 19:24:02 +04:00
char __user * ubuf ,
2010-02-28 23:39:26 +03:00
unsigned long * sk_offset ,
int * num_found )
{
u64 found_transid ;
struct extent_buffer * leaf ;
struct btrfs_ioctl_search_header sh ;
2015-06-30 05:25:43 +03:00
struct btrfs_key test ;
2010-02-28 23:39:26 +03:00
unsigned long item_off ;
unsigned long item_len ;
int nritems ;
int i ;
int slot ;
int ret = 0 ;
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
nritems = btrfs_header_nritems ( leaf ) ;
if ( btrfs_header_generation ( leaf ) > sk - > max_transid ) {
i = nritems ;
goto advance_key ;
}
found_transid = btrfs_header_generation ( leaf ) ;
for ( i = slot ; i < nritems ; i + + ) {
item_off = btrfs_item_ptr_offset ( leaf , i ) ;
item_len = btrfs_item_size_nr ( leaf , i ) ;
2013-05-06 21:40:18 +04:00
btrfs_item_key_to_cpu ( leaf , key , i ) ;
if ( ! key_in_sk ( key , sk ) )
continue ;
2014-01-30 19:24:00 +04:00
if ( sizeof ( sh ) + item_len > * buf_size ) {
2014-01-30 19:23:59 +04:00
if ( * num_found ) {
ret = 1 ;
goto out ;
}
/*
* return one empty item back for v1 , which does not
* handle - EOVERFLOW
*/
2014-01-30 19:24:00 +04:00
* buf_size = sizeof ( sh ) + item_len ;
2010-02-28 23:39:26 +03:00
item_len = 0 ;
2014-01-30 19:23:59 +04:00
ret = - EOVERFLOW ;
}
2010-02-28 23:39:26 +03:00
2014-01-30 19:24:00 +04:00
if ( sizeof ( sh ) + item_len + * sk_offset > * buf_size ) {
2010-02-28 23:39:26 +03:00
ret = 1 ;
2014-01-30 19:23:57 +04:00
goto out ;
2010-02-28 23:39:26 +03:00
}
sh . objectid = key - > objectid ;
sh . offset = key - > offset ;
sh . type = key - > type ;
sh . len = item_len ;
sh . transid = found_transid ;
/* copy search result header */
2014-01-30 19:24:02 +04:00
if ( copy_to_user ( ubuf + * sk_offset , & sh , sizeof ( sh ) ) ) {
ret = - EFAULT ;
goto out ;
}
2010-02-28 23:39:26 +03:00
* sk_offset + = sizeof ( sh ) ;
if ( item_len ) {
2014-01-30 19:24:02 +04:00
char __user * up = ubuf + * sk_offset ;
2010-02-28 23:39:26 +03:00
/* copy the item */
2014-01-30 19:24:02 +04:00
if ( read_extent_buffer_to_user ( leaf , up ,
item_off , item_len ) ) {
ret = - EFAULT ;
goto out ;
}
2010-02-28 23:39:26 +03:00
* sk_offset + = item_len ;
}
2011-05-14 21:43:41 +04:00
( * num_found ) + + ;
2010-02-28 23:39:26 +03:00
2014-01-30 19:23:59 +04:00
if ( ret ) /* -EOVERFLOW from above */
goto out ;
2014-01-30 19:23:57 +04:00
if ( * num_found > = sk - > nr_items ) {
ret = 1 ;
goto out ;
}
2010-02-28 23:39:26 +03:00
}
advance_key :
2010-03-18 19:10:08 +03:00
ret = 0 ;
2015-06-30 05:25:43 +03:00
test . objectid = sk - > max_objectid ;
test . type = sk - > max_type ;
test . offset = sk - > max_offset ;
if ( btrfs_comp_cpu_keys ( key , & test ) > = 0 )
ret = 1 ;
else if ( key - > offset < ( u64 ) - 1 )
2010-02-28 23:39:26 +03:00
key - > offset + + ;
2015-06-30 05:25:43 +03:00
else if ( key - > type < ( u8 ) - 1 ) {
2010-03-18 19:10:08 +03:00
key - > offset = 0 ;
2010-02-28 23:39:26 +03:00
key - > type + + ;
2015-06-30 05:25:43 +03:00
} else if ( key - > objectid < ( u64 ) - 1 ) {
2010-03-18 19:10:08 +03:00
key - > offset = 0 ;
key - > type = 0 ;
2010-02-28 23:39:26 +03:00
key - > objectid + + ;
2010-03-18 19:10:08 +03:00
} else
ret = 1 ;
2014-01-30 19:23:57 +04:00
out :
2014-01-30 19:24:02 +04:00
/*
* 0 : all items from this leaf copied , continue with next
* 1 : * more items can be copied , but unused buffer is too small
* * all items were found
* Either way , it will stops the loop which iterates to the next
* leaf
* - EOVERFLOW : item was to large for buffer
* - EFAULT : could not copy extent buffer back to userspace
*/
2010-02-28 23:39:26 +03:00
return ret ;
}
static noinline int search_ioctl ( struct inode * inode ,
2014-01-30 19:23:58 +04:00
struct btrfs_ioctl_search_key * sk ,
2014-01-30 19:24:00 +04:00
size_t * buf_size ,
2014-01-30 19:24:02 +04:00
char __user * ubuf )
2010-02-28 23:39:26 +03:00
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * info = btrfs_sb ( inode - > i_sb ) ;
2010-02-28 23:39:26 +03:00
struct btrfs_root * root ;
struct btrfs_key key ;
struct btrfs_path * path ;
int ret ;
int num_found = 0 ;
unsigned long sk_offset = 0 ;
2014-01-30 19:24:00 +04:00
if ( * buf_size < sizeof ( struct btrfs_ioctl_search_header ) ) {
* buf_size = sizeof ( struct btrfs_ioctl_search_header ) ;
2014-01-30 19:23:58 +04:00
return - EOVERFLOW ;
2014-01-30 19:24:00 +04:00
}
2014-01-30 19:23:58 +04:00
2010-02-28 23:39:26 +03:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
if ( sk - > tree_id = = 0 ) {
/* search the root of the inode that was passed */
root = BTRFS_I ( inode ) - > root ;
} else {
key . objectid = sk - > tree_id ;
key . type = BTRFS_ROOT_ITEM_KEY ;
key . offset = ( u64 ) - 1 ;
root = btrfs_read_fs_root_no_name ( info , & key ) ;
if ( IS_ERR ( root ) ) {
btrfs_free_path ( path ) ;
2018-05-21 07:57:27 +03:00
return PTR_ERR ( root ) ;
2010-02-28 23:39:26 +03:00
}
}
key . objectid = sk - > min_objectid ;
key . type = sk - > min_type ;
key . offset = sk - > min_offset ;
2013-10-31 09:03:04 +04:00
while ( 1 ) {
2013-10-01 19:13:42 +04:00
ret = btrfs_search_forward ( root , & key , path , sk - > min_transid ) ;
2010-02-28 23:39:26 +03:00
if ( ret ! = 0 ) {
if ( ret > 0 )
ret = 0 ;
goto err ;
}
2016-06-22 03:18:21 +03:00
ret = copy_to_sk ( path , & key , sk , buf_size , ubuf ,
2010-02-28 23:39:26 +03:00
& sk_offset , & num_found ) ;
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2014-01-30 19:23:57 +04:00
if ( ret )
2010-02-28 23:39:26 +03:00
break ;
}
2014-01-30 19:23:59 +04:00
if ( ret > 0 )
ret = 0 ;
2010-02-28 23:39:26 +03:00
err :
sk - > nr_items = num_found ;
btrfs_free_path ( path ) ;
return ret ;
}
static noinline int btrfs_ioctl_tree_search ( struct file * file ,
void __user * argp )
{
2014-01-30 19:24:02 +04:00
struct btrfs_ioctl_search_args __user * uargs ;
struct btrfs_ioctl_search_key sk ;
2014-01-30 19:24:00 +04:00
struct inode * inode ;
int ret ;
size_t buf_size ;
2010-02-28 23:39:26 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2014-01-30 19:24:02 +04:00
uargs = ( struct btrfs_ioctl_search_args __user * ) argp ;
if ( copy_from_user ( & sk , & uargs - > key , sizeof ( sk ) ) )
return - EFAULT ;
2010-02-28 23:39:26 +03:00
2014-01-30 19:24:02 +04:00
buf_size = sizeof ( uargs - > buf ) ;
2010-02-28 23:39:26 +03:00
2013-01-24 02:07:38 +04:00
inode = file_inode ( file ) ;
2014-01-30 19:24:02 +04:00
ret = search_ioctl ( inode , & sk , & buf_size , uargs - > buf ) ;
2014-01-30 19:23:59 +04:00
/*
* In the origin implementation an overflow is handled by returning a
* search header with a len of zero , so reset ret .
*/
if ( ret = = - EOVERFLOW )
ret = 0 ;
2014-01-30 19:24:02 +04:00
if ( ret = = 0 & & copy_to_user ( & uargs - > key , & sk , sizeof ( sk ) ) )
2010-02-28 23:39:26 +03:00
ret = - EFAULT ;
return ret ;
}
2014-01-30 19:24:03 +04:00
static noinline int btrfs_ioctl_tree_search_v2 ( struct file * file ,
void __user * argp )
{
struct btrfs_ioctl_search_args_v2 __user * uarg ;
struct btrfs_ioctl_search_args_v2 args ;
struct inode * inode ;
int ret ;
size_t buf_size ;
2015-12-14 19:42:10 +03:00
const size_t buf_limit = SZ_16M ;
2014-01-30 19:24:03 +04:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
/* copy search header and buffer size */
uarg = ( struct btrfs_ioctl_search_args_v2 __user * ) argp ;
if ( copy_from_user ( & args , uarg , sizeof ( args ) ) )
return - EFAULT ;
buf_size = args . buf_size ;
/* limit result size to 16MB */
if ( buf_size > buf_limit )
buf_size = buf_limit ;
inode = file_inode ( file ) ;
ret = search_ioctl ( inode , & args . key , & buf_size ,
2017-08-23 09:46:05 +03:00
( char __user * ) ( & uarg - > buf [ 0 ] ) ) ;
2014-01-30 19:24:03 +04:00
if ( ret = = 0 & & copy_to_user ( & uarg - > key , & args . key , sizeof ( args . key ) ) )
ret = - EFAULT ;
else if ( ret = = - EOVERFLOW & &
copy_to_user ( & uarg - > buf_size , & buf_size , sizeof ( buf_size ) ) )
ret = - EFAULT ;
2010-02-28 23:39:26 +03:00
return ret ;
}
2009-11-18 08:42:14 +03:00
/*
2010-02-28 23:39:26 +03:00
* Search INODE_REFs to identify path name of ' dirid ' directory
* in a ' tree_id ' tree . and sets path name to ' name ' .
*/
2009-11-18 08:42:14 +03:00
static noinline int btrfs_search_path_in_tree ( struct btrfs_fs_info * info ,
u64 tree_id , u64 dirid , char * name )
{
struct btrfs_root * root ;
struct btrfs_key key ;
2010-02-28 23:39:26 +03:00
char * ptr ;
2009-11-18 08:42:14 +03:00
int ret = - 1 ;
int slot ;
int len ;
int total_len = 0 ;
struct btrfs_inode_ref * iref ;
struct extent_buffer * l ;
struct btrfs_path * path ;
if ( dirid = = BTRFS_FIRST_FREE_OBJECTID ) {
name [ 0 ] = ' \0 ' ;
return 0 ;
}
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2017-12-01 12:19:42 +03:00
ptr = & name [ BTRFS_INO_LOOKUP_PATH_MAX - 1 ] ;
2009-11-18 08:42:14 +03:00
key . objectid = tree_id ;
key . type = BTRFS_ROOT_ITEM_KEY ;
key . offset = ( u64 ) - 1 ;
root = btrfs_read_fs_root_no_name ( info , & key ) ;
if ( IS_ERR ( root ) ) {
2018-05-21 07:57:27 +03:00
ret = PTR_ERR ( root ) ;
2010-03-18 19:23:10 +03:00
goto out ;
2009-11-18 08:42:14 +03:00
}
key . objectid = dirid ;
key . type = BTRFS_INODE_REF_KEY ;
2010-03-18 19:23:10 +03:00
key . offset = ( u64 ) - 1 ;
2009-11-18 08:42:14 +03:00
2013-10-31 09:03:04 +04:00
while ( 1 ) {
2009-11-18 08:42:14 +03:00
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto out ;
2013-08-14 06:00:21 +04:00
else if ( ret > 0 ) {
ret = btrfs_previous_item ( root , path , dirid ,
BTRFS_INODE_REF_KEY ) ;
if ( ret < 0 )
goto out ;
else if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
}
2009-11-18 08:42:14 +03:00
l = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( l , & key , slot ) ;
iref = btrfs_item_ptr ( l , slot , struct btrfs_inode_ref ) ;
len = btrfs_inode_ref_name_len ( l , iref ) ;
ptr - = len + 1 ;
total_len + = len + 1 ;
2013-08-14 06:00:20 +04:00
if ( ptr < name ) {
ret = - ENAMETOOLONG ;
2009-11-18 08:42:14 +03:00
goto out ;
2013-08-14 06:00:20 +04:00
}
2009-11-18 08:42:14 +03:00
* ( ptr + len ) = ' / ' ;
2013-10-31 09:03:04 +04:00
read_extent_buffer ( l , ptr , ( unsigned long ) ( iref + 1 ) , len ) ;
2009-11-18 08:42:14 +03:00
if ( key . offset = = BTRFS_FIRST_FREE_OBJECTID )
break ;
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-18 08:42:14 +03:00
key . objectid = key . offset ;
2010-03-18 19:23:10 +03:00
key . offset = ( u64 ) - 1 ;
2009-11-18 08:42:14 +03:00
dirid = key . objectid ;
}
2011-07-14 07:16:00 +04:00
memmove ( name , ptr , total_len ) ;
2013-10-31 09:03:04 +04:00
name [ total_len ] = ' \0 ' ;
2009-11-18 08:42:14 +03:00
ret = 0 ;
out :
btrfs_free_path ( path ) ;
2010-02-28 23:39:26 +03:00
return ret ;
}
2018-05-21 04:09:44 +03:00
static int btrfs_search_path_in_tree_user ( struct inode * inode ,
struct btrfs_ioctl_ino_lookup_user_args * args )
{
struct btrfs_fs_info * fs_info = BTRFS_I ( inode ) - > root - > fs_info ;
struct super_block * sb = inode - > i_sb ;
struct btrfs_key upper_limit = BTRFS_I ( inode ) - > location ;
u64 treeid = BTRFS_I ( inode ) - > root - > root_key . objectid ;
u64 dirid = args - > dirid ;
unsigned long item_off ;
unsigned long item_len ;
struct btrfs_inode_ref * iref ;
struct btrfs_root_ref * rref ;
struct btrfs_root * root ;
struct btrfs_path * path ;
struct btrfs_key key , key2 ;
struct extent_buffer * leaf ;
struct inode * temp_inode ;
char * ptr ;
int slot ;
int len ;
int total_len = 0 ;
int ret ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
/*
* If the bottom subvolume does not exist directly under upper_limit ,
* construct the path in from the bottom up .
*/
if ( dirid ! = upper_limit . objectid ) {
ptr = & args - > path [ BTRFS_INO_LOOKUP_USER_PATH_MAX - 1 ] ;
key . objectid = treeid ;
key . type = BTRFS_ROOT_ITEM_KEY ;
key . offset = ( u64 ) - 1 ;
root = btrfs_read_fs_root_no_name ( fs_info , & key ) ;
if ( IS_ERR ( root ) ) {
ret = PTR_ERR ( root ) ;
goto out ;
}
key . objectid = dirid ;
key . type = BTRFS_INODE_REF_KEY ;
key . offset = ( u64 ) - 1 ;
while ( 1 ) {
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = btrfs_previous_item ( root , path , dirid ,
BTRFS_INODE_REF_KEY ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
iref = btrfs_item_ptr ( leaf , slot , struct btrfs_inode_ref ) ;
len = btrfs_inode_ref_name_len ( leaf , iref ) ;
ptr - = len + 1 ;
total_len + = len + 1 ;
if ( ptr < args - > path ) {
ret = - ENAMETOOLONG ;
goto out ;
}
* ( ptr + len ) = ' / ' ;
read_extent_buffer ( leaf , ptr ,
( unsigned long ) ( iref + 1 ) , len ) ;
/* Check the read+exec permission of this directory */
ret = btrfs_previous_item ( root , path , dirid ,
BTRFS_INODE_ITEM_KEY ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key2 , slot ) ;
if ( key2 . objectid ! = dirid ) {
ret = - ENOENT ;
goto out ;
}
temp_inode = btrfs_iget ( sb , & key2 , root , NULL ) ;
2018-06-04 10:41:07 +03:00
if ( IS_ERR ( temp_inode ) ) {
ret = PTR_ERR ( temp_inode ) ;
goto out ;
}
2018-05-21 04:09:44 +03:00
ret = inode_permission ( temp_inode , MAY_READ | MAY_EXEC ) ;
iput ( temp_inode ) ;
if ( ret ) {
ret = - EACCES ;
goto out ;
}
if ( key . offset = = upper_limit . objectid )
break ;
if ( key . objectid = = BTRFS_FIRST_FREE_OBJECTID ) {
ret = - EACCES ;
goto out ;
}
btrfs_release_path ( path ) ;
key . objectid = key . offset ;
key . offset = ( u64 ) - 1 ;
dirid = key . objectid ;
}
memmove ( args - > path , ptr , total_len ) ;
args - > path [ total_len ] = ' \0 ' ;
btrfs_release_path ( path ) ;
}
/* Get the bottom subvolume's name from ROOT_REF */
root = fs_info - > tree_root ;
key . objectid = treeid ;
key . type = BTRFS_ROOT_REF_KEY ;
key . offset = args - > treeid ;
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
item_off = btrfs_item_ptr_offset ( leaf , slot ) ;
item_len = btrfs_item_size_nr ( leaf , slot ) ;
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr ( leaf , slot , struct btrfs_root_ref ) ;
if ( args - > dirid ! = btrfs_root_ref_dirid ( leaf , rref ) ) {
ret = - EINVAL ;
goto out ;
}
/* Copy subvolume's name */
item_off + = sizeof ( struct btrfs_root_ref ) ;
item_len - = sizeof ( struct btrfs_root_ref ) ;
read_extent_buffer ( leaf , args - > name , item_off , item_len ) ;
args - > name [ item_len ] = 0 ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
2010-02-28 23:39:26 +03:00
static noinline int btrfs_ioctl_ino_lookup ( struct file * file ,
void __user * argp )
{
2018-06-20 20:03:31 +03:00
struct btrfs_ioctl_ino_lookup_args * args ;
struct inode * inode ;
2015-05-12 20:14:49 +03:00
int ret = 0 ;
2010-02-28 23:39:26 +03:00
2010-10-29 23:14:18 +04:00
args = memdup_user ( argp , sizeof ( * args ) ) ;
if ( IS_ERR ( args ) )
return PTR_ERR ( args ) ;
2010-03-20 14:24:15 +03:00
2013-01-24 02:07:38 +04:00
inode = file_inode ( file ) ;
2010-02-28 23:39:26 +03:00
2015-05-12 20:14:49 +03:00
/*
* Unprivileged query to obtain the containing subvolume root id . The
* path is reset so it ' s consistent with btrfs_search_path_in_tree .
*/
2010-03-18 19:17:05 +03:00
if ( args - > treeid = = 0 )
args - > treeid = BTRFS_I ( inode ) - > root - > root_key . objectid ;
2015-05-12 20:14:49 +03:00
if ( args - > objectid = = BTRFS_FIRST_FREE_OBJECTID ) {
args - > name [ 0 ] = 0 ;
goto out ;
}
if ( ! capable ( CAP_SYS_ADMIN ) ) {
ret = - EPERM ;
goto out ;
}
2010-02-28 23:39:26 +03:00
ret = btrfs_search_path_in_tree ( BTRFS_I ( inode ) - > root - > fs_info ,
args - > treeid , args - > objectid ,
args - > name ) ;
2015-05-12 20:14:49 +03:00
out :
2010-02-28 23:39:26 +03:00
if ( ret = = 0 & & copy_to_user ( argp , args , sizeof ( * args ) ) )
ret = - EFAULT ;
kfree ( args ) ;
2009-11-18 08:42:14 +03:00
return ret ;
}
2018-05-21 04:09:44 +03:00
/*
* Version of ino_lookup ioctl ( unprivileged )
*
* The main differences from ino_lookup ioctl are :
*
* 1. Read + Exec permission will be checked using inode_permission ( ) during
* path construction . - EACCES will be returned in case of failure .
* 2. Path construction will be stopped at the inode number which corresponds
* to the fd with which this ioctl is called . If constructed path does not
* exist under fd ' s inode , - EACCES will be returned .
* 3. The name of bottom subvolume is also searched and filled .
*/
static int btrfs_ioctl_ino_lookup_user ( struct file * file , void __user * argp )
{
struct btrfs_ioctl_ino_lookup_user_args * args ;
struct inode * inode ;
int ret ;
args = memdup_user ( argp , sizeof ( * args ) ) ;
if ( IS_ERR ( args ) )
return PTR_ERR ( args ) ;
inode = file_inode ( file ) ;
if ( args - > dirid = = BTRFS_FIRST_FREE_OBJECTID & &
BTRFS_I ( inode ) - > location . objectid ! = BTRFS_FIRST_FREE_OBJECTID ) {
/*
* The subvolume does not exist under fd with which this is
* called
*/
kfree ( args ) ;
return - EACCES ;
}
ret = btrfs_search_path_in_tree_user ( inode , args ) ;
if ( ret = = 0 & & copy_to_user ( argp , args , sizeof ( * args ) ) )
ret = - EFAULT ;
kfree ( args ) ;
return ret ;
}
2018-05-21 04:09:42 +03:00
/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
static int btrfs_ioctl_get_subvol_info ( struct file * file , void __user * argp )
{
struct btrfs_ioctl_get_subvol_info_args * subvol_info ;
struct btrfs_fs_info * fs_info ;
struct btrfs_root * root ;
struct btrfs_path * path ;
struct btrfs_key key ;
struct btrfs_root_item * root_item ;
struct btrfs_root_ref * rref ;
struct extent_buffer * leaf ;
unsigned long item_off ;
unsigned long item_len ;
struct inode * inode ;
int slot ;
int ret = 0 ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
subvol_info = kzalloc ( sizeof ( * subvol_info ) , GFP_KERNEL ) ;
if ( ! subvol_info ) {
btrfs_free_path ( path ) ;
return - ENOMEM ;
}
inode = file_inode ( file ) ;
fs_info = BTRFS_I ( inode ) - > root - > fs_info ;
/* Get root_item of inode's subvolume */
key . objectid = BTRFS_I ( inode ) - > root - > root_key . objectid ;
key . type = BTRFS_ROOT_ITEM_KEY ;
key . offset = ( u64 ) - 1 ;
root = btrfs_read_fs_root_no_name ( fs_info , & key ) ;
if ( IS_ERR ( root ) ) {
ret = PTR_ERR ( root ) ;
goto out ;
}
root_item = & root - > root_item ;
subvol_info - > treeid = key . objectid ;
subvol_info - > generation = btrfs_root_generation ( root_item ) ;
subvol_info - > flags = btrfs_root_flags ( root_item ) ;
memcpy ( subvol_info - > uuid , root_item - > uuid , BTRFS_UUID_SIZE ) ;
memcpy ( subvol_info - > parent_uuid , root_item - > parent_uuid ,
BTRFS_UUID_SIZE ) ;
memcpy ( subvol_info - > received_uuid , root_item - > received_uuid ,
BTRFS_UUID_SIZE ) ;
subvol_info - > ctransid = btrfs_root_ctransid ( root_item ) ;
subvol_info - > ctime . sec = btrfs_stack_timespec_sec ( & root_item - > ctime ) ;
subvol_info - > ctime . nsec = btrfs_stack_timespec_nsec ( & root_item - > ctime ) ;
subvol_info - > otransid = btrfs_root_otransid ( root_item ) ;
subvol_info - > otime . sec = btrfs_stack_timespec_sec ( & root_item - > otime ) ;
subvol_info - > otime . nsec = btrfs_stack_timespec_nsec ( & root_item - > otime ) ;
subvol_info - > stransid = btrfs_root_stransid ( root_item ) ;
subvol_info - > stime . sec = btrfs_stack_timespec_sec ( & root_item - > stime ) ;
subvol_info - > stime . nsec = btrfs_stack_timespec_nsec ( & root_item - > stime ) ;
subvol_info - > rtransid = btrfs_root_rtransid ( root_item ) ;
subvol_info - > rtime . sec = btrfs_stack_timespec_sec ( & root_item - > rtime ) ;
subvol_info - > rtime . nsec = btrfs_stack_timespec_nsec ( & root_item - > rtime ) ;
if ( key . objectid ! = BTRFS_FS_TREE_OBJECTID ) {
/* Search root tree for ROOT_BACKREF of this subvolume */
root = fs_info - > tree_root ;
key . type = BTRFS_ROOT_BACKREF_KEY ;
key . offset = 0 ;
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 ) {
goto out ;
} else if ( path - > slots [ 0 ] > =
btrfs_header_nritems ( path - > nodes [ 0 ] ) ) {
ret = btrfs_next_leaf ( root , path ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - EUCLEAN ;
goto out ;
}
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
if ( key . objectid = = subvol_info - > treeid & &
key . type = = BTRFS_ROOT_BACKREF_KEY ) {
subvol_info - > parent_id = key . offset ;
rref = btrfs_item_ptr ( leaf , slot , struct btrfs_root_ref ) ;
subvol_info - > dirid = btrfs_root_ref_dirid ( leaf , rref ) ;
item_off = btrfs_item_ptr_offset ( leaf , slot )
+ sizeof ( struct btrfs_root_ref ) ;
item_len = btrfs_item_size_nr ( leaf , slot )
- sizeof ( struct btrfs_root_ref ) ;
read_extent_buffer ( leaf , subvol_info - > name ,
item_off , item_len ) ;
} else {
ret = - ENOENT ;
goto out ;
}
}
if ( copy_to_user ( argp , subvol_info , sizeof ( * subvol_info ) ) )
ret = - EFAULT ;
out :
btrfs_free_path ( path ) ;
kzfree ( subvol_info ) ;
return ret ;
}
2018-05-21 04:09:43 +03:00
/*
* Return ROOT_REF information of the subvolume containing this inode
* except the subvolume name .
*/
static int btrfs_ioctl_get_subvol_rootref ( struct file * file , void __user * argp )
{
struct btrfs_ioctl_get_subvol_rootref_args * rootrefs ;
struct btrfs_root_ref * rref ;
struct btrfs_root * root ;
struct btrfs_path * path ;
struct btrfs_key key ;
struct extent_buffer * leaf ;
struct inode * inode ;
u64 objectid ;
int slot ;
int ret ;
u8 found ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
rootrefs = memdup_user ( argp , sizeof ( * rootrefs ) ) ;
if ( IS_ERR ( rootrefs ) ) {
btrfs_free_path ( path ) ;
return PTR_ERR ( rootrefs ) ;
}
inode = file_inode ( file ) ;
root = BTRFS_I ( inode ) - > root - > fs_info - > tree_root ;
objectid = BTRFS_I ( inode ) - > root - > root_key . objectid ;
key . objectid = objectid ;
key . type = BTRFS_ROOT_REF_KEY ;
key . offset = rootrefs - > min_treeid ;
found = 0 ;
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 ) {
goto out ;
} else if ( path - > slots [ 0 ] > =
btrfs_header_nritems ( path - > nodes [ 0 ] ) ) {
ret = btrfs_next_leaf ( root , path ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - EUCLEAN ;
goto out ;
}
}
while ( 1 ) {
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
if ( key . objectid ! = objectid | | key . type ! = BTRFS_ROOT_REF_KEY ) {
ret = 0 ;
goto out ;
}
if ( found = = BTRFS_MAX_ROOTREF_BUFFER_NUM ) {
ret = - EOVERFLOW ;
goto out ;
}
rref = btrfs_item_ptr ( leaf , slot , struct btrfs_root_ref ) ;
rootrefs - > rootref [ found ] . treeid = key . offset ;
rootrefs - > rootref [ found ] . dirid =
btrfs_root_ref_dirid ( leaf , rref ) ;
found + + ;
ret = btrfs_next_item ( root , path ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - EUCLEAN ;
goto out ;
}
}
out :
if ( ! ret | | ret = = - EOVERFLOW ) {
rootrefs - > num_items = found ;
/* update min_treeid for next search */
if ( found )
rootrefs - > min_treeid =
rootrefs - > rootref [ found - 1 ] . treeid + 1 ;
if ( copy_to_user ( argp , rootrefs , sizeof ( * rootrefs ) ) )
ret = - EFAULT ;
}
kfree ( rootrefs ) ;
btrfs_free_path ( path ) ;
return ret ;
}
2009-09-22 00:00:26 +04:00
static noinline int btrfs_ioctl_snap_destroy ( struct file * file ,
void __user * arg )
{
2013-09-01 23:57:51 +04:00
struct dentry * parent = file - > f_path . dentry ;
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( parent - > d_sb ) ;
2009-09-22 00:00:26 +04:00
struct dentry * dentry ;
2015-03-18 01:25:59 +03:00
struct inode * dir = d_inode ( parent ) ;
2009-09-22 00:00:26 +04:00
struct inode * inode ;
struct btrfs_root * root = BTRFS_I ( dir ) - > root ;
struct btrfs_root * dest = NULL ;
struct btrfs_ioctl_vol_args * vol_args ;
int namelen ;
int err = 0 ;
2016-09-21 15:31:29 +03:00
if ( ! S_ISDIR ( dir - > i_mode ) )
return - ENOTDIR ;
2009-09-22 00:00:26 +04:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
if ( IS_ERR ( vol_args ) )
return PTR_ERR ( vol_args ) ;
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
namelen = strlen ( vol_args - > name ) ;
if ( strchr ( vol_args - > name , ' / ' ) | |
strncmp ( vol_args - > name , " .. " , namelen ) = = 0 ) {
err = - EINVAL ;
goto out ;
}
2011-11-23 20:57:51 +04:00
err = mnt_want_write_file ( file ) ;
2009-09-22 00:00:26 +04:00
if ( err )
goto out ;
2014-04-15 18:41:44 +04:00
2016-05-26 07:05:12 +03:00
err = down_write_killable_nested ( & dir - > i_rwsem , I_MUTEX_PARENT ) ;
if ( err = = - EINTR )
goto out_drop_write ;
2009-09-22 00:00:26 +04:00
dentry = lookup_one_len ( vol_args - > name , parent , namelen ) ;
if ( IS_ERR ( dentry ) ) {
err = PTR_ERR ( dentry ) ;
goto out_unlock_dir ;
}
2015-03-18 01:25:59 +03:00
if ( d_really_is_negative ( dentry ) ) {
2009-09-22 00:00:26 +04:00
err = - ENOENT ;
goto out_dput ;
}
2015-03-18 01:25:59 +03:00
inode = d_inode ( dentry ) ;
2010-10-29 23:46:43 +04:00
dest = BTRFS_I ( inode ) - > root ;
2013-10-31 09:03:04 +04:00
if ( ! capable ( CAP_SYS_ADMIN ) ) {
2010-10-29 23:46:43 +04:00
/*
* Regular user . Only allow this with a special mount
* option , when the user has write + exec access to the
* subvol root , and when rmdir ( 2 ) would have been
* allowed .
*
* Note that this is _not_ check that the subvol is
* empty or doesn ' t contain data that we wouldn ' t
* otherwise be able to delete .
*
* Users who want to delete empty subvols should try
* rmdir ( 2 ) .
*/
err = - EPERM ;
2016-06-23 01:54:23 +03:00
if ( ! btrfs_test_opt ( fs_info , USER_SUBVOL_RM_ALLOWED ) )
2010-10-29 23:46:43 +04:00
goto out_dput ;
/*
* Do not allow deletion if the parent dir is the same
* as the dir to be deleted . That means the ioctl
* must be called on the dentry referencing the root
* of the subvol , not a random directory contained
* within it .
*/
err = - EINVAL ;
if ( root = = dest )
goto out_dput ;
err = inode_permission ( inode , MAY_WRITE | MAY_EXEC ) ;
if ( err )
goto out_dput ;
}
2012-10-22 15:39:53 +04:00
/* check if subvolume may be deleted by a user */
err = btrfs_may_delete ( dir , dentry , 1 ) ;
if ( err )
goto out_dput ;
2017-01-10 21:35:31 +03:00
if ( btrfs_ino ( BTRFS_I ( inode ) ) ! = BTRFS_FIRST_FREE_OBJECTID ) {
2009-09-22 00:00:26 +04:00
err = - EINVAL ;
goto out_dput ;
}
2016-01-22 23:40:57 +03:00
inode_lock ( inode ) ;
2018-04-18 05:34:52 +03:00
err = btrfs_delete_subvolume ( dir , dentry ) ;
2016-01-22 23:40:57 +03:00
inode_unlock ( inode ) ;
2019-05-26 17:34:03 +03:00
if ( ! err ) {
fsnotify_rmdir ( dir , dentry ) ;
2009-09-22 00:00:26 +04:00
d_delete ( dentry ) ;
2019-05-26 17:34:03 +03:00
}
Btrfs: fix cleaner thread not working with inode cache option
Right now inode cache inode is treated as the same as space cache
inode, ie. keep inode in memory till putting super.
But this leads to an awkward situation.
If we're going to delete a snapshot/subvolume, btrfs will not
actually delete it and return free space, but will add it to dead
roots list until the last inode on this snap/subvol being destroyed.
Then we'll fetch deleted roots and cleanup them via cleaner thread.
So here is the problem, if we enable inode cache option, each
snap/subvol has a cached inode which is used to store inode allcation
information. And this cache inode will be kept in memory, as the above
said. So with inode cache, snap/subvol can only be added into
dead roots list during freeing roots stage in umount, so that we can
ONLY get space back after another remount(we cleanup dead roots on mount).
But the real thing is we'll no more use the snap/subvol if we mark it
deleted, so we can safely iput its cache inode when we delete snap/subvol.
Another thing is that we need to change the rules of droping inode, we
don't keep snap/subvol's cache inode in memory till end so that we can
add snap/subvol into dead roots list in time.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-02-20 18:10:23 +04:00
2009-09-22 00:00:26 +04:00
out_dput :
dput ( dentry ) ;
out_unlock_dir :
2016-01-22 23:40:57 +03:00
inode_unlock ( dir ) ;
2016-05-26 07:05:12 +03:00
out_drop_write :
2011-12-09 17:06:57 +04:00
mnt_drop_write_file ( file ) ;
2009-09-22 00:00:26 +04:00
out :
kfree ( vol_args ) ;
return err ;
}
2010-03-11 17:42:04 +03:00
static int btrfs_ioctl_defrag ( struct file * file , void __user * argp )
2008-06-12 05:53:53 +04:00
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2008-06-12 05:53:53 +04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2010-03-11 17:42:04 +03:00
struct btrfs_ioctl_defrag_range_args * range ;
2008-11-12 22:34:12 +03:00
int ret ;
2013-01-20 17:57:57 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2010-12-20 11:04:08 +03:00
2013-01-20 17:57:57 +04:00
if ( btrfs_root_readonly ( root ) ) {
ret = - EROFS ;
goto out ;
2012-11-05 20:54:08 +04:00
}
2008-06-12 05:53:53 +04:00
switch ( inode - > i_mode & S_IFMT ) {
case S_IFDIR :
2009-01-06 00:57:23 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) ) {
ret = - EPERM ;
goto out ;
}
2013-01-31 22:21:12 +04:00
ret = btrfs_defrag_root ( root ) ;
2008-06-12 05:53:53 +04:00
break ;
case S_IFREG :
2018-07-18 01:08:59 +03:00
/*
* Note that this does not check the file descriptor for write
* access . This prevents defragmenting executables that are
* running and allows defrag on files open in read - only mode .
*/
if ( ! capable ( CAP_SYS_ADMIN ) & &
inode_permission ( inode , MAY_WRITE ) ) {
ret = - EPERM ;
2009-01-06 00:57:23 +03:00
goto out ;
}
2010-03-11 17:42:04 +03:00
range = kzalloc ( sizeof ( * range ) , GFP_KERNEL ) ;
if ( ! range ) {
ret = - ENOMEM ;
goto out ;
}
if ( argp ) {
if ( copy_from_user ( range , argp ,
sizeof ( * range ) ) ) {
ret = - EFAULT ;
kfree ( range ) ;
2010-03-20 14:24:48 +03:00
goto out ;
2010-03-11 17:42:04 +03:00
}
/* compression requires us to start the IO */
if ( ( range - > flags & BTRFS_DEFRAG_RANGE_COMPRESS ) ) {
range - > flags | = BTRFS_DEFRAG_RANGE_START_IO ;
range - > extent_thresh = ( u32 ) - 1 ;
}
} else {
/* the rest are all set to zero by kzalloc */
range - > len = ( u64 ) - 1 ;
}
2013-01-24 02:07:38 +04:00
ret = btrfs_defrag_file ( file_inode ( file ) , file ,
2018-03-07 12:29:18 +03:00
range , BTRFS_OLDEST_GENERATION , 0 ) ;
2011-05-24 23:35:30 +04:00
if ( ret > 0 )
ret = 0 ;
2010-03-11 17:42:04 +03:00
kfree ( range ) ;
2008-06-12 05:53:53 +04:00
break ;
2010-05-16 18:49:58 +04:00
default :
ret = - EINVAL ;
2008-06-12 05:53:53 +04:00
}
2009-01-06 00:57:23 +03:00
out :
2013-01-20 17:57:57 +04:00
mnt_drop_write_file ( file ) ;
2009-01-06 00:57:23 +03:00
return ret ;
2008-06-12 05:53:53 +04:00
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_add_dev ( struct btrfs_fs_info * fs_info , void __user * arg )
2008-06-12 05:53:53 +04:00
{
struct btrfs_ioctl_vol_args * vol_args ;
int ret ;
2009-01-06 00:57:23 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2017-03-28 15:44:21 +03:00
if ( test_and_set_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) )
2013-08-21 07:44:48 +04:00
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2012-01-17 00:04:47 +04:00
2009-04-08 11:06:54 +04:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
2012-01-17 00:04:47 +04:00
if ( IS_ERR ( vol_args ) ) {
ret = PTR_ERR ( vol_args ) ;
goto out ;
}
2008-06-12 05:53:53 +04:00
2008-07-24 20:20:14 +04:00
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
2016-06-23 01:54:24 +03:00
ret = btrfs_init_new_device ( fs_info , vol_args - > name ) ;
2008-06-12 05:53:53 +04:00
2014-06-30 20:58:56 +04:00
if ( ! ret )
2016-06-23 01:54:23 +03:00
btrfs_info ( fs_info , " disk added %s " , vol_args - > name ) ;
2014-06-30 20:58:56 +04:00
2008-06-12 05:53:53 +04:00
kfree ( vol_args ) ;
2012-01-17 00:04:47 +04:00
out :
2017-03-28 15:44:21 +03:00
clear_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ;
2008-06-12 05:53:53 +04:00
return ret ;
}
2016-02-13 05:01:39 +03:00
static long btrfs_ioctl_rm_dev_v2 ( struct file * file , void __user * arg )
2008-06-12 05:53:53 +04:00
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2016-02-13 05:01:39 +03:00
struct btrfs_ioctl_vol_args_v2 * vol_args ;
2008-06-12 05:53:53 +04:00
int ret ;
2009-01-06 00:57:23 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 12:44:50 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2008-11-12 22:34:12 +03:00
2009-04-08 11:06:54 +04:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
2012-01-17 00:04:47 +04:00
if ( IS_ERR ( vol_args ) ) {
ret = PTR_ERR ( vol_args ) ;
2014-09-04 15:09:15 +04:00
goto err_drop ;
2012-01-17 00:04:47 +04:00
}
2008-06-12 05:53:53 +04:00
2016-02-13 05:01:39 +03:00
/* Check for compatibility reject unknown flags */
2018-05-23 01:44:01 +03:00
if ( vol_args - > flags & ~ BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED ) {
ret = - EOPNOTSUPP ;
goto out ;
}
2008-06-12 05:53:53 +04:00
2017-03-28 15:44:21 +03:00
if ( test_and_set_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ) {
2013-05-17 14:52:45 +04:00
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
goto out ;
}
2016-02-15 20:15:21 +03:00
if ( vol_args - > flags & BTRFS_DEVICE_SPEC_BY_ID ) {
2016-06-23 01:54:24 +03:00
ret = btrfs_rm_device ( fs_info , NULL , vol_args - > devid ) ;
2016-02-13 05:01:39 +03:00
} else {
vol_args - > name [ BTRFS_SUBVOL_NAME_MAX ] = ' \0 ' ;
2016-06-23 01:54:24 +03:00
ret = btrfs_rm_device ( fs_info , vol_args - > name , 0 ) ;
2016-02-13 05:01:39 +03:00
}
2017-03-28 15:44:21 +03:00
clear_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ;
2013-05-17 14:52:45 +04:00
2016-02-13 05:01:39 +03:00
if ( ! ret ) {
2016-02-15 20:15:21 +03:00
if ( vol_args - > flags & BTRFS_DEVICE_SPEC_BY_ID )
2016-06-23 01:54:23 +03:00
btrfs_info ( fs_info , " device deleted: id %llu " ,
2016-02-13 05:01:39 +03:00
vol_args - > devid ) ;
else
2016-06-23 01:54:23 +03:00
btrfs_info ( fs_info , " device deleted: %s " ,
2016-02-13 05:01:39 +03:00
vol_args - > name ) ;
}
2013-05-17 14:52:45 +04:00
out :
kfree ( vol_args ) ;
2014-09-04 15:09:15 +04:00
err_drop :
2013-01-20 17:57:57 +04:00
mnt_drop_write_file ( file ) ;
2008-06-12 05:53:53 +04:00
return ret ;
}
2012-11-26 12:44:50 +04:00
static long btrfs_ioctl_rm_dev ( struct file * file , void __user * arg )
2008-06-12 05:53:53 +04:00
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2008-06-12 05:53:53 +04:00
struct btrfs_ioctl_vol_args * vol_args ;
int ret ;
2009-01-06 00:57:23 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 12:44:50 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2008-11-12 22:34:12 +03:00
2017-03-28 15:44:21 +03:00
if ( test_and_set_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ) {
2013-05-17 14:52:45 +04:00
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2016-05-04 15:10:47 +03:00
goto out_drop_write ;
}
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
if ( IS_ERR ( vol_args ) ) {
ret = PTR_ERR ( vol_args ) ;
2013-05-17 14:52:45 +04:00
goto out ;
}
2016-05-04 15:10:47 +03:00
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
2016-06-23 01:54:24 +03:00
ret = btrfs_rm_device ( fs_info , vol_args - > name , 0 ) ;
2013-05-17 14:52:45 +04:00
2014-06-30 20:58:57 +04:00
if ( ! ret )
2016-06-23 01:54:23 +03:00
btrfs_info ( fs_info , " disk deleted %s " , vol_args - > name ) ;
2013-05-17 14:52:45 +04:00
kfree ( vol_args ) ;
2016-05-04 15:10:47 +03:00
out :
2017-03-28 15:44:21 +03:00
clear_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ;
2016-05-04 15:10:47 +03:00
out_drop_write :
2013-01-20 17:57:57 +04:00
mnt_drop_write_file ( file ) ;
2016-05-04 15:10:47 +03:00
2008-06-12 05:53:53 +04:00
return ret ;
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_fs_info ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2011-03-11 17:41:01 +03:00
{
2011-06-08 12:27:56 +04:00
struct btrfs_ioctl_fs_info_args * fi_args ;
2011-03-11 17:41:01 +03:00
struct btrfs_device * device ;
2016-06-23 01:54:23 +03:00
struct btrfs_fs_devices * fs_devices = fs_info - > fs_devices ;
2011-06-08 12:27:56 +04:00
int ret = 0 ;
2011-03-11 17:41:01 +03:00
2011-06-08 12:27:56 +04:00
fi_args = kzalloc ( sizeof ( * fi_args ) , GFP_KERNEL ) ;
if ( ! fi_args )
return - ENOMEM ;
2017-06-16 01:09:21 +03:00
rcu_read_lock ( ) ;
2011-06-08 12:27:56 +04:00
fi_args - > num_devices = fs_devices - > num_devices ;
2011-03-11 17:41:01 +03:00
2017-06-16 01:09:21 +03:00
list_for_each_entry_rcu ( device , & fs_devices - > devices , dev_list ) {
2011-06-08 12:27:56 +04:00
if ( device - > devid > fi_args - > max_id )
fi_args - > max_id = device - > devid ;
2011-03-11 17:41:01 +03:00
}
2017-06-16 01:09:21 +03:00
rcu_read_unlock ( ) ;
2011-03-11 17:41:01 +03:00
2018-10-30 17:43:24 +03:00
memcpy ( & fi_args - > fsid , fs_devices - > fsid , sizeof ( fi_args - > fsid ) ) ;
2017-08-23 09:46:00 +03:00
fi_args - > nodesize = fs_info - > nodesize ;
fi_args - > sectorsize = fs_info - > sectorsize ;
fi_args - > clone_alignment = fs_info - > sectorsize ;
2014-05-07 20:17:06 +04:00
2011-06-08 12:27:56 +04:00
if ( copy_to_user ( arg , fi_args , sizeof ( * fi_args ) ) )
ret = - EFAULT ;
2011-03-11 17:41:01 +03:00
2011-06-08 12:27:56 +04:00
kfree ( fi_args ) ;
return ret ;
2011-03-11 17:41:01 +03:00
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_dev_info ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2011-03-11 17:41:01 +03:00
{
struct btrfs_ioctl_dev_info_args * di_args ;
struct btrfs_device * dev ;
int ret = 0 ;
char * s_uuid = NULL ;
di_args = memdup_user ( arg , sizeof ( * di_args ) ) ;
if ( IS_ERR ( di_args ) )
return PTR_ERR ( di_args ) ;
2013-08-15 19:11:20 +04:00
if ( ! btrfs_is_empty_uuid ( di_args - > uuid ) )
2011-03-11 17:41:01 +03:00
s_uuid = di_args - > uuid ;
2017-06-16 01:09:21 +03:00
rcu_read_lock ( ) ;
2019-01-17 18:32:31 +03:00
dev = btrfs_find_device ( fs_info - > fs_devices , di_args - > devid , s_uuid ,
2019-01-19 09:48:55 +03:00
NULL , true ) ;
2011-03-11 17:41:01 +03:00
if ( ! dev ) {
ret = - ENODEV ;
goto out ;
}
di_args - > devid = dev - > devid ;
2014-09-03 17:35:38 +04:00
di_args - > bytes_used = btrfs_device_get_bytes_used ( dev ) ;
di_args - > total_bytes = btrfs_device_get_total_bytes ( dev ) ;
2011-03-11 17:41:01 +03:00
memcpy ( di_args - > uuid , dev - > uuid , sizeof ( di_args - > uuid ) ) ;
2012-04-26 20:36:56 +04:00
if ( dev - > name ) {
2018-08-02 10:19:07 +03:00
strncpy ( di_args - > path , rcu_str_deref ( dev - > name ) ,
sizeof ( di_args - > path ) - 1 ) ;
2012-04-26 20:36:56 +04:00
di_args - > path [ sizeof ( di_args - > path ) - 1 ] = 0 ;
} else {
2012-03-19 19:17:22 +04:00
di_args - > path [ 0 ] = ' \0 ' ;
2012-04-26 20:36:56 +04:00
}
2011-03-11 17:41:01 +03:00
out :
2017-06-16 01:09:21 +03:00
rcu_read_unlock ( ) ;
2011-03-11 17:41:01 +03:00
if ( ret = = 0 & & copy_to_user ( arg , di_args , sizeof ( * di_args ) ) )
ret = - EFAULT ;
kfree ( di_args ) ;
return ret ;
}
Btrfs: fix race between reflink/dedupe and relocation
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
relocation and reflinking (for both cloning and deduplication) the file
extents between the source and destination inodes.
This happens because we no longer lock the source range anymore, and we do
not lock it anymore because we wait for direct IO writes and writeback to
complete early on the code path right after locking the inodes, which
guarantees no other file operations interfere with the reflinking. However
there is one exception which is relocation, since it replaces the byte
number of file extents items in the fs tree after locking the range the
file extent items represent. This is a problem because after finding each
file extent to clone in the fs tree, the reflink process copies the file
extent item into a local buffer, releases the search path, inserts new
file extent items in the destination range and then increments the
reference count for the extent mentioned in the file extent item that it
previously copied to the buffer. If right after copying the file extent
item into the buffer and releasing the path the relocation process
updates the file extent item to point to the new extent, the reflink
process ends up creating a delayed reference to increment the reference
count of the old extent, for which the relocation process already created
a delayed reference to drop it. This results in failure to run delayed
references because we will attempt to increment the count of a reference
that was already dropped. This is illustrated by the following diagram:
CPU 1 CPU 2
relocation is running
btrfs_clone_files()
btrfs_clone()
--> finds extent item
in source range
point to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
replace_file_extents()
--> successfully locks the
range represented by
the file extent item
--> replaces disk_bytenr
field in the file
extent item with some
other value Y
--> creates delayed reference
to increment reference
count for extent at
bytenr Y
--> creates delayed reference
to drop the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by locking the source range before searching for the file extent
items in the fs tree, since the relocation process will try to lock the
range a file extent item represents before updating it with the new extent
location.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:43:07 +03:00
static void btrfs_double_extent_unlock ( struct inode * inode1 , u64 loff1 ,
struct inode * inode2 , u64 loff2 , u64 len )
{
unlock_extent ( & BTRFS_I ( inode1 ) - > io_tree , loff1 , loff1 + len - 1 ) ;
unlock_extent ( & BTRFS_I ( inode2 ) - > io_tree , loff2 , loff2 + len - 1 ) ;
}
static void btrfs_double_extent_lock ( struct inode * inode1 , u64 loff1 ,
struct inode * inode2 , u64 loff2 , u64 len )
{
if ( inode1 < inode2 ) {
swap ( inode1 , inode2 ) ;
swap ( loff1 , loff2 ) ;
} else if ( inode1 = = inode2 & & loff2 < loff1 ) {
swap ( loff1 , loff2 ) ;
}
lock_extent ( & BTRFS_I ( inode1 ) - > io_tree , loff1 , loff1 + len - 1 ) ;
lock_extent ( & BTRFS_I ( inode2 ) - > io_tree , loff2 , loff2 + len - 1 ) ;
}
2018-12-12 21:05:59 +03:00
static int btrfs_extent_same_range ( struct inode * src , u64 loff , u64 len ,
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
struct inode * dst , u64 dst_loff )
2013-08-06 22:42:51 +04:00
{
int ret ;
2015-07-01 00:42:05 +03:00
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
/*
Btrfs: fix race between reflink/dedupe and relocation
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
relocation and reflinking (for both cloning and deduplication) the file
extents between the source and destination inodes.
This happens because we no longer lock the source range anymore, and we do
not lock it anymore because we wait for direct IO writes and writeback to
complete early on the code path right after locking the inodes, which
guarantees no other file operations interfere with the reflinking. However
there is one exception which is relocation, since it replaces the byte
number of file extents items in the fs tree after locking the range the
file extent items represent. This is a problem because after finding each
file extent to clone in the fs tree, the reflink process copies the file
extent item into a local buffer, releases the search path, inserts new
file extent items in the destination range and then increments the
reference count for the extent mentioned in the file extent item that it
previously copied to the buffer. If right after copying the file extent
item into the buffer and releasing the path the relocation process
updates the file extent item to point to the new extent, the reflink
process ends up creating a delayed reference to increment the reference
count of the old extent, for which the relocation process already created
a delayed reference to drop it. This results in failure to run delayed
references because we will attempt to increment the count of a reference
that was already dropped. This is illustrated by the following diagram:
CPU 1 CPU 2
relocation is running
btrfs_clone_files()
btrfs_clone()
--> finds extent item
in source range
point to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
replace_file_extents()
--> successfully locks the
range represented by
the file extent item
--> replaces disk_bytenr
field in the file
extent item with some
other value Y
--> creates delayed reference
to increment reference
count for extent at
bytenr Y
--> creates delayed reference
to drop the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by locking the source range before searching for the file extent
items in the fs tree, since the relocation process will try to lock the
range a file extent item represents before updating it with the new extent
location.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:43:07 +03:00
* Lock destination range to serialize with concurrent readpages ( ) and
* source range to serialize with relocation .
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
*/
Btrfs: fix race between reflink/dedupe and relocation
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
relocation and reflinking (for both cloning and deduplication) the file
extents between the source and destination inodes.
This happens because we no longer lock the source range anymore, and we do
not lock it anymore because we wait for direct IO writes and writeback to
complete early on the code path right after locking the inodes, which
guarantees no other file operations interfere with the reflinking. However
there is one exception which is relocation, since it replaces the byte
number of file extents items in the fs tree after locking the range the
file extent items represent. This is a problem because after finding each
file extent to clone in the fs tree, the reflink process copies the file
extent item into a local buffer, releases the search path, inserts new
file extent items in the destination range and then increments the
reference count for the extent mentioned in the file extent item that it
previously copied to the buffer. If right after copying the file extent
item into the buffer and releasing the path the relocation process
updates the file extent item to point to the new extent, the reflink
process ends up creating a delayed reference to increment the reference
count of the old extent, for which the relocation process already created
a delayed reference to drop it. This results in failure to run delayed
references because we will attempt to increment the count of a reference
that was already dropped. This is illustrated by the following diagram:
CPU 1 CPU 2
relocation is running
btrfs_clone_files()
btrfs_clone()
--> finds extent item
in source range
point to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
replace_file_extents()
--> successfully locks the
range represented by
the file extent item
--> replaces disk_bytenr
field in the file
extent item with some
other value Y
--> creates delayed reference
to increment reference
count for extent at
bytenr Y
--> creates delayed reference
to drop the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by locking the source range before searching for the file extent
items in the fs tree, since the relocation process will try to lock the
range a file extent item represents before updating it with the new extent
location.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:43:07 +03:00
btrfs_double_extent_lock ( src , loff , dst , dst_loff , len ) ;
2018-12-12 21:05:59 +03:00
ret = btrfs_clone ( src , dst , loff , len , len , dst_loff , 1 ) ;
Btrfs: fix race between reflink/dedupe and relocation
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
relocation and reflinking (for both cloning and deduplication) the file
extents between the source and destination inodes.
This happens because we no longer lock the source range anymore, and we do
not lock it anymore because we wait for direct IO writes and writeback to
complete early on the code path right after locking the inodes, which
guarantees no other file operations interfere with the reflinking. However
there is one exception which is relocation, since it replaces the byte
number of file extents items in the fs tree after locking the range the
file extent items represent. This is a problem because after finding each
file extent to clone in the fs tree, the reflink process copies the file
extent item into a local buffer, releases the search path, inserts new
file extent items in the destination range and then increments the
reference count for the extent mentioned in the file extent item that it
previously copied to the buffer. If right after copying the file extent
item into the buffer and releasing the path the relocation process
updates the file extent item to point to the new extent, the reflink
process ends up creating a delayed reference to increment the reference
count of the old extent, for which the relocation process already created
a delayed reference to drop it. This results in failure to run delayed
references because we will attempt to increment the count of a reference
that was already dropped. This is illustrated by the following diagram:
CPU 1 CPU 2
relocation is running
btrfs_clone_files()
btrfs_clone()
--> finds extent item
in source range
point to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
replace_file_extents()
--> successfully locks the
range represented by
the file extent item
--> replaces disk_bytenr
field in the file
extent item with some
other value Y
--> creates delayed reference
to increment reference
count for extent at
bytenr Y
--> creates delayed reference
to drop the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by locking the source range before searching for the file extent
items in the fs tree, since the relocation process will try to lock the
range a file extent item represents before updating it with the new extent
location.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:43:07 +03:00
btrfs_double_extent_unlock ( src , loff , dst , dst_loff , len ) ;
2018-05-02 08:15:36 +03:00
return ret ;
}
2018-05-02 08:15:37 +03:00
# define BTRFS_MAX_DEDUPE_LEN SZ_16M
2018-05-02 08:15:36 +03:00
static int btrfs_extent_same ( struct inode * src , u64 loff , u64 olen ,
struct inode * dst , u64 dst_loff )
{
int ret ;
2018-05-02 08:15:37 +03:00
u64 i , tail_len , chunk_count ;
Btrfs: fix race between send and deduplication that lead to failures and crashes
Send operates on read only trees and expects them to never change while it
is using them. This is part of its initial design, and this expection is
due to two different reasons:
1) When it was introduced, no operations were allowed to modifiy read-only
subvolumes/snapshots (including defrag for example).
2) It keeps send from having an impact on other filesystem operations.
Namely send does not need to keep locks on the trees nor needs to hold on
to transaction handles and delay transaction commits. This ends up being
a consequence of the former reason.
However the deduplication feature was introduced later (on September 2013,
while send was introduced in July 2012) and it allowed for deduplication
with destination files that belong to read-only trees (subvolumes and
snapshots).
That means that having a send operation (either full or incremental) running
in parallel with a deduplication that has the destination inode in one of
the trees used by the send operation, can result in tree nodes and leaves
getting freed and reused while send is using them. This problem is similar
to the problem solved for the root nodes getting freed and reused when a
snapshot is made against one tree that is currenly being used by a send
operation, fixed in commits [1] and [2]. These commits explain in detail
how the problem happens and the explanation is valid for any node or leaf
that is not the root of a tree as well. This problem was also discussed
and explained recently in a thread [3].
The problem is very easy to reproduce when using send with large trees
(snapshots) and just a few concurrent deduplication operations that target
files in the trees used by send. A stress test case is being sent for
fstests that triggers the issue easily. The most common error to hit is
the send ioctl return -EIO with the following messages in dmesg/syslog:
[1631617.204075] BTRFS error (device sdc): did not find backref in send_root. inode=63292, offset=0, disk_byte=5228134400 found extent=5228134400
[1631633.251754] BTRFS error (device sdc): parent transid verify failed on 32243712 wanted 24 found 27
The first one is very easy to hit while the second one happens much less
frequently, except for very large trees (in that test case, snapshots
with 100000 files having large xattrs to get deep and wide trees).
Less frequently, at least one BUG_ON can be hit:
[1631742.130080] ------------[ cut here ]------------
[1631742.130625] kernel BUG at fs/btrfs/ctree.c:1806!
[1631742.131188] invalid opcode: 0000 [#6] SMP DEBUG_PAGEALLOC PTI
[1631742.131726] CPU: 1 PID: 13394 Comm: btrfs Tainted: G B D W 5.0.0-rc8-btrfs-next-45 #1
[1631742.132265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[1631742.133399] RIP: 0010:read_node_slot+0x122/0x130 [btrfs]
(...)
[1631742.135061] RSP: 0018:ffffb530021ebaa0 EFLAGS: 00010246
[1631742.135615] RAX: ffff93ac8912e000 RBX: 000000000000009d RCX: 0000000000000002
[1631742.136173] RDX: 000000000000009d RSI: ffff93ac564b0d08 RDI: ffff93ad5b48c000
[1631742.136759] RBP: ffffb530021ebb7d R08: 0000000000000001 R09: ffffb530021ebb7d
[1631742.137324] R10: ffffb530021eba70 R11: 0000000000000000 R12: ffff93ac87d0a708
[1631742.137900] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
[1631742.138455] FS: 00007f4cdb1528c0(0000) GS:ffff93ad76a80000(0000) knlGS:0000000000000000
[1631742.139010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1631742.139568] CR2: 00007f5acb3d0420 CR3: 000000012be3e006 CR4: 00000000003606e0
[1631742.140131] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[1631742.140719] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[1631742.141272] Call Trace:
[1631742.141826] ? do_raw_spin_unlock+0x49/0xc0
[1631742.142390] tree_advance+0x173/0x1d0 [btrfs]
[1631742.142948] btrfs_compare_trees+0x268/0x690 [btrfs]
[1631742.143533] ? process_extent+0x1070/0x1070 [btrfs]
[1631742.144088] btrfs_ioctl_send+0x1037/0x1270 [btrfs]
[1631742.144645] _btrfs_ioctl_send+0x80/0x110 [btrfs]
[1631742.145161] ? trace_sched_stick_numa+0xe0/0xe0
[1631742.145685] btrfs_ioctl+0x13fe/0x3120 [btrfs]
[1631742.146179] ? account_entity_enqueue+0xd3/0x100
[1631742.146662] ? reweight_entity+0x154/0x1a0
[1631742.147135] ? update_curr+0x20/0x2a0
[1631742.147593] ? check_preempt_wakeup+0x103/0x250
[1631742.148053] ? do_vfs_ioctl+0xa2/0x6f0
[1631742.148510] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[1631742.148942] do_vfs_ioctl+0xa2/0x6f0
[1631742.149361] ? __fget+0x113/0x200
[1631742.149767] ksys_ioctl+0x70/0x80
[1631742.150159] __x64_sys_ioctl+0x16/0x20
[1631742.150543] do_syscall_64+0x60/0x1b0
[1631742.150931] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[1631742.151326] RIP: 0033:0x7f4cd9f5add7
(...)
[1631742.152509] RSP: 002b:00007ffe91017708 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[1631742.152892] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f4cd9f5add7
[1631742.153268] RDX: 00007ffe91017790 RSI: 0000000040489426 RDI: 0000000000000007
[1631742.153633] RBP: 0000000000000007 R08: 00007f4cd9e79700 R09: 00007f4cd9e79700
[1631742.153999] R10: 00007f4cd9e799d0 R11: 0000000000000202 R12: 0000000000000003
[1631742.154365] R13: 0000555dfae53020 R14: 0000000000000000 R15: 0000000000000001
(...)
[1631742.156696] ---[ end trace 5dac9f96dcc3fd6b ]---
That BUG_ON happens because while send is using a node, that node is COWed
by a concurrent deduplication, gets freed and gets reused as a leaf (because
a transaction commit happened in between), so when it attempts to read a
slot from the extent buffer, at ctree.c:read_node_slot(), the extent buffer
contents were wiped out and it now matches a leaf (which can even belong to
some other tree now), hitting the BUG_ON(level == 0).
Fix this concurrency issue by not allowing send and deduplication to run
in parallel if both operate on the same readonly trees, returning EAGAIN
to user space and logging an exlicit warning in dmesg/syslog.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be6821f82c3cc36e026f5afd10249988852b35ea
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2
[3] https://lore.kernel.org/linux-btrfs/CAL3q7H7iqSEEyFaEtpRZw3cp613y+4k2Q8b4W7mweR3tZA05bQ@mail.gmail.com/
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-22 18:43:42 +03:00
struct btrfs_root * root_dst = BTRFS_I ( dst ) - > root ;
spin_lock ( & root_dst - > root_item_lock ) ;
if ( root_dst - > send_in_progress ) {
btrfs_warn_rl ( root_dst - > fs_info ,
" cannot deduplicate to root %llu while send operations are using it (%d in progress) " ,
root_dst - > root_key . objectid ,
root_dst - > send_in_progress ) ;
spin_unlock ( & root_dst - > root_item_lock ) ;
return - EAGAIN ;
}
root_dst - > dedupe_in_progress + + ;
spin_unlock ( & root_dst - > root_item_lock ) ;
2018-05-02 08:15:36 +03:00
2018-05-02 08:15:37 +03:00
tail_len = olen % BTRFS_MAX_DEDUPE_LEN ;
chunk_count = div_u64 ( olen , BTRFS_MAX_DEDUPE_LEN ) ;
2018-05-02 08:15:38 +03:00
2018-05-02 08:15:37 +03:00
for ( i = 0 ; i < chunk_count ; i + + ) {
ret = btrfs_extent_same_range ( src , loff , BTRFS_MAX_DEDUPE_LEN ,
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
dst , dst_loff ) ;
2018-05-02 08:15:37 +03:00
if ( ret )
Btrfs: fix race between send and deduplication that lead to failures and crashes
Send operates on read only trees and expects them to never change while it
is using them. This is part of its initial design, and this expection is
due to two different reasons:
1) When it was introduced, no operations were allowed to modifiy read-only
subvolumes/snapshots (including defrag for example).
2) It keeps send from having an impact on other filesystem operations.
Namely send does not need to keep locks on the trees nor needs to hold on
to transaction handles and delay transaction commits. This ends up being
a consequence of the former reason.
However the deduplication feature was introduced later (on September 2013,
while send was introduced in July 2012) and it allowed for deduplication
with destination files that belong to read-only trees (subvolumes and
snapshots).
That means that having a send operation (either full or incremental) running
in parallel with a deduplication that has the destination inode in one of
the trees used by the send operation, can result in tree nodes and leaves
getting freed and reused while send is using them. This problem is similar
to the problem solved for the root nodes getting freed and reused when a
snapshot is made against one tree that is currenly being used by a send
operation, fixed in commits [1] and [2]. These commits explain in detail
how the problem happens and the explanation is valid for any node or leaf
that is not the root of a tree as well. This problem was also discussed
and explained recently in a thread [3].
The problem is very easy to reproduce when using send with large trees
(snapshots) and just a few concurrent deduplication operations that target
files in the trees used by send. A stress test case is being sent for
fstests that triggers the issue easily. The most common error to hit is
the send ioctl return -EIO with the following messages in dmesg/syslog:
[1631617.204075] BTRFS error (device sdc): did not find backref in send_root. inode=63292, offset=0, disk_byte=5228134400 found extent=5228134400
[1631633.251754] BTRFS error (device sdc): parent transid verify failed on 32243712 wanted 24 found 27
The first one is very easy to hit while the second one happens much less
frequently, except for very large trees (in that test case, snapshots
with 100000 files having large xattrs to get deep and wide trees).
Less frequently, at least one BUG_ON can be hit:
[1631742.130080] ------------[ cut here ]------------
[1631742.130625] kernel BUG at fs/btrfs/ctree.c:1806!
[1631742.131188] invalid opcode: 0000 [#6] SMP DEBUG_PAGEALLOC PTI
[1631742.131726] CPU: 1 PID: 13394 Comm: btrfs Tainted: G B D W 5.0.0-rc8-btrfs-next-45 #1
[1631742.132265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[1631742.133399] RIP: 0010:read_node_slot+0x122/0x130 [btrfs]
(...)
[1631742.135061] RSP: 0018:ffffb530021ebaa0 EFLAGS: 00010246
[1631742.135615] RAX: ffff93ac8912e000 RBX: 000000000000009d RCX: 0000000000000002
[1631742.136173] RDX: 000000000000009d RSI: ffff93ac564b0d08 RDI: ffff93ad5b48c000
[1631742.136759] RBP: ffffb530021ebb7d R08: 0000000000000001 R09: ffffb530021ebb7d
[1631742.137324] R10: ffffb530021eba70 R11: 0000000000000000 R12: ffff93ac87d0a708
[1631742.137900] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
[1631742.138455] FS: 00007f4cdb1528c0(0000) GS:ffff93ad76a80000(0000) knlGS:0000000000000000
[1631742.139010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1631742.139568] CR2: 00007f5acb3d0420 CR3: 000000012be3e006 CR4: 00000000003606e0
[1631742.140131] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[1631742.140719] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[1631742.141272] Call Trace:
[1631742.141826] ? do_raw_spin_unlock+0x49/0xc0
[1631742.142390] tree_advance+0x173/0x1d0 [btrfs]
[1631742.142948] btrfs_compare_trees+0x268/0x690 [btrfs]
[1631742.143533] ? process_extent+0x1070/0x1070 [btrfs]
[1631742.144088] btrfs_ioctl_send+0x1037/0x1270 [btrfs]
[1631742.144645] _btrfs_ioctl_send+0x80/0x110 [btrfs]
[1631742.145161] ? trace_sched_stick_numa+0xe0/0xe0
[1631742.145685] btrfs_ioctl+0x13fe/0x3120 [btrfs]
[1631742.146179] ? account_entity_enqueue+0xd3/0x100
[1631742.146662] ? reweight_entity+0x154/0x1a0
[1631742.147135] ? update_curr+0x20/0x2a0
[1631742.147593] ? check_preempt_wakeup+0x103/0x250
[1631742.148053] ? do_vfs_ioctl+0xa2/0x6f0
[1631742.148510] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[1631742.148942] do_vfs_ioctl+0xa2/0x6f0
[1631742.149361] ? __fget+0x113/0x200
[1631742.149767] ksys_ioctl+0x70/0x80
[1631742.150159] __x64_sys_ioctl+0x16/0x20
[1631742.150543] do_syscall_64+0x60/0x1b0
[1631742.150931] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[1631742.151326] RIP: 0033:0x7f4cd9f5add7
(...)
[1631742.152509] RSP: 002b:00007ffe91017708 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[1631742.152892] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f4cd9f5add7
[1631742.153268] RDX: 00007ffe91017790 RSI: 0000000040489426 RDI: 0000000000000007
[1631742.153633] RBP: 0000000000000007 R08: 00007f4cd9e79700 R09: 00007f4cd9e79700
[1631742.153999] R10: 00007f4cd9e799d0 R11: 0000000000000202 R12: 0000000000000003
[1631742.154365] R13: 0000555dfae53020 R14: 0000000000000000 R15: 0000000000000001
(...)
[1631742.156696] ---[ end trace 5dac9f96dcc3fd6b ]---
That BUG_ON happens because while send is using a node, that node is COWed
by a concurrent deduplication, gets freed and gets reused as a leaf (because
a transaction commit happened in between), so when it attempts to read a
slot from the extent buffer, at ctree.c:read_node_slot(), the extent buffer
contents were wiped out and it now matches a leaf (which can even belong to
some other tree now), hitting the BUG_ON(level == 0).
Fix this concurrency issue by not allowing send and deduplication to run
in parallel if both operate on the same readonly trees, returning EAGAIN
to user space and logging an exlicit warning in dmesg/syslog.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be6821f82c3cc36e026f5afd10249988852b35ea
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2
[3] https://lore.kernel.org/linux-btrfs/CAL3q7H7iqSEEyFaEtpRZw3cp613y+4k2Q8b4W7mweR3tZA05bQ@mail.gmail.com/
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-22 18:43:42 +03:00
goto out ;
2018-05-02 08:15:37 +03:00
loff + = BTRFS_MAX_DEDUPE_LEN ;
dst_loff + = BTRFS_MAX_DEDUPE_LEN ;
}
if ( tail_len > 0 )
2018-05-02 08:15:38 +03:00
ret = btrfs_extent_same_range ( src , loff , tail_len , dst ,
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
dst_loff ) ;
Btrfs: fix race between send and deduplication that lead to failures and crashes
Send operates on read only trees and expects them to never change while it
is using them. This is part of its initial design, and this expection is
due to two different reasons:
1) When it was introduced, no operations were allowed to modifiy read-only
subvolumes/snapshots (including defrag for example).
2) It keeps send from having an impact on other filesystem operations.
Namely send does not need to keep locks on the trees nor needs to hold on
to transaction handles and delay transaction commits. This ends up being
a consequence of the former reason.
However the deduplication feature was introduced later (on September 2013,
while send was introduced in July 2012) and it allowed for deduplication
with destination files that belong to read-only trees (subvolumes and
snapshots).
That means that having a send operation (either full or incremental) running
in parallel with a deduplication that has the destination inode in one of
the trees used by the send operation, can result in tree nodes and leaves
getting freed and reused while send is using them. This problem is similar
to the problem solved for the root nodes getting freed and reused when a
snapshot is made against one tree that is currenly being used by a send
operation, fixed in commits [1] and [2]. These commits explain in detail
how the problem happens and the explanation is valid for any node or leaf
that is not the root of a tree as well. This problem was also discussed
and explained recently in a thread [3].
The problem is very easy to reproduce when using send with large trees
(snapshots) and just a few concurrent deduplication operations that target
files in the trees used by send. A stress test case is being sent for
fstests that triggers the issue easily. The most common error to hit is
the send ioctl return -EIO with the following messages in dmesg/syslog:
[1631617.204075] BTRFS error (device sdc): did not find backref in send_root. inode=63292, offset=0, disk_byte=5228134400 found extent=5228134400
[1631633.251754] BTRFS error (device sdc): parent transid verify failed on 32243712 wanted 24 found 27
The first one is very easy to hit while the second one happens much less
frequently, except for very large trees (in that test case, snapshots
with 100000 files having large xattrs to get deep and wide trees).
Less frequently, at least one BUG_ON can be hit:
[1631742.130080] ------------[ cut here ]------------
[1631742.130625] kernel BUG at fs/btrfs/ctree.c:1806!
[1631742.131188] invalid opcode: 0000 [#6] SMP DEBUG_PAGEALLOC PTI
[1631742.131726] CPU: 1 PID: 13394 Comm: btrfs Tainted: G B D W 5.0.0-rc8-btrfs-next-45 #1
[1631742.132265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[1631742.133399] RIP: 0010:read_node_slot+0x122/0x130 [btrfs]
(...)
[1631742.135061] RSP: 0018:ffffb530021ebaa0 EFLAGS: 00010246
[1631742.135615] RAX: ffff93ac8912e000 RBX: 000000000000009d RCX: 0000000000000002
[1631742.136173] RDX: 000000000000009d RSI: ffff93ac564b0d08 RDI: ffff93ad5b48c000
[1631742.136759] RBP: ffffb530021ebb7d R08: 0000000000000001 R09: ffffb530021ebb7d
[1631742.137324] R10: ffffb530021eba70 R11: 0000000000000000 R12: ffff93ac87d0a708
[1631742.137900] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
[1631742.138455] FS: 00007f4cdb1528c0(0000) GS:ffff93ad76a80000(0000) knlGS:0000000000000000
[1631742.139010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1631742.139568] CR2: 00007f5acb3d0420 CR3: 000000012be3e006 CR4: 00000000003606e0
[1631742.140131] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[1631742.140719] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[1631742.141272] Call Trace:
[1631742.141826] ? do_raw_spin_unlock+0x49/0xc0
[1631742.142390] tree_advance+0x173/0x1d0 [btrfs]
[1631742.142948] btrfs_compare_trees+0x268/0x690 [btrfs]
[1631742.143533] ? process_extent+0x1070/0x1070 [btrfs]
[1631742.144088] btrfs_ioctl_send+0x1037/0x1270 [btrfs]
[1631742.144645] _btrfs_ioctl_send+0x80/0x110 [btrfs]
[1631742.145161] ? trace_sched_stick_numa+0xe0/0xe0
[1631742.145685] btrfs_ioctl+0x13fe/0x3120 [btrfs]
[1631742.146179] ? account_entity_enqueue+0xd3/0x100
[1631742.146662] ? reweight_entity+0x154/0x1a0
[1631742.147135] ? update_curr+0x20/0x2a0
[1631742.147593] ? check_preempt_wakeup+0x103/0x250
[1631742.148053] ? do_vfs_ioctl+0xa2/0x6f0
[1631742.148510] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[1631742.148942] do_vfs_ioctl+0xa2/0x6f0
[1631742.149361] ? __fget+0x113/0x200
[1631742.149767] ksys_ioctl+0x70/0x80
[1631742.150159] __x64_sys_ioctl+0x16/0x20
[1631742.150543] do_syscall_64+0x60/0x1b0
[1631742.150931] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[1631742.151326] RIP: 0033:0x7f4cd9f5add7
(...)
[1631742.152509] RSP: 002b:00007ffe91017708 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[1631742.152892] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f4cd9f5add7
[1631742.153268] RDX: 00007ffe91017790 RSI: 0000000040489426 RDI: 0000000000000007
[1631742.153633] RBP: 0000000000000007 R08: 00007f4cd9e79700 R09: 00007f4cd9e79700
[1631742.153999] R10: 00007f4cd9e799d0 R11: 0000000000000202 R12: 0000000000000003
[1631742.154365] R13: 0000555dfae53020 R14: 0000000000000000 R15: 0000000000000001
(...)
[1631742.156696] ---[ end trace 5dac9f96dcc3fd6b ]---
That BUG_ON happens because while send is using a node, that node is COWed
by a concurrent deduplication, gets freed and gets reused as a leaf (because
a transaction commit happened in between), so when it attempts to read a
slot from the extent buffer, at ctree.c:read_node_slot(), the extent buffer
contents were wiped out and it now matches a leaf (which can even belong to
some other tree now), hitting the BUG_ON(level == 0).
Fix this concurrency issue by not allowing send and deduplication to run
in parallel if both operate on the same readonly trees, returning EAGAIN
to user space and logging an exlicit warning in dmesg/syslog.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be6821f82c3cc36e026f5afd10249988852b35ea
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2
[3] https://lore.kernel.org/linux-btrfs/CAL3q7H7iqSEEyFaEtpRZw3cp613y+4k2Q8b4W7mweR3tZA05bQ@mail.gmail.com/
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-22 18:43:42 +03:00
out :
spin_lock ( & root_dst - > root_item_lock ) ;
root_dst - > dedupe_in_progress - - ;
spin_unlock ( & root_dst - > root_item_lock ) ;
2013-08-06 22:42:51 +04:00
return ret ;
}
2014-06-01 04:50:28 +04:00
static int clone_finish_inode_update ( struct btrfs_trans_handle * trans ,
struct inode * inode ,
u64 endoff ,
const u64 destoff ,
2015-07-01 00:42:08 +03:00
const u64 olen ,
int no_time_update )
2014-06-01 04:50:28 +04:00
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
int ret ;
inode_inc_iversion ( inode ) ;
2015-07-01 00:42:08 +03:00
if ( ! no_time_update )
2016-09-14 17:48:06 +03:00
inode - > i_mtime = inode - > i_ctime = current_time ( inode ) ;
2014-06-01 04:50:28 +04:00
/*
* We round up to the block size at eof when determining which
* extents to clone above , but shouldn ' t round up the file size .
*/
if ( endoff > destoff + olen )
endoff = destoff + olen ;
if ( endoff > inode - > i_size )
2017-02-20 14:50:34 +03:00
btrfs_i_size_write ( BTRFS_I ( inode ) , endoff ) ;
2014-06-01 04:50:28 +04:00
ret = btrfs_update_inode ( trans , root , inode ) ;
if ( ret ) {
2016-06-11 01:19:25 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2016-09-10 04:39:03 +03:00
btrfs_end_transaction ( trans ) ;
2014-06-01 04:50:28 +04:00
goto out ;
}
2016-09-10 04:39:03 +03:00
ret = btrfs_end_transaction ( trans ) ;
2014-06-01 04:50:28 +04:00
out :
return ret ;
}
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.
Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).
So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.
Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.
The following test case for fstests exercises all these cases that result
in file corruption and data loss:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
rm -f $seqres.full
test_cloning_inline_extents()
{
local mkfs_opts=$1
local mount_opts=$2
_scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
_scratch_mount $mount_opts
# File bar, the source for all the following clone operations, consists
# of a single inline extent (50 bytes).
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Test cloning into a file with an extent (non-inlined) where the
# destination offset overlaps that extent. It should not be possible to
# clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo data after clone operation:"
# All bytes should have the value 0xaa (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo
$XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
# Test cloning the inline extent against a file which has a hole in its
# first 4K followed by a non-inlined extent. It should not be possible
# as well to clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo2 data after clone operation:"
# All bytes should have the value 0x00 (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo2
$XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
# Test cloning the inline extent against a file which has a size of zero
# but has a prealloc extent. It should not be possible as well to clone
# the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "First 50 bytes of foo3 after clone operation:"
# Should not be able to read any bytes, file has 0 bytes i_size (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo3
$XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size not greater than the size of
# bar's inline extent (40 < 50).
# It should be possible to do the extent cloning from bar to this file.
$XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
# Doing IO against any range in the first 4K of the file should work.
echo "File foo4 data after clone operation:"
# Must match file bar's content.
od -t x1 $SCRATCH_MNT/foo4
$XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size greater than the size of bar's
# inline extent (60 > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
# Reading the file should not fail.
echo "File foo5 data after clone operation:"
# Must have a size of 60 bytes, with all bytes having a value of 0x03
# (the clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo5
# Test cloning the inline extent against a file which has no extents but
# has a size greater than bar's inline extent (16K > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
# Reading the file should not fail.
echo "File foo6 data after clone operation:"
# Must have a size of 16K, with all bytes having a value of 0x00 (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo6
# Test cloning the inline extent against a file which has no extents but
# has a size not greater than bar's inline extent (30 < 50).
# It should be possible to clone the inline extent from file bar into
# this file.
$XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
# Reading the file should not fail.
echo "File foo7 data after clone operation:"
# Must have a size of 50 bytes, with all bytes having a value of 0xbb.
od -t x1 $SCRATCH_MNT/foo7
# Test cloning the inline extent against a file which has a size not
# greater than the size of bar's inline extent (20 < 50) but has
# a prealloc extent that goes beyond the file's size. It should not be
# possible to clone the inline extent from bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" \
-c "pwrite -S 0x88 0 20" \
$SCRATCH_MNT/foo8 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
echo "File foo8 data after clone operation:"
# Must have a size of 20 bytes, with all bytes having a value of 0x88
# (the clone operation did not modify our file).
od -t x1 $SCRATCH_MNT/foo8
_scratch_unmount
}
echo -e "\nTesting without compression and without the no-holes feature...\n"
test_cloning_inline_extents
echo -e "\nTesting with compression and without the no-holes feature...\n"
test_cloning_inline_extents "" "-o compress"
echo -e "\nTesting without compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" ""
echo -e "\nTesting with compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" "-o compress"
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 17:15:00 +03:00
/*
* Make sure we do not end up inserting an inline extent into a file that has
* already other ( non - inline ) extents . If a file has an inline extent it can
* not have any other extents and the ( single ) inline extent must start at the
* file offset 0. Failing to respect these rules will lead to file corruption ,
* resulting in EIO errors on read / write operations , hitting BUG_ON ' s in mm , etc
*
* We can have extents that have been already written to disk or we can have
* dirty ranges still in delalloc , in which case the extent maps and items are
* created only when we run delalloc , and the delalloc ranges might fall outside
* the range we are currently locking in the inode ' s io tree . So we check the
* inode ' s i_size because of that ( i_size updates are done while holding the
* i_mutex , which we are holding here ) .
* We also check to see if the inode has a size not greater than " datal " but has
* extents beyond it , due to an fallocate with FALLOC_FL_KEEP_SIZE ( and we are
* protected against such concurrent fallocate calls by the i_mutex ) .
*
* If the file has no extents but a size greater than datal , do not allow the
* copy because we would need turn the inline extent into a non - inline one ( even
* with NO_HOLES enabled ) . If we find our destination inode only has one inline
* extent , just overwrite it with the source inline extent if its size is less
* than the source extent ' s size , or we could copy the source inline extent ' s
* data into the destination inode ' s inline extent if the later is greater then
* the former .
*/
2017-02-10 22:18:49 +03:00
static int clone_copy_inline_extent ( struct inode * dst ,
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.
Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).
So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.
Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.
The following test case for fstests exercises all these cases that result
in file corruption and data loss:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
rm -f $seqres.full
test_cloning_inline_extents()
{
local mkfs_opts=$1
local mount_opts=$2
_scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
_scratch_mount $mount_opts
# File bar, the source for all the following clone operations, consists
# of a single inline extent (50 bytes).
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Test cloning into a file with an extent (non-inlined) where the
# destination offset overlaps that extent. It should not be possible to
# clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo data after clone operation:"
# All bytes should have the value 0xaa (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo
$XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
# Test cloning the inline extent against a file which has a hole in its
# first 4K followed by a non-inlined extent. It should not be possible
# as well to clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo2 data after clone operation:"
# All bytes should have the value 0x00 (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo2
$XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
# Test cloning the inline extent against a file which has a size of zero
# but has a prealloc extent. It should not be possible as well to clone
# the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "First 50 bytes of foo3 after clone operation:"
# Should not be able to read any bytes, file has 0 bytes i_size (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo3
$XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size not greater than the size of
# bar's inline extent (40 < 50).
# It should be possible to do the extent cloning from bar to this file.
$XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
# Doing IO against any range in the first 4K of the file should work.
echo "File foo4 data after clone operation:"
# Must match file bar's content.
od -t x1 $SCRATCH_MNT/foo4
$XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size greater than the size of bar's
# inline extent (60 > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
# Reading the file should not fail.
echo "File foo5 data after clone operation:"
# Must have a size of 60 bytes, with all bytes having a value of 0x03
# (the clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo5
# Test cloning the inline extent against a file which has no extents but
# has a size greater than bar's inline extent (16K > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
# Reading the file should not fail.
echo "File foo6 data after clone operation:"
# Must have a size of 16K, with all bytes having a value of 0x00 (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo6
# Test cloning the inline extent against a file which has no extents but
# has a size not greater than bar's inline extent (30 < 50).
# It should be possible to clone the inline extent from file bar into
# this file.
$XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
# Reading the file should not fail.
echo "File foo7 data after clone operation:"
# Must have a size of 50 bytes, with all bytes having a value of 0xbb.
od -t x1 $SCRATCH_MNT/foo7
# Test cloning the inline extent against a file which has a size not
# greater than the size of bar's inline extent (20 < 50) but has
# a prealloc extent that goes beyond the file's size. It should not be
# possible to clone the inline extent from bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" \
-c "pwrite -S 0x88 0 20" \
$SCRATCH_MNT/foo8 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
echo "File foo8 data after clone operation:"
# Must have a size of 20 bytes, with all bytes having a value of 0x88
# (the clone operation did not modify our file).
od -t x1 $SCRATCH_MNT/foo8
_scratch_unmount
}
echo -e "\nTesting without compression and without the no-holes feature...\n"
test_cloning_inline_extents
echo -e "\nTesting with compression and without the no-holes feature...\n"
test_cloning_inline_extents "" "-o compress"
echo -e "\nTesting without compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" ""
echo -e "\nTesting with compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" "-o compress"
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 17:15:00 +03:00
struct btrfs_trans_handle * trans ,
struct btrfs_path * path ,
struct btrfs_key * new_key ,
const u64 drop_start ,
const u64 datal ,
const u64 skip ,
const u64 size ,
char * inline_data )
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( dst - > i_sb ) ;
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.
Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).
So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.
Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.
The following test case for fstests exercises all these cases that result
in file corruption and data loss:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
rm -f $seqres.full
test_cloning_inline_extents()
{
local mkfs_opts=$1
local mount_opts=$2
_scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
_scratch_mount $mount_opts
# File bar, the source for all the following clone operations, consists
# of a single inline extent (50 bytes).
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Test cloning into a file with an extent (non-inlined) where the
# destination offset overlaps that extent. It should not be possible to
# clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo data after clone operation:"
# All bytes should have the value 0xaa (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo
$XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
# Test cloning the inline extent against a file which has a hole in its
# first 4K followed by a non-inlined extent. It should not be possible
# as well to clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo2 data after clone operation:"
# All bytes should have the value 0x00 (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo2
$XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
# Test cloning the inline extent against a file which has a size of zero
# but has a prealloc extent. It should not be possible as well to clone
# the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "First 50 bytes of foo3 after clone operation:"
# Should not be able to read any bytes, file has 0 bytes i_size (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo3
$XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size not greater than the size of
# bar's inline extent (40 < 50).
# It should be possible to do the extent cloning from bar to this file.
$XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
# Doing IO against any range in the first 4K of the file should work.
echo "File foo4 data after clone operation:"
# Must match file bar's content.
od -t x1 $SCRATCH_MNT/foo4
$XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size greater than the size of bar's
# inline extent (60 > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
# Reading the file should not fail.
echo "File foo5 data after clone operation:"
# Must have a size of 60 bytes, with all bytes having a value of 0x03
# (the clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo5
# Test cloning the inline extent against a file which has no extents but
# has a size greater than bar's inline extent (16K > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
# Reading the file should not fail.
echo "File foo6 data after clone operation:"
# Must have a size of 16K, with all bytes having a value of 0x00 (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo6
# Test cloning the inline extent against a file which has no extents but
# has a size not greater than bar's inline extent (30 < 50).
# It should be possible to clone the inline extent from file bar into
# this file.
$XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
# Reading the file should not fail.
echo "File foo7 data after clone operation:"
# Must have a size of 50 bytes, with all bytes having a value of 0xbb.
od -t x1 $SCRATCH_MNT/foo7
# Test cloning the inline extent against a file which has a size not
# greater than the size of bar's inline extent (20 < 50) but has
# a prealloc extent that goes beyond the file's size. It should not be
# possible to clone the inline extent from bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" \
-c "pwrite -S 0x88 0 20" \
$SCRATCH_MNT/foo8 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
echo "File foo8 data after clone operation:"
# Must have a size of 20 bytes, with all bytes having a value of 0x88
# (the clone operation did not modify our file).
od -t x1 $SCRATCH_MNT/foo8
_scratch_unmount
}
echo -e "\nTesting without compression and without the no-holes feature...\n"
test_cloning_inline_extents
echo -e "\nTesting with compression and without the no-holes feature...\n"
test_cloning_inline_extents "" "-o compress"
echo -e "\nTesting without compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" ""
echo -e "\nTesting with compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" "-o compress"
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 17:15:00 +03:00
struct btrfs_root * root = BTRFS_I ( dst ) - > root ;
const u64 aligned_end = ALIGN ( new_key - > offset + datal ,
2016-06-23 01:54:23 +03:00
fs_info - > sectorsize ) ;
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.
Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).
So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.
Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.
The following test case for fstests exercises all these cases that result
in file corruption and data loss:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
rm -f $seqres.full
test_cloning_inline_extents()
{
local mkfs_opts=$1
local mount_opts=$2
_scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
_scratch_mount $mount_opts
# File bar, the source for all the following clone operations, consists
# of a single inline extent (50 bytes).
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Test cloning into a file with an extent (non-inlined) where the
# destination offset overlaps that extent. It should not be possible to
# clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo data after clone operation:"
# All bytes should have the value 0xaa (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo
$XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
# Test cloning the inline extent against a file which has a hole in its
# first 4K followed by a non-inlined extent. It should not be possible
# as well to clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo2 data after clone operation:"
# All bytes should have the value 0x00 (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo2
$XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
# Test cloning the inline extent against a file which has a size of zero
# but has a prealloc extent. It should not be possible as well to clone
# the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "First 50 bytes of foo3 after clone operation:"
# Should not be able to read any bytes, file has 0 bytes i_size (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo3
$XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size not greater than the size of
# bar's inline extent (40 < 50).
# It should be possible to do the extent cloning from bar to this file.
$XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
# Doing IO against any range in the first 4K of the file should work.
echo "File foo4 data after clone operation:"
# Must match file bar's content.
od -t x1 $SCRATCH_MNT/foo4
$XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size greater than the size of bar's
# inline extent (60 > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
# Reading the file should not fail.
echo "File foo5 data after clone operation:"
# Must have a size of 60 bytes, with all bytes having a value of 0x03
# (the clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo5
# Test cloning the inline extent against a file which has no extents but
# has a size greater than bar's inline extent (16K > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
# Reading the file should not fail.
echo "File foo6 data after clone operation:"
# Must have a size of 16K, with all bytes having a value of 0x00 (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo6
# Test cloning the inline extent against a file which has no extents but
# has a size not greater than bar's inline extent (30 < 50).
# It should be possible to clone the inline extent from file bar into
# this file.
$XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
# Reading the file should not fail.
echo "File foo7 data after clone operation:"
# Must have a size of 50 bytes, with all bytes having a value of 0xbb.
od -t x1 $SCRATCH_MNT/foo7
# Test cloning the inline extent against a file which has a size not
# greater than the size of bar's inline extent (20 < 50) but has
# a prealloc extent that goes beyond the file's size. It should not be
# possible to clone the inline extent from bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" \
-c "pwrite -S 0x88 0 20" \
$SCRATCH_MNT/foo8 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
echo "File foo8 data after clone operation:"
# Must have a size of 20 bytes, with all bytes having a value of 0x88
# (the clone operation did not modify our file).
od -t x1 $SCRATCH_MNT/foo8
_scratch_unmount
}
echo -e "\nTesting without compression and without the no-holes feature...\n"
test_cloning_inline_extents
echo -e "\nTesting with compression and without the no-holes feature...\n"
test_cloning_inline_extents "" "-o compress"
echo -e "\nTesting without compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" ""
echo -e "\nTesting with compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" "-o compress"
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 17:15:00 +03:00
int ret ;
struct btrfs_key key ;
if ( new_key - > offset > 0 )
return - EOPNOTSUPP ;
2017-01-10 21:35:31 +03:00
key . objectid = btrfs_ino ( BTRFS_I ( dst ) ) ;
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.
Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).
So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.
Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.
The following test case for fstests exercises all these cases that result
in file corruption and data loss:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
rm -f $seqres.full
test_cloning_inline_extents()
{
local mkfs_opts=$1
local mount_opts=$2
_scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
_scratch_mount $mount_opts
# File bar, the source for all the following clone operations, consists
# of a single inline extent (50 bytes).
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Test cloning into a file with an extent (non-inlined) where the
# destination offset overlaps that extent. It should not be possible to
# clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo data after clone operation:"
# All bytes should have the value 0xaa (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo
$XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
# Test cloning the inline extent against a file which has a hole in its
# first 4K followed by a non-inlined extent. It should not be possible
# as well to clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo2 data after clone operation:"
# All bytes should have the value 0x00 (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo2
$XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
# Test cloning the inline extent against a file which has a size of zero
# but has a prealloc extent. It should not be possible as well to clone
# the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "First 50 bytes of foo3 after clone operation:"
# Should not be able to read any bytes, file has 0 bytes i_size (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo3
$XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size not greater than the size of
# bar's inline extent (40 < 50).
# It should be possible to do the extent cloning from bar to this file.
$XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
# Doing IO against any range in the first 4K of the file should work.
echo "File foo4 data after clone operation:"
# Must match file bar's content.
od -t x1 $SCRATCH_MNT/foo4
$XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size greater than the size of bar's
# inline extent (60 > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
# Reading the file should not fail.
echo "File foo5 data after clone operation:"
# Must have a size of 60 bytes, with all bytes having a value of 0x03
# (the clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo5
# Test cloning the inline extent against a file which has no extents but
# has a size greater than bar's inline extent (16K > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
# Reading the file should not fail.
echo "File foo6 data after clone operation:"
# Must have a size of 16K, with all bytes having a value of 0x00 (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo6
# Test cloning the inline extent against a file which has no extents but
# has a size not greater than bar's inline extent (30 < 50).
# It should be possible to clone the inline extent from file bar into
# this file.
$XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
# Reading the file should not fail.
echo "File foo7 data after clone operation:"
# Must have a size of 50 bytes, with all bytes having a value of 0xbb.
od -t x1 $SCRATCH_MNT/foo7
# Test cloning the inline extent against a file which has a size not
# greater than the size of bar's inline extent (20 < 50) but has
# a prealloc extent that goes beyond the file's size. It should not be
# possible to clone the inline extent from bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" \
-c "pwrite -S 0x88 0 20" \
$SCRATCH_MNT/foo8 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
echo "File foo8 data after clone operation:"
# Must have a size of 20 bytes, with all bytes having a value of 0x88
# (the clone operation did not modify our file).
od -t x1 $SCRATCH_MNT/foo8
_scratch_unmount
}
echo -e "\nTesting without compression and without the no-holes feature...\n"
test_cloning_inline_extents
echo -e "\nTesting with compression and without the no-holes feature...\n"
test_cloning_inline_extents "" "-o compress"
echo -e "\nTesting without compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" ""
echo -e "\nTesting with compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" "-o compress"
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 17:15:00 +03:00
key . type = BTRFS_EXTENT_DATA_KEY ;
key . offset = 0 ;
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 ) {
return ret ;
} else if ( ret > 0 ) {
if ( path - > slots [ 0 ] > = btrfs_header_nritems ( path - > nodes [ 0 ] ) ) {
ret = btrfs_next_leaf ( root , path ) ;
if ( ret < 0 )
return ret ;
else if ( ret > 0 )
goto copy_inline_extent ;
}
btrfs_item_key_to_cpu ( path - > nodes [ 0 ] , & key , path - > slots [ 0 ] ) ;
2017-01-10 21:35:31 +03:00
if ( key . objectid = = btrfs_ino ( BTRFS_I ( dst ) ) & &
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.
Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).
So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.
Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.
The following test case for fstests exercises all these cases that result
in file corruption and data loss:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
rm -f $seqres.full
test_cloning_inline_extents()
{
local mkfs_opts=$1
local mount_opts=$2
_scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
_scratch_mount $mount_opts
# File bar, the source for all the following clone operations, consists
# of a single inline extent (50 bytes).
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Test cloning into a file with an extent (non-inlined) where the
# destination offset overlaps that extent. It should not be possible to
# clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo data after clone operation:"
# All bytes should have the value 0xaa (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo
$XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
# Test cloning the inline extent against a file which has a hole in its
# first 4K followed by a non-inlined extent. It should not be possible
# as well to clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo2 data after clone operation:"
# All bytes should have the value 0x00 (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo2
$XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
# Test cloning the inline extent against a file which has a size of zero
# but has a prealloc extent. It should not be possible as well to clone
# the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "First 50 bytes of foo3 after clone operation:"
# Should not be able to read any bytes, file has 0 bytes i_size (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo3
$XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size not greater than the size of
# bar's inline extent (40 < 50).
# It should be possible to do the extent cloning from bar to this file.
$XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
# Doing IO against any range in the first 4K of the file should work.
echo "File foo4 data after clone operation:"
# Must match file bar's content.
od -t x1 $SCRATCH_MNT/foo4
$XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size greater than the size of bar's
# inline extent (60 > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
# Reading the file should not fail.
echo "File foo5 data after clone operation:"
# Must have a size of 60 bytes, with all bytes having a value of 0x03
# (the clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo5
# Test cloning the inline extent against a file which has no extents but
# has a size greater than bar's inline extent (16K > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
# Reading the file should not fail.
echo "File foo6 data after clone operation:"
# Must have a size of 16K, with all bytes having a value of 0x00 (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo6
# Test cloning the inline extent against a file which has no extents but
# has a size not greater than bar's inline extent (30 < 50).
# It should be possible to clone the inline extent from file bar into
# this file.
$XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
# Reading the file should not fail.
echo "File foo7 data after clone operation:"
# Must have a size of 50 bytes, with all bytes having a value of 0xbb.
od -t x1 $SCRATCH_MNT/foo7
# Test cloning the inline extent against a file which has a size not
# greater than the size of bar's inline extent (20 < 50) but has
# a prealloc extent that goes beyond the file's size. It should not be
# possible to clone the inline extent from bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" \
-c "pwrite -S 0x88 0 20" \
$SCRATCH_MNT/foo8 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
echo "File foo8 data after clone operation:"
# Must have a size of 20 bytes, with all bytes having a value of 0x88
# (the clone operation did not modify our file).
od -t x1 $SCRATCH_MNT/foo8
_scratch_unmount
}
echo -e "\nTesting without compression and without the no-holes feature...\n"
test_cloning_inline_extents
echo -e "\nTesting with compression and without the no-holes feature...\n"
test_cloning_inline_extents "" "-o compress"
echo -e "\nTesting without compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" ""
echo -e "\nTesting with compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" "-o compress"
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 17:15:00 +03:00
key . type = = BTRFS_EXTENT_DATA_KEY ) {
ASSERT ( key . offset > 0 ) ;
return - EOPNOTSUPP ;
}
} else if ( i_size_read ( dst ) < = datal ) {
struct btrfs_file_extent_item * ei ;
u64 ext_len ;
/*
* If the file size is < = datal , make sure there are no other
* extents following ( can happen do to an fallocate call with
* the flag FALLOC_FL_KEEP_SIZE ) .
*/
ei = btrfs_item_ptr ( path - > nodes [ 0 ] , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
/*
* If it ' s an inline extent , it can not have other extents
* following it .
*/
if ( btrfs_file_extent_type ( path - > nodes [ 0 ] , ei ) = =
BTRFS_FILE_EXTENT_INLINE )
goto copy_inline_extent ;
ext_len = btrfs_file_extent_num_bytes ( path - > nodes [ 0 ] , ei ) ;
if ( ext_len > aligned_end )
return - EOPNOTSUPP ;
ret = btrfs_next_item ( root , path ) ;
if ( ret < 0 ) {
return ret ;
} else if ( ret = = 0 ) {
btrfs_item_key_to_cpu ( path - > nodes [ 0 ] , & key ,
path - > slots [ 0 ] ) ;
2017-01-10 21:35:31 +03:00
if ( key . objectid = = btrfs_ino ( BTRFS_I ( dst ) ) & &
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.
Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).
So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.
Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.
The following test case for fstests exercises all these cases that result
in file corruption and data loss:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
rm -f $seqres.full
test_cloning_inline_extents()
{
local mkfs_opts=$1
local mount_opts=$2
_scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
_scratch_mount $mount_opts
# File bar, the source for all the following clone operations, consists
# of a single inline extent (50 bytes).
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Test cloning into a file with an extent (non-inlined) where the
# destination offset overlaps that extent. It should not be possible to
# clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo data after clone operation:"
# All bytes should have the value 0xaa (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo
$XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
# Test cloning the inline extent against a file which has a hole in its
# first 4K followed by a non-inlined extent. It should not be possible
# as well to clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo2 data after clone operation:"
# All bytes should have the value 0x00 (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo2
$XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
# Test cloning the inline extent against a file which has a size of zero
# but has a prealloc extent. It should not be possible as well to clone
# the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "First 50 bytes of foo3 after clone operation:"
# Should not be able to read any bytes, file has 0 bytes i_size (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo3
$XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size not greater than the size of
# bar's inline extent (40 < 50).
# It should be possible to do the extent cloning from bar to this file.
$XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
# Doing IO against any range in the first 4K of the file should work.
echo "File foo4 data after clone operation:"
# Must match file bar's content.
od -t x1 $SCRATCH_MNT/foo4
$XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size greater than the size of bar's
# inline extent (60 > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
# Reading the file should not fail.
echo "File foo5 data after clone operation:"
# Must have a size of 60 bytes, with all bytes having a value of 0x03
# (the clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo5
# Test cloning the inline extent against a file which has no extents but
# has a size greater than bar's inline extent (16K > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
# Reading the file should not fail.
echo "File foo6 data after clone operation:"
# Must have a size of 16K, with all bytes having a value of 0x00 (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo6
# Test cloning the inline extent against a file which has no extents but
# has a size not greater than bar's inline extent (30 < 50).
# It should be possible to clone the inline extent from file bar into
# this file.
$XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
# Reading the file should not fail.
echo "File foo7 data after clone operation:"
# Must have a size of 50 bytes, with all bytes having a value of 0xbb.
od -t x1 $SCRATCH_MNT/foo7
# Test cloning the inline extent against a file which has a size not
# greater than the size of bar's inline extent (20 < 50) but has
# a prealloc extent that goes beyond the file's size. It should not be
# possible to clone the inline extent from bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" \
-c "pwrite -S 0x88 0 20" \
$SCRATCH_MNT/foo8 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
echo "File foo8 data after clone operation:"
# Must have a size of 20 bytes, with all bytes having a value of 0x88
# (the clone operation did not modify our file).
od -t x1 $SCRATCH_MNT/foo8
_scratch_unmount
}
echo -e "\nTesting without compression and without the no-holes feature...\n"
test_cloning_inline_extents
echo -e "\nTesting with compression and without the no-holes feature...\n"
test_cloning_inline_extents "" "-o compress"
echo -e "\nTesting without compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" ""
echo -e "\nTesting with compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" "-o compress"
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 17:15:00 +03:00
key . type = = BTRFS_EXTENT_DATA_KEY )
return - EOPNOTSUPP ;
}
}
copy_inline_extent :
/*
* We have no extent items , or we have an extent at offset 0 which may
* or may not be inlined . All these cases are dealt the same way .
*/
if ( i_size_read ( dst ) > datal ) {
/*
* If the destination inode has an inline extent . . .
* This would require copying the data from the source inline
* extent into the beginning of the destination ' s inline extent .
* But this is really complex , both extents can be compressed
* or just one of them , which would require decompressing and
* re - compressing data ( which could increase the new compressed
* size , not allowing the compressed data to fit anymore in an
* inline extent ) .
* So just don ' t support this case for now ( it should be rare ,
* we are not really saving space when cloning inline extents ) .
*/
return - EOPNOTSUPP ;
}
btrfs_release_path ( path ) ;
ret = btrfs_drop_extents ( trans , root , dst , drop_start , aligned_end , 1 ) ;
if ( ret )
return ret ;
ret = btrfs_insert_empty_item ( trans , root , path , new_key , size ) ;
if ( ret )
return ret ;
if ( skip ) {
const u32 start = btrfs_file_extent_calc_inline_size ( 0 ) ;
memmove ( inline_data + start , inline_data + start + skip , datal ) ;
}
write_extent_buffer ( path - > nodes [ 0 ] , inline_data ,
btrfs_item_ptr_offset ( path - > nodes [ 0 ] ,
path - > slots [ 0 ] ) ,
size ) ;
inode_add_bytes ( dst , datal ) ;
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents
When cloning extents (or deduplicating) we create a transaction with a
space reservation that considers we will drop or update a single file
extent item of the destination inode (that we modify a single leaf). That
is fine for the vast majority of scenarios, however it might happen that
we need to drop many file extent items, and adjust at most two file extent
items, in the destination root, which can span multiple leafs. This will
lead to either the call to btrfs_drop_extents() to fail with ENOSPC or
the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode()
(called through clone_finish_inode_update()) to fail with ENOSPC. Such
failure results in a transaction abort, leaving the filesystem in a
read-only mode.
In order to fix this we need to follow the same approach as the hole
punching code, where we create a local reservation with 1 unit and keep
ending and starting transactions, after balancing the btree inode,
when __btrfs_drop_extents() returns ENOSPC. So fix this by making the
extent cloning call calls the recently added btrfs_punch_hole_range()
helper, which is what does the mentioned work for hole punching, and
make sure whenever we drop extent items in a transaction, we also add a
replacing file extent item, to avoid corruption (a hole) if after ending
a transaction and before starting a new one, the old transaction gets
committed and a power failure happens before we finish cloning.
A test case for fstests follows soon.
Reported-by: David Goodwin <david@codepoets.co.uk>
Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/
Reported-by: Sam Tygier <sam@tygier.co.uk>
Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/
Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 13:09:50 +03:00
set_bit ( BTRFS_INODE_NEEDS_FULL_SYNC , & BTRFS_I ( dst ) - > runtime_flags ) ;
Btrfs: fix file corruption and data loss after cloning inline extents
Currently the clone ioctl allows to clone an inline extent from one file
to another that already has other (non-inlined) extents. This is a problem
because btrfs is not designed to deal with files having inline and regular
extents, if a file has an inline extent then it must be the only extent
in the file and must start at file offset 0. Having a file with an inline
extent followed by regular extents results in EIO errors when doing reads
or writes against the first 4K of the file.
Also, the clone ioctl allows one to lose data if the source file consists
of a single inline extent, with a size of N bytes, and the destination
file consists of a single inline extent with a size of M bytes, where we
have M > N. In this case the clone operation removes the inline extent
from the destination file and then copies the inline extent from the
source file into the destination file - we lose the M - N bytes from the
destination file, a read operation will get the value 0x00 for any bytes
in the the range [N, M] (the destination inode's i_size remained as M,
that's why we can read past N bytes).
So fix this by not allowing such destructive operations to happen and
return errno EOPNOTSUPP to user space.
Currently the fstest btrfs/035 tests the data loss case but it totally
ignores this - i.e. expects the operation to succeed and does not check
the we got data loss.
The following test case for fstests exercises all these cases that result
in file corruption and data loss:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
_require_btrfs_fs_feature "no_holes"
_require_btrfs_mkfs_feature "no-holes"
rm -f $seqres.full
test_cloning_inline_extents()
{
local mkfs_opts=$1
local mount_opts=$2
_scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
_scratch_mount $mount_opts
# File bar, the source for all the following clone operations, consists
# of a single inline extent (50 bytes).
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Test cloning into a file with an extent (non-inlined) where the
# destination offset overlaps that extent. It should not be possible to
# clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo data after clone operation:"
# All bytes should have the value 0xaa (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo
$XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
# Test cloning the inline extent against a file which has a hole in its
# first 4K followed by a non-inlined extent. It should not be possible
# as well to clone the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "File foo2 data after clone operation:"
# All bytes should have the value 0x00 (clone operation failed and did
# not modify our file).
od -t x1 $SCRATCH_MNT/foo2
$XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
# Test cloning the inline extent against a file which has a size of zero
# but has a prealloc extent. It should not be possible as well to clone
# the inline extent from file bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
# Doing IO against any range in the first 4K of the file should work.
# Due to a past clone ioctl bug which allowed cloning the inline extent,
# these operations resulted in EIO errors.
echo "First 50 bytes of foo3 after clone operation:"
# Should not be able to read any bytes, file has 0 bytes i_size (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo3
$XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size not greater than the size of
# bar's inline extent (40 < 50).
# It should be possible to do the extent cloning from bar to this file.
$XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
# Doing IO against any range in the first 4K of the file should work.
echo "File foo4 data after clone operation:"
# Must match file bar's content.
od -t x1 $SCRATCH_MNT/foo4
$XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
# Test cloning the inline extent against a file which consists of a
# single inline extent that has a size greater than the size of bar's
# inline extent (60 > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
| _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
# Reading the file should not fail.
echo "File foo5 data after clone operation:"
# Must have a size of 60 bytes, with all bytes having a value of 0x03
# (the clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo5
# Test cloning the inline extent against a file which has no extents but
# has a size greater than bar's inline extent (16K > 50).
# It should not be possible to clone the inline extent from file bar
# into this file.
$XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
# Reading the file should not fail.
echo "File foo6 data after clone operation:"
# Must have a size of 16K, with all bytes having a value of 0x00 (the
# clone operation failed and did not modify our file).
od -t x1 $SCRATCH_MNT/foo6
# Test cloning the inline extent against a file which has no extents but
# has a size not greater than bar's inline extent (30 < 50).
# It should be possible to clone the inline extent from file bar into
# this file.
$XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
# Reading the file should not fail.
echo "File foo7 data after clone operation:"
# Must have a size of 50 bytes, with all bytes having a value of 0xbb.
od -t x1 $SCRATCH_MNT/foo7
# Test cloning the inline extent against a file which has a size not
# greater than the size of bar's inline extent (20 < 50) but has
# a prealloc extent that goes beyond the file's size. It should not be
# possible to clone the inline extent from bar into this file.
$XFS_IO_PROG -f -c "falloc -k 0 1M" \
-c "pwrite -S 0x88 0 20" \
$SCRATCH_MNT/foo8 | _filter_xfs_io
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
echo "File foo8 data after clone operation:"
# Must have a size of 20 bytes, with all bytes having a value of 0x88
# (the clone operation did not modify our file).
od -t x1 $SCRATCH_MNT/foo8
_scratch_unmount
}
echo -e "\nTesting without compression and without the no-holes feature...\n"
test_cloning_inline_extents
echo -e "\nTesting with compression and without the no-holes feature...\n"
test_cloning_inline_extents "" "-o compress"
echo -e "\nTesting without compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" ""
echo -e "\nTesting with compression and with the no-holes feature...\n"
test_cloning_inline_extents "-O no-holes" "-o compress"
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 17:15:00 +03:00
return 0 ;
}
2013-08-06 22:42:49 +04:00
/**
* btrfs_clone ( ) - clone a range from inode file to another
*
* @ src : Inode to clone from
* @ inode : Inode to clone to
* @ off : Offset within source to start clone from
* @ olen : Original length , passed by user , of range to clone
2015-07-01 00:42:08 +03:00
* @ olen_aligned : Block - aligned value of olen
2013-08-06 22:42:49 +04:00
* @ destoff : Offset within @ inode to start clone
2015-07-01 00:42:08 +03:00
* @ no_time_update : Whether to update mtime / ctime on the target inode
2013-08-06 22:42:49 +04:00
*/
static int btrfs_clone ( struct inode * src , struct inode * inode ,
2014-06-01 04:50:28 +04:00
const u64 off , const u64 olen , const u64 olen_aligned ,
2015-07-01 00:42:08 +03:00
const u64 destoff , int no_time_update )
2008-06-12 05:53:53 +04:00
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2008-06-12 05:53:53 +04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2013-08-06 22:42:49 +04:00
struct btrfs_path * path = NULL ;
2008-06-12 05:53:53 +04:00
struct extent_buffer * leaf ;
2013-08-06 22:42:49 +04:00
struct btrfs_trans_handle * trans ;
char * buf = NULL ;
2008-08-05 07:23:47 +04:00
struct btrfs_key key ;
2008-06-12 05:53:53 +04:00
u32 nritems ;
int slot ;
2008-08-05 07:23:47 +04:00
int ret ;
2014-06-01 04:50:28 +04:00
const u64 len = olen_aligned ;
u64 last_dest_end = destoff ;
2008-08-05 07:23:47 +04:00
ret = - ENOMEM ;
2017-05-09 01:57:27 +03:00
buf = kvmalloc ( fs_info - > nodesize , GFP_KERNEL ) ;
if ( ! buf )
return ret ;
2008-08-05 07:23:47 +04:00
path = btrfs_alloc_path ( ) ;
if ( ! path ) {
2016-04-11 19:40:08 +03:00
kvfree ( buf ) ;
2013-08-06 22:42:49 +04:00
return ret ;
2011-09-11 18:52:25 +04:00
}
2015-11-27 18:31:35 +03:00
path - > reada = READA_FORWARD ;
2008-11-12 22:32:25 +03:00
/* clone data */
2017-01-10 21:35:31 +03:00
key . objectid = btrfs_ino ( BTRFS_I ( src ) ) ;
2008-08-05 07:23:47 +04:00
key . type = BTRFS_EXTENT_DATA_KEY ;
2014-05-31 05:31:05 +04:00
key . offset = off ;
2008-06-12 05:53:53 +04:00
while ( 1 ) {
2015-04-11 15:09:06 +03:00
u64 next_key_min_offset = key . offset + 1 ;
2019-07-02 17:23:07 +03:00
struct btrfs_file_extent_item * extent ;
int type ;
u32 size ;
struct btrfs_key new_key ;
u64 disko = 0 , diskl = 0 ;
u64 datao = 0 , datal = 0 ;
u8 comp ;
u64 drop_start ;
2015-03-31 16:56:46 +03:00
2008-06-12 05:53:53 +04:00
/*
* note the key will change type as we walk through the
* tree .
*/
2014-01-13 23:35:01 +04:00
path - > leave_spinning = 1 ;
2011-08-01 20:11:57 +04:00
ret = btrfs_search_slot ( NULL , BTRFS_I ( src ) - > root , & key , path ,
0 , 0 ) ;
2008-06-12 05:53:53 +04:00
if ( ret < 0 )
goto out ;
2014-05-31 05:31:05 +04:00
/*
* First search , if no extent item that starts at offset off was
* found but the previous item is an extent item , it ' s possible
* it might overlap our target range , therefore process it .
*/
if ( key . offset = = off & & ret > 0 & & path - > slots [ 0 ] > 0 ) {
btrfs_item_key_to_cpu ( path - > nodes [ 0 ] , & key ,
path - > slots [ 0 ] - 1 ) ;
if ( key . type = = BTRFS_EXTENT_DATA_KEY )
path - > slots [ 0 ] - - ;
}
2008-06-12 05:53:53 +04:00
2008-08-05 07:23:47 +04:00
nritems = btrfs_header_nritems ( path - > nodes [ 0 ] ) ;
2014-01-13 23:35:01 +04:00
process_slot :
2008-08-05 07:23:47 +04:00
if ( path - > slots [ 0 ] > = nritems ) {
2011-08-01 20:11:57 +04:00
ret = btrfs_next_leaf ( BTRFS_I ( src ) - > root , path ) ;
2008-06-12 05:53:53 +04:00
if ( ret < 0 )
goto out ;
if ( ret > 0 )
break ;
2008-08-05 07:23:47 +04:00
nritems = btrfs_header_nritems ( path - > nodes [ 0 ] ) ;
2008-06-12 05:53:53 +04:00
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
2008-08-05 07:23:47 +04:00
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
2014-06-04 20:41:45 +04:00
if ( key . type > BTRFS_EXTENT_DATA_KEY | |
2017-01-10 21:35:31 +03:00
key . objectid ! = btrfs_ino ( BTRFS_I ( src ) ) )
2008-06-12 05:53:53 +04:00
break ;
2019-07-02 17:23:07 +03:00
ASSERT ( key . type = = BTRFS_EXTENT_DATA_KEY ) ;
2008-09-23 21:14:14 +04:00
2019-07-02 17:23:07 +03:00
extent = btrfs_item_ptr ( leaf , slot ,
struct btrfs_file_extent_item ) ;
comp = btrfs_file_extent_compression ( leaf , extent ) ;
type = btrfs_file_extent_type ( leaf , extent ) ;
if ( type = = BTRFS_FILE_EXTENT_REG | |
type = = BTRFS_FILE_EXTENT_PREALLOC ) {
disko = btrfs_file_extent_disk_bytenr ( leaf , extent ) ;
diskl = btrfs_file_extent_disk_num_bytes ( leaf , extent ) ;
datao = btrfs_file_extent_offset ( leaf , extent ) ;
datal = btrfs_file_extent_num_bytes ( leaf , extent ) ;
} else if ( type = = BTRFS_FILE_EXTENT_INLINE ) {
/* Take upper bound, may be compressed */
datal = btrfs_file_extent_ram_bytes ( leaf , extent ) ;
}
2014-01-13 23:35:01 +04:00
2019-07-02 17:23:07 +03:00
/*
* The first search might have left us at an extent item that
* ends before our target range ' s start , can happen if we have
* holes and NO_HOLES feature enabled .
*/
if ( key . offset + datal < = off ) {
path - > slots [ 0 ] + + ;
goto process_slot ;
} else if ( key . offset > = off + len ) {
break ;
}
next_key_min_offset = key . offset + datal ;
size = btrfs_item_size_nr ( leaf , slot ) ;
read_extent_buffer ( leaf , buf , btrfs_item_ptr_offset ( leaf , slot ) ,
size ) ;
2008-11-12 22:32:25 +03:00
2019-07-02 17:23:07 +03:00
btrfs_release_path ( path ) ;
path - > leave_spinning = 0 ;
memcpy ( & new_key , & key , sizeof ( new_key ) ) ;
new_key . objectid = btrfs_ino ( BTRFS_I ( inode ) ) ;
if ( off < = key . offset )
new_key . offset = key . offset + destoff - off ;
else
new_key . offset = destoff ;
/*
* Deal with a hole that doesn ' t have an extent item that
* represents it ( NO_HOLES feature enabled ) .
* This hole is either in the middle of the cloning range or at
* the beginning ( fully overlaps it or partially overlaps it ) .
*/
if ( new_key . offset ! = last_dest_end )
drop_start = last_dest_end ;
else
drop_start = new_key . offset ;
if ( type = = BTRFS_FILE_EXTENT_REG | |
type = = BTRFS_FILE_EXTENT_PREALLOC ) {
struct btrfs_clone_extent_info clone_info ;
2008-09-23 21:14:14 +04:00
2014-06-01 04:50:28 +04:00
/*
2019-07-02 17:23:07 +03:00
* a | - - - range to clone - - - | b
* | - - - - - - - - - - - - - extent - - - - - - - - - - - - - |
2014-06-01 04:50:28 +04:00
*/
2010-05-16 18:48:46 +04:00
2019-07-02 17:23:07 +03:00
/* Subtract range b */
if ( key . offset + datal > off + len )
datal = off + len - key . offset ;
/* Subtract range a */
if ( off > key . offset ) {
datao + = off - key . offset ;
datal - = off - key . offset ;
}
clone_info . disk_offset = disko ;
clone_info . disk_len = diskl ;
clone_info . data_offset = datao ;
clone_info . data_len = datal ;
clone_info . file_offset = new_key . offset ;
clone_info . extent_buf = buf ;
clone_info . item_size = size ;
ret = btrfs_punch_hole_range ( inode , path ,
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents
When cloning extents (or deduplicating) we create a transaction with a
space reservation that considers we will drop or update a single file
extent item of the destination inode (that we modify a single leaf). That
is fine for the vast majority of scenarios, however it might happen that
we need to drop many file extent items, and adjust at most two file extent
items, in the destination root, which can span multiple leafs. This will
lead to either the call to btrfs_drop_extents() to fail with ENOSPC or
the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode()
(called through clone_finish_inode_update()) to fail with ENOSPC. Such
failure results in a transaction abort, leaving the filesystem in a
read-only mode.
In order to fix this we need to follow the same approach as the hole
punching code, where we create a local reservation with 1 unit and keep
ending and starting transactions, after balancing the btree inode,
when __btrfs_drop_extents() returns ENOSPC. So fix this by making the
extent cloning call calls the recently added btrfs_punch_hole_range()
helper, which is what does the mentioned work for hole punching, and
make sure whenever we drop extent items in a transaction, we also add a
replacing file extent item, to avoid corruption (a hole) if after ending
a transaction and before starting a new one, the old transaction gets
committed and a power failure happens before we finish cloning.
A test case for fstests follows soon.
Reported-by: David Goodwin <david@codepoets.co.uk>
Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/
Reported-by: Sam Tygier <sam@tygier.co.uk>
Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/
Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 13:09:50 +03:00
drop_start ,
new_key . offset + datal - 1 ,
& clone_info , & trans ) ;
2019-07-02 17:23:07 +03:00
if ( ret )
goto out ;
} else if ( type = = BTRFS_FILE_EXTENT_INLINE ) {
u64 skip = 0 ;
u64 trim = 0 ;
2009-01-06 05:25:51 +03:00
2019-07-02 17:23:07 +03:00
if ( off > key . offset ) {
skip = off - key . offset ;
new_key . offset + = skip ;
}
2009-01-06 05:25:51 +03:00
2019-07-02 17:23:07 +03:00
if ( key . offset + datal > off + len )
trim = key . offset + datal - ( off + len ) ;
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents
When cloning extents (or deduplicating) we create a transaction with a
space reservation that considers we will drop or update a single file
extent item of the destination inode (that we modify a single leaf). That
is fine for the vast majority of scenarios, however it might happen that
we need to drop many file extent items, and adjust at most two file extent
items, in the destination root, which can span multiple leafs. This will
lead to either the call to btrfs_drop_extents() to fail with ENOSPC or
the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode()
(called through clone_finish_inode_update()) to fail with ENOSPC. Such
failure results in a transaction abort, leaving the filesystem in a
read-only mode.
In order to fix this we need to follow the same approach as the hole
punching code, where we create a local reservation with 1 unit and keep
ending and starting transactions, after balancing the btree inode,
when __btrfs_drop_extents() returns ENOSPC. So fix this by making the
extent cloning call calls the recently added btrfs_punch_hole_range()
helper, which is what does the mentioned work for hole punching, and
make sure whenever we drop extent items in a transaction, we also add a
replacing file extent item, to avoid corruption (a hole) if after ending
a transaction and before starting a new one, the old transaction gets
committed and a power failure happens before we finish cloning.
A test case for fstests follows soon.
Reported-by: David Goodwin <david@codepoets.co.uk>
Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/
Reported-by: Sam Tygier <sam@tygier.co.uk>
Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/
Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 13:09:50 +03:00
2019-07-02 17:23:07 +03:00
if ( comp & & ( skip | | trim ) ) {
ret = - EINVAL ;
goto out ;
2008-06-12 05:53:53 +04:00
}
2019-07-02 17:23:07 +03:00
size - = skip + trim ;
datal - = skip + trim ;
2008-11-12 22:32:25 +03:00
2019-07-02 17:23:07 +03:00
/*
* If our extent is inline , we know we will drop or
* adjust at most 1 extent item in the destination root .
*
* 1 - adjusting old extent ( we may have to split it )
* 1 - add new extent
* 1 - inode update
*/
trans = btrfs_start_transaction ( root , 3 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out ;
}
2008-11-12 22:32:25 +03:00
2019-07-02 17:23:07 +03:00
ret = clone_copy_inline_extent ( inode , trans , path ,
& new_key , drop_start ,
datal , skip , size , buf ) ;
if ( ret ) {
if ( ret ! = - EOPNOTSUPP )
btrfs_abort_transaction ( trans , ret ) ;
btrfs_end_transaction ( trans ) ;
2012-03-12 19:03:00 +04:00
goto out ;
2019-07-02 17:23:07 +03:00
}
2010-05-16 18:48:46 +04:00
}
2019-07-02 17:23:07 +03:00
btrfs_release_path ( path ) ;
last_dest_end = ALIGN ( new_key . offset + datal ,
fs_info - > sectorsize ) ;
ret = clone_finish_inode_update ( trans , inode , last_dest_end ,
destoff , olen , no_time_update ) ;
if ( ret )
goto out ;
if ( new_key . offset + datal > = destoff + len )
break ;
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2015-03-31 16:56:46 +03:00
key . offset = next_key_min_offset ;
2016-10-13 04:23:39 +03:00
if ( fatal_signal_pending ( current ) ) {
ret = - EINTR ;
goto out ;
}
2008-06-12 05:53:53 +04:00
}
ret = 0 ;
2013-08-06 22:42:49 +04:00
2014-06-01 04:50:28 +04:00
if ( last_dest_end < destoff + len ) {
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents
When cloning extents (or deduplicating) we create a transaction with a
space reservation that considers we will drop or update a single file
extent item of the destination inode (that we modify a single leaf). That
is fine for the vast majority of scenarios, however it might happen that
we need to drop many file extent items, and adjust at most two file extent
items, in the destination root, which can span multiple leafs. This will
lead to either the call to btrfs_drop_extents() to fail with ENOSPC or
the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode()
(called through clone_finish_inode_update()) to fail with ENOSPC. Such
failure results in a transaction abort, leaving the filesystem in a
read-only mode.
In order to fix this we need to follow the same approach as the hole
punching code, where we create a local reservation with 1 unit and keep
ending and starting transactions, after balancing the btree inode,
when __btrfs_drop_extents() returns ENOSPC. So fix this by making the
extent cloning call calls the recently added btrfs_punch_hole_range()
helper, which is what does the mentioned work for hole punching, and
make sure whenever we drop extent items in a transaction, we also add a
replacing file extent item, to avoid corruption (a hole) if after ending
a transaction and before starting a new one, the old transaction gets
committed and a power failure happens before we finish cloning.
A test case for fstests follows soon.
Reported-by: David Goodwin <david@codepoets.co.uk>
Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/
Reported-by: Sam Tygier <sam@tygier.co.uk>
Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/
Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 13:09:50 +03:00
struct btrfs_clone_extent_info clone_info = { 0 } ;
2014-06-01 04:50:28 +04:00
/*
* We have an implicit hole ( NO_HOLES feature is enabled ) that
* fully or partially overlaps our cloning range at its end .
*/
btrfs_release_path ( path ) ;
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents
When cloning extents (or deduplicating) we create a transaction with a
space reservation that considers we will drop or update a single file
extent item of the destination inode (that we modify a single leaf). That
is fine for the vast majority of scenarios, however it might happen that
we need to drop many file extent items, and adjust at most two file extent
items, in the destination root, which can span multiple leafs. This will
lead to either the call to btrfs_drop_extents() to fail with ENOSPC or
the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode()
(called through clone_finish_inode_update()) to fail with ENOSPC. Such
failure results in a transaction abort, leaving the filesystem in a
read-only mode.
In order to fix this we need to follow the same approach as the hole
punching code, where we create a local reservation with 1 unit and keep
ending and starting transactions, after balancing the btree inode,
when __btrfs_drop_extents() returns ENOSPC. So fix this by making the
extent cloning call calls the recently added btrfs_punch_hole_range()
helper, which is what does the mentioned work for hole punching, and
make sure whenever we drop extent items in a transaction, we also add a
replacing file extent item, to avoid corruption (a hole) if after ending
a transaction and before starting a new one, the old transaction gets
committed and a power failure happens before we finish cloning.
A test case for fstests follows soon.
Reported-by: David Goodwin <david@codepoets.co.uk>
Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/
Reported-by: Sam Tygier <sam@tygier.co.uk>
Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/
Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 13:09:50 +03:00
path - > leave_spinning = 0 ;
2014-06-01 04:50:28 +04:00
/*
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents
When cloning extents (or deduplicating) we create a transaction with a
space reservation that considers we will drop or update a single file
extent item of the destination inode (that we modify a single leaf). That
is fine for the vast majority of scenarios, however it might happen that
we need to drop many file extent items, and adjust at most two file extent
items, in the destination root, which can span multiple leafs. This will
lead to either the call to btrfs_drop_extents() to fail with ENOSPC or
the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode()
(called through clone_finish_inode_update()) to fail with ENOSPC. Such
failure results in a transaction abort, leaving the filesystem in a
read-only mode.
In order to fix this we need to follow the same approach as the hole
punching code, where we create a local reservation with 1 unit and keep
ending and starting transactions, after balancing the btree inode,
when __btrfs_drop_extents() returns ENOSPC. So fix this by making the
extent cloning call calls the recently added btrfs_punch_hole_range()
helper, which is what does the mentioned work for hole punching, and
make sure whenever we drop extent items in a transaction, we also add a
replacing file extent item, to avoid corruption (a hole) if after ending
a transaction and before starting a new one, the old transaction gets
committed and a power failure happens before we finish cloning.
A test case for fstests follows soon.
Reported-by: David Goodwin <david@codepoets.co.uk>
Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/
Reported-by: Sam Tygier <sam@tygier.co.uk>
Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/
Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 13:09:50 +03:00
* We are dealing with a hole and our clone_info already has a
* disk_offset of 0 , we only need to fill the data length and
* file offset .
2014-06-01 04:50:28 +04:00
*/
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents
When cloning extents (or deduplicating) we create a transaction with a
space reservation that considers we will drop or update a single file
extent item of the destination inode (that we modify a single leaf). That
is fine for the vast majority of scenarios, however it might happen that
we need to drop many file extent items, and adjust at most two file extent
items, in the destination root, which can span multiple leafs. This will
lead to either the call to btrfs_drop_extents() to fail with ENOSPC or
the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode()
(called through clone_finish_inode_update()) to fail with ENOSPC. Such
failure results in a transaction abort, leaving the filesystem in a
read-only mode.
In order to fix this we need to follow the same approach as the hole
punching code, where we create a local reservation with 1 unit and keep
ending and starting transactions, after balancing the btree inode,
when __btrfs_drop_extents() returns ENOSPC. So fix this by making the
extent cloning call calls the recently added btrfs_punch_hole_range()
helper, which is what does the mentioned work for hole punching, and
make sure whenever we drop extent items in a transaction, we also add a
replacing file extent item, to avoid corruption (a hole) if after ending
a transaction and before starting a new one, the old transaction gets
committed and a power failure happens before we finish cloning.
A test case for fstests follows soon.
Reported-by: David Goodwin <david@codepoets.co.uk>
Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/
Reported-by: Sam Tygier <sam@tygier.co.uk>
Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/
Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 13:09:50 +03:00
clone_info . data_len = destoff + len - last_dest_end ;
clone_info . file_offset = last_dest_end ;
ret = btrfs_punch_hole_range ( inode , path ,
last_dest_end , destoff + len - 1 ,
& clone_info , & trans ) ;
if ( ret )
2014-06-01 04:50:28 +04:00
goto out ;
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents
When cloning extents (or deduplicating) we create a transaction with a
space reservation that considers we will drop or update a single file
extent item of the destination inode (that we modify a single leaf). That
is fine for the vast majority of scenarios, however it might happen that
we need to drop many file extent items, and adjust at most two file extent
items, in the destination root, which can span multiple leafs. This will
lead to either the call to btrfs_drop_extents() to fail with ENOSPC or
the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode()
(called through clone_finish_inode_update()) to fail with ENOSPC. Such
failure results in a transaction abort, leaving the filesystem in a
read-only mode.
In order to fix this we need to follow the same approach as the hole
punching code, where we create a local reservation with 1 unit and keep
ending and starting transactions, after balancing the btree inode,
when __btrfs_drop_extents() returns ENOSPC. So fix this by making the
extent cloning call calls the recently added btrfs_punch_hole_range()
helper, which is what does the mentioned work for hole punching, and
make sure whenever we drop extent items in a transaction, we also add a
replacing file extent item, to avoid corruption (a hole) if after ending
a transaction and before starting a new one, the old transaction gets
committed and a power failure happens before we finish cloning.
A test case for fstests follows soon.
Reported-by: David Goodwin <david@codepoets.co.uk>
Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/
Reported-by: Sam Tygier <sam@tygier.co.uk>
Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/
Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 13:09:50 +03:00
2014-06-01 04:50:28 +04:00
ret = clone_finish_inode_update ( trans , inode , destoff + len ,
2015-07-01 00:42:08 +03:00
destoff , olen , no_time_update ) ;
2014-06-01 04:50:28 +04:00
}
2008-06-12 05:53:53 +04:00
out :
2013-08-06 22:42:49 +04:00
btrfs_free_path ( path ) ;
2016-04-11 19:40:08 +03:00
kvfree ( buf ) ;
2013-08-06 22:42:49 +04:00
return ret ;
}
2015-11-11 00:53:32 +03:00
static noinline int btrfs_clone_files ( struct file * file , struct file * file_src ,
u64 off , u64 olen , u64 destoff )
2013-08-06 22:42:49 +04:00
{
2013-09-01 23:57:51 +04:00
struct inode * inode = file_inode ( file ) ;
2015-11-11 00:53:32 +03:00
struct inode * src = file_inode ( file_src ) ;
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2013-08-06 22:42:49 +04:00
int ret ;
u64 len = olen ;
2016-06-23 01:54:23 +03:00
u64 bs = fs_info - > sb - > s_blocksize ;
2013-08-06 22:42:49 +04:00
/*
* TODO :
* - split compressed inline extents . annoying : we need to
* decompress into destination ' s address_space ( the file offset
* may change , so source mapping won ' t do ) , then recompress ( or
* otherwise reinsert ) a subrange .
2014-03-10 14:56:07 +04:00
*
* - split destination inode ' s inline extents . The inline extents can
* be either compressed or non - compressed .
2013-08-06 22:42:49 +04:00
*/
2018-11-05 14:14:17 +03:00
/*
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
* VFS ' s generic_remap_file_range_prep ( ) protects us from cloning the
* eof block into the middle of a file , which would result in corruption
* if the file size is not blocksize aligned . So we don ' t need to check
* for that case here .
2018-11-05 14:14:17 +03:00
*/
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
if ( off + len = = src - > i_size )
2013-08-06 22:42:49 +04:00
len = ALIGN ( src - > i_size , bs ) - off ;
if ( destoff > inode - > i_size ) {
Btrfs: fix race between cloning range ending at eof and writeback
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
writeback and cloning a range that covers the eof extent of the source
file into a destination offset that is greater then the same file's size.
This happens because we now wait for writeback to complete before doing
the truncation of the eof block, while previously we did the truncation
and then waited for writeback to complete. This leads to a race between
writeback of the truncated block and cloning the file extents in the
source range, because we copy each file extent item we find in the fs
root into a buffer, then release the path and then increment the reference
count for the extent referred in that file extent item we copied, which
can no longer exist if writeback of the truncated eof block completes
after we copied the file extent item into the buffer and before we
incremented the reference count. This is illustrated by the following
diagram:
CPU 1 CPU 2
btrfs_clone_files()
btrfs_cont_expand()
btrfs_truncate_block()
--> zeroes part of the
page containg eof,
marking it for
delalloc
btrfs_clone()
--> finds extent item
covering eof,
points to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
writeback starts
btrfs_finish_ordered_io()
insert_reserved_file_extent()
__btrfs_drop_extents()
--> creates delayed
reference to drop
the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by waiting for writeback to complete after truncating the eof
block.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:42:54 +03:00
const u64 wb_start = ALIGN_DOWN ( inode - > i_size , bs ) ;
2013-08-06 22:42:49 +04:00
ret = btrfs_cont_expand ( inode , inode - > i_size , destoff ) ;
if ( ret )
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
return ret ;
Btrfs: fix race between cloning range ending at eof and writeback
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
writeback and cloning a range that covers the eof extent of the source
file into a destination offset that is greater then the same file's size.
This happens because we now wait for writeback to complete before doing
the truncation of the eof block, while previously we did the truncation
and then waited for writeback to complete. This leads to a race between
writeback of the truncated block and cloning the file extents in the
source range, because we copy each file extent item we find in the fs
root into a buffer, then release the path and then increment the reference
count for the extent referred in that file extent item we copied, which
can no longer exist if writeback of the truncated eof block completes
after we copied the file extent item into the buffer and before we
incremented the reference count. This is illustrated by the following
diagram:
CPU 1 CPU 2
btrfs_clone_files()
btrfs_cont_expand()
btrfs_truncate_block()
--> zeroes part of the
page containg eof,
marking it for
delalloc
btrfs_clone()
--> finds extent item
covering eof,
points to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
writeback starts
btrfs_finish_ordered_io()
insert_reserved_file_extent()
__btrfs_drop_extents()
--> creates delayed
reference to drop
the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by waiting for writeback to complete after truncating the eof
block.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:42:54 +03:00
/*
* We may have truncated the last block if the inode ' s size is
* not sector size aligned , so we need to wait for writeback to
* complete before proceeding further , otherwise we can race
* with cloning and attempt to increment a reference to an
* extent that no longer exists ( writeback completed right after
* we found the previous extent covering eof and before we
* attempted to increment its reference count ) .
*/
ret = btrfs_wait_ordered_range ( inode , wb_start ,
destoff - wb_start ) ;
if ( ret )
return ret ;
2013-08-06 22:42:49 +04:00
}
Btrfs: ensure readers see new data after a clone operation
We were cleaning the clone target file range from the page cache before
we did replace the file extent items in the fs tree. This was racy,
as right after cleaning the relevant range from the page cache and before
replacing the file extent items, a read against that range could be
performed by another task and populate again the page cache with stale
data (stale after the cloning finishes). This would result in reads after
the clone operation successfully finishes to get old data (and potentially
for a very long time). Therefore evict the pages after replacing the file
extent items, so that subsequent reads will always get the new data.
Similarly, we were prone to races while cloning the file extent items
because we weren't locking the target range and wait for any existing
ordered extents against that range to complete. It was possible that
after cloning the extent items, a write operation that was performed
before the clone operation and overlaps the same range, would end up
undoing all or part of the work the clone operation did (a worker task
running inode.c:btrfs_finish_ordered_io). Therefore lock the target
range in the io tree, wait for all pending ordered extents against that
range to finish and then safely perform the cloning.
The issue of reading stale data after the clone operation is easy to
reproduce by running the following C program in a loop until it exits
with return value 1.
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <pthread.h>
#include <fcntl.h>
#include <assert.h>
#include <asm/types.h>
#include <linux/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#define SRC_FILE "/mnt/sdd/foo"
#define DST_FILE "/mnt/sdd/bar"
#define FILE_SIZE (16 * 1024)
#define PATTERN_SRC 'X'
#define PATTERN_DST 'Y'
struct btrfs_ioctl_clone_range_args {
__s64 src_fd;
__u64 src_offset, src_length;
__u64 dest_offset;
};
#define BTRFS_IOCTL_MAGIC 0x94
#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
struct btrfs_ioctl_clone_range_args)
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
static int clone_done = 0;
static int reader_ready = 0;
static int stale_data = 0;
static void *reader_loop(void *arg)
{
char buf[4096], want_buf[4096];
memset(want_buf, PATTERN_SRC, 4096);
pthread_mutex_lock(&mutex);
reader_ready = 1;
pthread_mutex_unlock(&mutex);
while (1) {
int done, fd, ret;
fd = open(DST_FILE, O_RDONLY);
assert(fd != -1);
pthread_mutex_lock(&mutex);
done = clone_done;
pthread_mutex_unlock(&mutex);
ret = read(fd, buf, 4096);
assert(ret == 4096);
close(fd);
if (done) {
ret = memcmp(buf, want_buf, 4096);
if (ret == 0) {
printf("Found new content\n");
} else {
printf("Found old content\n");
pthread_mutex_lock(&mutex);
stale_data = 1;
pthread_mutex_unlock(&mutex);
}
break;
}
}
return NULL;
}
int main(int argc, char *argv[])
{
pthread_t reader;
int ret, i, fd;
struct btrfs_ioctl_clone_range_args clone_args;
int fd1, fd2;
ret = remove(SRC_FILE);
if (ret == -1 && errno != ENOENT) {
fprintf(stderr, "Error deleting src file: %s\n", strerror(errno));
return 1;
}
ret = remove(DST_FILE);
if (ret == -1 && errno != ENOENT) {
fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno));
return 1;
}
fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
assert(fd != -1);
for (i = 0; i < FILE_SIZE; i++) {
char c = PATTERN_SRC;
ret = write(fd, &c, 1);
assert(ret == 1);
}
close(fd);
fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
assert(fd != -1);
for (i = 0; i < FILE_SIZE; i++) {
char c = PATTERN_DST;
ret = write(fd, &c, 1);
assert(ret == 1);
}
close(fd);
sync();
ret = pthread_create(&reader, NULL, reader_loop, NULL);
assert(ret == 0);
while (1) {
int r;
pthread_mutex_lock(&mutex);
r = reader_ready;
pthread_mutex_unlock(&mutex);
if (r) break;
}
fd1 = open(SRC_FILE, O_RDONLY);
if (fd1 < 0) {
fprintf(stderr, "Error open src file: %s\n", strerror(errno));
return 1;
}
fd2 = open(DST_FILE, O_RDWR);
if (fd2 < 0) {
fprintf(stderr, "Error open dst file: %s\n", strerror(errno));
return 1;
}
clone_args.src_fd = fd1;
clone_args.src_offset = 0;
clone_args.src_length = 4096;
clone_args.dest_offset = 0;
ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args);
assert(ret == 0);
close(fd1);
close(fd2);
pthread_mutex_lock(&mutex);
clone_done = 1;
pthread_mutex_unlock(&mutex);
ret = pthread_join(reader, NULL);
assert(ret == 0);
pthread_mutex_lock(&mutex);
ret = stale_data ? 1 : 0;
pthread_mutex_unlock(&mutex);
return ret;
}
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-05-23 08:03:34 +04:00
/*
Btrfs: fix race between reflink/dedupe and relocation
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
relocation and reflinking (for both cloning and deduplication) the file
extents between the source and destination inodes.
This happens because we no longer lock the source range anymore, and we do
not lock it anymore because we wait for direct IO writes and writeback to
complete early on the code path right after locking the inodes, which
guarantees no other file operations interfere with the reflinking. However
there is one exception which is relocation, since it replaces the byte
number of file extents items in the fs tree after locking the range the
file extent items represent. This is a problem because after finding each
file extent to clone in the fs tree, the reflink process copies the file
extent item into a local buffer, releases the search path, inserts new
file extent items in the destination range and then increments the
reference count for the extent mentioned in the file extent item that it
previously copied to the buffer. If right after copying the file extent
item into the buffer and releasing the path the relocation process
updates the file extent item to point to the new extent, the reflink
process ends up creating a delayed reference to increment the reference
count of the old extent, for which the relocation process already created
a delayed reference to drop it. This results in failure to run delayed
references because we will attempt to increment the count of a reference
that was already dropped. This is illustrated by the following diagram:
CPU 1 CPU 2
relocation is running
btrfs_clone_files()
btrfs_clone()
--> finds extent item
in source range
point to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
replace_file_extents()
--> successfully locks the
range represented by
the file extent item
--> replaces disk_bytenr
field in the file
extent item with some
other value Y
--> creates delayed reference
to increment reference
count for extent at
bytenr Y
--> creates delayed reference
to drop the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by locking the source range before searching for the file extent
items in the fs tree, since the relocation process will try to lock the
range a file extent item represents before updating it with the new extent
location.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:43:07 +03:00
* Lock destination range to serialize with concurrent readpages ( ) and
* source range to serialize with relocation .
Btrfs: ensure readers see new data after a clone operation
We were cleaning the clone target file range from the page cache before
we did replace the file extent items in the fs tree. This was racy,
as right after cleaning the relevant range from the page cache and before
replacing the file extent items, a read against that range could be
performed by another task and populate again the page cache with stale
data (stale after the cloning finishes). This would result in reads after
the clone operation successfully finishes to get old data (and potentially
for a very long time). Therefore evict the pages after replacing the file
extent items, so that subsequent reads will always get the new data.
Similarly, we were prone to races while cloning the file extent items
because we weren't locking the target range and wait for any existing
ordered extents against that range to complete. It was possible that
after cloning the extent items, a write operation that was performed
before the clone operation and overlaps the same range, would end up
undoing all or part of the work the clone operation did (a worker task
running inode.c:btrfs_finish_ordered_io). Therefore lock the target
range in the io tree, wait for all pending ordered extents against that
range to finish and then safely perform the cloning.
The issue of reading stale data after the clone operation is easy to
reproduce by running the following C program in a loop until it exits
with return value 1.
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <pthread.h>
#include <fcntl.h>
#include <assert.h>
#include <asm/types.h>
#include <linux/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#define SRC_FILE "/mnt/sdd/foo"
#define DST_FILE "/mnt/sdd/bar"
#define FILE_SIZE (16 * 1024)
#define PATTERN_SRC 'X'
#define PATTERN_DST 'Y'
struct btrfs_ioctl_clone_range_args {
__s64 src_fd;
__u64 src_offset, src_length;
__u64 dest_offset;
};
#define BTRFS_IOCTL_MAGIC 0x94
#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
struct btrfs_ioctl_clone_range_args)
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
static int clone_done = 0;
static int reader_ready = 0;
static int stale_data = 0;
static void *reader_loop(void *arg)
{
char buf[4096], want_buf[4096];
memset(want_buf, PATTERN_SRC, 4096);
pthread_mutex_lock(&mutex);
reader_ready = 1;
pthread_mutex_unlock(&mutex);
while (1) {
int done, fd, ret;
fd = open(DST_FILE, O_RDONLY);
assert(fd != -1);
pthread_mutex_lock(&mutex);
done = clone_done;
pthread_mutex_unlock(&mutex);
ret = read(fd, buf, 4096);
assert(ret == 4096);
close(fd);
if (done) {
ret = memcmp(buf, want_buf, 4096);
if (ret == 0) {
printf("Found new content\n");
} else {
printf("Found old content\n");
pthread_mutex_lock(&mutex);
stale_data = 1;
pthread_mutex_unlock(&mutex);
}
break;
}
}
return NULL;
}
int main(int argc, char *argv[])
{
pthread_t reader;
int ret, i, fd;
struct btrfs_ioctl_clone_range_args clone_args;
int fd1, fd2;
ret = remove(SRC_FILE);
if (ret == -1 && errno != ENOENT) {
fprintf(stderr, "Error deleting src file: %s\n", strerror(errno));
return 1;
}
ret = remove(DST_FILE);
if (ret == -1 && errno != ENOENT) {
fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno));
return 1;
}
fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
assert(fd != -1);
for (i = 0; i < FILE_SIZE; i++) {
char c = PATTERN_SRC;
ret = write(fd, &c, 1);
assert(ret == 1);
}
close(fd);
fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
assert(fd != -1);
for (i = 0; i < FILE_SIZE; i++) {
char c = PATTERN_DST;
ret = write(fd, &c, 1);
assert(ret == 1);
}
close(fd);
sync();
ret = pthread_create(&reader, NULL, reader_loop, NULL);
assert(ret == 0);
while (1) {
int r;
pthread_mutex_lock(&mutex);
r = reader_ready;
pthread_mutex_unlock(&mutex);
if (r) break;
}
fd1 = open(SRC_FILE, O_RDONLY);
if (fd1 < 0) {
fprintf(stderr, "Error open src file: %s\n", strerror(errno));
return 1;
}
fd2 = open(DST_FILE, O_RDWR);
if (fd2 < 0) {
fprintf(stderr, "Error open dst file: %s\n", strerror(errno));
return 1;
}
clone_args.src_fd = fd1;
clone_args.src_offset = 0;
clone_args.src_length = 4096;
clone_args.dest_offset = 0;
ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args);
assert(ret == 0);
close(fd1);
close(fd2);
pthread_mutex_lock(&mutex);
clone_done = 1;
pthread_mutex_unlock(&mutex);
ret = pthread_join(reader, NULL);
assert(ret == 0);
pthread_mutex_lock(&mutex);
ret = stale_data ? 1 : 0;
pthread_mutex_unlock(&mutex);
return ret;
}
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-05-23 08:03:34 +04:00
*/
Btrfs: fix race between reflink/dedupe and relocation
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
relocation and reflinking (for both cloning and deduplication) the file
extents between the source and destination inodes.
This happens because we no longer lock the source range anymore, and we do
not lock it anymore because we wait for direct IO writes and writeback to
complete early on the code path right after locking the inodes, which
guarantees no other file operations interfere with the reflinking. However
there is one exception which is relocation, since it replaces the byte
number of file extents items in the fs tree after locking the range the
file extent items represent. This is a problem because after finding each
file extent to clone in the fs tree, the reflink process copies the file
extent item into a local buffer, releases the search path, inserts new
file extent items in the destination range and then increments the
reference count for the extent mentioned in the file extent item that it
previously copied to the buffer. If right after copying the file extent
item into the buffer and releasing the path the relocation process
updates the file extent item to point to the new extent, the reflink
process ends up creating a delayed reference to increment the reference
count of the old extent, for which the relocation process already created
a delayed reference to drop it. This results in failure to run delayed
references because we will attempt to increment the count of a reference
that was already dropped. This is illustrated by the following diagram:
CPU 1 CPU 2
relocation is running
btrfs_clone_files()
btrfs_clone()
--> finds extent item
in source range
point to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
replace_file_extents()
--> successfully locks the
range represented by
the file extent item
--> replaces disk_bytenr
field in the file
extent item with some
other value Y
--> creates delayed reference
to increment reference
count for extent at
bytenr Y
--> creates delayed reference
to drop the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by locking the source range before searching for the file extent
items in the fs tree, since the relocation process will try to lock the
range a file extent item represents before updating it with the new extent
location.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:43:07 +03:00
btrfs_double_extent_lock ( src , off , inode , destoff , len ) ;
2015-07-01 00:42:08 +03:00
ret = btrfs_clone ( src , inode , off , olen , len , destoff , 0 ) ;
Btrfs: fix race between reflink/dedupe and relocation
The recent rework that makes btrfs' remap_file_range operation use the
generic helper generic_remap_file_range_prep() introduced a race between
relocation and reflinking (for both cloning and deduplication) the file
extents between the source and destination inodes.
This happens because we no longer lock the source range anymore, and we do
not lock it anymore because we wait for direct IO writes and writeback to
complete early on the code path right after locking the inodes, which
guarantees no other file operations interfere with the reflinking. However
there is one exception which is relocation, since it replaces the byte
number of file extents items in the fs tree after locking the range the
file extent items represent. This is a problem because after finding each
file extent to clone in the fs tree, the reflink process copies the file
extent item into a local buffer, releases the search path, inserts new
file extent items in the destination range and then increments the
reference count for the extent mentioned in the file extent item that it
previously copied to the buffer. If right after copying the file extent
item into the buffer and releasing the path the relocation process
updates the file extent item to point to the new extent, the reflink
process ends up creating a delayed reference to increment the reference
count of the old extent, for which the relocation process already created
a delayed reference to drop it. This results in failure to run delayed
references because we will attempt to increment the count of a reference
that was already dropped. This is illustrated by the following diagram:
CPU 1 CPU 2
relocation is running
btrfs_clone_files()
btrfs_clone()
--> finds extent item
in source range
point to extent
at bytenr X
--> copies it into a
local buffer
--> releases path
replace_file_extents()
--> successfully locks the
range represented by
the file extent item
--> replaces disk_bytenr
field in the file
extent item with some
other value Y
--> creates delayed reference
to increment reference
count for extent at
bytenr Y
--> creates delayed reference
to drop the extent at
bytenr X
--> starts transaction
--> creates delayed
reference to
increment extent
at bytenr X
<delayed references are run, due to a transaction
commit for example, and the transaction is aborted
with -EIO because we attempt to increment reference
count for the extent at bytenr X after we freed it>
When this race is hit the running transaction ends up getting aborted with
an -EIO error and a trace like the following is produced:
[ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1
[ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs]
(...)
[ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202
[ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001
[ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000
[ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
[ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000
[ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548
[ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000
[ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0
[ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 4382.564887] Call Trace:
[ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs]
[ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs]
[ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs]
[ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs]
[ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs]
[ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs]
[ 4382.568112] ? _raw_spin_unlock+0x24/0x30
[ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs]
[ 4382.569006] create_subvol+0x3c8/0x830 [btrfs]
[ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs]
[ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60
[ 4382.570822] ? __sb_start_write+0xd4/0x1c0
[ 4382.571262] ? mnt_want_write_file+0x24/0x50
[ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs]
[ 4382.572155] ? _copy_from_user+0x66/0x90
[ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs]
[ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs]
[ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570
[ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0
[ 4382.574379] ? _raw_spin_unlock+0x24/0x30
[ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0
[ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0
[ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[ 4382.576020] do_vfs_ioctl+0xa2/0x6f0
[ 4382.576405] ksys_ioctl+0x70/0x80
[ 4382.576776] __x64_sys_ioctl+0x16/0x20
[ 4382.577137] do_syscall_64+0x60/0x1b0
[ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(...)
[ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7
[ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003
[ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044
[ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010
[ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050
[ 4382.580792] irq event stamp: 0
[ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null)
[ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320
[ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null)
[ 4382.582413] ---[ end trace d3c188e3e9367382 ]---
[ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure
[ 4382.624295] BTRFS info (device sdc): forced readonly
Fix this by locking the source range before searching for the file extent
items in the fs tree, since the relocation process will try to lock the
range a file extent item represents before updating it with the new extent
location.
Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 14:43:07 +03:00
btrfs_double_extent_unlock ( src , off , inode , destoff , len ) ;
Btrfs: ensure readers see new data after a clone operation
We were cleaning the clone target file range from the page cache before
we did replace the file extent items in the fs tree. This was racy,
as right after cleaning the relevant range from the page cache and before
replacing the file extent items, a read against that range could be
performed by another task and populate again the page cache with stale
data (stale after the cloning finishes). This would result in reads after
the clone operation successfully finishes to get old data (and potentially
for a very long time). Therefore evict the pages after replacing the file
extent items, so that subsequent reads will always get the new data.
Similarly, we were prone to races while cloning the file extent items
because we weren't locking the target range and wait for any existing
ordered extents against that range to complete. It was possible that
after cloning the extent items, a write operation that was performed
before the clone operation and overlaps the same range, would end up
undoing all or part of the work the clone operation did (a worker task
running inode.c:btrfs_finish_ordered_io). Therefore lock the target
range in the io tree, wait for all pending ordered extents against that
range to finish and then safely perform the cloning.
The issue of reading stale data after the clone operation is easy to
reproduce by running the following C program in a loop until it exits
with return value 1.
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <pthread.h>
#include <fcntl.h>
#include <assert.h>
#include <asm/types.h>
#include <linux/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#define SRC_FILE "/mnt/sdd/foo"
#define DST_FILE "/mnt/sdd/bar"
#define FILE_SIZE (16 * 1024)
#define PATTERN_SRC 'X'
#define PATTERN_DST 'Y'
struct btrfs_ioctl_clone_range_args {
__s64 src_fd;
__u64 src_offset, src_length;
__u64 dest_offset;
};
#define BTRFS_IOCTL_MAGIC 0x94
#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
struct btrfs_ioctl_clone_range_args)
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
static int clone_done = 0;
static int reader_ready = 0;
static int stale_data = 0;
static void *reader_loop(void *arg)
{
char buf[4096], want_buf[4096];
memset(want_buf, PATTERN_SRC, 4096);
pthread_mutex_lock(&mutex);
reader_ready = 1;
pthread_mutex_unlock(&mutex);
while (1) {
int done, fd, ret;
fd = open(DST_FILE, O_RDONLY);
assert(fd != -1);
pthread_mutex_lock(&mutex);
done = clone_done;
pthread_mutex_unlock(&mutex);
ret = read(fd, buf, 4096);
assert(ret == 4096);
close(fd);
if (done) {
ret = memcmp(buf, want_buf, 4096);
if (ret == 0) {
printf("Found new content\n");
} else {
printf("Found old content\n");
pthread_mutex_lock(&mutex);
stale_data = 1;
pthread_mutex_unlock(&mutex);
}
break;
}
}
return NULL;
}
int main(int argc, char *argv[])
{
pthread_t reader;
int ret, i, fd;
struct btrfs_ioctl_clone_range_args clone_args;
int fd1, fd2;
ret = remove(SRC_FILE);
if (ret == -1 && errno != ENOENT) {
fprintf(stderr, "Error deleting src file: %s\n", strerror(errno));
return 1;
}
ret = remove(DST_FILE);
if (ret == -1 && errno != ENOENT) {
fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno));
return 1;
}
fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
assert(fd != -1);
for (i = 0; i < FILE_SIZE; i++) {
char c = PATTERN_SRC;
ret = write(fd, &c, 1);
assert(ret == 1);
}
close(fd);
fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
assert(fd != -1);
for (i = 0; i < FILE_SIZE; i++) {
char c = PATTERN_DST;
ret = write(fd, &c, 1);
assert(ret == 1);
}
close(fd);
sync();
ret = pthread_create(&reader, NULL, reader_loop, NULL);
assert(ret == 0);
while (1) {
int r;
pthread_mutex_lock(&mutex);
r = reader_ready;
pthread_mutex_unlock(&mutex);
if (r) break;
}
fd1 = open(SRC_FILE, O_RDONLY);
if (fd1 < 0) {
fprintf(stderr, "Error open src file: %s\n", strerror(errno));
return 1;
}
fd2 = open(DST_FILE, O_RDWR);
if (fd2 < 0) {
fprintf(stderr, "Error open dst file: %s\n", strerror(errno));
return 1;
}
clone_args.src_fd = fd1;
clone_args.src_offset = 0;
clone_args.src_length = 4096;
clone_args.dest_offset = 0;
ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args);
assert(ret == 0);
close(fd1);
close(fd2);
pthread_mutex_lock(&mutex);
clone_done = 1;
pthread_mutex_unlock(&mutex);
ret = pthread_join(reader, NULL);
assert(ret == 0);
pthread_mutex_lock(&mutex);
ret = stale_data ? 1 : 0;
pthread_mutex_unlock(&mutex);
return ret;
}
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-05-23 08:03:34 +04:00
/*
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data .
*/
2016-01-21 13:26:04 +03:00
truncate_inode_pages_range ( & inode - > i_data ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
round_down ( destoff , PAGE_SIZE ) ,
round_up ( destoff + len , PAGE_SIZE ) - 1 ) ;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
return ret ;
}
static int btrfs_remap_file_range_prep ( struct file * file_in , loff_t pos_in ,
struct file * file_out , loff_t pos_out ,
loff_t * len , unsigned int remap_flags )
{
struct inode * inode_in = file_inode ( file_in ) ;
struct inode * inode_out = file_inode ( file_out ) ;
u64 bs = BTRFS_I ( inode_out ) - > root - > fs_info - > sb - > s_blocksize ;
bool same_inode = inode_out = = inode_in ;
u64 wb_len ;
int ret ;
if ( ! ( remap_flags & REMAP_FILE_DEDUP ) ) {
struct btrfs_root * root_out = BTRFS_I ( inode_out ) - > root ;
if ( btrfs_root_readonly ( root_out ) )
return - EROFS ;
if ( file_in - > f_path . mnt ! = file_out - > f_path . mnt | |
inode_in - > i_sb ! = inode_out - > i_sb )
return - EXDEV ;
}
2018-12-12 21:05:56 +03:00
/* don't make the dst file partly checksummed */
if ( ( BTRFS_I ( inode_in ) - > flags & BTRFS_INODE_NODATASUM ) ! =
( BTRFS_I ( inode_out ) - > flags & BTRFS_INODE_NODATASUM ) ) {
2019-02-25 22:07:44 +03:00
return - EINVAL ;
2018-12-12 21:05:56 +03:00
}
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
/*
* Now that the inodes are locked , we need to start writeback ourselves
* and can not rely on the writeback from the VFS ' s generic helper
* generic_remap_file_range_prep ( ) because :
*
* 1 ) For compression we must call filemap_fdatawrite_range ( ) range
* twice ( btrfs_fdatawrite_range ( ) does it for us ) , and the generic
* helper only calls it once ;
*
* 2 ) filemap_fdatawrite_range ( ) , called by the generic helper only
* waits for the writeback to complete , i . e . for IO to be done , and
* not for the ordered extents to complete . We need to wait for them
* to complete so that new file extent items are in the fs tree .
*/
if ( * len = = 0 & & ! ( remap_flags & REMAP_FILE_DEDUP ) )
wb_len = ALIGN ( inode_in - > i_size , bs ) - ALIGN_DOWN ( pos_in , bs ) ;
else
wb_len = ALIGN ( * len , bs ) ;
/*
* Since we don ' t lock ranges , wait for ongoing lockless dio writes ( as
* any in progress could create its ordered extents after we wait for
* existing ordered extents below ) .
*/
inode_dio_wait ( inode_in ) ;
2015-07-01 00:42:06 +03:00
if ( ! same_inode )
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
inode_dio_wait ( inode_out ) ;
2019-05-08 13:49:58 +03:00
/*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW .
*
* Btrfs ' back references do not have a block level granularity , they
* work at the whole extent level .
* NOCOW buffered write without data space reserved may not be able
* to fall back to CoW due to lack of data space , thus could cause
* data loss .
*
* Here we take a shortcut by flushing the whole inode , so that all
* nocow write should reach disk as nocow before we increase the
* reference of the extent . We could do better by only flushing NOCOW
* data , but that needs extra accounting .
*
* Also we don ' t need to check ASYNC_EXTENT , as async extent will be
* CoWed anyway , not affecting nocow part .
*/
ret = filemap_flush ( inode_in - > i_mapping ) ;
if ( ret < 0 )
return ret ;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
ret = btrfs_wait_ordered_range ( inode_in , ALIGN_DOWN ( pos_in , bs ) ,
wb_len ) ;
if ( ret < 0 )
2019-02-25 22:07:44 +03:00
return ret ;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
ret = btrfs_wait_ordered_range ( inode_out , ALIGN_DOWN ( pos_out , bs ) ,
wb_len ) ;
if ( ret < 0 )
2019-02-25 22:07:44 +03:00
return ret ;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
2019-02-25 22:07:44 +03:00
return generic_remap_file_range_prep ( file_in , pos_in , file_out , pos_out ,
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
len , remap_flags ) ;
2015-11-11 00:53:32 +03:00
}
2018-10-30 02:41:49 +03:00
loff_t btrfs_remap_file_range ( struct file * src_file , loff_t off ,
struct file * dst_file , loff_t destoff , loff_t len ,
2018-10-30 02:41:21 +03:00
unsigned int remap_flags )
2015-11-11 00:53:32 +03:00
{
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
struct inode * src_inode = file_inode ( src_file ) ;
struct inode * dst_inode = file_inode ( dst_file ) ;
bool same_inode = dst_inode = = src_inode ;
2018-10-30 02:41:49 +03:00
int ret ;
2018-10-30 02:41:21 +03:00
if ( remap_flags & ~ ( REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY ) )
return - EINVAL ;
2019-02-25 22:07:44 +03:00
if ( same_inode )
inode_lock ( src_inode ) ;
else
lock_two_nondirectories ( src_inode , dst_inode ) ;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
ret = btrfs_remap_file_range_prep ( src_file , off , dst_file , destoff ,
& len , remap_flags ) ;
if ( ret < 0 | | len = = 0 )
2019-02-25 22:07:44 +03:00
goto out_unlock ;
2018-10-30 02:41:21 +03:00
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
if ( remap_flags & REMAP_FILE_DEDUP )
ret = btrfs_extent_same ( src_inode , off , len , dst_inode , destoff ) ;
else
2018-10-30 02:41:49 +03:00
ret = btrfs_clone_files ( dst_file , src_file , off , len , destoff ) ;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
2019-02-25 22:07:44 +03:00
out_unlock :
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
if ( same_inode )
inode_unlock ( src_inode ) ;
else
2019-02-26 15:06:09 +03:00
unlock_two_nondirectories ( src_inode , dst_inode ) ;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
Since cloning and deduplication are no longer Btrfs specific operations, we
now have generic code to handle parameter validation, compare file ranges
used for deduplication, clear capabilities when cloning, etc. This change
makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a
few bugs, such as:
1) When cloning, the destination file's capabilities were not dropped
(the fstest generic/513 tests this);
2) We were not checking if the destination file is immutable;
3) Not checking if either the source or destination files are swap
files (swap file support is coming soon for Btrfs);
4) System limits were not checked (resource limits and O_LARGEFILE).
Note that the generic helper generic_remap_file_range_prep() does start
and waits for writeback by calling filemap_write_and_wait_range(), however
that is not enough for Btrfs for two reasons:
1) With compression, we need to start writeback twice in order to get the
pages marked for writeback and ordered extents created;
2) filemap_write_and_wait_range() (and all its other variants) only waits
for the IO to complete, but we need to wait for the ordered extents to
finish, so that when we do the actual reflinking operations the file
extent items are in the fs tree. This is also important due to the fact
that the generic helper, for the deduplication case, compares the
contents of the pages in the requested range, which might require
reading extents from disk in the very unlikely case that pages get
invalidated after writeback finishes (so the file extent items must be
up to date in the fs tree).
Since these reasons are specific to Btrfs we have to do it in the Btrfs
code before calling generic_remap_file_range_prep(). This also results
in a simpler way of dealing with existing delalloc in the source/target
ranges, specially for the deduplication case where we used to lock all
the pages first and then if we found any dealloc for the range, or
ordered extent, we would unlock the pages trigger writeback and wait for
ordered extents to complete, then lock all the pages again and check if
deduplication can be done. So now we get a simpler approach: lock the
inodes, then trigger writeback and then wait for ordered extents to
complete.
So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it)
to eliminate duplicated code, fix a few bugs and benefit from future bug
fixes done there - for example the recent clone and dedupe bugs involving
reflinking a partial EOF block got a counterpart fix in the generic
helper, since it affected all filesystems supporting these operations,
so we no longer need special checks in Btrfs for them.
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 18:25:38 +03:00
2018-10-30 02:41:49 +03:00
return ret < 0 ? ret : len ;
2008-11-12 22:32:25 +03:00
}
2009-12-12 00:11:29 +03:00
static long btrfs_ioctl_default_subvol ( struct file * file , void __user * argp )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2009-12-12 00:11:29 +03:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_root * new_root ;
struct btrfs_dir_item * di ;
struct btrfs_trans_handle * trans ;
struct btrfs_path * path ;
struct btrfs_key location ;
struct btrfs_disk_key disk_key ;
u64 objectid = 0 ;
u64 dir_id ;
2012-11-26 12:43:07 +04:00
int ret ;
2009-12-12 00:11:29 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 12:43:07 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
if ( copy_from_user ( & objectid , argp , sizeof ( objectid ) ) ) {
ret = - EFAULT ;
goto out ;
}
2009-12-12 00:11:29 +03:00
if ( ! objectid )
2013-09-13 18:04:10 +04:00
objectid = BTRFS_FS_TREE_OBJECTID ;
2009-12-12 00:11:29 +03:00
location . objectid = objectid ;
location . type = BTRFS_ROOT_ITEM_KEY ;
location . offset = ( u64 ) - 1 ;
2016-06-23 01:54:23 +03:00
new_root = btrfs_read_fs_root_no_name ( fs_info , & location ) ;
2012-11-26 12:43:07 +04:00
if ( IS_ERR ( new_root ) ) {
ret = PTR_ERR ( new_root ) ;
goto out ;
}
2018-08-06 08:25:24 +03:00
if ( ! is_fstree ( new_root - > root_key . objectid ) ) {
2017-09-12 16:42:52 +03:00
ret = - ENOENT ;
goto out ;
}
2009-12-12 00:11:29 +03:00
path = btrfs_alloc_path ( ) ;
2012-11-26 12:43:07 +04:00
if ( ! path ) {
ret = - ENOMEM ;
goto out ;
}
2009-12-12 00:11:29 +03:00
path - > leave_spinning = 1 ;
trans = btrfs_start_transaction ( root , 1 ) ;
2011-01-20 09:19:37 +03:00
if ( IS_ERR ( trans ) ) {
2009-12-12 00:11:29 +03:00
btrfs_free_path ( path ) ;
2012-11-26 12:43:07 +04:00
ret = PTR_ERR ( trans ) ;
goto out ;
2009-12-12 00:11:29 +03:00
}
2016-06-23 01:54:23 +03:00
dir_id = btrfs_super_root_dir ( fs_info - > super_copy ) ;
di = btrfs_lookup_dir_item ( trans , fs_info - > tree_root , path ,
2009-12-12 00:11:29 +03:00
dir_id , " default " , 7 , 1 ) ;
2010-05-29 13:47:24 +04:00
if ( IS_ERR_OR_NULL ( di ) ) {
2009-12-12 00:11:29 +03:00
btrfs_free_path ( path ) ;
2016-09-10 04:39:03 +03:00
btrfs_end_transaction ( trans ) ;
2016-06-23 01:54:23 +03:00
btrfs_err ( fs_info ,
2016-09-20 17:05:00 +03:00
" Umm, you don't have the default diritem, this isn't going to work " ) ;
2012-11-26 12:43:07 +04:00
ret = - ENOENT ;
goto out ;
2009-12-12 00:11:29 +03:00
}
btrfs_cpu_key_to_disk ( & disk_key , & new_root - > root_key ) ;
btrfs_set_dir_item_key ( path - > nodes [ 0 ] , di , & disk_key ) ;
btrfs_mark_buffer_dirty ( path - > nodes [ 0 ] ) ;
btrfs_free_path ( path ) ;
2016-06-23 01:54:23 +03:00
btrfs_set_fs_incompat ( fs_info , DEFAULT_SUBVOL ) ;
2016-09-10 04:39:03 +03:00
btrfs_end_transaction ( trans ) ;
2012-11-26 12:43:07 +04:00
out :
mnt_drop_write_file ( file ) ;
return ret ;
2009-12-12 00:11:29 +03:00
}
2018-04-02 12:24:11 +03:00
static void get_block_group_info ( struct list_head * groups_list ,
struct btrfs_ioctl_space_info * space )
2010-09-29 19:22:36 +04:00
{
struct btrfs_block_group_cache * block_group ;
space - > total_bytes = 0 ;
space - > used_bytes = 0 ;
space - > flags = 0 ;
list_for_each_entry ( block_group , groups_list , list ) {
space - > flags = block_group - > flags ;
space - > total_bytes + = block_group - > key . offset ;
space - > used_bytes + =
btrfs_block_group_used ( & block_group - > item ) ;
}
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_space_info ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2010-01-13 21:19:06 +03:00
{
struct btrfs_ioctl_space_args space_args ;
struct btrfs_ioctl_space_info space ;
struct btrfs_ioctl_space_info * dest ;
2010-03-16 22:40:10 +03:00
struct btrfs_ioctl_space_info * dest_orig ;
2011-04-11 19:56:31 +04:00
struct btrfs_ioctl_space_info __user * user_dest ;
2010-01-13 21:19:06 +03:00
struct btrfs_space_info * info ;
2017-09-19 18:01:23 +03:00
static const u64 types [ ] = {
BTRFS_BLOCK_GROUP_DATA ,
BTRFS_BLOCK_GROUP_SYSTEM ,
BTRFS_BLOCK_GROUP_METADATA ,
BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
} ;
2010-09-29 19:22:36 +04:00
int num_types = 4 ;
2010-03-16 22:40:10 +03:00
int alloc_size ;
2010-01-13 21:19:06 +03:00
int ret = 0 ;
2011-02-15 00:04:23 +03:00
u64 slot_count = 0 ;
2010-09-29 19:22:36 +04:00
int i , c ;
2010-01-13 21:19:06 +03:00
if ( copy_from_user ( & space_args ,
( struct btrfs_ioctl_space_args __user * ) arg ,
sizeof ( space_args ) ) )
return - EFAULT ;
2010-09-29 19:22:36 +04:00
for ( i = 0 ; i < num_types ; i + + ) {
struct btrfs_space_info * tmp ;
info = NULL ;
rcu_read_lock ( ) ;
2016-06-23 01:54:23 +03:00
list_for_each_entry_rcu ( tmp , & fs_info - > space_info ,
2010-09-29 19:22:36 +04:00
list ) {
if ( tmp - > flags = = types [ i ] ) {
info = tmp ;
break ;
}
}
rcu_read_unlock ( ) ;
if ( ! info )
continue ;
down_read ( & info - > groups_sem ) ;
for ( c = 0 ; c < BTRFS_NR_RAID_TYPES ; c + + ) {
if ( ! list_empty ( & info - > block_groups [ c ] ) )
slot_count + + ;
}
up_read ( & info - > groups_sem ) ;
}
2010-03-16 22:40:10 +03:00
2014-02-07 17:34:12 +04:00
/*
* Global block reserve , exported as a space_info
*/
slot_count + + ;
2010-03-16 22:40:10 +03:00
/* space_slots == 0 means they are asking for a count */
if ( space_args . space_slots = = 0 ) {
space_args . total_spaces = slot_count ;
goto out ;
}
2010-09-29 19:22:36 +04:00
2011-02-15 00:04:23 +03:00
slot_count = min_t ( u64 , space_args . space_slots , slot_count ) ;
2010-09-29 19:22:36 +04:00
2010-03-16 22:40:10 +03:00
alloc_size = sizeof ( * dest ) * slot_count ;
2010-09-29 19:22:36 +04:00
2010-03-16 22:40:10 +03:00
/* we generally have at most 6 or so space infos, one for each raid
* level . So , a whole page should be more than enough for everyone
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
if ( alloc_size > PAGE_SIZE )
2010-03-16 22:40:10 +03:00
return - ENOMEM ;
2010-01-13 21:19:06 +03:00
space_args . total_spaces = 0 ;
2015-11-04 17:38:29 +03:00
dest = kmalloc ( alloc_size , GFP_KERNEL ) ;
2010-03-16 22:40:10 +03:00
if ( ! dest )
return - ENOMEM ;
dest_orig = dest ;
2010-01-13 21:19:06 +03:00
2010-03-16 22:40:10 +03:00
/* now we have a buffer to copy into */
2010-09-29 19:22:36 +04:00
for ( i = 0 ; i < num_types ; i + + ) {
struct btrfs_space_info * tmp ;
2011-02-15 00:04:23 +03:00
if ( ! slot_count )
break ;
2010-09-29 19:22:36 +04:00
info = NULL ;
rcu_read_lock ( ) ;
2016-06-23 01:54:23 +03:00
list_for_each_entry_rcu ( tmp , & fs_info - > space_info ,
2010-09-29 19:22:36 +04:00
list ) {
if ( tmp - > flags = = types [ i ] ) {
info = tmp ;
break ;
}
}
rcu_read_unlock ( ) ;
2010-03-16 22:40:10 +03:00
2010-09-29 19:22:36 +04:00
if ( ! info )
continue ;
down_read ( & info - > groups_sem ) ;
for ( c = 0 ; c < BTRFS_NR_RAID_TYPES ; c + + ) {
if ( ! list_empty ( & info - > block_groups [ c ] ) ) {
2018-04-02 12:24:11 +03:00
get_block_group_info ( & info - > block_groups [ c ] ,
& space ) ;
2010-09-29 19:22:36 +04:00
memcpy ( dest , & space , sizeof ( space ) ) ;
dest + + ;
space_args . total_spaces + + ;
2011-02-15 00:04:23 +03:00
slot_count - - ;
2010-09-29 19:22:36 +04:00
}
2011-02-15 00:04:23 +03:00
if ( ! slot_count )
break ;
2010-09-29 19:22:36 +04:00
}
up_read ( & info - > groups_sem ) ;
2010-01-13 21:19:06 +03:00
}
2014-02-07 17:34:12 +04:00
/*
* Add global block reserve
*/
if ( slot_count ) {
2016-06-23 01:54:23 +03:00
struct btrfs_block_rsv * block_rsv = & fs_info - > global_block_rsv ;
2014-02-07 17:34:12 +04:00
spin_lock ( & block_rsv - > lock ) ;
space . total_bytes = block_rsv - > size ;
space . used_bytes = block_rsv - > size - block_rsv - > reserved ;
spin_unlock ( & block_rsv - > lock ) ;
space . flags = BTRFS_SPACE_INFO_GLOBAL_RSV ;
memcpy ( dest , & space , sizeof ( space ) ) ;
space_args . total_spaces + + ;
}
2012-04-25 20:37:14 +04:00
user_dest = ( struct btrfs_ioctl_space_info __user * )
2010-03-16 22:40:10 +03:00
( arg + sizeof ( struct btrfs_ioctl_space_args ) ) ;
if ( copy_to_user ( user_dest , dest_orig , alloc_size ) )
ret = - EFAULT ;
kfree ( dest_orig ) ;
out :
if ( ret = = 0 & & copy_to_user ( arg , & space_args , sizeof ( space_args ) ) )
2010-01-13 21:19:06 +03:00
ret = - EFAULT ;
return ret ;
}
2012-11-26 12:40:43 +04:00
static noinline long btrfs_ioctl_start_sync ( struct btrfs_root * root ,
void __user * argp )
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 23:41:32 +04:00
{
struct btrfs_trans_handle * trans ;
u64 transid ;
2011-03-23 11:14:16 +03:00
int ret ;
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 23:41:32 +04:00
Btrfs: fix uncompleted transaction
In some cases, we need commit the current transaction, but don't want
to start a new one if there is no running transaction, so we introduce
the function - btrfs_attach_transaction(), which can catch the current
transaction, and return -ENOENT if there is no running transaction.
But no running transaction doesn't mean the current transction completely,
because we removed the running transaction before it completes. In some
cases, it doesn't matter. But in some special cases, such as freeze fs, we
hope the transaction is fully on disk, it will introduce some bugs, for
example, we may feeze the fs and dump the data in the disk, if the transction
doesn't complete, we would dump inconsistent data. So we need fix the above
problem for those cases.
We fixes this problem by introducing a function:
btrfs_attach_transaction_barrier()
if we hope all the transaction is fully on the disk, even they are not
running, we can use this function.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-02-20 13:17:06 +04:00
trans = btrfs_attach_transaction_barrier ( root ) ;
2012-11-26 12:41:29 +04:00
if ( IS_ERR ( trans ) ) {
if ( PTR_ERR ( trans ) ! = - ENOENT )
return PTR_ERR ( trans ) ;
/* No running transaction, don't bother */
transid = root - > fs_info - > last_trans_committed ;
goto out ;
}
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 23:41:32 +04:00
transid = trans - > transid ;
2016-09-10 04:39:03 +03:00
ret = btrfs_commit_transaction_async ( trans , 0 ) ;
2011-04-04 05:52:13 +04:00
if ( ret ) {
2016-09-10 04:39:03 +03:00
btrfs_end_transaction ( trans ) ;
2011-03-23 11:14:16 +03:00
return ret ;
2011-04-04 05:52:13 +04:00
}
2012-11-26 12:41:29 +04:00
out :
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 23:41:32 +04:00
if ( argp )
if ( copy_to_user ( argp , & transid , sizeof ( transid ) ) )
return - EFAULT ;
return 0 ;
}
2016-06-23 01:54:24 +03:00
static noinline long btrfs_ioctl_wait_sync ( struct btrfs_fs_info * fs_info ,
2012-11-26 12:40:43 +04:00
void __user * argp )
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 23:41:32 +04:00
{
u64 transid ;
if ( argp ) {
if ( copy_from_user ( & transid , argp , sizeof ( transid ) ) )
return - EFAULT ;
} else {
transid = 0 ; /* current trans */
}
2016-06-23 01:54:24 +03:00
return btrfs_wait_for_commit ( fs_info , transid ) ;
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 23:41:32 +04:00
}
2012-11-26 12:48:01 +04:00
static long btrfs_ioctl_scrub ( struct file * file , void __user * arg )
2011-03-11 17:41:01 +03:00
{
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( file_inode ( file ) - > i_sb ) ;
2011-03-11 17:41:01 +03:00
struct btrfs_ioctl_scrub_args * sa ;
2012-11-26 12:48:01 +04:00
int ret ;
2011-03-11 17:41:01 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
if ( IS_ERR ( sa ) )
return PTR_ERR ( sa ) ;
2012-11-26 12:48:01 +04:00
if ( ! ( sa - > flags & BTRFS_SCRUB_READONLY ) ) {
ret = mnt_want_write_file ( file ) ;
if ( ret )
goto out ;
}
2016-06-23 01:54:23 +03:00
ret = btrfs_scrub_dev ( fs_info , sa - > devid , sa - > start , sa - > end ,
2012-11-05 21:29:28 +04:00
& sa - > progress , sa - > flags & BTRFS_SCRUB_READONLY ,
0 ) ;
2011-03-11 17:41:01 +03:00
2018-12-14 22:50:17 +03:00
if ( ret = = 0 & & copy_to_user ( arg , sa , sizeof ( * sa ) ) )
2011-03-11 17:41:01 +03:00
ret = - EFAULT ;
2012-11-26 12:48:01 +04:00
if ( ! ( sa - > flags & BTRFS_SCRUB_READONLY ) )
mnt_drop_write_file ( file ) ;
out :
2011-03-11 17:41:01 +03:00
kfree ( sa ) ;
return ret ;
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_scrub_cancel ( struct btrfs_fs_info * fs_info )
2011-03-11 17:41:01 +03:00
{
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2016-06-23 01:54:24 +03:00
return btrfs_scrub_cancel ( fs_info ) ;
2011-03-11 17:41:01 +03:00
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_scrub_progress ( struct btrfs_fs_info * fs_info ,
2011-03-11 17:41:01 +03:00
void __user * arg )
{
struct btrfs_ioctl_scrub_args * sa ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
if ( IS_ERR ( sa ) )
return PTR_ERR ( sa ) ;
2016-06-23 01:54:24 +03:00
ret = btrfs_scrub_progress ( fs_info , sa - > devid , & sa - > progress ) ;
2011-03-11 17:41:01 +03:00
2018-12-14 22:45:13 +03:00
if ( ret = = 0 & & copy_to_user ( arg , sa , sizeof ( * sa ) ) )
2011-03-11 17:41:01 +03:00
ret = - EFAULT ;
kfree ( sa ) ;
return ret ;
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_get_dev_stats ( struct btrfs_fs_info * fs_info ,
2012-06-22 16:30:39 +04:00
void __user * arg )
2012-05-25 18:06:09 +04:00
{
struct btrfs_ioctl_get_dev_stats * sa ;
int ret ;
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
if ( IS_ERR ( sa ) )
return PTR_ERR ( sa ) ;
2012-06-22 16:30:39 +04:00
if ( ( sa - > flags & BTRFS_DEV_STATS_RESET ) & & ! capable ( CAP_SYS_ADMIN ) ) {
kfree ( sa ) ;
return - EPERM ;
}
2016-06-23 01:54:24 +03:00
ret = btrfs_get_dev_stats ( fs_info , sa ) ;
2012-05-25 18:06:09 +04:00
2018-12-14 22:45:22 +03:00
if ( ret = = 0 & & copy_to_user ( arg , sa , sizeof ( * sa ) ) )
2012-05-25 18:06:09 +04:00
ret = - EFAULT ;
kfree ( sa ) ;
return ret ;
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_dev_replace ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2012-11-06 18:08:53 +04:00
{
struct btrfs_ioctl_dev_replace_args * p ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
p = memdup_user ( arg , sizeof ( * p ) ) ;
if ( IS_ERR ( p ) )
return PTR_ERR ( p ) ;
switch ( p - > cmd ) {
case BTRFS_IOCTL_DEV_REPLACE_CMD_START :
2017-07-17 10:45:34 +03:00
if ( sb_rdonly ( fs_info - > sb ) ) {
2013-10-10 21:39:28 +04:00
ret = - EROFS ;
goto out ;
}
2017-03-28 15:44:21 +03:00
if ( test_and_set_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ) {
2013-08-21 07:44:48 +04:00
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2012-11-06 18:08:53 +04:00
} else {
2016-06-23 01:54:24 +03:00
ret = btrfs_dev_replace_by_ioctl ( fs_info , p ) ;
2017-03-28 15:44:21 +03:00
clear_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ;
2012-11-06 18:08:53 +04:00
}
break ;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS :
2016-06-23 01:54:23 +03:00
btrfs_dev_replace_status ( fs_info , p ) ;
2012-11-06 18:08:53 +04:00
ret = 0 ;
break ;
case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL :
2018-02-12 18:33:30 +03:00
p - > result = btrfs_dev_replace_cancel ( fs_info ) ;
2018-02-12 18:33:29 +03:00
ret = 0 ;
2012-11-06 18:08:53 +04:00
break ;
default :
ret = - EINVAL ;
break ;
}
2019-01-08 14:42:09 +03:00
if ( ( ret = = 0 | | ret = = - ECANCELED ) & & copy_to_user ( arg , p , sizeof ( * p ) ) )
2012-11-06 18:08:53 +04:00
ret = - EFAULT ;
2013-10-10 21:39:28 +04:00
out :
2012-11-06 18:08:53 +04:00
kfree ( p ) ;
return ret ;
}
2011-07-07 18:48:38 +04:00
static long btrfs_ioctl_ino_to_path ( struct btrfs_root * root , void __user * arg )
{
int ret = 0 ;
int i ;
2011-11-02 23:48:34 +04:00
u64 rel_ptr ;
2011-07-07 18:48:38 +04:00
int size ;
2011-11-06 12:07:10 +04:00
struct btrfs_ioctl_ino_path_args * ipa = NULL ;
2011-07-07 18:48:38 +04:00
struct inode_fs_paths * ipath = NULL ;
struct btrfs_path * path ;
2013-01-28 15:33:31 +04:00
if ( ! capable ( CAP_DAC_READ_SEARCH ) )
2011-07-07 18:48:38 +04:00
return - EPERM ;
path = btrfs_alloc_path ( ) ;
if ( ! path ) {
ret = - ENOMEM ;
goto out ;
}
ipa = memdup_user ( arg , sizeof ( * ipa ) ) ;
if ( IS_ERR ( ipa ) ) {
ret = PTR_ERR ( ipa ) ;
ipa = NULL ;
goto out ;
}
size = min_t ( u32 , ipa - > size , 4096 ) ;
ipath = init_ipath ( size , root , path ) ;
if ( IS_ERR ( ipath ) ) {
ret = PTR_ERR ( ipath ) ;
ipath = NULL ;
goto out ;
}
ret = paths_from_inode ( ipa - > inum , ipath ) ;
if ( ret < 0 )
goto out ;
for ( i = 0 ; i < ipath - > fspath - > elem_cnt ; + + i ) {
2011-11-20 16:31:57 +04:00
rel_ptr = ipath - > fspath - > val [ i ] -
( u64 ) ( unsigned long ) ipath - > fspath - > val ;
2011-11-02 23:48:34 +04:00
ipath - > fspath - > val [ i ] = rel_ptr ;
2011-07-07 18:48:38 +04:00
}
2017-08-23 09:46:05 +03:00
ret = copy_to_user ( ( void __user * ) ( unsigned long ) ipa - > fspath ,
ipath - > fspath , size ) ;
2011-07-07 18:48:38 +04:00
if ( ret ) {
ret = - EFAULT ;
goto out ;
}
out :
btrfs_free_path ( path ) ;
free_ipath ( ipath ) ;
kfree ( ipa ) ;
return ret ;
}
static int build_ino_list ( u64 inum , u64 offset , u64 root , void * ctx )
{
struct btrfs_data_container * inodes = ctx ;
const size_t c = 3 * sizeof ( u64 ) ;
if ( inodes - > bytes_left > = c ) {
inodes - > bytes_left - = c ;
inodes - > val [ inodes - > elem_cnt ] = inum ;
inodes - > val [ inodes - > elem_cnt + 1 ] = offset ;
inodes - > val [ inodes - > elem_cnt + 2 ] = root ;
inodes - > elem_cnt + = 3 ;
} else {
inodes - > bytes_missing + = c - inodes - > bytes_left ;
inodes - > bytes_left = 0 ;
inodes - > elem_missed + = 3 ;
}
return 0 ;
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_logical_to_ino ( struct btrfs_fs_info * fs_info ,
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:46 +03:00
void __user * arg , int version )
2011-07-07 18:48:38 +04:00
{
int ret = 0 ;
int size ;
struct btrfs_ioctl_logical_ino_args * loi ;
struct btrfs_data_container * inodes = NULL ;
struct btrfs_path * path = NULL ;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:46 +03:00
bool ignore_offset ;
2011-07-07 18:48:38 +04:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
loi = memdup_user ( arg , sizeof ( * loi ) ) ;
2016-11-10 12:47:41 +03:00
if ( IS_ERR ( loi ) )
return PTR_ERR ( loi ) ;
2011-07-07 18:48:38 +04:00
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:46 +03:00
if ( version = = 1 ) {
ignore_offset = false ;
2017-09-22 20:58:47 +03:00
size = min_t ( u32 , loi - > size , SZ_64K ) ;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:46 +03:00
} else {
/* All reserved bits must be 0 for now */
if ( memchr_inv ( loi - > reserved , 0 , sizeof ( loi - > reserved ) ) ) {
ret = - EINVAL ;
goto out_loi ;
}
/* Only accept flags we have defined so far */
if ( loi - > flags & ~ ( BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET ) ) {
ret = - EINVAL ;
goto out_loi ;
}
ignore_offset = loi - > flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET ;
2017-09-22 20:58:47 +03:00
size = min_t ( u32 , loi - > size , SZ_16M ) ;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:46 +03:00
}
2011-07-07 18:48:38 +04:00
path = btrfs_alloc_path ( ) ;
if ( ! path ) {
ret = - ENOMEM ;
goto out ;
}
inodes = init_data_container ( size ) ;
if ( IS_ERR ( inodes ) ) {
ret = PTR_ERR ( inodes ) ;
inodes = NULL ;
goto out ;
}
2016-06-23 01:54:24 +03:00
ret = iterate_inodes_from_logical ( loi - > logical , fs_info , path ,
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:46 +03:00
build_ino_list , inodes , ignore_offset ) ;
2012-09-08 06:01:29 +04:00
if ( ret = = - EINVAL )
2011-07-07 18:48:38 +04:00
ret = - ENOENT ;
if ( ret < 0 )
goto out ;
2017-08-23 09:46:05 +03:00
ret = copy_to_user ( ( void __user * ) ( unsigned long ) loi - > inodes , inodes ,
size ) ;
2011-07-07 18:48:38 +04:00
if ( ret )
ret = - EFAULT ;
out :
btrfs_free_path ( path ) ;
2017-05-31 20:32:09 +03:00
kvfree ( inodes ) ;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:46 +03:00
out_loi :
2011-07-07 18:48:38 +04:00
kfree ( loi ) ;
return ret ;
}
2018-03-21 04:05:27 +03:00
void btrfs_update_ioctl_balance_args ( struct btrfs_fs_info * fs_info ,
2012-01-17 00:04:47 +04:00
struct btrfs_ioctl_balance_args * bargs )
{
struct btrfs_balance_control * bctl = fs_info - > balance_ctl ;
bargs - > flags = bctl - > flags ;
2018-03-21 03:31:04 +03:00
if ( test_bit ( BTRFS_FS_BALANCE_RUNNING , & fs_info - > flags ) )
2012-01-17 00:04:49 +04:00
bargs - > state | = BTRFS_BALANCE_STATE_RUNNING ;
if ( atomic_read ( & fs_info - > balance_pause_req ) )
bargs - > state | = BTRFS_BALANCE_STATE_PAUSE_REQ ;
2012-01-17 00:04:49 +04:00
if ( atomic_read ( & fs_info - > balance_cancel_req ) )
bargs - > state | = BTRFS_BALANCE_STATE_CANCEL_REQ ;
2012-01-17 00:04:49 +04:00
2012-01-17 00:04:47 +04:00
memcpy ( & bargs - > data , & bctl - > data , sizeof ( bargs - > data ) ) ;
memcpy ( & bargs - > meta , & bctl - > meta , sizeof ( bargs - > meta ) ) ;
memcpy ( & bargs - > sys , & bctl - > sys , sizeof ( bargs - > sys ) ) ;
2012-01-17 00:04:49 +04:00
2018-03-21 04:05:27 +03:00
spin_lock ( & fs_info - > balance_lock ) ;
memcpy ( & bargs - > stat , & bctl - > stat , sizeof ( bargs - > stat ) ) ;
spin_unlock ( & fs_info - > balance_lock ) ;
2012-01-17 00:04:47 +04:00
}
2012-05-11 14:11:26 +04:00
static long btrfs_ioctl_balance ( struct file * file , void __user * arg )
2012-01-17 00:04:47 +04:00
{
2013-01-24 02:07:38 +04:00
struct btrfs_root * root = BTRFS_I ( file_inode ( file ) ) - > root ;
2012-01-17 00:04:47 +04:00
struct btrfs_fs_info * fs_info = root - > fs_info ;
struct btrfs_ioctl_balance_args * bargs ;
struct btrfs_balance_control * bctl ;
2013-01-20 17:57:57 +04:00
bool need_unlock ; /* for mut. excl. ops lock */
2012-01-17 00:04:47 +04:00
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-06-29 13:58:48 +04:00
ret = mnt_want_write_file ( file ) ;
2012-05-11 14:11:26 +04:00
if ( ret )
return ret ;
2013-01-20 17:57:57 +04:00
again :
2017-03-28 15:44:21 +03:00
if ( ! test_and_set_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ) {
2013-01-20 17:57:57 +04:00
mutex_lock ( & fs_info - > balance_mutex ) ;
need_unlock = true ;
goto locked ;
}
/*
2016-05-20 04:18:45 +03:00
* mut . excl . ops lock is locked . Three possibilities :
2013-01-20 17:57:57 +04:00
* ( 1 ) some other op is running
* ( 2 ) balance is running
* ( 3 ) balance is paused - - special case ( think resume )
*/
2012-01-17 00:04:47 +04:00
mutex_lock ( & fs_info - > balance_mutex ) ;
2013-01-20 17:57:57 +04:00
if ( fs_info - > balance_ctl ) {
/* this is either (2) or (3) */
2018-03-21 03:31:04 +03:00
if ( ! test_bit ( BTRFS_FS_BALANCE_RUNNING , & fs_info - > flags ) ) {
2013-01-20 17:57:57 +04:00
mutex_unlock ( & fs_info - > balance_mutex ) ;
2018-03-21 02:20:05 +03:00
/*
* Lock released to allow other waiters to continue ,
* we ' ll reexamine the status again .
*/
2013-01-20 17:57:57 +04:00
mutex_lock ( & fs_info - > balance_mutex ) ;
if ( fs_info - > balance_ctl & &
2018-03-21 03:31:04 +03:00
! test_bit ( BTRFS_FS_BALANCE_RUNNING , & fs_info - > flags ) ) {
2013-01-20 17:57:57 +04:00
/* this is (3) */
need_unlock = false ;
goto locked ;
}
mutex_unlock ( & fs_info - > balance_mutex ) ;
goto again ;
} else {
/* this is (2) */
mutex_unlock ( & fs_info - > balance_mutex ) ;
ret = - EINPROGRESS ;
goto out ;
}
} else {
/* this is (1) */
mutex_unlock ( & fs_info - > balance_mutex ) ;
2013-08-21 07:44:48 +04:00
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2013-01-20 17:57:57 +04:00
goto out ;
}
locked :
2017-03-28 15:44:21 +03:00
BUG_ON ( ! test_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ) ;
2012-01-17 00:04:47 +04:00
if ( arg ) {
bargs = memdup_user ( arg , sizeof ( * bargs ) ) ;
if ( IS_ERR ( bargs ) ) {
ret = PTR_ERR ( bargs ) ;
2013-01-20 17:57:57 +04:00
goto out_unlock ;
2012-01-17 00:04:47 +04:00
}
2012-01-17 00:04:49 +04:00
if ( bargs - > flags & BTRFS_BALANCE_RESUME ) {
if ( ! fs_info - > balance_ctl ) {
ret = - ENOTCONN ;
goto out_bargs ;
}
bctl = fs_info - > balance_ctl ;
spin_lock ( & fs_info - > balance_lock ) ;
bctl - > flags | = BTRFS_BALANCE_RESUME ;
spin_unlock ( & fs_info - > balance_lock ) ;
goto do_balance ;
}
2012-01-17 00:04:47 +04:00
} else {
bargs = NULL ;
}
2013-01-20 17:57:57 +04:00
if ( fs_info - > balance_ctl ) {
2012-01-17 00:04:49 +04:00
ret = - EINPROGRESS ;
goto out_bargs ;
}
2015-11-04 17:38:29 +03:00
bctl = kzalloc ( sizeof ( * bctl ) , GFP_KERNEL ) ;
2012-01-17 00:04:47 +04:00
if ( ! bctl ) {
ret = - ENOMEM ;
goto out_bargs ;
}
if ( arg ) {
memcpy ( & bctl - > data , & bargs - > data , sizeof ( bctl - > data ) ) ;
memcpy ( & bctl - > meta , & bargs - > meta , sizeof ( bctl - > meta ) ) ;
memcpy ( & bctl - > sys , & bargs - > sys , sizeof ( bctl - > sys ) ) ;
bctl - > flags = bargs - > flags ;
2012-01-17 00:04:47 +04:00
} else {
/* balance everything - no filters */
bctl - > flags | = BTRFS_BALANCE_TYPE_MASK ;
2012-01-17 00:04:47 +04:00
}
2015-10-12 17:55:54 +03:00
if ( bctl - > flags & ~ ( BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK ) ) {
ret = - EINVAL ;
2015-10-21 01:50:06 +03:00
goto out_bctl ;
2015-10-12 17:55:54 +03:00
}
2012-01-17 00:04:49 +04:00
do_balance :
2012-01-17 00:04:47 +04:00
/*
2018-03-20 22:23:09 +03:00
* Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
* btrfs_balance . bctl is freed in reset_balance_state , or , if
* restriper was paused all the way until unmount , in free_fs_info .
* The flag should be cleared after reset_balance_state .
2012-01-17 00:04:47 +04:00
*/
2013-01-20 17:57:57 +04:00
need_unlock = false ;
2018-05-07 18:44:03 +03:00
ret = btrfs_balance ( fs_info , bctl , bargs ) ;
2015-10-21 01:50:06 +03:00
bctl = NULL ;
2013-01-20 17:57:57 +04:00
2019-01-08 14:42:01 +03:00
if ( ( ret = = 0 | | ret = = - ECANCELED ) & & arg ) {
2012-01-17 00:04:47 +04:00
if ( copy_to_user ( arg , bargs , sizeof ( * bargs ) ) )
ret = - EFAULT ;
}
2015-10-21 01:50:06 +03:00
out_bctl :
kfree ( bctl ) ;
2012-01-17 00:04:47 +04:00
out_bargs :
kfree ( bargs ) ;
2013-01-20 17:57:57 +04:00
out_unlock :
2012-01-17 00:04:47 +04:00
mutex_unlock ( & fs_info - > balance_mutex ) ;
2013-01-20 17:57:57 +04:00
if ( need_unlock )
2017-03-28 15:44:21 +03:00
clear_bit ( BTRFS_FS_EXCL_OP , & fs_info - > flags ) ;
2013-01-20 17:57:57 +04:00
out :
2012-06-29 13:58:48 +04:00
mnt_drop_write_file ( file ) ;
2012-01-17 00:04:47 +04:00
return ret ;
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_balance_ctl ( struct btrfs_fs_info * fs_info , int cmd )
2012-01-17 00:04:49 +04:00
{
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
switch ( cmd ) {
case BTRFS_BALANCE_CTL_PAUSE :
2016-06-23 01:54:23 +03:00
return btrfs_pause_balance ( fs_info ) ;
2012-01-17 00:04:49 +04:00
case BTRFS_BALANCE_CTL_CANCEL :
2016-06-23 01:54:23 +03:00
return btrfs_cancel_balance ( fs_info ) ;
2012-01-17 00:04:49 +04:00
}
return - EINVAL ;
}
2016-06-23 01:54:24 +03:00
static long btrfs_ioctl_balance_progress ( struct btrfs_fs_info * fs_info ,
2012-01-17 00:04:49 +04:00
void __user * arg )
{
struct btrfs_ioctl_balance_args * bargs ;
int ret = 0 ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
mutex_lock ( & fs_info - > balance_mutex ) ;
if ( ! fs_info - > balance_ctl ) {
ret = - ENOTCONN ;
goto out ;
}
2015-11-04 17:38:29 +03:00
bargs = kzalloc ( sizeof ( * bargs ) , GFP_KERNEL ) ;
2012-01-17 00:04:49 +04:00
if ( ! bargs ) {
ret = - ENOMEM ;
goto out ;
}
2018-03-21 04:05:27 +03:00
btrfs_update_ioctl_balance_args ( fs_info , bargs ) ;
2012-01-17 00:04:49 +04:00
if ( copy_to_user ( arg , bargs , sizeof ( * bargs ) ) )
ret = - EFAULT ;
kfree ( bargs ) ;
out :
mutex_unlock ( & fs_info - > balance_mutex ) ;
return ret ;
}
2012-11-26 12:50:11 +04:00
static long btrfs_ioctl_quota_ctl ( struct file * file , void __user * arg )
2011-09-14 17:53:51 +04:00
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2011-09-14 17:53:51 +04:00
struct btrfs_ioctl_quota_ctl_args * sa ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 12:50:11 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2011-09-14 17:53:51 +04:00
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2012-11-26 12:50:11 +04:00
if ( IS_ERR ( sa ) ) {
ret = PTR_ERR ( sa ) ;
goto drop_write ;
}
2011-09-14 17:53:51 +04:00
2016-06-23 01:54:23 +03:00
down_write ( & fs_info - > subvol_sem ) ;
2011-09-14 17:53:51 +04:00
switch ( sa - > cmd ) {
case BTRFS_QUOTA_CTL_ENABLE :
2018-07-05 14:50:48 +03:00
ret = btrfs_quota_enable ( fs_info ) ;
2011-09-14 17:53:51 +04:00
break ;
case BTRFS_QUOTA_CTL_DISABLE :
2018-07-05 14:50:48 +03:00
ret = btrfs_quota_disable ( fs_info ) ;
2011-09-14 17:53:51 +04:00
break ;
default :
ret = - EINVAL ;
break ;
}
kfree ( sa ) ;
2016-06-23 01:54:23 +03:00
up_write ( & fs_info - > subvol_sem ) ;
2012-11-26 12:50:11 +04:00
drop_write :
mnt_drop_write_file ( file ) ;
2011-09-14 17:53:51 +04:00
return ret ;
}
2012-11-26 12:50:11 +04:00
static long btrfs_ioctl_qgroup_assign ( struct file * file , void __user * arg )
2011-09-14 17:53:51 +04:00
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-09-14 17:53:51 +04:00
struct btrfs_ioctl_qgroup_assign_args * sa ;
struct btrfs_trans_handle * trans ;
int ret ;
int err ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 12:50:11 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2011-09-14 17:53:51 +04:00
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2012-11-26 12:50:11 +04:00
if ( IS_ERR ( sa ) ) {
ret = PTR_ERR ( sa ) ;
goto drop_write ;
}
2011-09-14 17:53:51 +04:00
trans = btrfs_join_transaction ( root ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out ;
}
if ( sa - > assign ) {
2018-07-18 09:45:30 +03:00
ret = btrfs_add_qgroup_relation ( trans , sa - > src , sa - > dst ) ;
2011-09-14 17:53:51 +04:00
} else {
2018-07-18 09:45:32 +03:00
ret = btrfs_del_qgroup_relation ( trans , sa - > src , sa - > dst ) ;
2011-09-14 17:53:51 +04:00
}
2015-02-27 11:24:28 +03:00
/* update qgroup status and info */
2018-07-18 09:45:40 +03:00
err = btrfs_run_qgroups ( trans ) ;
2015-02-27 11:24:28 +03:00
if ( err < 0 )
2016-06-23 01:54:23 +03:00
btrfs_handle_fs_error ( fs_info , err ,
" failed to update qgroup status and info " ) ;
2016-09-10 04:39:03 +03:00
err = btrfs_end_transaction ( trans ) ;
2011-09-14 17:53:51 +04:00
if ( err & & ! ret )
ret = err ;
out :
kfree ( sa ) ;
2012-11-26 12:50:11 +04:00
drop_write :
mnt_drop_write_file ( file ) ;
2011-09-14 17:53:51 +04:00
return ret ;
}
2012-11-26 12:50:11 +04:00
static long btrfs_ioctl_qgroup_create ( struct file * file , void __user * arg )
2011-09-14 17:53:51 +04:00
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-09-14 17:53:51 +04:00
struct btrfs_ioctl_qgroup_create_args * sa ;
struct btrfs_trans_handle * trans ;
int ret ;
int err ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 12:50:11 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2011-09-14 17:53:51 +04:00
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2012-11-26 12:50:11 +04:00
if ( IS_ERR ( sa ) ) {
ret = PTR_ERR ( sa ) ;
goto drop_write ;
}
2011-09-14 17:53:51 +04:00
2012-11-15 15:35:41 +04:00
if ( ! sa - > qgroupid ) {
ret = - EINVAL ;
goto out ;
}
2011-09-14 17:53:51 +04:00
trans = btrfs_join_transaction ( root ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out ;
}
if ( sa - > create ) {
2018-07-18 09:45:33 +03:00
ret = btrfs_create_qgroup ( trans , sa - > qgroupid ) ;
2011-09-14 17:53:51 +04:00
} else {
2018-07-18 09:45:34 +03:00
ret = btrfs_remove_qgroup ( trans , sa - > qgroupid ) ;
2011-09-14 17:53:51 +04:00
}
2016-09-10 04:39:03 +03:00
err = btrfs_end_transaction ( trans ) ;
2011-09-14 17:53:51 +04:00
if ( err & & ! ret )
ret = err ;
out :
kfree ( sa ) ;
2012-11-26 12:50:11 +04:00
drop_write :
mnt_drop_write_file ( file ) ;
2011-09-14 17:53:51 +04:00
return ret ;
}
2012-11-26 12:50:11 +04:00
static long btrfs_ioctl_qgroup_limit ( struct file * file , void __user * arg )
2011-09-14 17:53:51 +04:00
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-09-14 17:53:51 +04:00
struct btrfs_ioctl_qgroup_limit_args * sa ;
struct btrfs_trans_handle * trans ;
int ret ;
int err ;
u64 qgroupid ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 12:50:11 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2011-09-14 17:53:51 +04:00
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2012-11-26 12:50:11 +04:00
if ( IS_ERR ( sa ) ) {
ret = PTR_ERR ( sa ) ;
goto drop_write ;
}
2011-09-14 17:53:51 +04:00
trans = btrfs_join_transaction ( root ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out ;
}
qgroupid = sa - > qgroupid ;
if ( ! qgroupid ) {
/* take the current subvol as qgroup */
qgroupid = root - > root_key . objectid ;
}
2018-07-18 09:45:35 +03:00
ret = btrfs_limit_qgroup ( trans , qgroupid , & sa - > lim ) ;
2011-09-14 17:53:51 +04:00
2016-09-10 04:39:03 +03:00
err = btrfs_end_transaction ( trans ) ;
2011-09-14 17:53:51 +04:00
if ( err & & ! ret )
ret = err ;
out :
kfree ( sa ) ;
2012-11-26 12:50:11 +04:00
drop_write :
mnt_drop_write_file ( file ) ;
2011-09-14 17:53:51 +04:00
return ret ;
}
2013-04-25 20:04:51 +04:00
static long btrfs_ioctl_quota_rescan ( struct file * file , void __user * arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2013-04-25 20:04:51 +04:00
struct btrfs_ioctl_quota_rescan_args * qsa ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
qsa = memdup_user ( arg , sizeof ( * qsa ) ) ;
if ( IS_ERR ( qsa ) ) {
ret = PTR_ERR ( qsa ) ;
goto drop_write ;
}
if ( qsa - > flags ) {
ret = - EINVAL ;
goto out ;
}
2016-06-23 01:54:23 +03:00
ret = btrfs_qgroup_rescan ( fs_info ) ;
2013-04-25 20:04:51 +04:00
out :
kfree ( qsa ) ;
drop_write :
mnt_drop_write_file ( file ) ;
return ret ;
}
static long btrfs_ioctl_quota_rescan_status ( struct file * file , void __user * arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2013-04-25 20:04:51 +04:00
struct btrfs_ioctl_quota_rescan_args * qsa ;
int ret = 0 ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2015-11-04 17:38:29 +03:00
qsa = kzalloc ( sizeof ( * qsa ) , GFP_KERNEL ) ;
2013-04-25 20:04:51 +04:00
if ( ! qsa )
return - ENOMEM ;
2016-06-23 01:54:23 +03:00
if ( fs_info - > qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN ) {
2013-04-25 20:04:51 +04:00
qsa - > flags = 1 ;
2016-06-23 01:54:23 +03:00
qsa - > progress = fs_info - > qgroup_rescan_progress . objectid ;
2013-04-25 20:04:51 +04:00
}
if ( copy_to_user ( arg , qsa , sizeof ( * qsa ) ) )
ret = - EFAULT ;
kfree ( qsa ) ;
return ret ;
}
2013-05-06 23:14:17 +04:00
static long btrfs_ioctl_quota_rescan_wait ( struct file * file , void __user * arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2013-05-06 23:14:17 +04:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2016-06-23 01:54:23 +03:00
return btrfs_qgroup_wait_for_completion ( fs_info , true ) ;
2013-05-06 23:14:17 +04:00
}
2014-01-31 00:17:00 +04:00
static long _btrfs_ioctl_set_received_subvol ( struct file * file ,
struct btrfs_ioctl_received_subvol_args * sa )
2012-07-25 19:35:53 +04:00
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2016-06-23 01:54:23 +03:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2012-07-25 19:35:53 +04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_root_item * root_item = & root - > root_item ;
struct btrfs_trans_handle * trans ;
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-09 05:36:02 +03:00
struct timespec64 ct = current_time ( inode ) ;
2012-07-25 19:35:53 +04:00
int ret = 0 ;
2013-08-15 19:11:20 +04:00
int received_uuid_changed ;
2012-07-25 19:35:53 +04:00
2014-01-16 18:50:22 +04:00
if ( ! inode_owner_or_capable ( inode ) )
return - EPERM ;
2012-07-25 19:35:53 +04:00
ret = mnt_want_write_file ( file ) ;
if ( ret < 0 )
return ret ;
2016-06-23 01:54:23 +03:00
down_write ( & fs_info - > subvol_sem ) ;
2012-07-25 19:35:53 +04:00
2017-01-10 21:35:31 +03:00
if ( btrfs_ino ( BTRFS_I ( inode ) ) ! = BTRFS_FIRST_FREE_OBJECTID ) {
2012-07-25 19:35:53 +04:00
ret = - EINVAL ;
goto out ;
}
if ( btrfs_root_readonly ( root ) ) {
ret = - EROFS ;
goto out ;
}
2013-08-15 19:11:20 +04:00
/*
* 1 - root item
* 2 - uuid items ( received uuid + subvol uuid )
*/
trans = btrfs_start_transaction ( root , 3 ) ;
2012-07-25 19:35:53 +04:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
trans = NULL ;
goto out ;
}
sa - > rtransid = trans - > transid ;
sa - > rtime . sec = ct . tv_sec ;
sa - > rtime . nsec = ct . tv_nsec ;
2013-08-15 19:11:20 +04:00
received_uuid_changed = memcmp ( root_item - > received_uuid , sa - > uuid ,
BTRFS_UUID_SIZE ) ;
if ( received_uuid_changed & &
2018-03-12 15:48:09 +03:00
! btrfs_is_empty_uuid ( root_item - > received_uuid ) ) {
2018-05-29 10:01:54 +03:00
ret = btrfs_uuid_tree_remove ( trans , root_item - > received_uuid ,
2018-03-12 15:48:09 +03:00
BTRFS_UUID_KEY_RECEIVED_SUBVOL ,
root - > root_key . objectid ) ;
if ( ret & & ret ! = - ENOENT ) {
btrfs_abort_transaction ( trans , ret ) ;
btrfs_end_transaction ( trans ) ;
goto out ;
}
}
2012-07-25 19:35:53 +04:00
memcpy ( root_item - > received_uuid , sa - > uuid , BTRFS_UUID_SIZE ) ;
btrfs_set_root_stransid ( root_item , sa - > stransid ) ;
btrfs_set_root_rtransid ( root_item , sa - > rtransid ) ;
2013-07-16 07:19:18 +04:00
btrfs_set_stack_timespec_sec ( & root_item - > stime , sa - > stime . sec ) ;
btrfs_set_stack_timespec_nsec ( & root_item - > stime , sa - > stime . nsec ) ;
btrfs_set_stack_timespec_sec ( & root_item - > rtime , sa - > rtime . sec ) ;
btrfs_set_stack_timespec_nsec ( & root_item - > rtime , sa - > rtime . nsec ) ;
2012-07-25 19:35:53 +04:00
2016-06-23 01:54:23 +03:00
ret = btrfs_update_root ( trans , fs_info - > tree_root ,
2012-07-25 19:35:53 +04:00
& root - > root_key , & root - > root_item ) ;
if ( ret < 0 ) {
2016-09-10 04:39:03 +03:00
btrfs_end_transaction ( trans ) ;
2012-07-25 19:35:53 +04:00
goto out ;
2013-08-15 19:11:20 +04:00
}
if ( received_uuid_changed & & ! btrfs_is_empty_uuid ( sa - > uuid ) ) {
2018-05-29 10:01:53 +03:00
ret = btrfs_uuid_tree_add ( trans , sa - > uuid ,
2013-08-15 19:11:20 +04:00
BTRFS_UUID_KEY_RECEIVED_SUBVOL ,
root - > root_key . objectid ) ;
if ( ret < 0 & & ret ! = - EEXIST ) {
2016-06-11 01:19:25 +03:00
btrfs_abort_transaction ( trans , ret ) ;
2017-09-28 11:45:26 +03:00
btrfs_end_transaction ( trans ) ;
2012-07-25 19:35:53 +04:00
goto out ;
2013-08-15 19:11:20 +04:00
}
}
2016-09-10 04:39:03 +03:00
ret = btrfs_commit_transaction ( trans ) ;
2014-01-31 00:17:00 +04:00
out :
2016-06-23 01:54:23 +03:00
up_write ( & fs_info - > subvol_sem ) ;
2014-01-31 00:17:00 +04:00
mnt_drop_write_file ( file ) ;
return ret ;
}
# ifdef CONFIG_64BIT
static long btrfs_ioctl_set_received_subvol_32 ( struct file * file ,
void __user * arg )
{
struct btrfs_ioctl_received_subvol_args_32 * args32 = NULL ;
struct btrfs_ioctl_received_subvol_args * args64 = NULL ;
int ret = 0 ;
args32 = memdup_user ( arg , sizeof ( * args32 ) ) ;
2016-11-10 12:47:41 +03:00
if ( IS_ERR ( args32 ) )
return PTR_ERR ( args32 ) ;
2014-01-31 00:17:00 +04:00
2015-11-04 17:38:29 +03:00
args64 = kmalloc ( sizeof ( * args64 ) , GFP_KERNEL ) ;
2014-03-28 12:06:00 +04:00
if ( ! args64 ) {
ret = - ENOMEM ;
2014-01-31 00:17:00 +04:00
goto out ;
}
memcpy ( args64 - > uuid , args32 - > uuid , BTRFS_UUID_SIZE ) ;
args64 - > stransid = args32 - > stransid ;
args64 - > rtransid = args32 - > rtransid ;
args64 - > stime . sec = args32 - > stime . sec ;
args64 - > stime . nsec = args32 - > stime . nsec ;
args64 - > rtime . sec = args32 - > rtime . sec ;
args64 - > rtime . nsec = args32 - > rtime . nsec ;
args64 - > flags = args32 - > flags ;
ret = _btrfs_ioctl_set_received_subvol ( file , args64 ) ;
if ( ret )
goto out ;
memcpy ( args32 - > uuid , args64 - > uuid , BTRFS_UUID_SIZE ) ;
args32 - > stransid = args64 - > stransid ;
args32 - > rtransid = args64 - > rtransid ;
args32 - > stime . sec = args64 - > stime . sec ;
args32 - > stime . nsec = args64 - > stime . nsec ;
args32 - > rtime . sec = args64 - > rtime . sec ;
args32 - > rtime . nsec = args64 - > rtime . nsec ;
args32 - > flags = args64 - > flags ;
ret = copy_to_user ( arg , args32 , sizeof ( * args32 ) ) ;
if ( ret )
ret = - EFAULT ;
out :
kfree ( args32 ) ;
kfree ( args64 ) ;
return ret ;
}
# endif
static long btrfs_ioctl_set_received_subvol ( struct file * file ,
void __user * arg )
{
struct btrfs_ioctl_received_subvol_args * sa = NULL ;
int ret = 0 ;
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2016-11-10 12:47:41 +03:00
if ( IS_ERR ( sa ) )
return PTR_ERR ( sa ) ;
2014-01-31 00:17:00 +04:00
ret = _btrfs_ioctl_set_received_subvol ( file , sa ) ;
if ( ret )
goto out ;
2012-07-25 19:35:53 +04:00
ret = copy_to_user ( arg , sa , sizeof ( * sa ) ) ;
if ( ret )
ret = - EFAULT ;
out :
kfree ( sa ) ;
return ret ;
}
2013-01-05 06:48:01 +04:00
static int btrfs_ioctl_get_fslabel ( struct file * file , void __user * arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2013-07-19 13:39:32 +04:00
size_t len ;
2013-01-05 06:48:01 +04:00
int ret ;
2013-07-19 13:39:32 +04:00
char label [ BTRFS_LABEL_SIZE ] ;
2016-06-23 01:54:23 +03:00
spin_lock ( & fs_info - > super_lock ) ;
memcpy ( label , fs_info - > super_copy - > label , BTRFS_LABEL_SIZE ) ;
spin_unlock ( & fs_info - > super_lock ) ;
2013-07-19 13:39:32 +04:00
len = strnlen ( label , BTRFS_LABEL_SIZE ) ;
2013-01-05 06:48:01 +04:00
if ( len = = BTRFS_LABEL_SIZE ) {
2016-06-23 01:54:23 +03:00
btrfs_warn ( fs_info ,
" label is too long, return the first %zu bytes " ,
- - len ) ;
2013-01-05 06:48:01 +04:00
}
ret = copy_to_user ( arg , label , len ) ;
return ret ? - EFAULT : 0 ;
}
2013-01-05 06:48:08 +04:00
static int btrfs_ioctl_set_fslabel ( struct file * file , void __user * arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_super_block * super_block = fs_info - > super_copy ;
2013-01-05 06:48:08 +04:00
struct btrfs_trans_handle * trans ;
char label [ BTRFS_LABEL_SIZE ] ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( copy_from_user ( label , arg , sizeof ( label ) ) )
return - EFAULT ;
if ( strnlen ( label , BTRFS_LABEL_SIZE ) = = BTRFS_LABEL_SIZE ) {
2016-06-23 01:54:23 +03:00
btrfs_err ( fs_info ,
2016-09-20 17:05:00 +03:00
" unable to set label with more than %d bytes " ,
BTRFS_LABEL_SIZE - 1 ) ;
2013-01-05 06:48:08 +04:00
return - EINVAL ;
}
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
trans = btrfs_start_transaction ( root , 0 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out_unlock ;
}
2016-06-23 01:54:23 +03:00
spin_lock ( & fs_info - > super_lock ) ;
2013-01-05 06:48:08 +04:00
strcpy ( super_block - > label , label ) ;
2016-06-23 01:54:23 +03:00
spin_unlock ( & fs_info - > super_lock ) ;
2016-09-10 04:39:03 +03:00
ret = btrfs_commit_transaction ( trans ) ;
2013-01-05 06:48:08 +04:00
out_unlock :
mnt_drop_write_file ( file ) ;
return ret ;
}
2013-11-16 00:33:55 +04:00
# define INIT_FEATURE_FLAGS(suffix) \
{ . compat_flags = BTRFS_FEATURE_COMPAT_ # # suffix , \
. compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_ # # suffix , \
. incompat_flags = BTRFS_FEATURE_INCOMPAT_ # # suffix }
2016-02-17 17:26:27 +03:00
int btrfs_ioctl_get_supported_features ( void __user * arg )
2013-11-16 00:33:55 +04:00
{
2015-11-19 13:42:31 +03:00
static const struct btrfs_ioctl_feature_flags features [ 3 ] = {
2013-11-16 00:33:55 +04:00
INIT_FEATURE_FLAGS ( SUPP ) ,
INIT_FEATURE_FLAGS ( SAFE_SET ) ,
INIT_FEATURE_FLAGS ( SAFE_CLEAR )
} ;
if ( copy_to_user ( arg , & features , sizeof ( features ) ) )
return - EFAULT ;
return 0 ;
}
static int btrfs_ioctl_get_features ( struct file * file , void __user * arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_super_block * super_block = fs_info - > super_copy ;
2013-11-16 00:33:55 +04:00
struct btrfs_ioctl_feature_flags features ;
features . compat_flags = btrfs_super_compat_flags ( super_block ) ;
features . compat_ro_flags = btrfs_super_compat_ro_flags ( super_block ) ;
features . incompat_flags = btrfs_super_incompat_flags ( super_block ) ;
if ( copy_to_user ( arg , & features , sizeof ( features ) ) )
return - EFAULT ;
return 0 ;
}
2016-06-23 01:54:24 +03:00
static int check_feature_bits ( struct btrfs_fs_info * fs_info ,
2013-11-01 21:07:02 +04:00
enum btrfs_feature_set set ,
2013-11-16 00:33:55 +04:00
u64 change_mask , u64 flags , u64 supported_flags ,
u64 safe_set , u64 safe_clear )
{
2019-08-01 20:07:55 +03:00
const char * type = btrfs_feature_set_name ( set ) ;
2013-11-01 21:07:02 +04:00
char * names ;
2013-11-16 00:33:55 +04:00
u64 disallowed , unsupported ;
u64 set_mask = flags & change_mask ;
u64 clear_mask = ~ flags & change_mask ;
unsupported = set_mask & ~ supported_flags ;
if ( unsupported ) {
2013-11-01 21:07:02 +04:00
names = btrfs_printable_features ( set , unsupported ) ;
if ( names ) {
2016-06-23 01:54:23 +03:00
btrfs_warn ( fs_info ,
" this kernel does not support the %s feature bit%s " ,
names , strchr ( names , ' , ' ) ? " s " : " " ) ;
2013-11-01 21:07:02 +04:00
kfree ( names ) ;
} else
2016-06-23 01:54:23 +03:00
btrfs_warn ( fs_info ,
" this kernel does not support %s bits 0x%llx " ,
type , unsupported ) ;
2013-11-16 00:33:55 +04:00
return - EOPNOTSUPP ;
}
disallowed = set_mask & ~ safe_set ;
if ( disallowed ) {
2013-11-01 21:07:02 +04:00
names = btrfs_printable_features ( set , disallowed ) ;
if ( names ) {
2016-06-23 01:54:23 +03:00
btrfs_warn ( fs_info ,
" can't set the %s feature bit%s while mounted " ,
names , strchr ( names , ' , ' ) ? " s " : " " ) ;
2013-11-01 21:07:02 +04:00
kfree ( names ) ;
} else
2016-06-23 01:54:23 +03:00
btrfs_warn ( fs_info ,
" can't set %s bits 0x%llx while mounted " ,
type , disallowed ) ;
2013-11-16 00:33:55 +04:00
return - EPERM ;
}
disallowed = clear_mask & ~ safe_clear ;
if ( disallowed ) {
2013-11-01 21:07:02 +04:00
names = btrfs_printable_features ( set , disallowed ) ;
if ( names ) {
2016-06-23 01:54:23 +03:00
btrfs_warn ( fs_info ,
" can't clear the %s feature bit%s while mounted " ,
names , strchr ( names , ' , ' ) ? " s " : " " ) ;
2013-11-01 21:07:02 +04:00
kfree ( names ) ;
} else
2016-06-23 01:54:23 +03:00
btrfs_warn ( fs_info ,
" can't clear %s bits 0x%llx while mounted " ,
type , disallowed ) ;
2013-11-16 00:33:55 +04:00
return - EPERM ;
}
return 0 ;
}
2016-06-23 01:54:24 +03:00
# define check_feature(fs_info, change_mask, flags, mask_base) \
check_feature_bits ( fs_info , FEAT_ # # mask_base , change_mask , flags , \
2013-11-16 00:33:55 +04:00
BTRFS_FEATURE_ # # mask_base # # _SUPP , \
BTRFS_FEATURE_ # # mask_base # # _SAFE_SET , \
BTRFS_FEATURE_ # # mask_base # # _SAFE_CLEAR )
static int btrfs_ioctl_set_features ( struct file * file , void __user * arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_super_block * super_block = fs_info - > super_copy ;
2013-11-16 00:33:55 +04:00
struct btrfs_ioctl_feature_flags flags [ 2 ] ;
struct btrfs_trans_handle * trans ;
u64 newflags ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( copy_from_user ( flags , arg , sizeof ( flags ) ) )
return - EFAULT ;
/* Nothing to do */
if ( ! flags [ 0 ] . compat_flags & & ! flags [ 0 ] . compat_ro_flags & &
! flags [ 0 ] . incompat_flags )
return 0 ;
2016-06-23 01:54:24 +03:00
ret = check_feature ( fs_info , flags [ 0 ] . compat_flags ,
2013-11-16 00:33:55 +04:00
flags [ 1 ] . compat_flags , COMPAT ) ;
if ( ret )
return ret ;
2016-06-23 01:54:24 +03:00
ret = check_feature ( fs_info , flags [ 0 ] . compat_ro_flags ,
2013-11-16 00:33:55 +04:00
flags [ 1 ] . compat_ro_flags , COMPAT_RO ) ;
if ( ret )
return ret ;
2016-06-23 01:54:24 +03:00
ret = check_feature ( fs_info , flags [ 0 ] . incompat_flags ,
2013-11-16 00:33:55 +04:00
flags [ 1 ] . incompat_flags , INCOMPAT ) ;
if ( ret )
return ret ;
2016-05-04 12:32:00 +03:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2014-02-07 17:34:04 +04:00
trans = btrfs_start_transaction ( root , 0 ) ;
2016-05-04 12:32:00 +03:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out_drop_write ;
}
2013-11-16 00:33:55 +04:00
2016-06-23 01:54:23 +03:00
spin_lock ( & fs_info - > super_lock ) ;
2013-11-16 00:33:55 +04:00
newflags = btrfs_super_compat_flags ( super_block ) ;
newflags | = flags [ 0 ] . compat_flags & flags [ 1 ] . compat_flags ;
newflags & = ~ ( flags [ 0 ] . compat_flags & ~ flags [ 1 ] . compat_flags ) ;
btrfs_set_super_compat_flags ( super_block , newflags ) ;
newflags = btrfs_super_compat_ro_flags ( super_block ) ;
newflags | = flags [ 0 ] . compat_ro_flags & flags [ 1 ] . compat_ro_flags ;
newflags & = ~ ( flags [ 0 ] . compat_ro_flags & ~ flags [ 1 ] . compat_ro_flags ) ;
btrfs_set_super_compat_ro_flags ( super_block , newflags ) ;
newflags = btrfs_super_incompat_flags ( super_block ) ;
newflags | = flags [ 0 ] . incompat_flags & flags [ 1 ] . incompat_flags ;
newflags & = ~ ( flags [ 0 ] . incompat_flags & ~ flags [ 1 ] . incompat_flags ) ;
btrfs_set_super_incompat_flags ( super_block , newflags ) ;
2016-06-23 01:54:23 +03:00
spin_unlock ( & fs_info - > super_lock ) ;
2013-11-16 00:33:55 +04:00
2016-09-10 04:39:03 +03:00
ret = btrfs_commit_transaction ( trans ) ;
2016-05-04 12:32:00 +03:00
out_drop_write :
mnt_drop_write_file ( file ) ;
return ret ;
2013-11-16 00:33:55 +04:00
}
2017-09-27 17:43:13 +03:00
static int _btrfs_ioctl_send ( struct file * file , void __user * argp , bool compat )
{
struct btrfs_ioctl_send_args * arg ;
int ret ;
if ( compat ) {
# if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_send_args_32 args32 ;
ret = copy_from_user ( & args32 , argp , sizeof ( args32 ) ) ;
if ( ret )
return - EFAULT ;
arg = kzalloc ( sizeof ( * arg ) , GFP_KERNEL ) ;
if ( ! arg )
return - ENOMEM ;
arg - > send_fd = args32 . send_fd ;
arg - > clone_sources_count = args32 . clone_sources_count ;
arg - > clone_sources = compat_ptr ( args32 . clone_sources ) ;
arg - > parent_root = args32 . parent_root ;
arg - > flags = args32 . flags ;
memcpy ( arg - > reserved , args32 . reserved ,
sizeof ( args32 . reserved ) ) ;
# else
return - ENOTTY ;
# endif
} else {
arg = memdup_user ( argp , sizeof ( * arg ) ) ;
if ( IS_ERR ( arg ) )
return PTR_ERR ( arg ) ;
}
ret = btrfs_ioctl_send ( file , arg ) ;
kfree ( arg ) ;
return ret ;
}
2008-06-12 05:53:53 +04:00
long btrfs_ioctl ( struct file * file , unsigned int
cmd , unsigned long arg )
{
2016-06-23 01:54:23 +03:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2008-12-02 14:36:08 +03:00
void __user * argp = ( void __user * ) arg ;
2008-06-12 05:53:53 +04:00
switch ( cmd ) {
2009-04-17 12:37:41 +04:00
case FS_IOC_GETFLAGS :
return btrfs_ioctl_getflags ( file , argp ) ;
case FS_IOC_SETFLAGS :
return btrfs_ioctl_setflags ( file , argp ) ;
case FS_IOC_GETVERSION :
return btrfs_ioctl_getversion ( file , argp ) ;
2019-07-17 20:39:20 +03:00
case FS_IOC_GETFSLABEL :
return btrfs_ioctl_get_fslabel ( file , argp ) ;
case FS_IOC_SETFSLABEL :
return btrfs_ioctl_set_fslabel ( file , argp ) ;
2011-03-24 13:24:28 +03:00
case FITRIM :
return btrfs_ioctl_fitrim ( file , argp ) ;
2008-06-12 05:53:53 +04:00
case BTRFS_IOC_SNAP_CREATE :
2010-12-20 10:53:28 +03:00
return btrfs_ioctl_snap_create ( file , argp , 0 ) ;
2010-12-10 09:41:56 +03:00
case BTRFS_IOC_SNAP_CREATE_V2 :
2010-12-20 10:53:28 +03:00
return btrfs_ioctl_snap_create_v2 ( file , argp , 0 ) ;
2008-11-18 05:02:50 +03:00
case BTRFS_IOC_SUBVOL_CREATE :
2010-12-20 10:53:28 +03:00
return btrfs_ioctl_snap_create ( file , argp , 1 ) ;
2011-09-14 17:58:21 +04:00
case BTRFS_IOC_SUBVOL_CREATE_V2 :
return btrfs_ioctl_snap_create_v2 ( file , argp , 1 ) ;
2009-09-22 00:00:26 +04:00
case BTRFS_IOC_SNAP_DESTROY :
return btrfs_ioctl_snap_destroy ( file , argp ) ;
2010-12-20 11:30:25 +03:00
case BTRFS_IOC_SUBVOL_GETFLAGS :
return btrfs_ioctl_subvol_getflags ( file , argp ) ;
case BTRFS_IOC_SUBVOL_SETFLAGS :
return btrfs_ioctl_subvol_setflags ( file , argp ) ;
2009-12-12 00:11:29 +03:00
case BTRFS_IOC_DEFAULT_SUBVOL :
return btrfs_ioctl_default_subvol ( file , argp ) ;
2008-06-12 05:53:53 +04:00
case BTRFS_IOC_DEFRAG :
2010-03-11 17:42:04 +03:00
return btrfs_ioctl_defrag ( file , NULL ) ;
case BTRFS_IOC_DEFRAG_RANGE :
return btrfs_ioctl_defrag ( file , argp ) ;
2008-06-12 05:53:53 +04:00
case BTRFS_IOC_RESIZE :
2012-11-26 12:43:45 +04:00
return btrfs_ioctl_resize ( file , argp ) ;
2008-06-12 05:53:53 +04:00
case BTRFS_IOC_ADD_DEV :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_add_dev ( fs_info , argp ) ;
2008-06-12 05:53:53 +04:00
case BTRFS_IOC_RM_DEV :
2012-11-26 12:44:50 +04:00
return btrfs_ioctl_rm_dev ( file , argp ) ;
2016-02-13 05:01:39 +03:00
case BTRFS_IOC_RM_DEV_V2 :
return btrfs_ioctl_rm_dev_v2 ( file , argp ) ;
2011-03-11 17:41:01 +03:00
case BTRFS_IOC_FS_INFO :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_fs_info ( fs_info , argp ) ;
2011-03-11 17:41:01 +03:00
case BTRFS_IOC_DEV_INFO :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_dev_info ( fs_info , argp ) ;
2008-06-12 05:53:53 +04:00
case BTRFS_IOC_BALANCE :
2012-05-11 14:11:26 +04:00
return btrfs_ioctl_balance ( file , NULL ) ;
2010-02-28 23:39:26 +03:00
case BTRFS_IOC_TREE_SEARCH :
return btrfs_ioctl_tree_search ( file , argp ) ;
2014-01-30 19:24:03 +04:00
case BTRFS_IOC_TREE_SEARCH_V2 :
return btrfs_ioctl_tree_search_v2 ( file , argp ) ;
2010-02-28 23:39:26 +03:00
case BTRFS_IOC_INO_LOOKUP :
return btrfs_ioctl_ino_lookup ( file , argp ) ;
2011-07-07 18:48:38 +04:00
case BTRFS_IOC_INO_PATHS :
return btrfs_ioctl_ino_to_path ( root , argp ) ;
case BTRFS_IOC_LOGICAL_INO :
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 20:58:46 +03:00
return btrfs_ioctl_logical_to_ino ( fs_info , argp , 1 ) ;
case BTRFS_IOC_LOGICAL_INO_V2 :
return btrfs_ioctl_logical_to_ino ( fs_info , argp , 2 ) ;
2010-01-13 21:19:06 +03:00
case BTRFS_IOC_SPACE_INFO :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_space_info ( fs_info , argp ) ;
2013-09-23 14:35:11 +04:00
case BTRFS_IOC_SYNC : {
int ret ;
2018-04-23 10:54:13 +03:00
ret = btrfs_start_delalloc_roots ( fs_info , - 1 ) ;
2013-09-23 14:35:11 +04:00
if ( ret )
return ret ;
2016-06-23 01:54:23 +03:00
ret = btrfs_sync_fs ( inode - > i_sb , 1 ) ;
2014-07-23 16:39:35 +04:00
/*
* The transaction thread may want to do more work ,
2016-05-20 04:18:45 +03:00
* namely it pokes the cleaner kthread that will start
2014-07-23 16:39:35 +04:00
* processing uncleaned subvols .
*/
2016-06-23 01:54:23 +03:00
wake_up_process ( fs_info - > transaction_kthread ) ;
2013-09-23 14:35:11 +04:00
return ret ;
}
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 23:41:32 +04:00
case BTRFS_IOC_START_SYNC :
2012-11-26 12:40:43 +04:00
return btrfs_ioctl_start_sync ( root , argp ) ;
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 23:41:32 +04:00
case BTRFS_IOC_WAIT_SYNC :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_wait_sync ( fs_info , argp ) ;
2011-03-11 17:41:01 +03:00
case BTRFS_IOC_SCRUB :
2012-11-26 12:48:01 +04:00
return btrfs_ioctl_scrub ( file , argp ) ;
2011-03-11 17:41:01 +03:00
case BTRFS_IOC_SCRUB_CANCEL :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_scrub_cancel ( fs_info ) ;
2011-03-11 17:41:01 +03:00
case BTRFS_IOC_SCRUB_PROGRESS :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_scrub_progress ( fs_info , argp ) ;
2012-01-17 00:04:47 +04:00
case BTRFS_IOC_BALANCE_V2 :
2012-05-11 14:11:26 +04:00
return btrfs_ioctl_balance ( file , argp ) ;
2012-01-17 00:04:49 +04:00
case BTRFS_IOC_BALANCE_CTL :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_balance_ctl ( fs_info , arg ) ;
2012-01-17 00:04:49 +04:00
case BTRFS_IOC_BALANCE_PROGRESS :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_balance_progress ( fs_info , argp ) ;
2012-07-25 19:35:53 +04:00
case BTRFS_IOC_SET_RECEIVED_SUBVOL :
return btrfs_ioctl_set_received_subvol ( file , argp ) ;
2014-01-31 00:17:00 +04:00
# ifdef CONFIG_64BIT
case BTRFS_IOC_SET_RECEIVED_SUBVOL_32 :
return btrfs_ioctl_set_received_subvol_32 ( file , argp ) ;
# endif
2012-07-26 01:19:24 +04:00
case BTRFS_IOC_SEND :
2017-09-27 17:43:13 +03:00
return _btrfs_ioctl_send ( file , argp , false ) ;
# if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32 :
return _btrfs_ioctl_send ( file , argp , true ) ;
# endif
2012-05-25 18:06:09 +04:00
case BTRFS_IOC_GET_DEV_STATS :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_get_dev_stats ( fs_info , argp ) ;
2011-09-14 17:53:51 +04:00
case BTRFS_IOC_QUOTA_CTL :
2012-11-26 12:50:11 +04:00
return btrfs_ioctl_quota_ctl ( file , argp ) ;
2011-09-14 17:53:51 +04:00
case BTRFS_IOC_QGROUP_ASSIGN :
2012-11-26 12:50:11 +04:00
return btrfs_ioctl_qgroup_assign ( file , argp ) ;
2011-09-14 17:53:51 +04:00
case BTRFS_IOC_QGROUP_CREATE :
2012-11-26 12:50:11 +04:00
return btrfs_ioctl_qgroup_create ( file , argp ) ;
2011-09-14 17:53:51 +04:00
case BTRFS_IOC_QGROUP_LIMIT :
2012-11-26 12:50:11 +04:00
return btrfs_ioctl_qgroup_limit ( file , argp ) ;
2013-04-25 20:04:51 +04:00
case BTRFS_IOC_QUOTA_RESCAN :
return btrfs_ioctl_quota_rescan ( file , argp ) ;
case BTRFS_IOC_QUOTA_RESCAN_STATUS :
return btrfs_ioctl_quota_rescan_status ( file , argp ) ;
2013-05-06 23:14:17 +04:00
case BTRFS_IOC_QUOTA_RESCAN_WAIT :
return btrfs_ioctl_quota_rescan_wait ( file , argp ) ;
2012-11-06 18:08:53 +04:00
case BTRFS_IOC_DEV_REPLACE :
2016-06-23 01:54:24 +03:00
return btrfs_ioctl_dev_replace ( fs_info , argp ) ;
2013-11-16 00:33:55 +04:00
case BTRFS_IOC_GET_SUPPORTED_FEATURES :
2016-02-17 17:26:27 +03:00
return btrfs_ioctl_get_supported_features ( argp ) ;
2013-11-16 00:33:55 +04:00
case BTRFS_IOC_GET_FEATURES :
return btrfs_ioctl_get_features ( file , argp ) ;
case BTRFS_IOC_SET_FEATURES :
return btrfs_ioctl_set_features ( file , argp ) ;
2018-03-26 20:51:16 +03:00
case FS_IOC_FSGETXATTR :
return btrfs_ioctl_fsgetxattr ( file , argp ) ;
2018-03-26 20:51:16 +03:00
case FS_IOC_FSSETXATTR :
return btrfs_ioctl_fssetxattr ( file , argp ) ;
2018-05-21 04:09:42 +03:00
case BTRFS_IOC_GET_SUBVOL_INFO :
return btrfs_ioctl_get_subvol_info ( file , argp ) ;
2018-05-21 04:09:43 +03:00
case BTRFS_IOC_GET_SUBVOL_ROOTREF :
return btrfs_ioctl_get_subvol_rootref ( file , argp ) ;
2018-05-21 04:09:44 +03:00
case BTRFS_IOC_INO_LOOKUP_USER :
return btrfs_ioctl_ino_lookup_user ( file , argp ) ;
2008-06-12 05:53:53 +04:00
}
return - ENOTTY ;
}
2015-10-29 11:22:21 +03:00
# ifdef CONFIG_COMPAT
long btrfs_compat_ioctl ( struct file * file , unsigned int cmd , unsigned long arg )
{
2017-02-07 03:39:09 +03:00
/*
* These all access 32 - bit values anyway so no further
* handling is necessary .
*/
2015-10-29 11:22:21 +03:00
switch ( cmd ) {
case FS_IOC32_GETFLAGS :
cmd = FS_IOC_GETFLAGS ;
break ;
case FS_IOC32_SETFLAGS :
cmd = FS_IOC_SETFLAGS ;
break ;
case FS_IOC32_GETVERSION :
cmd = FS_IOC_GETVERSION ;
break ;
}
return btrfs_ioctl ( file , cmd , ( unsigned long ) compat_ptr ( arg ) ) ;
}
# endif