2018-04-03 19:23:33 +02:00
// SPDX-License-Identifier: GPL-2.0
2008-06-11 21:53:53 -04:00
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
*/
# include <linux/kernel.h>
# include <linux/bio.h>
# include <linux/file.h>
# include <linux/fs.h>
2008-10-09 13:39:39 -04:00
# include <linux/fsnotify.h>
2008-06-11 21:53:53 -04:00
# include <linux/pagemap.h>
# include <linux/highmem.h>
# include <linux/time.h>
# include <linux/string.h>
# include <linux/backing-dev.h>
2008-10-09 13:39:39 -04:00
# include <linux/mount.h>
# include <linux/namei.h>
2008-06-11 21:53:53 -04:00
# include <linux/writeback.h>
# include <linux/compat.h>
2008-10-09 13:39:39 -04:00
# include <linux/security.h>
2008-06-11 21:53:53 -04:00
# include <linux/xattr.h>
2017-05-31 19:32:09 +02:00
# include <linux/mm.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
# include <linux/slab.h>
2011-03-24 10:24:28 +00:00
# include <linux/blkdev.h>
2012-07-25 17:35:53 +02:00
# include <linux/uuid.h>
2013-01-29 06:04:50 +00:00
# include <linux/btrfs.h>
2013-08-06 11:42:51 -07:00
# include <linux/uaccess.h>
2018-01-29 06:41:30 -05:00
# include <linux/iversion.h>
2021-04-07 14:36:43 +02:00
# include <linux/fileattr.h>
2008-06-11 21:53:53 -04:00
# include "ctree.h"
# include "disk-io.h"
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
# include "export.h"
2008-06-11 21:53:53 -04:00
# include "transaction.h"
# include "btrfs_inode.h"
# include "print-tree.h"
# include "volumes.h"
2008-06-25 16:01:30 -04:00
# include "locking.h"
2011-07-07 16:48:38 +02:00
# include "backref.h"
2012-06-04 14:03:51 -04:00
# include "rcu-string.h"
2012-07-25 23:19:24 +02:00
# include "send.h"
2012-11-06 15:08:53 +01:00
# include "dev-replace.h"
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
# include "props.h"
2013-11-01 13:07:02 -04:00
# include "sysfs.h"
2014-05-13 17:30:47 -07:00
# include "qgroup.h"
Btrfs: fix unreplayable log after snapshot delete + parent dir fsync
If we delete a snapshot, fsync its parent directory and crash/power fail
before the next transaction commit, on the next mount when we attempt to
replay the log tree of the root containing the parent directory we will
fail and prevent the filesystem from mounting, which is solvable by wiping
out the log trees with the btrfs-zero-log tool but very inconvenient as
we will lose any data and metadata fsynced before the parent directory
was fsynced.
For example:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt
$ mkdir /mnt/testdir
$ btrfs subvolume snapshot /mnt /mnt/testdir/snap
$ btrfs subvolume delete /mnt/testdir/snap
$ xfs_io -c "fsync" /mnt/testdir
< crash / power failure and reboot >
$ mount /dev/sdc /mnt
mount: mount(2) failed: No such file or directory
And in dmesg/syslog we get the following message and trace:
[192066.361162] BTRFS info (device dm-0): failed to delete reference to snap, inode 257 parent 257
[192066.363010] ------------[ cut here ]------------
[192066.365268] WARNING: CPU: 4 PID: 5130 at fs/btrfs/inode.c:3986 __btrfs_unlink_inode+0x17a/0x354 [btrfs]()
[192066.367250] BTRFS: Transaction aborted (error -2)
[192066.368401] Modules linked in: btrfs dm_flakey dm_mod ppdev sha256_generic xor raid6_pq hmac drbg ansi_cprng aesni_intel acpi_cpufreq tpm_tis aes_x86_64 tpm ablk_helper evdev cryptd sg parport_pc i2c_piix4 psmouse lrw parport i2c_core pcspkr gf128mul processor serio_raw glue_helper button loop autofs4 ext4 crc16 mbcache jbd2 sd_mod sr_mod cdrom ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel scsi_mod e1000 virtio floppy [last unloaded: btrfs]
[192066.377154] CPU: 4 PID: 5130 Comm: mount Tainted: G W 4.4.0-rc6-btrfs-next-20+ #1
[192066.378875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[192066.380889] 0000000000000000 ffff880143923670 ffffffff81257570 ffff8801439236b8
[192066.382561] ffff8801439236a8 ffffffff8104ec07 ffffffffa039dc2c 00000000fffffffe
[192066.384191] ffff8801ed31d000 ffff8801b9fc9c88 ffff8801086875e0 ffff880143923710
[192066.385827] Call Trace:
[192066.386373] [<ffffffff81257570>] dump_stack+0x4e/0x79
[192066.387387] [<ffffffff8104ec07>] warn_slowpath_common+0x99/0xb2
[192066.388429] [<ffffffffa039dc2c>] ? __btrfs_unlink_inode+0x17a/0x354 [btrfs]
[192066.389236] [<ffffffff8104ec68>] warn_slowpath_fmt+0x48/0x50
[192066.389884] [<ffffffffa039dc2c>] __btrfs_unlink_inode+0x17a/0x354 [btrfs]
[192066.390621] [<ffffffff81184b55>] ? iput+0xb0/0x266
[192066.391200] [<ffffffffa039ea25>] btrfs_unlink_inode+0x1c/0x3d [btrfs]
[192066.391930] [<ffffffffa03ca623>] check_item_in_log+0x1fe/0x29b [btrfs]
[192066.392715] [<ffffffffa03ca827>] replay_dir_deletes+0x167/0x1cf [btrfs]
[192066.393510] [<ffffffffa03cccc7>] replay_one_buffer+0x417/0x570 [btrfs]
[192066.394241] [<ffffffffa03ca164>] walk_up_log_tree+0x10e/0x1dc [btrfs]
[192066.394958] [<ffffffffa03cac72>] walk_log_tree+0xa5/0x190 [btrfs]
[192066.395628] [<ffffffffa03ce8b8>] btrfs_recover_log_trees+0x239/0x32c [btrfs]
[192066.396790] [<ffffffffa03cc8b0>] ? replay_one_extent+0x50a/0x50a [btrfs]
[192066.397891] [<ffffffffa0394041>] open_ctree+0x1d8b/0x2167 [btrfs]
[192066.398897] [<ffffffffa03706e1>] btrfs_mount+0x5ef/0x729 [btrfs]
[192066.399823] [<ffffffff8108ad98>] ? trace_hardirqs_on+0xd/0xf
[192066.400739] [<ffffffff8108959b>] ? lockdep_init_map+0xb9/0x1b3
[192066.401700] [<ffffffff811714b9>] mount_fs+0x67/0x131
[192066.402482] [<ffffffff81188560>] vfs_kern_mount+0x6c/0xde
[192066.403930] [<ffffffffa03702bd>] btrfs_mount+0x1cb/0x729 [btrfs]
[192066.404831] [<ffffffff8108ad98>] ? trace_hardirqs_on+0xd/0xf
[192066.405726] [<ffffffff8108959b>] ? lockdep_init_map+0xb9/0x1b3
[192066.406621] [<ffffffff811714b9>] mount_fs+0x67/0x131
[192066.407401] [<ffffffff81188560>] vfs_kern_mount+0x6c/0xde
[192066.408247] [<ffffffff8118ae36>] do_mount+0x893/0x9d2
[192066.409047] [<ffffffff8113009b>] ? strndup_user+0x3f/0x8c
[192066.409842] [<ffffffff8118b187>] SyS_mount+0x75/0xa1
[192066.410621] [<ffffffff8147e517>] entry_SYSCALL_64_fastpath+0x12/0x6b
[192066.411572] ---[ end trace 2de42126c1e0a0f0 ]---
[192066.412344] BTRFS: error (device dm-0) in __btrfs_unlink_inode:3986: errno=-2 No such entry
[192066.413748] BTRFS: error (device dm-0) in btrfs_replay_log:2464: errno=-2 No such entry (Failed to recover log tree)
[192066.415458] BTRFS error (device dm-0): cleaner transaction attach returned -30
[192066.444613] BTRFS: open_ctree failed
This happens because when we are replaying the log and processing the
directory entry pointing to the snapshot in the subvolume tree, we treat
its btrfs_dir_item item as having a location with a key type matching
BTRFS_INODE_ITEM_KEY, which is wrong because the type matches
BTRFS_ROOT_ITEM_KEY and therefore must be processed differently, as the
object id refers to a root number and not to an inode in the root
containing the parent directory.
So fix this by triggering a transaction commit if an fsync against the
parent directory is requested after deleting a snapshot. This is the
simplest approach for a rare use case. Some alternative that avoids the
transaction commit would require more code to explicitly delete the
snapshot at log replay time (factoring out common code from ioctl.c:
btrfs_ioctl_snap_destroy()), special care at fsync time to remove the
log tree of the snapshot's root from the log root of the root of tree
roots, amongst other steps.
A test case for xfstests that triggers the issue follows.
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
cd /
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_dm_target flakey
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create a snapshot at the root of our filesystem (mount point path), delete it,
# fsync the mount point path, crash and mount to replay the log. This should
# succeed and after the filesystem is mounted the snapshot should not be visible
# anymore.
_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/snap1
_run_btrfs_util_prog subvolume delete $SCRATCH_MNT/snap1
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT
_flakey_drop_and_remount
[ -e $SCRATCH_MNT/snap1 ] && \
echo "Snapshot snap1 still exists after log replay"
# Similar scenario as above, but this time the snapshot is created inside a
# directory and not directly under the root (mount point path).
mkdir $SCRATCH_MNT/testdir
_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/testdir/snap2
_run_btrfs_util_prog subvolume delete $SCRATCH_MNT/testdir/snap2
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir
_flakey_drop_and_remount
[ -e $SCRATCH_MNT/testdir/snap2 ] && \
echo "Snapshot snap2 still exists after log replay"
_unmount_flakey
echo "Silence is golden"
status=0
exit
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-02-10 10:42:25 +00:00
# include "tree-log.h"
2016-03-10 17:26:59 +08:00
# include "compression.h"
2019-06-18 16:09:16 -04:00
# include "space-info.h"
2019-06-19 15:12:00 -04:00
# include "delalloc-space.h"
2019-06-20 15:37:44 -04:00
# include "block-group.h"
2008-06-11 21:53:53 -04:00
2014-01-30 20:17:00 +00:00
# ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
* structures are incorrect , as the timespec structure from userspace
* is 4 bytes too small . We define these alternatives here to teach
* the kernel about the 32 - bit struct packing .
*/
struct btrfs_ioctl_timespec_32 {
__u64 sec ;
__u32 nsec ;
} __attribute__ ( ( __packed__ ) ) ;
struct btrfs_ioctl_received_subvol_args_32 {
char uuid [ BTRFS_UUID_SIZE ] ; /* in */
__u64 stransid ; /* in */
__u64 rtransid ; /* out */
struct btrfs_ioctl_timespec_32 stime ; /* in */
struct btrfs_ioctl_timespec_32 rtime ; /* out */
__u64 flags ; /* in */
__u64 reserved [ 16 ] ; /* in */
} __attribute__ ( ( __packed__ ) ) ;
# define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
struct btrfs_ioctl_received_subvol_args_32 )
# endif
2017-09-27 10:43:13 -04:00
# if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_send_args_32 {
__s64 send_fd ; /* in */
__u64 clone_sources_count ; /* in */
compat_uptr_t clone_sources ; /* in */
__u64 parent_root ; /* in */
__u64 flags ; /* in */
__u64 reserved [ 4 ] ; /* in */
} __attribute__ ( ( __packed__ ) ) ;
# define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
struct btrfs_ioctl_send_args_32 )
# endif
2014-01-30 20:17:00 +00:00
2009-04-17 10:37:41 +02:00
/* Mask out flags that are inappropriate for the given type of inode. */
2018-03-26 18:52:15 +02:00
static unsigned int btrfs_mask_fsflags_for_type ( struct inode * inode ,
unsigned int flags )
2009-04-17 10:37:41 +02:00
{
2018-03-26 18:52:15 +02:00
if ( S_ISDIR ( inode - > i_mode ) )
2009-04-17 10:37:41 +02:00
return flags ;
2018-03-26 18:52:15 +02:00
else if ( S_ISREG ( inode - > i_mode ) )
2009-04-17 10:37:41 +02:00
return flags & ~ FS_DIRSYNC_FL ;
else
return flags & ( FS_NODUMP_FL | FS_NOATIME_FL ) ;
}
/*
2018-03-26 19:12:25 +02:00
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl .
2009-04-17 10:37:41 +02:00
*/
2018-03-26 19:12:25 +02:00
static unsigned int btrfs_inode_flags_to_fsflags ( unsigned int flags )
2009-04-17 10:37:41 +02:00
{
unsigned int iflags = 0 ;
if ( flags & BTRFS_INODE_SYNC )
iflags | = FS_SYNC_FL ;
if ( flags & BTRFS_INODE_IMMUTABLE )
iflags | = FS_IMMUTABLE_FL ;
if ( flags & BTRFS_INODE_APPEND )
iflags | = FS_APPEND_FL ;
if ( flags & BTRFS_INODE_NODUMP )
iflags | = FS_NODUMP_FL ;
if ( flags & BTRFS_INODE_NOATIME )
iflags | = FS_NOATIME_FL ;
if ( flags & BTRFS_INODE_DIRSYNC )
iflags | = FS_DIRSYNC_FL ;
2011-04-15 03:03:06 +00:00
if ( flags & BTRFS_INODE_NODATACOW )
iflags | = FS_NOCOW_FL ;
2016-03-15 09:09:59 +09:00
if ( flags & BTRFS_INODE_NOCOMPRESS )
2011-04-15 03:03:06 +00:00
iflags | = FS_NOCOMP_FL ;
2016-03-15 09:09:59 +09:00
else if ( flags & BTRFS_INODE_COMPRESS )
iflags | = FS_COMPR_FL ;
2009-04-17 10:37:41 +02:00
return iflags ;
}
/*
* Update inode - > i_flags based on the btrfs internal flags .
*/
2018-03-26 18:40:21 +02:00
void btrfs_sync_inode_flags_to_i_flags ( struct inode * inode )
2009-04-17 10:37:41 +02:00
{
2018-04-23 15:45:18 +02:00
struct btrfs_inode * binode = BTRFS_I ( inode ) ;
2014-06-25 22:36:02 +01:00
unsigned int new_fl = 0 ;
2009-04-17 10:37:41 +02:00
2018-04-23 15:45:18 +02:00
if ( binode - > flags & BTRFS_INODE_SYNC )
2014-06-25 22:36:02 +01:00
new_fl | = S_SYNC ;
2018-04-23 15:45:18 +02:00
if ( binode - > flags & BTRFS_INODE_IMMUTABLE )
2014-06-25 22:36:02 +01:00
new_fl | = S_IMMUTABLE ;
2018-04-23 15:45:18 +02:00
if ( binode - > flags & BTRFS_INODE_APPEND )
2014-06-25 22:36:02 +01:00
new_fl | = S_APPEND ;
2018-04-23 15:45:18 +02:00
if ( binode - > flags & BTRFS_INODE_NOATIME )
2014-06-25 22:36:02 +01:00
new_fl | = S_NOATIME ;
2018-04-23 15:45:18 +02:00
if ( binode - > flags & BTRFS_INODE_DIRSYNC )
2014-06-25 22:36:02 +01:00
new_fl | = S_DIRSYNC ;
set_mask_bits ( & inode - > i_flags ,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC ,
new_fl ) ;
2009-04-17 10:37:41 +02:00
}
2020-07-10 09:49:56 +02:00
/*
* Check if @ flags are a supported and valid set of FS_ * _FL flags and that
* the old and new flags are not conflicting
*/
static int check_fsflags ( unsigned int old_flags , unsigned int flags )
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 10:12:20 +00:00
{
if ( flags & ~ ( FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
FS_SYNC_FL | FS_DIRSYNC_FL | \
2011-04-15 03:02:49 +00:00
FS_NOCOMP_FL | FS_COMPR_FL |
FS_NOCOW_FL ) )
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 10:12:20 +00:00
return - EOPNOTSUPP ;
2020-07-10 09:49:56 +02:00
/* COMPR and NOCOMP on new/old are valid */
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 10:12:20 +00:00
if ( ( flags & FS_NOCOMP_FL ) & & ( flags & FS_COMPR_FL ) )
return - EINVAL ;
2020-07-10 09:49:56 +02:00
if ( ( flags & FS_COMPR_FL ) & & ( flags & FS_NOCOW_FL ) )
return - EINVAL ;
/* NOCOW and compression options are mutually exclusive */
if ( ( old_flags & FS_NOCOW_FL ) & & ( flags & ( FS_COMPR_FL | FS_NOCOMP_FL ) ) )
return - EINVAL ;
if ( ( flags & FS_NOCOW_FL ) & & ( old_flags & ( FS_COMPR_FL | FS_NOCOMP_FL ) ) )
return - EINVAL ;
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 10:12:20 +00:00
return 0 ;
}
2020-11-10 20:26:11 +09:00
static int check_fsflags_compatible ( struct btrfs_fs_info * fs_info ,
unsigned int flags )
{
if ( btrfs_is_zoned ( fs_info ) & & ( flags & FS_NOCOW_FL ) )
return - EPERM ;
return 0 ;
}
2021-04-07 14:36:43 +02:00
/*
* Set flags / xflags from the internal inode flags . The remaining items of
* fsxattr are zeroed .
*/
int btrfs_fileattr_get ( struct dentry * dentry , struct fileattr * fa )
2009-04-17 10:37:41 +02:00
{
2021-04-07 14:36:43 +02:00
struct btrfs_inode * binode = BTRFS_I ( d_inode ( dentry ) ) ;
fileattr_fill_flags ( fa , btrfs_inode_flags_to_fsflags ( binode - > flags ) ) ;
return 0 ;
}
int btrfs_fileattr_set ( struct user_namespace * mnt_userns ,
struct dentry * dentry , struct fileattr * fa )
{
struct inode * inode = d_inode ( dentry ) ;
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2018-04-23 15:45:18 +02:00
struct btrfs_inode * binode = BTRFS_I ( inode ) ;
struct btrfs_root * root = binode - > root ;
2009-04-17 10:37:41 +02:00
struct btrfs_trans_handle * trans ;
2019-07-01 08:25:34 -07:00
unsigned int fsflags , old_fsflags ;
2009-04-17 10:37:41 +02:00
int ret ;
2019-04-20 19:48:53 +08:00
const char * comp = NULL ;
2020-07-10 09:49:56 +02:00
u32 binode_flags ;
2009-04-17 10:37:41 +02:00
2010-12-20 16:04:08 +08:00
if ( btrfs_root_readonly ( root ) )
return - EROFS ;
2021-04-07 14:36:43 +02:00
if ( fileattr_has_fsx ( fa ) )
return - EOPNOTSUPP ;
2012-06-12 16:20:32 +02:00
2021-04-07 14:36:43 +02:00
fsflags = btrfs_mask_fsflags_for_type ( inode , fa - > flags ) ;
2019-07-01 08:25:34 -07:00
old_fsflags = btrfs_inode_flags_to_fsflags ( binode - > flags ) ;
2020-07-10 09:49:56 +02:00
ret = check_fsflags ( old_fsflags , fsflags ) ;
if ( ret )
2021-04-07 14:36:43 +02:00
return ret ;
2020-07-10 09:49:56 +02:00
2020-11-10 20:26:11 +09:00
ret = check_fsflags_compatible ( fs_info , fsflags ) ;
if ( ret )
2021-04-07 14:36:43 +02:00
return ret ;
2020-11-10 20:26:11 +09:00
2020-07-10 09:49:56 +02:00
binode_flags = binode - > flags ;
2018-04-23 15:45:18 +02:00
if ( fsflags & FS_SYNC_FL )
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_SYNC ;
2009-04-17 10:37:41 +02:00
else
2019-04-20 19:48:55 +08:00
binode_flags & = ~ BTRFS_INODE_SYNC ;
2018-04-23 15:45:18 +02:00
if ( fsflags & FS_IMMUTABLE_FL )
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_IMMUTABLE ;
2009-04-17 10:37:41 +02:00
else
2019-04-20 19:48:55 +08:00
binode_flags & = ~ BTRFS_INODE_IMMUTABLE ;
2018-04-23 15:45:18 +02:00
if ( fsflags & FS_APPEND_FL )
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_APPEND ;
2009-04-17 10:37:41 +02:00
else
2019-04-20 19:48:55 +08:00
binode_flags & = ~ BTRFS_INODE_APPEND ;
2018-04-23 15:45:18 +02:00
if ( fsflags & FS_NODUMP_FL )
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_NODUMP ;
2009-04-17 10:37:41 +02:00
else
2019-04-20 19:48:55 +08:00
binode_flags & = ~ BTRFS_INODE_NODUMP ;
2018-04-23 15:45:18 +02:00
if ( fsflags & FS_NOATIME_FL )
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_NOATIME ;
2009-04-17 10:37:41 +02:00
else
2019-04-20 19:48:55 +08:00
binode_flags & = ~ BTRFS_INODE_NOATIME ;
2021-04-07 14:36:43 +02:00
/* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
if ( ! fa - > flags_valid ) {
/* 1 item for the inode */
trans = btrfs_start_transaction ( root , 1 ) ;
2021-04-30 21:30:55 +05:30
if ( IS_ERR ( trans ) )
return PTR_ERR ( trans ) ;
2021-04-07 14:36:43 +02:00
goto update_flags ;
}
2018-04-23 15:45:18 +02:00
if ( fsflags & FS_DIRSYNC_FL )
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_DIRSYNC ;
2009-04-17 10:37:41 +02:00
else
2019-04-20 19:48:55 +08:00
binode_flags & = ~ BTRFS_INODE_DIRSYNC ;
2018-04-23 15:45:18 +02:00
if ( fsflags & FS_NOCOW_FL ) {
2019-04-20 19:48:57 +08:00
if ( S_ISREG ( inode - > i_mode ) ) {
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 05:56:55 -06:00
/*
* It ' s safe to turn csums off here , no extents exist .
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it .
*/
if ( inode - > i_size = = 0 )
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM ;
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 05:56:55 -06:00
} else {
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_NODATACOW ;
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 05:56:55 -06:00
}
} else {
/*
2016-05-19 21:18:45 -04:00
* Revert back under same assumptions as above
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 05:56:55 -06:00
*/
2019-04-20 19:48:57 +08:00
if ( S_ISREG ( inode - > i_mode ) ) {
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 05:56:55 -06:00
if ( inode - > i_size = = 0 )
2019-04-20 19:48:55 +08:00
binode_flags & = ~ ( BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM ) ;
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 05:56:55 -06:00
} else {
2019-04-20 19:48:55 +08:00
binode_flags & = ~ BTRFS_INODE_NODATACOW ;
btrfs: allow setting NOCOW for a zero sized file via ioctl
Hi,
the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.
In short, $subj says it, chattr -C supports it and we want to use it.
The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:
1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
warn that the COW flag has not been changed
2.2.1) dtto, no syslog message
Man page of chattr states that
"If it is set on a file which already has data blocks, it is undefined when
the blocks assigned to the file will be fully stable."
Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.
My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.
The patch implements 2.2.1.
david
-------------8<-------------------
From: David Sterba <dsterba@suse.cz>
It's safe to turn off checksums for a zero sized file.
http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030
"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.
Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."
Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 05:56:55 -06:00
}
}
2009-04-17 10:37:41 +02:00
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 10:12:20 +00:00
/*
* The COMPRESS flag can only be changed by users , while the NOCOMPRESS
* flag may be changed automatically if compression code won ' t make
* things smaller .
*/
2018-04-23 15:45:18 +02:00
if ( fsflags & FS_NOCOMP_FL ) {
2019-04-20 19:48:55 +08:00
binode_flags & = ~ BTRFS_INODE_COMPRESS ;
binode_flags | = BTRFS_INODE_NOCOMPRESS ;
2018-04-23 15:45:18 +02:00
} else if ( fsflags & FS_COMPR_FL ) {
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
2021-04-07 14:36:43 +02:00
if ( IS_SWAPFILE ( inode ) )
return - ETXTBSY ;
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-03 10:28:12 -07:00
2019-04-20 19:48:55 +08:00
binode_flags | = BTRFS_INODE_COMPRESS ;
binode_flags & = ~ BTRFS_INODE_NOCOMPRESS ;
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
2017-10-31 17:32:41 +01:00
comp = btrfs_compress_type2str ( fs_info - > compress_type ) ;
if ( ! comp | | comp [ 0 ] = = 0 )
comp = btrfs_compress_type2str ( BTRFS_COMPRESS_ZLIB ) ;
2011-04-15 03:03:17 +00:00
} else {
2019-04-20 19:48:55 +08:00
binode_flags & = ~ ( BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS ) ;
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 10:12:20 +00:00
}
2009-04-17 10:37:41 +02:00
2019-04-20 19:48:53 +08:00
/*
* 1 for inode item
* 2 for properties
*/
trans = btrfs_start_transaction ( root , 3 ) ;
2021-04-07 14:36:43 +02:00
if ( IS_ERR ( trans ) )
return PTR_ERR ( trans ) ;
2009-04-17 10:37:41 +02:00
2019-04-20 19:48:53 +08:00
if ( comp ) {
ret = btrfs_set_prop ( trans , inode , " btrfs.compression " , comp ,
strlen ( comp ) , 0 ) ;
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
goto out_end_trans ;
}
} else {
ret = btrfs_set_prop ( trans , inode , " btrfs.compression " , NULL ,
0 , 0 ) ;
if ( ret & & ret ! = - ENODATA ) {
btrfs_abort_transaction ( trans , ret ) ;
goto out_end_trans ;
}
}
2021-04-07 14:36:43 +02:00
update_flags :
2019-04-20 19:48:55 +08:00
binode - > flags = binode_flags ;
2018-03-26 18:40:21 +02:00
btrfs_sync_inode_flags_to_i_flags ( inode ) ;
2012-04-05 15:03:02 -04:00
inode_inc_iversion ( inode ) ;
2016-09-14 07:48:06 -07:00
inode - > i_ctime = current_time ( inode ) ;
2020-11-02 16:48:59 +02:00
ret = btrfs_update_inode ( trans , root , BTRFS_I ( inode ) ) ;
2009-04-17 10:37:41 +02:00
2019-04-20 19:48:53 +08:00
out_end_trans :
2016-09-09 21:39:03 -04:00
btrfs_end_transaction ( trans ) ;
2011-02-24 09:38:16 +00:00
return ret ;
2009-04-17 10:37:41 +02:00
}
2020-08-25 10:02:32 -05:00
bool btrfs_exclop_start ( struct btrfs_fs_info * fs_info ,
enum btrfs_exclusive_operation type )
{
return ! cmpxchg ( & fs_info - > exclusive_operation , BTRFS_EXCLOP_NONE , type ) ;
}
void btrfs_exclop_finish ( struct btrfs_fs_info * fs_info )
{
WRITE_ONCE ( fs_info - > exclusive_operation , BTRFS_EXCLOP_NONE ) ;
2020-08-25 10:02:33 -05:00
sysfs_notify ( & fs_info - > fs_devices - > fsid_kobj , NULL , " exclusive_operation " ) ;
2020-08-25 10:02:32 -05:00
}
2009-04-17 10:37:41 +02:00
static int btrfs_ioctl_getversion ( struct file * file , int __user * arg )
{
2013-01-23 17:07:38 -05:00
struct inode * inode = file_inode ( file ) ;
2009-04-17 10:37:41 +02:00
return put_user ( inode - > i_generation , arg ) ;
}
2008-06-11 21:53:53 -04:00
2019-10-10 21:23:11 -03:00
static noinline int btrfs_ioctl_fitrim ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2011-03-24 10:24:28 +00:00
{
struct btrfs_device * device ;
struct request_queue * q ;
struct fstrim_range range ;
u64 minlen = ULLONG_MAX ;
u64 num_devices = 0 ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2021-02-04 19:21:46 +09:00
/*
* btrfs_trim_block_group ( ) depends on space cache , which is not
* available in zoned filesystem . So , disallow fitrim on a zoned
* filesystem for now .
*/
if ( btrfs_is_zoned ( fs_info ) )
return - EOPNOTSUPP ;
2019-03-26 10:49:56 +00:00
/*
* If the fs is mounted with nologreplay , which requires it to be
* mounted in RO mode as well , we can not allow discard on free space
* inside block groups , because log trees refer to extents that are not
* pinned in a block group ' s free space cache ( pinning the extents is
* precisely the first phase of replaying a log tree ) .
*/
if ( btrfs_test_opt ( fs_info , NOLOGREPLAY ) )
return - EROFS ;
2011-04-20 10:09:16 +00:00
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( device , & fs_info - > fs_devices - > devices ,
dev_list ) {
2011-03-24 10:24:28 +00:00
if ( ! device - > bdev )
continue ;
q = bdev_get_queue ( device - > bdev ) ;
if ( blk_queue_discard ( q ) ) {
num_devices + + ;
2016-12-15 14:38:28 +01:00
minlen = min_t ( u64 , q - > limits . discard_granularity ,
2011-03-24 10:24:28 +00:00
minlen ) ;
}
}
2011-04-20 10:09:16 +00:00
rcu_read_unlock ( ) ;
2011-09-05 16:34:54 +02:00
2011-03-24 10:24:28 +00:00
if ( ! num_devices )
return - EOPNOTSUPP ;
if ( copy_from_user ( & range , arg , sizeof ( range ) ) )
return - EFAULT ;
btrfs: Ensure btrfs_trim_fs can trim the whole filesystem
[BUG]
fstrim on some btrfs only trims the unallocated space, not trimming any
space in existing block groups.
[CAUSE]
Before fstrim_range passed to btrfs_trim_fs(), it gets truncated to
range [0, super->total_bytes). So later btrfs_trim_fs() will only be
able to trim block groups in range [0, super->total_bytes).
While for btrfs, any bytenr aligned to sectorsize is valid, since btrfs
uses its logical address space, there is nothing limiting the location
where we put block groups.
For filesystem with frequent balance, it's quite easy to relocate all
block groups and bytenr of block groups will start beyond
super->total_bytes.
In that case, btrfs will not trim existing block groups.
[FIX]
Just remove the truncation in btrfs_ioctl_fitrim(), so btrfs_trim_fs()
can get the unmodified range, which is normally set to [0, U64_MAX].
Reported-by: Chris Murphy <lists@colorremedies.com>
Fixes: f4c697e6406d ("btrfs: return EINVAL if start > total_bytes in fitrim ioctl")
CC: <stable@vger.kernel.org> # v4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-07 14:16:24 +08:00
/*
* NOTE : Don ' t truncate the range using super - > total_bytes . Bytenr of
* block group is in the logical address space , which can be any
* sectorsize aligned bytenr in the range [ 0 , U64_MAX ] .
*/
if ( range . len < fs_info - > sb - > s_blocksize )
2011-09-05 16:34:54 +02:00
return - EINVAL ;
2011-03-24 10:24:28 +00:00
range . minlen = max ( range . minlen , minlen ) ;
2016-06-22 18:54:24 -04:00
ret = btrfs_trim_fs ( fs_info , & range ) ;
2011-03-24 10:24:28 +00:00
if ( ret < 0 )
return ret ;
if ( copy_to_user ( arg , & range , sizeof ( range ) ) )
return - EFAULT ;
return 0 ;
}
2019-10-01 19:57:39 +02:00
int __pure btrfs_is_empty_uuid ( u8 * uuid )
2013-08-15 17:11:20 +02:00
{
2013-11-15 12:14:55 +01:00
int i ;
for ( i = 0 ; i < BTRFS_UUID_SIZE ; i + + ) {
if ( uuid [ i ] )
return 0 ;
}
return 1 ;
2013-08-15 17:11:20 +02:00
}
2013-02-28 10:04:33 +00:00
static noinline int create_subvol ( struct inode * dir ,
2008-10-09 13:39:39 -04:00
struct dentry * dentry ,
2017-02-14 18:33:53 +01:00
const char * name , int namelen ,
2013-02-07 06:02:44 +00:00
struct btrfs_qgroup_inherit * inherit )
2008-06-11 21:53:53 -04:00
{
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( dir - > i_sb ) ;
2008-06-11 21:53:53 -04:00
struct btrfs_trans_handle * trans ;
struct btrfs_key key ;
2016-03-24 17:49:22 +01:00
struct btrfs_root_item * root_item ;
2008-06-11 21:53:53 -04:00
struct btrfs_inode_item * inode_item ;
struct extent_buffer * leaf ;
2013-02-28 10:04:33 +00:00
struct btrfs_root * root = BTRFS_I ( dir ) - > root ;
2009-09-21 16:00:26 -04:00
struct btrfs_root * new_root ;
2013-02-28 10:04:33 +00:00
struct btrfs_block_rsv block_rsv ;
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-08 19:36:02 -07:00
struct timespec64 cur_time = current_time ( dir ) ;
2013-12-13 09:51:42 +09:00
struct inode * inode ;
2008-06-11 21:53:53 -04:00
int ret ;
int err ;
2020-06-16 10:17:36 +08:00
dev_t anon_dev = 0 ;
2008-06-11 21:53:53 -04:00
u64 objectid ;
2008-11-17 21:02:50 -05:00
u64 index = 0 ;
2008-06-11 21:53:53 -04:00
2016-03-24 17:49:22 +01:00
root_item = kzalloc ( sizeof ( * root_item ) , GFP_KERNEL ) ;
if ( ! root_item )
return - ENOMEM ;
2020-12-07 17:32:33 +02:00
ret = btrfs_get_free_objectid ( fs_info - > tree_root , & objectid ) ;
2011-07-16 21:38:06 -04:00
if ( ret )
2016-03-24 17:49:22 +01:00
goto fail_free ;
2010-11-20 09:48:00 +00:00
2020-06-16 10:17:36 +08:00
ret = get_anon_bdev ( & anon_dev ) ;
if ( ret < 0 )
goto fail_free ;
2015-02-27 16:24:23 +08:00
/*
* Don ' t create subvolume whose level is not zero . Or qgroup will be
2016-05-19 21:18:45 -04:00
* screwed up since it assumes subvolume qgroup ' s level to be 0.
2015-02-27 16:24:23 +08:00
*/
2016-03-24 17:49:22 +01:00
if ( btrfs_qgroup_level ( objectid ) ) {
ret = - ENOSPC ;
goto fail_free ;
}
2015-02-27 16:24:23 +08:00
2013-02-28 10:04:33 +00:00
btrfs_init_block_rsv ( & block_rsv , BTRFS_BLOCK_RSV_TEMP ) ;
2009-09-11 16:12:44 -04:00
/*
2013-02-28 10:04:33 +00:00
* The same as the snapshot creation , please see the comment
* of create_snapshot ( ) .
2009-09-11 16:12:44 -04:00
*/
2018-05-30 11:00:38 +08:00
ret = btrfs_subvolume_reserve_metadata ( root , & block_rsv , 8 , false ) ;
2013-02-28 10:04:33 +00:00
if ( ret )
2016-03-24 17:49:22 +01:00
goto fail_free ;
2013-02-28 10:04:33 +00:00
trans = btrfs_start_transaction ( root , 0 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
[BUG]
When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
And with the following metadata leak:
BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
------------[ cut here ]------------
WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
Call Trace:
btrfs_put_super+0x15/0x17 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x17/0x30 [btrfs]
deactivate_locked_super+0x3b/0xa0
deactivate_super+0x40/0x50
cleanup_mnt+0x135/0x190
__cleanup_mnt+0x12/0x20
task_work_run+0x64/0xb0
__prepare_exit_to_usermode+0x1bc/0x1c0
__syscall_return_slowpath+0x47/0x230
do_syscall_64+0x64/0xb0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
---[ end trace a6cfd45ba80e4e06 ]---
BTRFS error (device dm-3): qgroup reserved space leaked
BTRFS info (device dm-3): disk space caching is enabled
BTRFS info (device dm-3): has skinny extents
[CAUSE]
The qgroup preallocated meta rsv operations of that offending root are:
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
It's pretty obvious that, we reserve qgroup meta rsv in
btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
release/convert calls in btrfs_subvolume_release_metadata().
This leads to the leakage.
[FIX]
To fix this bug, we should follow what we're doing in
btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
add it to block_rsv->qgroup_rsv_reserved.
And free the qgroup reserved metadata space when releasing the
block_rsv.
To do this, we need to change the btrfs_subvolume_release_metadata() to
accept btrfs_root, and record the qgroup_to_release number, and call
btrfs_qgroup_convert_reserved_meta() for it.
Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-24 14:46:10 +08:00
btrfs_subvolume_release_metadata ( root , & block_rsv ) ;
2016-03-24 17:49:22 +01:00
goto fail_free ;
2013-02-28 10:04:33 +00:00
}
trans - > block_rsv = & block_rsv ;
trans - > bytes_reserved = block_rsv . size ;
2008-06-11 21:53:53 -04:00
2018-07-18 14:45:41 +08:00
ret = btrfs_qgroup_inherit ( trans , 0 , objectid , inherit ) ;
2011-09-14 15:58:21 +02:00
if ( ret )
goto fail ;
2020-08-20 11:46:03 -04:00
leaf = btrfs_alloc_tree_block ( trans , root , 0 , objectid , NULL , 0 , 0 , 0 ,
BTRFS_NESTING_NORMAL ) ;
2008-07-24 12:17:14 -04:00
if ( IS_ERR ( leaf ) ) {
ret = PTR_ERR ( leaf ) ;
goto fail ;
}
2008-06-11 21:53:53 -04:00
btrfs_mark_buffer_dirty ( leaf ) ;
2016-03-24 17:49:22 +01:00
inode_item = & root_item - > inode ;
2013-07-16 11:19:18 +08:00
btrfs_set_stack_inode_generation ( inode_item , 1 ) ;
btrfs_set_stack_inode_size ( inode_item , 3 ) ;
btrfs_set_stack_inode_nlink ( inode_item , 1 ) ;
2016-06-15 09:22:56 -04:00
btrfs_set_stack_inode_nbytes ( inode_item ,
2016-06-22 18:54:23 -04:00
fs_info - > nodesize ) ;
2013-07-16 11:19:18 +08:00
btrfs_set_stack_inode_mode ( inode_item , S_IFDIR | 0755 ) ;
2008-06-11 21:53:53 -04:00
2016-03-24 17:49:22 +01:00
btrfs_set_root_flags ( root_item , 0 ) ;
btrfs_set_root_limit ( root_item , 0 ) ;
2013-07-16 11:19:18 +08:00
btrfs_set_stack_inode_flags ( inode_item , BTRFS_INODE_ROOT_ITEM_INIT ) ;
2011-03-28 02:01:25 +00:00
2016-03-24 17:49:22 +01:00
btrfs_set_root_bytenr ( root_item , leaf - > start ) ;
btrfs_set_root_generation ( root_item , trans - > transid ) ;
btrfs_set_root_level ( root_item , 0 ) ;
btrfs_set_root_refs ( root_item , 1 ) ;
btrfs_set_root_used ( root_item , leaf - > len ) ;
btrfs_set_root_last_snapshot ( root_item , 0 ) ;
2008-06-11 21:53:53 -04:00
2016-03-24 17:49:22 +01:00
btrfs_set_root_generation_v2 ( root_item ,
btrfs_root_generation ( root_item ) ) ;
2020-02-24 17:37:51 +02:00
generate_random_guid ( root_item - > uuid ) ;
2016-03-24 17:49:22 +01:00
btrfs_set_stack_timespec_sec ( & root_item - > otime , cur_time . tv_sec ) ;
btrfs_set_stack_timespec_nsec ( & root_item - > otime , cur_time . tv_nsec ) ;
root_item - > ctime = root_item - > otime ;
btrfs_set_root_ctransid ( root_item , trans - > transid ) ;
btrfs_set_root_otransid ( root_item , trans - > transid ) ;
2008-06-11 21:53:53 -04:00
2008-06-25 16:01:30 -04:00
btrfs_tree_unlock ( leaf ) ;
2008-06-11 21:53:53 -04:00
2020-12-07 17:32:37 +02:00
btrfs_set_root_dirid ( root_item , BTRFS_FIRST_FREE_OBJECTID ) ;
2008-06-11 21:53:53 -04:00
key . objectid = objectid ;
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
key . offset = 0 ;
2014-06-04 18:41:45 +02:00
key . type = BTRFS_ROOT_ITEM_KEY ;
2016-06-22 18:54:23 -04:00
ret = btrfs_insert_root ( trans , fs_info - > tree_root , & key ,
2016-03-24 17:49:22 +01:00
root_item ) ;
2021-04-20 10:55:12 +01:00
if ( ret ) {
/*
* Since we don ' t abort the transaction in this case , free the
* tree block so that we don ' t leak space and leave the
* filesystem in an inconsistent state ( an extent item in the
* extent tree without backreferences ) . Also no need to have
* the tree block locked since it is not in any tree at this
* point , so no other task can find it and use it .
*/
btrfs_free_tree_block ( trans , root , leaf , 0 , 1 ) ;
free_extent_buffer ( leaf ) ;
2008-06-11 21:53:53 -04:00
goto fail ;
2021-04-20 10:55:12 +01:00
}
free_extent_buffer ( leaf ) ;
leaf = NULL ;
2008-06-11 21:53:53 -04:00
2009-09-21 16:00:26 -04:00
key . offset = ( u64 ) - 1 ;
2020-06-16 10:17:36 +08:00
new_root = btrfs_get_new_fs_root ( fs_info , objectid , anon_dev ) ;
2012-03-12 16:03:00 +01:00
if ( IS_ERR ( new_root ) ) {
2020-06-16 10:17:36 +08:00
free_anon_bdev ( anon_dev ) ;
2012-03-12 16:03:00 +01:00
ret = PTR_ERR ( new_root ) ;
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2012-03-12 16:03:00 +01:00
goto fail ;
}
2020-06-16 10:17:36 +08:00
/* Freeing will be done in btrfs_put_root() of new_root */
anon_dev = 0 ;
2009-09-21 16:00:26 -04:00
2021-03-12 15:25:06 -05:00
ret = btrfs_record_root_in_trans ( trans , new_root ) ;
if ( ret ) {
btrfs_put_root ( new_root ) ;
btrfs_abort_transaction ( trans , ret ) ;
goto fail ;
}
2009-09-21 16:00:26 -04:00
2020-12-07 17:32:37 +02:00
ret = btrfs_create_subvol_root ( trans , new_root , root ) ;
2020-01-24 09:33:01 -05:00
btrfs_put_root ( new_root ) ;
2011-07-26 11:32:23 -07:00
if ( ret ) {
/* We potentially lose an unused inode item here */
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2011-07-26 11:32:23 -07:00
goto fail ;
}
2008-06-11 21:53:53 -04:00
/*
* insert the directory item
*/
2017-02-20 13:50:33 +02:00
ret = btrfs_set_inode_index ( BTRFS_I ( dir ) , & index ) ;
2012-03-12 16:03:00 +01:00
if ( ret ) {
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2012-03-12 16:03:00 +01:00
goto fail ;
}
2008-11-17 21:02:50 -05:00
2018-08-04 21:10:57 +08:00
ret = btrfs_insert_dir_item ( trans , name , namelen , BTRFS_I ( dir ) , & key ,
2008-11-17 21:02:50 -05:00
BTRFS_FT_DIR , index ) ;
2012-03-12 16:03:00 +01:00
if ( ret ) {
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2008-06-11 21:53:53 -04:00
goto fail ;
2012-03-12 16:03:00 +01:00
}
2008-11-17 20:37:39 -05:00
2017-02-20 13:50:34 +02:00
btrfs_i_size_write ( BTRFS_I ( dir ) , dir - > i_size + namelen * 2 ) ;
2020-11-02 16:48:59 +02:00
ret = btrfs_update_inode ( trans , root , BTRFS_I ( dir ) ) ;
2019-12-06 09:37:15 -05:00
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
goto fail ;
}
2009-01-05 15:43:43 -05:00
2018-08-01 11:32:29 +08:00
ret = btrfs_add_root_ref ( trans , objectid , root - > root_key . objectid ,
2017-01-10 20:35:31 +02:00
btrfs_ino ( BTRFS_I ( dir ) ) , index , name , namelen ) ;
2019-12-06 09:37:15 -05:00
if ( ret ) {
btrfs_abort_transaction ( trans , ret ) ;
goto fail ;
}
2008-06-11 21:53:53 -04:00
2018-05-29 15:01:53 +08:00
ret = btrfs_uuid_tree_add ( trans , root_item - > uuid ,
2016-06-21 21:16:51 -04:00
BTRFS_UUID_KEY_SUBVOL , objectid ) ;
2013-08-15 17:11:20 +02:00
if ( ret )
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2013-08-15 17:11:20 +02:00
2008-06-11 21:53:53 -04:00
fail :
2016-03-24 17:49:22 +01:00
kfree ( root_item ) ;
2013-02-28 10:04:33 +00:00
trans - > block_rsv = NULL ;
trans - > bytes_reserved = 0 ;
btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
[BUG]
When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
And with the following metadata leak:
BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
------------[ cut here ]------------
WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
Call Trace:
btrfs_put_super+0x15/0x17 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x17/0x30 [btrfs]
deactivate_locked_super+0x3b/0xa0
deactivate_super+0x40/0x50
cleanup_mnt+0x135/0x190
__cleanup_mnt+0x12/0x20
task_work_run+0x64/0xb0
__prepare_exit_to_usermode+0x1bc/0x1c0
__syscall_return_slowpath+0x47/0x230
do_syscall_64+0x64/0xb0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
---[ end trace a6cfd45ba80e4e06 ]---
BTRFS error (device dm-3): qgroup reserved space leaked
BTRFS info (device dm-3): disk space caching is enabled
BTRFS info (device dm-3): has skinny extents
[CAUSE]
The qgroup preallocated meta rsv operations of that offending root are:
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
It's pretty obvious that, we reserve qgroup meta rsv in
btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
release/convert calls in btrfs_subvolume_release_metadata().
This leads to the leakage.
[FIX]
To fix this bug, we should follow what we're doing in
btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
add it to block_rsv->qgroup_rsv_reserved.
And free the qgroup reserved metadata space when releasing the
block_rsv.
To do this, we need to change the btrfs_subvolume_release_metadata() to
accept btrfs_root, and record the qgroup_to_release number, and call
btrfs_qgroup_convert_reserved_meta() for it.
Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-24 14:46:10 +08:00
btrfs_subvolume_release_metadata ( root , & block_rsv ) ;
2014-01-09 14:57:06 +08:00
2020-03-13 17:23:20 +02:00
err = btrfs_commit_transaction ( trans ) ;
2008-06-11 21:53:53 -04:00
if ( err & & ! ret )
ret = err ;
2013-02-06 12:06:02 -05:00
2013-12-13 09:51:42 +09:00
if ( ! ret ) {
inode = btrfs_lookup_dentry ( dir , dentry ) ;
2014-01-09 14:57:06 +08:00
if ( IS_ERR ( inode ) )
return PTR_ERR ( inode ) ;
2013-12-13 09:51:42 +09:00
d_instantiate ( dentry , inode ) ;
}
2008-06-11 21:53:53 -04:00
return ret ;
2016-03-24 17:49:22 +01:00
fail_free :
2020-06-16 10:17:36 +08:00
if ( anon_dev )
free_anon_bdev ( anon_dev ) ;
2016-03-24 17:49:22 +01:00
kfree ( root_item ) ;
return ret ;
2008-06-11 21:53:53 -04:00
}
2013-02-28 10:01:15 +00:00
static int create_snapshot ( struct btrfs_root * root , struct inode * dir ,
2020-03-13 17:23:20 +02:00
struct dentry * dentry , bool readonly ,
2013-02-28 10:01:15 +00:00
struct btrfs_qgroup_inherit * inherit )
2008-06-11 21:53:53 -04:00
{
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( dir - > i_sb ) ;
2009-11-12 09:37:02 +00:00
struct inode * inode ;
2008-06-11 21:53:53 -04:00
struct btrfs_pending_snapshot * pending_snapshot ;
struct btrfs_trans_handle * trans ;
2009-11-12 09:37:02 +00:00
int ret ;
2008-06-11 21:53:53 -04:00
2020-05-15 14:01:40 +08:00
if ( ! test_bit ( BTRFS_ROOT_SHAREABLE , & root - > state ) )
2008-06-11 21:53:53 -04:00
return - EINVAL ;
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-03 10:28:12 -07:00
if ( atomic_read ( & root - > nr_swapfiles ) ) {
btrfs_warn ( fs_info ,
" cannot snapshot subvolume with active swapfile " ) ;
return - ETXTBSY ;
}
2017-02-13 11:03:44 +01:00
pending_snapshot = kzalloc ( sizeof ( * pending_snapshot ) , GFP_KERNEL ) ;
2015-11-10 18:53:56 +01:00
if ( ! pending_snapshot )
return - ENOMEM ;
2020-06-16 10:17:36 +08:00
ret = get_anon_bdev ( & pending_snapshot - > anon_dev ) ;
if ( ret < 0 )
goto free_pending ;
2015-11-10 18:54:00 +01:00
pending_snapshot - > root_item = kzalloc ( sizeof ( struct btrfs_root_item ) ,
2017-02-13 11:03:44 +01:00
GFP_KERNEL ) ;
2015-11-10 18:54:03 +01:00
pending_snapshot - > path = btrfs_alloc_path ( ) ;
if ( ! pending_snapshot - > root_item | | ! pending_snapshot - > path ) {
2015-11-10 18:54:00 +01:00
ret = - ENOMEM ;
goto free_pending ;
}
2012-09-06 04:02:28 -06:00
btrfs_init_block_rsv ( & pending_snapshot - > block_rsv ,
BTRFS_BLOCK_RSV_TEMP ) ;
2013-02-28 10:04:33 +00:00
/*
* 1 - parent dir inode
* 2 - dir entries
* 1 - root item
* 2 - root ref / backref
* 1 - root of snapshot
2013-08-15 17:11:20 +02:00
* 1 - UUID item
2013-02-28 10:04:33 +00:00
*/
ret = btrfs_subvolume_reserve_metadata ( BTRFS_I ( dir ) - > root ,
2013-08-15 17:11:20 +02:00
& pending_snapshot - > block_rsv , 8 ,
2013-07-09 16:37:21 -04:00
false ) ;
2013-02-28 10:04:33 +00:00
if ( ret )
2020-05-14 17:19:18 +08:00
goto free_pending ;
2013-02-28 10:04:33 +00:00
2008-11-17 21:02:50 -05:00
pending_snapshot - > dentry = dentry ;
2008-06-11 21:53:53 -04:00
pending_snapshot - > root = root ;
2010-12-20 16:04:08 +08:00
pending_snapshot - > readonly = readonly ;
2013-02-28 10:01:15 +00:00
pending_snapshot - > dir = dir ;
2013-02-07 06:02:44 +00:00
pending_snapshot - > inherit = inherit ;
2010-05-16 10:48:46 -04:00
2013-02-28 10:04:33 +00:00
trans = btrfs_start_transaction ( root , 0 ) ;
2010-05-16 10:48:46 -04:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto fail ;
}
2016-06-22 18:54:23 -04:00
spin_lock ( & fs_info - > trans_lock ) ;
2008-06-11 21:53:53 -04:00
list_add ( & pending_snapshot - > list ,
& trans - > transaction - > pending_snapshots ) ;
2016-06-22 18:54:23 -04:00
spin_unlock ( & fs_info - > trans_lock ) ;
2020-03-13 17:23:20 +02:00
ret = btrfs_commit_transaction ( trans ) ;
2013-03-04 09:44:29 +00:00
if ( ret )
2012-10-22 15:51:44 -04:00
goto fail ;
2010-05-16 10:48:46 -04:00
ret = pending_snapshot - > error ;
if ( ret )
goto fail ;
2014-10-15 13:50:56 -07:00
ret = btrfs_orphan_cleanup ( pending_snapshot - > snap ) ;
if ( ret )
goto fail ;
2015-03-17 22:25:59 +00:00
inode = btrfs_lookup_dentry ( d_inode ( dentry - > d_parent ) , dentry ) ;
2009-11-12 09:37:02 +00:00
if ( IS_ERR ( inode ) ) {
ret = PTR_ERR ( inode ) ;
goto fail ;
}
2013-12-13 09:51:42 +09:00
2009-11-12 09:37:02 +00:00
d_instantiate ( dentry , inode ) ;
ret = 0 ;
2020-06-16 10:17:36 +08:00
pending_snapshot - > anon_dev = 0 ;
2009-11-12 09:37:02 +00:00
fail :
2020-06-16 10:17:36 +08:00
/* Prevent double freeing of anon_dev */
if ( ret & & pending_snapshot - > snap )
pending_snapshot - > snap - > anon_dev = 0 ;
2020-01-24 09:33:01 -05:00
btrfs_put_root ( pending_snapshot - > snap ) ;
btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
[BUG]
When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
And with the following metadata leak:
BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
------------[ cut here ]------------
WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
Call Trace:
btrfs_put_super+0x15/0x17 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x17/0x30 [btrfs]
deactivate_locked_super+0x3b/0xa0
deactivate_super+0x40/0x50
cleanup_mnt+0x135/0x190
__cleanup_mnt+0x12/0x20
task_work_run+0x64/0xb0
__prepare_exit_to_usermode+0x1bc/0x1c0
__syscall_return_slowpath+0x47/0x230
do_syscall_64+0x64/0xb0
entry_SYSCALL_64_after_hwframe+0x44/0xa9
---[ end trace a6cfd45ba80e4e06 ]---
BTRFS error (device dm-3): qgroup reserved space leaked
BTRFS info (device dm-3): disk space caching is enabled
BTRFS info (device dm-3): has skinny extents
[CAUSE]
The qgroup preallocated meta rsv operations of that offending root are:
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
It's pretty obvious that, we reserve qgroup meta rsv in
btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
release/convert calls in btrfs_subvolume_release_metadata().
This leads to the leakage.
[FIX]
To fix this bug, we should follow what we're doing in
btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
add it to block_rsv->qgroup_rsv_reserved.
And free the qgroup reserved metadata space when releasing the
block_rsv.
To do this, we need to change the btrfs_subvolume_release_metadata() to
accept btrfs_root, and record the qgroup_to_release number, and call
btrfs_qgroup_convert_reserved_meta() for it.
Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-24 14:46:10 +08:00
btrfs_subvolume_release_metadata ( root , & pending_snapshot - > block_rsv ) ;
2015-11-10 18:54:00 +01:00
free_pending :
2020-06-16 10:17:36 +08:00
if ( pending_snapshot - > anon_dev )
free_anon_bdev ( pending_snapshot - > anon_dev ) ;
2015-11-10 18:54:00 +01:00
kfree ( pending_snapshot - > root_item ) ;
2015-11-10 18:54:03 +01:00
btrfs_free_path ( pending_snapshot - > path ) ;
2015-11-10 18:53:56 +01:00
kfree ( pending_snapshot ) ;
2008-06-11 21:53:53 -04:00
return ret ;
}
2010-10-29 15:46:43 -04:00
/* copy of may_delete in fs/namei.c()
* Check whether we can remove a link victim from directory dir , check
* whether the type of victim is right .
* 1. We can ' t do it if dir is read - only ( done in permission ( ) )
* 2. We should have write and exec permissions on dir
* 3. We can ' t remove anything from append - only dir
* 4. We can ' t do anything with immutable dir ( done in permission ( ) )
* 5. If the sticky bit on dir is set we should either
* a . be owner of dir , or
* b . be owner of victim , or
* c . have CAP_FOWNER capability
2016-05-19 21:18:45 -04:00
* 6. If the victim is append - only or immutable we can ' t do anything with
2010-10-29 15:46:43 -04:00
* links pointing to it .
* 7. If we were asked to remove a directory and victim isn ' t one - ENOTDIR .
* 8. If we were asked to remove a non - directory and victim isn ' t one - EISDIR .
* 9. We can ' t remove a root or mountpoint .
* 10. We don ' t allow removal of NFS sillyrenamed files ; it ' s handled by
* nfs_async_unlink ( ) .
*/
2013-10-31 10:33:04 +05:30
static int btrfs_may_delete ( struct inode * dir , struct dentry * victim , int isdir )
2010-10-29 15:46:43 -04:00
{
int error ;
2015-03-17 22:25:59 +00:00
if ( d_really_is_negative ( victim ) )
2010-10-29 15:46:43 -04:00
return - ENOENT ;
2015-03-17 22:25:59 +00:00
BUG_ON ( d_inode ( victim - > d_parent ) ! = dir ) ;
2012-10-10 15:25:25 -04:00
audit_inode_child ( dir , victim , AUDIT_TYPE_CHILD_DELETE ) ;
2010-10-29 15:46:43 -04:00
2021-01-21 14:19:24 +01:00
error = inode_permission ( & init_user_ns , dir , MAY_WRITE | MAY_EXEC ) ;
2010-10-29 15:46:43 -04:00
if ( error )
return error ;
if ( IS_APPEND ( dir ) )
return - EPERM ;
2021-01-21 14:19:31 +01:00
if ( check_sticky ( & init_user_ns , dir , d_inode ( victim ) ) | |
IS_APPEND ( d_inode ( victim ) ) | | IS_IMMUTABLE ( d_inode ( victim ) ) | |
IS_SWAPFILE ( d_inode ( victim ) ) )
2010-10-29 15:46:43 -04:00
return - EPERM ;
if ( isdir ) {
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry)
Convert the following where appropriate:
(1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry).
(2) S_ISREG(dentry->d_inode) to d_is_reg(dentry).
(3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more
complicated than it appears as some calls should be converted to
d_can_lookup() instead. The difference is whether the directory in
question is a real dir with a ->lookup op or whether it's a fake dir with
a ->d_automount op.
In some circumstances, we can subsume checks for dentry->d_inode not being
NULL into this, provided we the code isn't in a filesystem that expects
d_inode to be NULL if the dirent really *is* negative (ie. if we're going to
use d_inode() rather than d_backing_inode() to get the inode pointer).
Note that the dentry type field may be set to something other than
DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS
manages the fall-through from a negative dentry to a lower layer. In such a
case, the dentry type of the negative union dentry is set to the same as the
type of the lower dentry.
However, if you know d_inode is not NULL at the call site, then you can use
the d_is_xxx() functions even in a filesystem.
There is one further complication: a 0,0 chardev dentry may be labelled
DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was
intended for special directory entry types that don't have attached inodes.
The following perl+coccinelle script was used:
use strict;
my @callers;
open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') ||
die "Can't grep for S_ISDIR and co. callers";
@callers = <$fd>;
close($fd);
unless (@callers) {
print "No matches\n";
exit(0);
}
my @cocci = (
'@@',
'expression E;',
'@@',
'',
'- S_ISLNK(E->d_inode->i_mode)',
'+ d_is_symlink(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISDIR(E->d_inode->i_mode)',
'+ d_is_dir(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISREG(E->d_inode->i_mode)',
'+ d_is_reg(E)' );
my $coccifile = "tmp.sp.cocci";
open($fd, ">$coccifile") || die $coccifile;
print($fd "$_\n") || die $coccifile foreach (@cocci);
close($fd);
foreach my $file (@callers) {
chomp $file;
print "Processing ", $file, "\n";
system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 ||
die "spatch failed";
}
[AV: overlayfs parts skipped]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 12:02:35 +00:00
if ( ! d_is_dir ( victim ) )
2010-10-29 15:46:43 -04:00
return - ENOTDIR ;
if ( IS_ROOT ( victim ) )
return - EBUSY ;
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry)
Convert the following where appropriate:
(1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry).
(2) S_ISREG(dentry->d_inode) to d_is_reg(dentry).
(3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more
complicated than it appears as some calls should be converted to
d_can_lookup() instead. The difference is whether the directory in
question is a real dir with a ->lookup op or whether it's a fake dir with
a ->d_automount op.
In some circumstances, we can subsume checks for dentry->d_inode not being
NULL into this, provided we the code isn't in a filesystem that expects
d_inode to be NULL if the dirent really *is* negative (ie. if we're going to
use d_inode() rather than d_backing_inode() to get the inode pointer).
Note that the dentry type field may be set to something other than
DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS
manages the fall-through from a negative dentry to a lower layer. In such a
case, the dentry type of the negative union dentry is set to the same as the
type of the lower dentry.
However, if you know d_inode is not NULL at the call site, then you can use
the d_is_xxx() functions even in a filesystem.
There is one further complication: a 0,0 chardev dentry may be labelled
DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was
intended for special directory entry types that don't have attached inodes.
The following perl+coccinelle script was used:
use strict;
my @callers;
open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') ||
die "Can't grep for S_ISDIR and co. callers";
@callers = <$fd>;
close($fd);
unless (@callers) {
print "No matches\n";
exit(0);
}
my @cocci = (
'@@',
'expression E;',
'@@',
'',
'- S_ISLNK(E->d_inode->i_mode)',
'+ d_is_symlink(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISDIR(E->d_inode->i_mode)',
'+ d_is_dir(E)',
'',
'@@',
'expression E;',
'@@',
'',
'- S_ISREG(E->d_inode->i_mode)',
'+ d_is_reg(E)' );
my $coccifile = "tmp.sp.cocci";
open($fd, ">$coccifile") || die $coccifile;
print($fd "$_\n") || die $coccifile foreach (@cocci);
close($fd);
foreach my $file (@callers) {
chomp $file;
print "Processing ", $file, "\n";
system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 ||
die "spatch failed";
}
[AV: overlayfs parts skipped]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 12:02:35 +00:00
} else if ( d_is_dir ( victim ) )
2010-10-29 15:46:43 -04:00
return - EISDIR ;
if ( IS_DEADDIR ( dir ) )
return - ENOENT ;
if ( victim - > d_flags & DCACHE_NFSFS_RENAMED )
return - EBUSY ;
return 0 ;
}
2008-10-09 13:39:39 -04:00
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create ( struct inode * dir , struct dentry * child )
{
2015-03-17 22:25:59 +00:00
if ( d_really_is_positive ( child ) )
2008-10-09 13:39:39 -04:00
return - EEXIST ;
if ( IS_DEADDIR ( dir ) )
return - ENOENT ;
2021-01-21 14:19:24 +01:00
return inode_permission ( & init_user_ns , dir , MAY_WRITE | MAY_EXEC ) ;
2008-10-09 13:39:39 -04:00
}
/*
* Create a new subvolume below @ parent . This is largely modeled after
* sys_mkdirat and vfs_mkdir , but we only do a single component lookup
* inside this filesystem so it ' s quite a bit simpler .
*/
2016-11-20 19:34:31 -05:00
static noinline int btrfs_mksubvol ( const struct path * parent ,
2017-02-14 18:33:53 +01:00
const char * name , int namelen ,
2010-10-29 15:41:32 -04:00
struct btrfs_root * snap_src ,
2020-03-13 17:23:20 +02:00
bool readonly ,
2013-02-07 06:02:44 +00:00
struct btrfs_qgroup_inherit * inherit )
2008-10-09 13:39:39 -04:00
{
2016-06-22 18:54:23 -04:00
struct inode * dir = d_inode ( parent - > dentry ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( dir - > i_sb ) ;
2008-10-09 13:39:39 -04:00
struct dentry * dentry ;
int error ;
2016-05-26 00:05:12 -04:00
error = down_write_killable_nested ( & dir - > i_rwsem , I_MUTEX_PARENT ) ;
if ( error = = - EINTR )
return error ;
2008-10-09 13:39:39 -04:00
dentry = lookup_one_len ( name , parent - > dentry , namelen ) ;
error = PTR_ERR ( dentry ) ;
if ( IS_ERR ( dentry ) )
goto out_unlock ;
2009-09-21 16:00:26 -04:00
error = btrfs_may_create ( dir , dentry ) ;
2008-10-09 13:39:39 -04:00
if ( error )
2012-06-29 03:58:46 -06:00
goto out_dput ;
2008-10-09 13:39:39 -04:00
2012-12-17 14:26:57 -05:00
/*
* even if this name doesn ' t exist , we may get hash collisions .
* check for them now when we can safely fail
*/
error = btrfs_check_dir_item_collision ( BTRFS_I ( dir ) - > root ,
dir - > i_ino , name ,
namelen ) ;
if ( error )
goto out_dput ;
2016-06-22 18:54:23 -04:00
down_read ( & fs_info - > subvol_sem ) ;
2009-09-21 16:00:26 -04:00
if ( btrfs_root_refs ( & BTRFS_I ( dir ) - > root - > root_item ) = = 0 )
goto out_up_read ;
2020-03-13 17:23:20 +02:00
if ( snap_src )
error = create_snapshot ( snap_src , dir , dentry , readonly , inherit ) ;
else
error = create_subvol ( dir , dentry , name , namelen , inherit ) ;
2009-09-21 16:00:26 -04:00
if ( ! error )
fsnotify_mkdir ( dir , dentry ) ;
out_up_read :
2016-06-22 18:54:23 -04:00
up_read ( & fs_info - > subvol_sem ) ;
2008-10-09 13:39:39 -04:00
out_dput :
dput ( dentry ) ;
out_unlock :
2021-02-10 17:14:34 -05:00
btrfs_inode_unlock ( dir , 0 ) ;
2008-10-09 13:39:39 -04:00
return error ;
}
2020-05-14 17:19:18 +08:00
static noinline int btrfs_mksnapshot ( const struct path * parent ,
const char * name , int namelen ,
struct btrfs_root * root ,
bool readonly ,
struct btrfs_qgroup_inherit * inherit )
{
int ret ;
bool snapshot_force_cow = false ;
/*
* Force new buffered writes to reserve space even when NOCOW is
* possible . This is to avoid later writeback ( running dealloc ) to
* fallback to COW mode and unexpectedly fail with ENOSPC .
*/
btrfs_drew_read_lock ( & root - > snapshot_lock ) ;
btrfs: fix deadlock when cloning inline extents and using qgroups
There are a few exceptional cases where cloning an inline extent needs to
copy the inline extent data into a page of the destination inode.
When this happens, we end up starting a transaction while having a dirty
page for the destination inode and while having the range locked in the
destination's inode iotree too. Because when reserving metadata space
for a transaction we may need to flush existing delalloc in case there is
not enough free space, we have a mechanism in place to prevent a deadlock,
which was introduced in commit 3d45f221ce627d ("btrfs: fix deadlock when
cloning inline extent and low on free metadata space").
However when using qgroups, a transaction also reserves metadata qgroup
space, which can also result in flushing delalloc in case there is not
enough available space at the moment. When this happens we deadlock, since
flushing delalloc requires locking the file range in the inode's iotree
and the range was already locked at the very beginning of the clone
operation, before attempting to start the transaction.
When this issue happens, stack traces like the following are reported:
[72747.556262] task:kworker/u81:9 state:D stack: 0 pid: 225 ppid: 2 flags:0x00004000
[72747.556268] Workqueue: writeback wb_workfn (flush-btrfs-1142)
[72747.556271] Call Trace:
[72747.556273] __schedule+0x296/0x760
[72747.556277] schedule+0x3c/0xa0
[72747.556279] io_schedule+0x12/0x40
[72747.556284] __lock_page+0x13c/0x280
[72747.556287] ? generic_file_readonly_mmap+0x70/0x70
[72747.556325] extent_write_cache_pages+0x22a/0x440 [btrfs]
[72747.556331] ? __set_page_dirty_nobuffers+0xe7/0x160
[72747.556358] ? set_extent_buffer_dirty+0x5e/0x80 [btrfs]
[72747.556362] ? update_group_capacity+0x25/0x210
[72747.556366] ? cpumask_next_and+0x1a/0x20
[72747.556391] extent_writepages+0x44/0xa0 [btrfs]
[72747.556394] do_writepages+0x41/0xd0
[72747.556398] __writeback_single_inode+0x39/0x2a0
[72747.556403] writeback_sb_inodes+0x1ea/0x440
[72747.556407] __writeback_inodes_wb+0x5f/0xc0
[72747.556410] wb_writeback+0x235/0x2b0
[72747.556414] ? get_nr_inodes+0x35/0x50
[72747.556417] wb_workfn+0x354/0x490
[72747.556420] ? newidle_balance+0x2c5/0x3e0
[72747.556424] process_one_work+0x1aa/0x340
[72747.556426] worker_thread+0x30/0x390
[72747.556429] ? create_worker+0x1a0/0x1a0
[72747.556432] kthread+0x116/0x130
[72747.556435] ? kthread_park+0x80/0x80
[72747.556438] ret_from_fork+0x1f/0x30
[72747.566958] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
[72747.566961] Call Trace:
[72747.566964] __schedule+0x296/0x760
[72747.566968] ? finish_wait+0x80/0x80
[72747.566970] schedule+0x3c/0xa0
[72747.566995] wait_extent_bit.constprop.68+0x13b/0x1c0 [btrfs]
[72747.566999] ? finish_wait+0x80/0x80
[72747.567024] lock_extent_bits+0x37/0x90 [btrfs]
[72747.567047] btrfs_invalidatepage+0x299/0x2c0 [btrfs]
[72747.567051] ? find_get_pages_range_tag+0x2cd/0x380
[72747.567076] __extent_writepage+0x203/0x320 [btrfs]
[72747.567102] extent_write_cache_pages+0x2bb/0x440 [btrfs]
[72747.567106] ? update_load_avg+0x7e/0x5f0
[72747.567109] ? enqueue_entity+0xf4/0x6f0
[72747.567134] extent_writepages+0x44/0xa0 [btrfs]
[72747.567137] ? enqueue_task_fair+0x93/0x6f0
[72747.567140] do_writepages+0x41/0xd0
[72747.567144] __filemap_fdatawrite_range+0xc7/0x100
[72747.567167] btrfs_run_delalloc_work+0x17/0x40 [btrfs]
[72747.567195] btrfs_work_helper+0xc2/0x300 [btrfs]
[72747.567200] process_one_work+0x1aa/0x340
[72747.567202] worker_thread+0x30/0x390
[72747.567205] ? create_worker+0x1a0/0x1a0
[72747.567208] kthread+0x116/0x130
[72747.567211] ? kthread_park+0x80/0x80
[72747.567214] ret_from_fork+0x1f/0x30
[72747.569686] task:fsstress state:D stack: 0 pid:841421 ppid:841417 flags:0x00000000
[72747.569689] Call Trace:
[72747.569691] __schedule+0x296/0x760
[72747.569694] schedule+0x3c/0xa0
[72747.569721] try_flush_qgroup+0x95/0x140 [btrfs]
[72747.569725] ? finish_wait+0x80/0x80
[72747.569753] btrfs_qgroup_reserve_data+0x34/0x50 [btrfs]
[72747.569781] btrfs_check_data_free_space+0x5f/0xa0 [btrfs]
[72747.569804] btrfs_buffered_write+0x1f7/0x7f0 [btrfs]
[72747.569810] ? path_lookupat.isra.48+0x97/0x140
[72747.569833] btrfs_file_write_iter+0x81/0x410 [btrfs]
[72747.569836] ? __kmalloc+0x16a/0x2c0
[72747.569839] do_iter_readv_writev+0x160/0x1c0
[72747.569843] do_iter_write+0x80/0x1b0
[72747.569847] vfs_writev+0x84/0x140
[72747.569869] ? btrfs_file_llseek+0x38/0x270 [btrfs]
[72747.569873] do_writev+0x65/0x100
[72747.569876] do_syscall_64+0x33/0x40
[72747.569879] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[72747.569899] task:fsstress state:D stack: 0 pid:841424 ppid:841417 flags:0x00004000
[72747.569903] Call Trace:
[72747.569906] __schedule+0x296/0x760
[72747.569909] schedule+0x3c/0xa0
[72747.569936] try_flush_qgroup+0x95/0x140 [btrfs]
[72747.569940] ? finish_wait+0x80/0x80
[72747.569967] __btrfs_qgroup_reserve_meta+0x36/0x50 [btrfs]
[72747.569989] start_transaction+0x279/0x580 [btrfs]
[72747.570014] clone_copy_inline_extent+0x332/0x490 [btrfs]
[72747.570041] btrfs_clone+0x5b7/0x7a0 [btrfs]
[72747.570068] ? lock_extent_bits+0x64/0x90 [btrfs]
[72747.570095] btrfs_clone_files+0xfc/0x150 [btrfs]
[72747.570122] btrfs_remap_file_range+0x3d8/0x4a0 [btrfs]
[72747.570126] do_clone_file_range+0xed/0x200
[72747.570131] vfs_clone_file_range+0x37/0x110
[72747.570134] ioctl_file_clone+0x7d/0xb0
[72747.570137] do_vfs_ioctl+0x138/0x630
[72747.570140] __x64_sys_ioctl+0x62/0xc0
[72747.570143] do_syscall_64+0x33/0x40
[72747.570146] entry_SYSCALL_64_after_hwframe+0x44/0xa9
So fix this by skipping the flush of delalloc for an inode that is
flagged with BTRFS_INODE_NO_DELALLOC_FLUSH, meaning it is currently under
such a special case of cloning an inline extent, when flushing delalloc
during qgroup metadata reservation.
The special cases for cloning inline extents were added in kernel 5.7 by
by commit 05a5a7621ce66c ("Btrfs: implement full reflink support for
inline extents"), while having qgroup metadata space reservation flushing
delalloc when low on space was added in kernel 5.9 by commit
c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get
-EDQUOT"). So use a "Fixes:" tag for the later commit to ease stable
kernel backports.
Reported-by: Wang Yugui <wangyugui@e16-tech.com>
Link: https://lore.kernel.org/linux-btrfs/20210421083137.31E3.409509F4@e16-tech.com/
Fixes: c53e9653605dbf ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
CC: stable@vger.kernel.org # 5.9+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-22 12:08:05 +01:00
ret = btrfs_start_delalloc_snapshot ( root , false ) ;
2020-05-14 17:19:18 +08:00
if ( ret )
goto out ;
/*
* All previous writes have started writeback in NOCOW mode , so now
* we force future writes to fallback to COW mode during snapshot
* creation .
*/
atomic_inc ( & root - > snapshot_force_cow ) ;
snapshot_force_cow = true ;
btrfs_wait_ordered_extents ( root , U64_MAX , 0 , ( u64 ) - 1 ) ;
ret = btrfs_mksubvol ( parent , name , namelen ,
root , readonly , inherit ) ;
out :
if ( snapshot_force_cow )
atomic_dec ( & root - > snapshot_force_cow ) ;
btrfs_drew_read_unlock ( & root - > snapshot_lock ) ;
return ret ;
}
2011-05-24 15:35:30 -04:00
/*
* When we ' re defragging a range , we don ' t want to kick it off again
* if it is really just waiting for delalloc to send it down .
* If we find a nice big extent or delalloc range for the bytes in the
* file you want to defrag , we return 0 to let you know to skip this
* part of the file
*/
2014-07-29 17:32:10 +02:00
static int check_defrag_in_cache ( struct inode * inode , u64 offset , u32 thresh )
2011-05-24 15:35:30 -04:00
{
struct extent_io_tree * io_tree = & BTRFS_I ( inode ) - > io_tree ;
struct extent_map * em = NULL ;
struct extent_map_tree * em_tree = & BTRFS_I ( inode ) - > extent_tree ;
u64 end ;
read_lock ( & em_tree - > lock ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
em = lookup_extent_mapping ( em_tree , offset , PAGE_SIZE ) ;
2011-05-24 15:35:30 -04:00
read_unlock ( & em_tree - > lock ) ;
if ( em ) {
end = extent_map_end ( em ) ;
free_extent_map ( em ) ;
if ( end - offset > thresh )
return 0 ;
}
/* if we already have a nice delalloc here, just stop */
thresh / = 2 ;
end = count_range_bits ( io_tree , & offset , offset + thresh ,
thresh , EXTENT_DELALLOC , 1 ) ;
if ( end > = thresh )
return 0 ;
return 1 ;
}
/*
* helper function to walk through a file and find extents
* newer than a specific transid , and smaller than thresh .
*
* This is used by the defragging code to find new and small
* extents
*/
static int find_new_extents ( struct btrfs_root * root ,
struct inode * inode , u64 newer_than ,
2014-07-29 17:32:10 +02:00
u64 * off , u32 thresh )
2011-05-24 15:35:30 -04:00
{
struct btrfs_path * path ;
struct btrfs_key min_key ;
struct extent_buffer * leaf ;
struct btrfs_file_extent_item * extent ;
int type ;
int ret ;
2017-01-10 20:35:31 +02:00
u64 ino = btrfs_ino ( BTRFS_I ( inode ) ) ;
2011-05-24 15:35:30 -04:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2011-05-31 17:08:14 +00:00
min_key . objectid = ino ;
2011-05-24 15:35:30 -04:00
min_key . type = BTRFS_EXTENT_DATA_KEY ;
min_key . offset = * off ;
2013-10-31 10:33:04 +05:30
while ( 1 ) {
2013-10-01 16:13:42 +01:00
ret = btrfs_search_forward ( root , & min_key , path , newer_than ) ;
2011-05-24 15:35:30 -04:00
if ( ret ! = 0 )
goto none ;
Btrfs: less fs tree lock contention when using autodefrag
When finding new extents during an autodefrag, don't do so many fs tree
lookups to find an extent with a size smaller then the target treshold.
Instead, after each fs tree forward search immediately unlock upper
levels and process the entire leaf while holding a read lock on the leaf,
since our leaf processing is very fast.
This reduces lock contention, allowing for higher concurrency when other
tasks want to write/update items related to other inodes in the fs tree,
as we're not holding read locks on upper tree levels while processing the
leaf and we do less tree searches.
Test:
sysbench --test=fileio --file-num=512 --file-total-size=16G \
--file-test-mode=rndrw --num-threads=32 --file-block-size=32768 \
--file-rw-ratio=3 --file-io-mode=sync --max-time=1800 \
--max-requests=10000000000 [prepare|run]
(fileystem mounted with -o autodefrag, averages of 5 runs)
Before this change: 58.852Mb/sec throughtput, read 77.589Gb, written 25.863Gb
After this change: 63.034Mb/sec throughtput, read 83.102Gb, written 27.701Gb
Test machine: quad core intel i5-3570K, 32Gb of RAM, SSD.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-03-12 01:28:24 +00:00
process_slot :
2011-05-31 17:08:14 +00:00
if ( min_key . objectid ! = ino )
2011-05-24 15:35:30 -04:00
goto none ;
if ( min_key . type ! = BTRFS_EXTENT_DATA_KEY )
goto none ;
leaf = path - > nodes [ 0 ] ;
extent = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
type = btrfs_file_extent_type ( leaf , extent ) ;
if ( type = = BTRFS_FILE_EXTENT_REG & &
btrfs_file_extent_num_bytes ( leaf , extent ) < thresh & &
check_defrag_in_cache ( inode , min_key . offset , thresh ) ) {
* off = min_key . offset ;
btrfs_free_path ( path ) ;
return 0 ;
}
Btrfs: less fs tree lock contention when using autodefrag
When finding new extents during an autodefrag, don't do so many fs tree
lookups to find an extent with a size smaller then the target treshold.
Instead, after each fs tree forward search immediately unlock upper
levels and process the entire leaf while holding a read lock on the leaf,
since our leaf processing is very fast.
This reduces lock contention, allowing for higher concurrency when other
tasks want to write/update items related to other inodes in the fs tree,
as we're not holding read locks on upper tree levels while processing the
leaf and we do less tree searches.
Test:
sysbench --test=fileio --file-num=512 --file-total-size=16G \
--file-test-mode=rndrw --num-threads=32 --file-block-size=32768 \
--file-rw-ratio=3 --file-io-mode=sync --max-time=1800 \
--max-requests=10000000000 [prepare|run]
(fileystem mounted with -o autodefrag, averages of 5 runs)
Before this change: 58.852Mb/sec throughtput, read 77.589Gb, written 25.863Gb
After this change: 63.034Mb/sec throughtput, read 83.102Gb, written 27.701Gb
Test machine: quad core intel i5-3570K, 32Gb of RAM, SSD.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-03-12 01:28:24 +00:00
path - > slots [ 0 ] + + ;
if ( path - > slots [ 0 ] < btrfs_header_nritems ( leaf ) ) {
btrfs_item_key_to_cpu ( leaf , & min_key , path - > slots [ 0 ] ) ;
goto process_slot ;
}
2011-05-24 15:35:30 -04:00
if ( min_key . offset = = ( u64 ) - 1 )
goto none ;
min_key . offset + + ;
btrfs_release_path ( path ) ;
}
none :
btrfs_free_path ( path ) ;
return - ENOENT ;
}
2012-06-11 16:03:35 +08:00
static struct extent_map * defrag_lookup_extent ( struct inode * inode , u64 start )
2012-03-29 09:57:45 -04:00
{
struct extent_map_tree * em_tree = & BTRFS_I ( inode ) - > extent_tree ;
2012-06-11 16:03:35 +08:00
struct extent_io_tree * io_tree = & BTRFS_I ( inode ) - > io_tree ;
struct extent_map * em ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
u64 len = PAGE_SIZE ;
2012-03-29 09:57:45 -04:00
2012-06-11 16:03:35 +08:00
/*
* hopefully we have this extent in the tree already , try without
* the full extent lock
*/
2012-03-29 09:57:45 -04:00
read_lock ( & em_tree - > lock ) ;
2012-06-11 16:03:35 +08:00
em = lookup_extent_mapping ( em_tree , start , len ) ;
2012-03-29 09:57:45 -04:00
read_unlock ( & em_tree - > lock ) ;
2012-06-11 16:03:35 +08:00
if ( ! em ) {
2014-03-11 13:56:15 +00:00
struct extent_state * cached = NULL ;
u64 end = start + len - 1 ;
2012-06-11 16:03:35 +08:00
/* get the big lock and read metadata off disk */
2015-12-03 14:30:40 +01:00
lock_extent_bits ( io_tree , start , end , & cached ) ;
2019-12-02 17:34:23 -08:00
em = btrfs_get_extent ( BTRFS_I ( inode ) , NULL , 0 , start , len ) ;
2017-12-12 21:43:52 +01:00
unlock_extent_cached ( io_tree , start , end , & cached ) ;
2012-06-11 16:03:35 +08:00
if ( IS_ERR ( em ) )
return NULL ;
}
return em ;
}
2012-03-29 09:57:45 -04:00
2012-06-11 16:03:35 +08:00
static bool defrag_check_next_extent ( struct inode * inode , struct extent_map * em )
{
struct extent_map * next ;
bool ret = true ;
/* this is the last extent */
if ( em - > start + em - > len > = i_size_read ( inode ) )
return false ;
next = defrag_lookup_extent ( inode , em - > start + em - > len ) ;
2014-08-26 13:55:54 -07:00
if ( ! next | | next - > block_start > = EXTENT_MAP_LAST_BYTE )
ret = false ;
else if ( ( em - > block_start + em - > block_len = = next - > block_start ) & &
2015-12-15 01:42:10 +09:00
( em - > block_len > SZ_128K & & next - > block_len > SZ_128K ) )
2012-06-11 16:03:35 +08:00
ret = false ;
free_extent_map ( next ) ;
2012-03-29 09:57:45 -04:00
return ret ;
}
2014-07-29 17:32:10 +02:00
static int should_defrag_range ( struct inode * inode , u64 start , u32 thresh ,
2012-06-19 21:08:32 -04:00
u64 * last_len , u64 * skip , u64 * defrag_end ,
int compress )
2010-03-10 10:52:59 -05:00
{
2012-06-11 16:03:35 +08:00
struct extent_map * em ;
2010-03-10 10:52:59 -05:00
int ret = 1 ;
2012-06-11 16:03:35 +08:00
bool next_mergeable = true ;
Btrfs: fix defrag to merge tail file extent
The file layout is
[extent 1]...[extent n][4k extent][HOLE][extent x]
extent 1~n and 4k extent can be merged during defrag, and the whole
defrag bytes is larger than our defrag thresh(256k), 4k extent as a
tail is left unmerged since we check if its next extent can be merged
(the next one is a hole, so the check will fail), the layout thus can
be
[new extent][4k extent][HOLE][extent x]
(1~n)
To fix it, beside looking at the next one, this also looks at the
previous one by checking @defrag_end, which is set to 0 when we
decide to stop merging contiguous extents, otherwise, we can merge
the previous one with our extent.
Also, this makes btrfs behave consistent with how xfs and ext4 do.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-08-07 16:48:41 +08:00
bool prev_mergeable = true ;
2010-03-10 10:52:59 -05:00
/*
2011-09-02 15:57:07 +08:00
* make sure that once we start defragging an extent , we keep on
2010-03-10 10:52:59 -05:00
* defragging it
*/
if ( start < * defrag_end )
return 1 ;
* skip = 0 ;
2012-06-11 16:03:35 +08:00
em = defrag_lookup_extent ( inode , start ) ;
if ( ! em )
return 0 ;
2010-03-10 10:52:59 -05:00
/* this will cover holes, and inline extents */
2012-03-29 09:57:45 -04:00
if ( em - > block_start > = EXTENT_MAP_LAST_BYTE ) {
2010-03-10 10:52:59 -05:00
ret = 0 ;
2012-03-29 09:57:45 -04:00
goto out ;
}
Btrfs: fix defrag to merge tail file extent
The file layout is
[extent 1]...[extent n][4k extent][HOLE][extent x]
extent 1~n and 4k extent can be merged during defrag, and the whole
defrag bytes is larger than our defrag thresh(256k), 4k extent as a
tail is left unmerged since we check if its next extent can be merged
(the next one is a hole, so the check will fail), the layout thus can
be
[new extent][4k extent][HOLE][extent x]
(1~n)
To fix it, beside looking at the next one, this also looks at the
previous one by checking @defrag_end, which is set to 0 when we
decide to stop merging contiguous extents, otherwise, we can merge
the previous one with our extent.
Also, this makes btrfs behave consistent with how xfs and ext4 do.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-08-07 16:48:41 +08:00
if ( ! * defrag_end )
prev_mergeable = false ;
2012-06-11 16:03:35 +08:00
next_mergeable = defrag_check_next_extent ( inode , em ) ;
2010-03-10 10:52:59 -05:00
/*
2012-06-11 16:03:35 +08:00
* we hit a real extent , if it is big or the next extent is not a
* real extent , don ' t bother defragging it
2010-03-10 10:52:59 -05:00
*/
2012-06-19 21:08:32 -04:00
if ( ! compress & & ( * last_len = = 0 | | * last_len > = thresh ) & &
Btrfs: fix defrag to merge tail file extent
The file layout is
[extent 1]...[extent n][4k extent][HOLE][extent x]
extent 1~n and 4k extent can be merged during defrag, and the whole
defrag bytes is larger than our defrag thresh(256k), 4k extent as a
tail is left unmerged since we check if its next extent can be merged
(the next one is a hole, so the check will fail), the layout thus can
be
[new extent][4k extent][HOLE][extent x]
(1~n)
To fix it, beside looking at the next one, this also looks at the
previous one by checking @defrag_end, which is set to 0 when we
decide to stop merging contiguous extents, otherwise, we can merge
the previous one with our extent.
Also, this makes btrfs behave consistent with how xfs and ext4 do.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-08-07 16:48:41 +08:00
( em - > len > = thresh | | ( ! next_mergeable & & ! prev_mergeable ) ) )
2010-03-10 10:52:59 -05:00
ret = 0 ;
2012-03-29 09:57:45 -04:00
out :
2010-03-10 10:52:59 -05:00
/*
* last_len ends up being a counter of how many bytes we ' ve defragged .
* every time we choose not to defrag an extent , we reset * last_len
* so that the next tiny extent will force a defrag .
*
* The end result of this is that tiny extents before a single big
* extent will force at least part of that big extent to be defragged .
*/
if ( ret ) {
* defrag_end = extent_map_end ( em ) ;
} else {
* last_len = 0 ;
* skip = extent_map_end ( em ) ;
* defrag_end = 0 ;
}
free_extent_map ( em ) ;
return ret ;
}
2011-05-24 15:35:30 -04:00
/*
* it doesn ' t do much good to defrag one or two pages
* at a time . This pulls in a nice chunk of pages
* to COW and defrag .
*
* It also makes sure the delalloc code has enough
* dirty data to avoid making new small extents as part
* of the defrag
*
* It ' s a good idea to start RA on this range
* before calling this .
*/
static int cluster_pages_for_defrag ( struct inode * inode ,
struct page * * pages ,
unsigned long start_index ,
2014-01-21 11:18:29 -08:00
unsigned long num_pages )
2008-06-11 21:53:53 -04:00
{
2011-05-24 15:35:30 -04:00
unsigned long file_end ;
u64 isize = i_size_read ( inode ) ;
u64 page_start ;
u64 page_end ;
2012-03-29 09:57:44 -04:00
u64 page_cnt ;
2020-10-04 19:04:26 +01:00
u64 start = ( u64 ) start_index < < PAGE_SHIFT ;
btrfs: fix race when defragmenting leads to unnecessary IO
When defragmenting we skip ranges that have holes or inline extents, so that
we don't do unnecessary IO and waste space. We do this check when calling
should_defrag_range() at btrfs_defrag_file(). However we do it without
holding the inode's lock. The reason we do it like this is to avoid
blocking other tasks for too long, that possibly want to operate on other
file ranges, since after the call to should_defrag_range() and before
locking the inode, we trigger a synchronous page cache readahead. However
before we were able to lock the inode, some other task might have punched
a hole in our range, or we may now have an inline extent there, in which
case we should not set the range for defrag anymore since that would cause
unnecessary IO and make us waste space (i.e. allocating extents to contain
zeros for a hole).
So after we locked the inode and the range in the iotree, check again if
we have holes or an inline extent, and if we do, just skip the range.
I hit this while testing my next patch that fixes races when updating an
inode's number of bytes (subject "btrfs: update the number of bytes used
by an inode atomically"), and it depends on this change in order to work
correctly. Alternatively I could rework that other patch to detect holes
and flag their range with the 'new delalloc' bit, but this itself fixes
an efficiency problem due a race that from a functional point of view is
not harmful (it could be triggered with btrfs/062 from fstests).
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:33 +00:00
u64 search_start ;
2011-05-24 15:35:30 -04:00
int ret ;
int i ;
int i_done ;
2008-07-24 11:57:52 -04:00
struct btrfs_ordered_extent * ordered ;
2011-05-24 15:35:30 -04:00
struct extent_state * cached_state = NULL ;
2012-02-16 15:01:24 +08:00
struct extent_io_tree * tree ;
2017-02-27 15:10:38 +08:00
struct extent_changeset * data_reserved = NULL ;
2011-09-21 15:05:58 -04:00
gfp_t mask = btrfs_alloc_write_mask ( inode - > i_mapping ) ;
2011-05-24 15:35:30 -04:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
file_end = ( isize - 1 ) > > PAGE_SHIFT ;
2012-03-29 09:57:44 -04:00
if ( ! isize | | start_index > file_end )
return 0 ;
page_cnt = min_t ( u64 , ( u64 ) num_pages , ( u64 ) file_end - start_index + 1 ) ;
2011-05-24 15:35:30 -04:00
2020-06-03 08:55:42 +03:00
ret = btrfs_delalloc_reserve_space ( BTRFS_I ( inode ) , & data_reserved ,
2020-10-04 19:04:26 +01:00
start , page_cnt < < PAGE_SHIFT ) ;
2011-05-24 15:35:30 -04:00
if ( ret )
return ret ;
i_done = 0 ;
2012-02-16 15:01:24 +08:00
tree = & BTRFS_I ( inode ) - > io_tree ;
2011-05-24 15:35:30 -04:00
/* step one, lock all the pages */
2012-03-29 09:57:44 -04:00
for ( i = 0 ; i < page_cnt ; i + + ) {
2011-05-24 15:35:30 -04:00
struct page * page ;
2012-02-16 15:01:24 +08:00
again :
2011-07-11 10:47:06 -04:00
page = find_or_create_page ( inode - > i_mapping ,
2012-02-16 15:01:24 +08:00
start_index + i , mask ) ;
2011-05-24 15:35:30 -04:00
if ( ! page )
break ;
2021-01-26 16:34:00 +08:00
ret = set_page_extent_mapped ( page ) ;
if ( ret < 0 ) {
unlock_page ( page ) ;
put_page ( page ) ;
break ;
}
2012-02-16 15:01:24 +08:00
page_start = page_offset ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
page_end = page_start + PAGE_SIZE - 1 ;
2012-02-16 15:01:24 +08:00
while ( 1 ) {
2014-03-11 13:56:15 +00:00
lock_extent_bits ( tree , page_start , page_end ,
2015-12-03 14:30:40 +01:00
& cached_state ) ;
2020-06-03 08:55:03 +03:00
ordered = btrfs_lookup_ordered_extent ( BTRFS_I ( inode ) ,
2012-02-16 15:01:24 +08:00
page_start ) ;
2014-03-11 13:56:15 +00:00
unlock_extent_cached ( tree , page_start , page_end ,
2017-12-12 21:43:52 +01:00
& cached_state ) ;
2012-02-16 15:01:24 +08:00
if ( ! ordered )
break ;
unlock_page ( page ) ;
2020-09-18 12:15:53 +03:00
btrfs_start_ordered_extent ( ordered , 1 ) ;
2012-02-16 15:01:24 +08:00
btrfs_put_ordered_extent ( ordered ) ;
lock_page ( page ) ;
2012-03-29 09:57:44 -04:00
/*
* we unlocked the page above , so we need check if
* it was released or not .
*/
if ( page - > mapping ! = inode - > i_mapping ) {
unlock_page ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( page ) ;
2012-03-29 09:57:44 -04:00
goto again ;
}
2012-02-16 15:01:24 +08:00
}
2011-05-24 15:35:30 -04:00
if ( ! PageUptodate ( page ) ) {
btrfs_readpage ( NULL , page ) ;
lock_page ( page ) ;
if ( ! PageUptodate ( page ) ) {
unlock_page ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( page ) ;
2011-05-24 15:35:30 -04:00
ret = - EIO ;
break ;
}
}
2012-02-16 15:01:24 +08:00
if ( page - > mapping ! = inode - > i_mapping ) {
unlock_page ( page ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( page ) ;
2012-02-16 15:01:24 +08:00
goto again ;
}
2011-05-24 15:35:30 -04:00
pages [ i ] = page ;
i_done + + ;
}
if ( ! i_done | | ret )
goto out ;
2017-11-27 13:05:09 -08:00
if ( ! ( inode - > i_sb - > s_flags & SB_ACTIVE ) )
2011-05-24 15:35:30 -04:00
goto out ;
/*
* so now we have a nice long stream of locked
* and up to date pages , lets wait on them
*/
for ( i = 0 ; i < i_done ; i + + )
wait_on_page_writeback ( pages [ i ] ) ;
page_start = page_offset ( pages [ 0 ] ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
page_end = page_offset ( pages [ i_done - 1 ] ) + PAGE_SIZE ;
2011-05-24 15:35:30 -04:00
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree ,
2015-12-03 14:30:40 +01:00
page_start , page_end - 1 , & cached_state ) ;
btrfs: fix race when defragmenting leads to unnecessary IO
When defragmenting we skip ranges that have holes or inline extents, so that
we don't do unnecessary IO and waste space. We do this check when calling
should_defrag_range() at btrfs_defrag_file(). However we do it without
holding the inode's lock. The reason we do it like this is to avoid
blocking other tasks for too long, that possibly want to operate on other
file ranges, since after the call to should_defrag_range() and before
locking the inode, we trigger a synchronous page cache readahead. However
before we were able to lock the inode, some other task might have punched
a hole in our range, or we may now have an inline extent there, in which
case we should not set the range for defrag anymore since that would cause
unnecessary IO and make us waste space (i.e. allocating extents to contain
zeros for a hole).
So after we locked the inode and the range in the iotree, check again if
we have holes or an inline extent, and if we do, just skip the range.
I hit this while testing my next patch that fixes races when updating an
inode's number of bytes (subject "btrfs: update the number of bytes used
by an inode atomically"), and it depends on this change in order to work
correctly. Alternatively I could rework that other patch to detect holes
and flag their range with the 'new delalloc' bit, but this itself fixes
an efficiency problem due a race that from a functional point of view is
not harmful (it could be triggered with btrfs/062 from fstests).
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:33 +00:00
/*
* When defragmenting we skip ranges that have holes or inline extents ,
* ( check should_defrag_range ( ) ) , to avoid unnecessary IO and wasting
* space . At btrfs_defrag_file ( ) , we check if a range should be defragged
* before locking the inode and then , if it should , we trigger a sync
* page cache readahead - we lock the inode only after that to avoid
* blocking for too long other tasks that possibly want to operate on
* other file ranges . But before we were able to get the inode lock ,
* some other task may have punched a hole in the range , or we may have
* now an inline extent , in which case we should not defrag . So check
* for that here , where we have the inode and the range locked , and bail
* out if that happened .
*/
search_start = page_start ;
while ( search_start < page_end ) {
struct extent_map * em ;
em = btrfs_get_extent ( BTRFS_I ( inode ) , NULL , 0 , search_start ,
page_end - search_start ) ;
if ( IS_ERR ( em ) ) {
ret = PTR_ERR ( em ) ;
goto out_unlock_range ;
}
if ( em - > block_start > = EXTENT_MAP_LAST_BYTE ) {
free_extent_map ( em ) ;
/* Ok, 0 means we did not defrag anything */
ret = 0 ;
goto out_unlock_range ;
}
search_start = extent_map_end ( em ) ;
free_extent_map ( em ) ;
}
2011-05-24 15:35:30 -04:00
clear_extent_bit ( & BTRFS_I ( inode ) - > io_tree , page_start ,
2019-08-15 14:04:04 -07:00
page_end - 1 , EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG , 0 , 0 , & cached_state ) ;
2011-05-24 15:35:30 -04:00
2012-03-29 09:57:44 -04:00
if ( i_done ! = page_cnt ) {
2011-07-15 15:16:44 +00:00
spin_lock ( & BTRFS_I ( inode ) - > lock ) ;
2018-09-05 11:07:33 +08:00
btrfs_mod_outstanding_extents ( BTRFS_I ( inode ) , 1 ) ;
2011-07-15 15:16:44 +00:00
spin_unlock ( & BTRFS_I ( inode ) - > lock ) ;
2020-06-03 08:55:40 +03:00
btrfs_delalloc_release_space ( BTRFS_I ( inode ) , data_reserved ,
2020-10-04 19:04:26 +01:00
start , ( page_cnt - i_done ) < < PAGE_SHIFT , true ) ;
2011-05-24 15:35:30 -04:00
}
2012-09-05 19:10:51 -06:00
set_extent_defrag ( & BTRFS_I ( inode ) - > io_tree , page_start , page_end - 1 ,
2016-04-26 23:54:39 +02:00
& cached_state ) ;
2011-05-24 15:35:30 -04:00
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
2017-12-12 21:43:52 +01:00
page_start , page_end - 1 , & cached_state ) ;
2011-05-24 15:35:30 -04:00
for ( i = 0 ; i < i_done ; i + + ) {
clear_page_dirty_for_io ( pages [ i ] ) ;
ClearPageChecked ( pages [ i ] ) ;
set_page_dirty ( pages [ i ] ) ;
unlock_page ( pages [ i ] ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( pages [ i ] ) ;
2011-05-24 15:35:30 -04:00
}
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents()
[Background]
Btrfs qgroup uses two types of reserved space for METADATA space,
PERTRANS and PREALLOC.
PERTRANS is metadata space reserved for each transaction started by
btrfs_start_transaction().
While PREALLOC is for delalloc, where we reserve space before joining a
transaction, and finally it will be converted to PERTRANS after the
writeback is done.
[Inconsistency]
However there is inconsistency in how we handle PREALLOC metadata space.
The most obvious one is:
In btrfs_buffered_write():
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true);
We always free qgroup PREALLOC meta space.
While in btrfs_truncate_block():
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
We only free qgroup PREALLOC meta space when something went wrong.
[The Correct Behavior]
The correct behavior should be the one in btrfs_buffered_write(), we
should always free PREALLOC metadata space.
The reason is, the btrfs_delalloc_* mechanism works by:
- Reserve metadata first, even it's not necessary
In btrfs_delalloc_reserve_metadata()
- Free the unused metadata space
Normally in:
btrfs_delalloc_release_extents()
|- btrfs_inode_rsv_release()
Here we do calculation on whether we should release or not.
E.g. for 64K buffered write, the metadata rsv works like:
/* The first page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=0
total: num_bytes=calc_inode_reservations()
/* The first page caused one outstanding extent, thus needs metadata
rsv */
/* The 2nd page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed
/* The 2nd page doesn't cause new outstanding extent, needs no new meta
rsv, so we free what we have reserved */
/* The 3rd~16th pages */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed (still space for one outstanding extent)
This means, if btrfs_delalloc_release_extents() determines to free some
space, then those space should be freed NOW.
So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other
than btrfs_qgroup_convert_reserved_meta().
The good news is:
- The callers are not that hot
The hottest caller is in btrfs_buffered_write(), which is already
fixed by commit 336a8bb8e36a ("btrfs: Fix wrong
btrfs_delalloc_release_extents parameter"). Thus it's not that
easy to cause false EDQUOT.
- The trans commit in advance for qgroup would hide the bug
Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction
commit more aggressive"), when btrfs qgroup metadata free space is slow,
it will try to commit transaction and free the wrongly converted
PERTRANS space, so it's not that easy to hit such bug.
[FIX]
So to fix the problem, remove the @qgroup_free parameter for
btrfs_delalloc_release_extents(), and always pass true to
btrfs_inode_rsv_release().
Reported-by: Filipe Manana <fdmanana@suse.com>
Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 14:34:51 +08:00
btrfs_delalloc_release_extents ( BTRFS_I ( inode ) , page_cnt < < PAGE_SHIFT ) ;
2017-02-27 15:10:38 +08:00
extent_changeset_free ( data_reserved ) ;
2011-05-24 15:35:30 -04:00
return i_done ;
btrfs: fix race when defragmenting leads to unnecessary IO
When defragmenting we skip ranges that have holes or inline extents, so that
we don't do unnecessary IO and waste space. We do this check when calling
should_defrag_range() at btrfs_defrag_file(). However we do it without
holding the inode's lock. The reason we do it like this is to avoid
blocking other tasks for too long, that possibly want to operate on other
file ranges, since after the call to should_defrag_range() and before
locking the inode, we trigger a synchronous page cache readahead. However
before we were able to lock the inode, some other task might have punched
a hole in our range, or we may now have an inline extent there, in which
case we should not set the range for defrag anymore since that would cause
unnecessary IO and make us waste space (i.e. allocating extents to contain
zeros for a hole).
So after we locked the inode and the range in the iotree, check again if
we have holes or an inline extent, and if we do, just skip the range.
I hit this while testing my next patch that fixes races when updating an
inode's number of bytes (subject "btrfs: update the number of bytes used
by an inode atomically"), and it depends on this change in order to work
correctly. Alternatively I could rework that other patch to detect holes
and flag their range with the 'new delalloc' bit, but this itself fixes
an efficiency problem due a race that from a functional point of view is
not harmful (it could be triggered with btrfs/062 from fstests).
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 11:07:33 +00:00
out_unlock_range :
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
page_start , page_end - 1 , & cached_state ) ;
2011-05-24 15:35:30 -04:00
out :
for ( i = 0 ; i < i_done ; i + + ) {
unlock_page ( pages [ i ] ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
put_page ( pages [ i ] ) ;
2011-05-24 15:35:30 -04:00
}
2020-06-03 08:55:40 +03:00
btrfs_delalloc_release_space ( BTRFS_I ( inode ) , data_reserved ,
2020-10-04 19:04:26 +01:00
start , page_cnt < < PAGE_SHIFT , true ) ;
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents()
[Background]
Btrfs qgroup uses two types of reserved space for METADATA space,
PERTRANS and PREALLOC.
PERTRANS is metadata space reserved for each transaction started by
btrfs_start_transaction().
While PREALLOC is for delalloc, where we reserve space before joining a
transaction, and finally it will be converted to PERTRANS after the
writeback is done.
[Inconsistency]
However there is inconsistency in how we handle PREALLOC metadata space.
The most obvious one is:
In btrfs_buffered_write():
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true);
We always free qgroup PREALLOC meta space.
While in btrfs_truncate_block():
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
We only free qgroup PREALLOC meta space when something went wrong.
[The Correct Behavior]
The correct behavior should be the one in btrfs_buffered_write(), we
should always free PREALLOC metadata space.
The reason is, the btrfs_delalloc_* mechanism works by:
- Reserve metadata first, even it's not necessary
In btrfs_delalloc_reserve_metadata()
- Free the unused metadata space
Normally in:
btrfs_delalloc_release_extents()
|- btrfs_inode_rsv_release()
Here we do calculation on whether we should release or not.
E.g. for 64K buffered write, the metadata rsv works like:
/* The first page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=0
total: num_bytes=calc_inode_reservations()
/* The first page caused one outstanding extent, thus needs metadata
rsv */
/* The 2nd page */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed
/* The 2nd page doesn't cause new outstanding extent, needs no new meta
rsv, so we free what we have reserved */
/* The 3rd~16th pages */
reserve_meta: num_bytes=calc_inode_reservations()
free_meta: num_bytes=calc_inode_reservations()
total: not changed (still space for one outstanding extent)
This means, if btrfs_delalloc_release_extents() determines to free some
space, then those space should be freed NOW.
So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other
than btrfs_qgroup_convert_reserved_meta().
The good news is:
- The callers are not that hot
The hottest caller is in btrfs_buffered_write(), which is already
fixed by commit 336a8bb8e36a ("btrfs: Fix wrong
btrfs_delalloc_release_extents parameter"). Thus it's not that
easy to cause false EDQUOT.
- The trans commit in advance for qgroup would hide the bug
Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction
commit more aggressive"), when btrfs qgroup metadata free space is slow,
it will try to commit transaction and free the wrongly converted
PERTRANS space, so it's not that easy to hit such bug.
[FIX]
So to fix the problem, remove the @qgroup_free parameter for
btrfs_delalloc_release_extents(), and always pass true to
btrfs_inode_rsv_release().
Reported-by: Filipe Manana <fdmanana@suse.com>
Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 14:34:51 +08:00
btrfs_delalloc_release_extents ( BTRFS_I ( inode ) , page_cnt < < PAGE_SHIFT ) ;
2017-02-27 15:10:38 +08:00
extent_changeset_free ( data_reserved ) ;
2011-05-24 15:35:30 -04:00
return ret ;
}
int btrfs_defrag_file ( struct inode * inode , struct file * file ,
struct btrfs_ioctl_defrag_range_args * range ,
u64 newer_than , unsigned long max_to_defrag )
{
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2011-05-24 15:35:30 -04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct file_ra_state * ra = NULL ;
2008-06-11 21:53:53 -04:00
unsigned long last_index ;
2011-09-02 15:56:39 +08:00
u64 isize = i_size_read ( inode ) ;
2010-03-10 10:52:59 -05:00
u64 last_len = 0 ;
u64 skip = 0 ;
u64 defrag_end = 0 ;
2011-05-24 15:35:30 -04:00
u64 newer_off = range - > start ;
2008-06-11 21:53:53 -04:00
unsigned long i ;
2011-09-02 15:57:07 +08:00
unsigned long ra_index = 0 ;
2008-06-11 21:53:53 -04:00
int ret ;
2011-05-24 15:35:30 -04:00
int defrag_count = 0 ;
2010-10-25 15:12:50 +08:00
int compress_type = BTRFS_COMPRESS_ZLIB ;
2014-07-29 17:32:10 +02:00
u32 extent_thresh = range - > extent_thresh ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
unsigned long max_cluster = SZ_256K > > PAGE_SHIFT ;
2014-01-21 11:18:29 -08:00
unsigned long cluster = max_cluster ;
2015-12-15 01:42:10 +09:00
u64 new_align = ~ ( ( u64 ) SZ_128K - 1 ) ;
2011-05-24 15:35:30 -04:00
struct page * * pages = NULL ;
2017-07-17 20:01:59 +02:00
bool do_compress = range - > flags & BTRFS_DEFRAG_RANGE_COMPRESS ;
2011-05-24 15:35:30 -04:00
2013-04-16 09:20:28 +00:00
if ( isize = = 0 )
return 0 ;
if ( range - > start > = isize )
return - EINVAL ;
2010-10-25 15:12:50 +08:00
2017-07-17 20:01:59 +02:00
if ( do_compress ) {
2019-10-10 15:59:57 +08:00
if ( range - > compress_type > = BTRFS_NR_COMPRESS_TYPES )
2010-10-25 15:12:50 +08:00
return - EINVAL ;
if ( range - > compress_type )
compress_type = range - > compress_type ;
}
2008-06-11 21:53:53 -04:00
2013-04-16 09:20:28 +00:00
if ( extent_thresh = = 0 )
2015-12-15 01:42:10 +09:00
extent_thresh = SZ_256K ;
2010-03-10 10:52:59 -05:00
2011-05-24 15:35:30 -04:00
/*
2017-06-22 03:22:58 +02:00
* If we were not given a file , allocate a readahead context . As
* readahead is just an optimization , defrag will work without it so
* we don ' t error out .
2011-05-24 15:35:30 -04:00
*/
if ( ! file ) {
2017-06-22 03:13:02 +02:00
ra = kzalloc ( sizeof ( * ra ) , GFP_KERNEL ) ;
2017-06-22 03:22:58 +02:00
if ( ra )
file_ra_state_init ( ra , inode - > i_mapping ) ;
2011-05-24 15:35:30 -04:00
} else {
ra = & file - > f_ra ;
}
2017-06-22 03:13:02 +02:00
pages = kmalloc_array ( max_cluster , sizeof ( struct page * ) , GFP_KERNEL ) ;
2011-05-24 15:35:30 -04:00
if ( ! pages ) {
ret = - ENOMEM ;
goto out_ra ;
}
/* find the last page to defrag */
2010-03-11 09:42:04 -05:00
if ( range - > start + range - > len > range - > start ) {
2011-09-02 15:56:39 +08:00
last_index = min_t ( u64 , isize - 1 ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
range - > start + range - > len - 1 ) > > PAGE_SHIFT ;
2010-03-11 09:42:04 -05:00
} else {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
last_index = ( isize - 1 ) > > PAGE_SHIFT ;
2010-03-11 09:42:04 -05:00
}
2011-05-24 15:35:30 -04:00
if ( newer_than ) {
ret = find_new_extents ( root , inode , newer_than ,
2015-12-15 01:42:10 +09:00
& newer_off , SZ_64K ) ;
2011-05-24 15:35:30 -04:00
if ( ! ret ) {
range - > start = newer_off ;
/*
* we always align our defrag to help keep
* the extents in the file evenly spaced
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
i = ( newer_off & new_align ) > > PAGE_SHIFT ;
2011-05-24 15:35:30 -04:00
} else
goto out_ra ;
} else {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
i = range - > start > > PAGE_SHIFT ;
2011-05-24 15:35:30 -04:00
}
if ( ! max_to_defrag )
2015-06-09 10:35:11 +05:30
max_to_defrag = last_index - i + 1 ;
2011-05-24 15:35:30 -04:00
2011-10-10 15:43:34 -04:00
/*
* make writeback starts from i , so the defrag range can be
* written sequentially .
*/
if ( i < inode - > i_mapping - > writeback_index )
inode - > i_mapping - > writeback_index = i ;
2011-10-11 11:41:40 -04:00
while ( i < = last_index & & defrag_count < max_to_defrag & &
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
( i < DIV_ROUND_UP ( i_size_read ( inode ) , PAGE_SIZE ) ) ) {
2011-05-24 15:35:30 -04:00
/*
* make sure we stop running if someone unmounts
* the FS
*/
2017-11-27 13:05:09 -08:00
if ( ! ( inode - > i_sb - > s_flags & SB_ACTIVE ) )
2011-05-24 15:35:30 -04:00
break ;
2016-06-22 18:54:23 -04:00
if ( btrfs_defrag_cancelled ( fs_info ) ) {
btrfs_debug ( fs_info , " defrag_file cancelled " ) ;
2013-02-09 23:38:06 +00:00
ret = - EAGAIN ;
break ;
}
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
if ( ! should_defrag_range ( inode , ( u64 ) i < < PAGE_SHIFT ,
2012-06-11 16:03:35 +08:00
extent_thresh , & last_len , & skip ,
2017-07-17 20:01:59 +02:00
& defrag_end , do_compress ) ) {
2010-03-10 10:52:59 -05:00
unsigned long next ;
/*
* the should_defrag function tells us how much to skip
* bump our counter by the suggested amount
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
next = DIV_ROUND_UP ( skip , PAGE_SIZE ) ;
2010-03-10 10:52:59 -05:00
i = max ( i + 1 , next ) ;
continue ;
}
2011-09-02 15:57:07 +08:00
if ( ! newer_than ) {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
cluster = ( PAGE_ALIGN ( defrag_end ) > >
PAGE_SHIFT ) - i ;
2011-09-02 15:57:07 +08:00
cluster = min ( cluster , max_cluster ) ;
} else {
cluster = max_cluster ;
}
if ( i + cluster > ra_index ) {
ra_index = max ( i , ra_index ) ;
2017-06-22 03:22:58 +02:00
if ( ra )
2017-06-22 03:35:28 +02:00
page_cache_sync_readahead ( inode - > i_mapping , ra ,
file , ra_index , cluster ) ;
2015-06-09 17:38:32 +05:30
ra_index + = cluster ;
2011-09-02 15:57:07 +08:00
}
2010-03-10 10:52:59 -05:00
2021-02-10 17:14:34 -05:00
btrfs_inode_lock ( inode , 0 ) ;
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-03 10:28:12 -07:00
if ( IS_SWAPFILE ( inode ) ) {
ret = - ETXTBSY ;
} else {
if ( do_compress )
BTRFS_I ( inode ) - > defrag_compress = compress_type ;
ret = cluster_pages_for_defrag ( inode , pages , i , cluster ) ;
}
2012-03-29 09:57:44 -04:00
if ( ret < 0 ) {
2021-02-10 17:14:34 -05:00
btrfs_inode_unlock ( inode , 0 ) ;
2011-05-24 15:35:30 -04:00
goto out_ra ;
2012-03-29 09:57:44 -04:00
}
2011-05-24 15:35:30 -04:00
defrag_count + = ret ;
2012-12-11 16:00:21 -08:00
balance_dirty_pages_ratelimited ( inode - > i_mapping ) ;
2021-02-10 17:14:34 -05:00
btrfs_inode_unlock ( inode , 0 ) ;
2011-05-24 15:35:30 -04:00
if ( newer_than ) {
if ( newer_off = = ( u64 ) - 1 )
break ;
2012-03-29 09:57:45 -04:00
if ( ret > 0 )
i + = ret ;
2011-05-24 15:35:30 -04:00
newer_off = max ( newer_off + 1 ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
( u64 ) i < < PAGE_SHIFT ) ;
2011-05-24 15:35:30 -04:00
2015-12-15 01:42:10 +09:00
ret = find_new_extents ( root , inode , newer_than ,
& newer_off , SZ_64K ) ;
2011-05-24 15:35:30 -04:00
if ( ! ret ) {
range - > start = newer_off ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
i = ( newer_off & new_align ) > > PAGE_SHIFT ;
2011-05-24 15:35:30 -04:00
} else {
break ;
2008-06-11 21:53:53 -04:00
}
2011-05-24 15:35:30 -04:00
} else {
2011-09-02 15:57:07 +08:00
if ( ret > 0 ) {
2011-09-02 15:56:25 +08:00
i + = ret ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
last_len + = ret < < PAGE_SHIFT ;
2011-09-02 15:57:07 +08:00
} else {
2011-09-02 15:56:25 +08:00
i + + ;
2011-09-02 15:57:07 +08:00
last_len = 0 ;
}
2008-06-11 21:53:53 -04:00
}
}
2014-03-01 10:55:54 +00:00
if ( ( range - > flags & BTRFS_DEFRAG_RANGE_START_IO ) ) {
2010-03-11 09:42:04 -05:00
filemap_flush ( inode - > i_mapping ) ;
2014-03-01 10:55:54 +00:00
if ( test_bit ( BTRFS_INODE_HAS_ASYNC_EXTENT ,
& BTRFS_I ( inode ) - > runtime_flags ) )
filemap_flush ( inode - > i_mapping ) ;
}
2010-03-11 09:42:04 -05:00
2010-10-25 15:12:50 +08:00
if ( range - > compress_type = = BTRFS_COMPRESS_LZO ) {
2016-06-22 18:54:23 -04:00
btrfs_set_fs_incompat ( fs_info , COMPRESS_LZO ) ;
btrfs: Add zstd support
Add zstd compression and decompression support to BtrFS. zstd at its
fastest level compresses almost as well as zlib, while offering much
faster compression and decompression, approaching lzo speeds.
I benchmarked btrfs with zstd compression against no compression, lzo
compression, and zlib compression. I benchmarked two scenarios. Copying
a set of files to btrfs, and then reading the files. Copying a tarball
to btrfs, extracting it to btrfs, and then reading the extracted files.
After every operation, I call `sync` and include the sync time.
Between every pair of operations I unmount and remount the filesystem
to avoid caching. The benchmark files can be found in the upstream
zstd source repository under
`contrib/linux-kernel/{btrfs-benchmark.sh,btrfs-extract-benchmark.sh}`
[1] [2].
I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD.
The first compression benchmark is copying 10 copies of the unzipped
Silesia corpus [3] into a BtrFS filesystem mounted with
`-o compress-force=Method`. The decompression benchmark times how long
it takes to `tar` all 10 copies into `/dev/null`. The compression ratio is
measured by comparing the output of `df` and `du`. See the benchmark file
[1] for details. I benchmarked multiple zstd compression levels, although
the patch uses zstd level 1.
| Method | Ratio | Compression MB/s | Decompression speed |
|---------|-------|------------------|---------------------|
| None | 0.99 | 504 | 686 |
| lzo | 1.66 | 398 | 442 |
| zlib | 2.58 | 65 | 241 |
| zstd 1 | 2.57 | 260 | 383 |
| zstd 3 | 2.71 | 174 | 408 |
| zstd 6 | 2.87 | 70 | 398 |
| zstd 9 | 2.92 | 43 | 406 |
| zstd 12 | 2.93 | 21 | 408 |
| zstd 15 | 3.01 | 11 | 354 |
The next benchmark first copies `linux-4.11.6.tar` [4] to btrfs. Then it
measures the compression ratio, extracts the tar, and deletes the tar.
Then it measures the compression ratio again, and `tar`s the extracted
files into `/dev/null`. See the benchmark file [2] for details.
| Method | Tar Ratio | Extract Ratio | Copy (s) | Extract (s)| Read (s) |
|--------|-----------|---------------|----------|------------|----------|
| None | 0.97 | 0.78 | 0.981 | 5.501 | 8.807 |
| lzo | 2.06 | 1.38 | 1.631 | 8.458 | 8.585 |
| zlib | 3.40 | 1.86 | 7.750 | 21.544 | 11.744 |
| zstd 1 | 3.57 | 1.85 | 2.579 | 11.479 | 9.389 |
[1] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-benchmark.sh
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-extract-benchmark.sh
[3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
[4] https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.11.6.tar.xz
zstd source repository: https://github.com/facebook/zstd
Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2017-08-09 19:39:02 -07:00
} else if ( range - > compress_type = = BTRFS_COMPRESS_ZSTD ) {
btrfs_set_fs_incompat ( fs_info , COMPRESS_ZSTD ) ;
2010-10-25 15:12:50 +08:00
}
2011-09-01 16:33:57 +02:00
ret = defrag_count ;
2010-03-10 10:52:59 -05:00
2011-05-24 15:35:30 -04:00
out_ra :
2017-07-17 20:01:59 +02:00
if ( do_compress ) {
2021-02-10 17:14:34 -05:00
btrfs_inode_lock ( inode , 0 ) ;
2017-07-17 19:41:31 +02:00
BTRFS_I ( inode ) - > defrag_compress = BTRFS_COMPRESS_NONE ;
2021-02-10 17:14:34 -05:00
btrfs_inode_unlock ( inode , 0 ) ;
2013-08-16 15:23:33 +01:00
}
2011-05-24 15:35:30 -04:00
if ( ! file )
kfree ( ra ) ;
kfree ( pages ) ;
2010-03-10 10:52:59 -05:00
return ret ;
2008-06-11 21:53:53 -04:00
}
2012-11-26 08:43:45 +00:00
static noinline int btrfs_ioctl_resize ( struct file * file ,
2009-09-21 16:00:26 -04:00
void __user * arg )
2008-06-11 21:53:53 -04:00
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2008-06-11 21:53:53 -04:00
u64 new_size ;
u64 old_size ;
u64 devid = 1 ;
2016-06-22 18:54:23 -04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2008-06-11 21:53:53 -04:00
struct btrfs_ioctl_vol_args * vol_args ;
struct btrfs_trans_handle * trans ;
struct btrfs_device * device = NULL ;
char * sizestr ;
2014-03-31 18:03:25 +08:00
char * retptr ;
2008-06-11 21:53:53 -04:00
char * devstr = NULL ;
int ret = 0 ;
int mod = 0 ;
2009-01-05 16:57:23 -05:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 08:43:45 +00:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2020-08-25 10:02:32 -05:00
if ( ! btrfs_exclop_start ( fs_info , BTRFS_EXCLOP_RESIZE ) ) {
2012-12-21 10:38:50 +00:00
mnt_drop_write_file ( file ) ;
2013-08-21 11:44:48 +08:00
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2012-01-16 22:04:47 +02:00
}
2009-04-08 15:06:54 +08:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
2012-01-16 22:04:47 +02:00
if ( IS_ERR ( vol_args ) ) {
ret = PTR_ERR ( vol_args ) ;
goto out ;
}
2008-07-24 12:20:14 -04:00
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
2008-06-11 21:53:53 -04:00
sizestr = vol_args - > name ;
devstr = strchr ( sizestr , ' : ' ) ;
if ( devstr ) {
sizestr = devstr + 1 ;
* devstr = ' \0 ' ;
devstr = vol_args - > name ;
2014-05-13 16:36:08 +08:00
ret = kstrtoull ( devstr , 10 , & devid ) ;
if ( ret )
goto out_free ;
2012-12-21 09:21:30 +00:00
if ( ! devid ) {
ret = - EINVAL ;
goto out_free ;
}
2016-06-22 18:54:23 -04:00
btrfs_info ( fs_info , " resizing devid %llu " , devid ) ;
2008-06-11 21:53:53 -04:00
}
2012-12-21 09:19:51 +00:00
2020-11-03 13:49:43 +08:00
device = btrfs_find_device ( fs_info - > fs_devices , devid , NULL , NULL ) ;
2008-06-11 21:53:53 -04:00
if ( ! device ) {
2016-06-22 18:54:23 -04:00
btrfs_info ( fs_info , " resizer unable to find device %llu " ,
devid ) ;
2012-12-21 09:21:30 +00:00
ret = - ENODEV ;
2012-01-16 22:04:47 +02:00
goto out_free ;
2008-06-11 21:53:53 -04:00
}
2012-12-21 09:19:51 +00:00
2017-12-04 12:54:52 +08:00
if ( ! test_bit ( BTRFS_DEV_STATE_WRITEABLE , & device - > dev_state ) ) {
2016-06-22 18:54:23 -04:00
btrfs_info ( fs_info ,
2013-12-20 11:37:06 -05:00
" resizer unable to apply on readonly device %llu " ,
2013-08-20 13:20:07 +02:00
devid ) ;
2012-12-21 09:21:30 +00:00
ret = - EPERM ;
2012-06-14 02:23:19 -06:00
goto out_free ;
}
2008-06-11 21:53:53 -04:00
if ( ! strcmp ( sizestr , " max " ) )
new_size = device - > bdev - > bd_inode - > i_size ;
else {
if ( sizestr [ 0 ] = = ' - ' ) {
mod = - 1 ;
sizestr + + ;
} else if ( sizestr [ 0 ] = = ' + ' ) {
mod = 1 ;
sizestr + + ;
}
2014-03-31 18:03:25 +08:00
new_size = memparse ( sizestr , & retptr ) ;
if ( * retptr ! = ' \0 ' | | new_size = = 0 ) {
2008-06-11 21:53:53 -04:00
ret = - EINVAL ;
2012-01-16 22:04:47 +02:00
goto out_free ;
2008-06-11 21:53:53 -04:00
}
}
2017-12-04 12:54:55 +08:00
if ( test_bit ( BTRFS_DEV_STATE_REPLACE_TGT , & device - > dev_state ) ) {
2012-12-21 09:21:30 +00:00
ret = - EPERM ;
2012-11-05 18:29:28 +01:00
goto out_free ;
}
2014-09-03 21:35:38 +08:00
old_size = btrfs_device_get_total_bytes ( device ) ;
2008-06-11 21:53:53 -04:00
if ( mod < 0 ) {
if ( new_size > old_size ) {
ret = - EINVAL ;
2012-01-16 22:04:47 +02:00
goto out_free ;
2008-06-11 21:53:53 -04:00
}
new_size = old_size - new_size ;
} else if ( mod > 0 ) {
2013-12-20 15:28:56 +08:00
if ( new_size > ULLONG_MAX - old_size ) {
2014-05-29 09:19:58 +08:00
ret = - ERANGE ;
2013-12-20 15:28:56 +08:00
goto out_free ;
}
2008-06-11 21:53:53 -04:00
new_size = old_size + new_size ;
}
2015-12-15 01:42:10 +09:00
if ( new_size < SZ_256M ) {
2008-06-11 21:53:53 -04:00
ret = - EINVAL ;
2012-01-16 22:04:47 +02:00
goto out_free ;
2008-06-11 21:53:53 -04:00
}
if ( new_size > device - > bdev - > bd_inode - > i_size ) {
ret = - EFBIG ;
2012-01-16 22:04:47 +02:00
goto out_free ;
2008-06-11 21:53:53 -04:00
}
2017-07-18 15:39:08 +03:00
new_size = round_down ( new_size , fs_info - > sectorsize ) ;
2008-06-11 21:53:53 -04:00
if ( new_size > old_size ) {
2010-05-16 10:48:46 -04:00
trans = btrfs_start_transaction ( root , 0 ) ;
2011-01-20 06:19:37 +00:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
2012-01-16 22:04:47 +02:00
goto out_free ;
2011-01-20 06:19:37 +00:00
}
2008-06-11 21:53:53 -04:00
ret = btrfs_grow_device ( trans , device , new_size ) ;
2016-09-09 21:39:03 -04:00
btrfs_commit_transaction ( trans ) ;
2011-11-18 18:55:01 +00:00
} else if ( new_size < old_size ) {
2008-06-11 21:53:53 -04:00
ret = btrfs_shrink_device ( device , new_size ) ;
2012-10-27 12:06:39 +00:00
} /* equal, nothing need to do */
2008-06-11 21:53:53 -04:00
2020-02-11 10:55:26 -03:00
if ( ret = = 0 & & new_size ! = old_size )
btrfs_info_in_rcu ( fs_info ,
" resize device %s (devid %llu) from %llu to %llu " ,
rcu_str_deref ( device - > name ) , device - > devid ,
old_size , new_size ) ;
2012-01-16 22:04:47 +02:00
out_free :
2008-06-11 21:53:53 -04:00
kfree ( vol_args ) ;
2012-01-16 22:04:47 +02:00
out :
2020-08-25 10:02:32 -05:00
btrfs_exclop_finish ( fs_info ) ;
2013-01-20 15:57:57 +02:00
mnt_drop_write_file ( file ) ;
2008-06-11 21:53:53 -04:00
return ret ;
}
2020-03-13 17:23:19 +02:00
static noinline int __btrfs_ioctl_snap_create ( struct file * file ,
2017-02-14 18:33:53 +01:00
const char * name , unsigned long fd , int subvol ,
2020-03-13 17:23:19 +02:00
bool readonly ,
2013-02-07 06:02:44 +00:00
struct btrfs_qgroup_inherit * inherit )
2008-06-11 21:53:53 -04:00
{
int namelen ;
2008-11-17 21:02:50 -05:00
int ret = 0 ;
2008-06-11 21:53:53 -04:00
2016-09-21 08:31:29 -04:00
if ( ! S_ISDIR ( file_inode ( file ) - > i_mode ) )
return - ENOTDIR ;
2012-06-29 03:58:46 -06:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
goto out ;
2010-10-29 15:41:32 -04:00
namelen = strlen ( name ) ;
if ( strchr ( name , ' / ' ) ) {
2008-06-11 21:53:53 -04:00
ret = - EINVAL ;
2012-06-29 03:58:46 -06:00
goto out_drop_write ;
2008-06-11 21:53:53 -04:00
}
2012-02-20 22:14:55 -05:00
if ( name [ 0 ] = = ' . ' & &
( namelen = = 1 | | ( name [ 1 ] = = ' . ' & & namelen = = 2 ) ) ) {
ret = - EEXIST ;
2012-06-29 03:58:46 -06:00
goto out_drop_write ;
2012-02-20 22:14:55 -05:00
}
2008-11-17 21:02:50 -05:00
if ( subvol ) {
2010-10-29 15:41:32 -04:00
ret = btrfs_mksubvol ( & file - > f_path , name , namelen ,
2020-03-13 17:23:19 +02:00
NULL , readonly , inherit ) ;
2008-10-09 13:39:39 -04:00
} else {
2012-08-28 12:52:22 -04:00
struct fd src = fdget ( fd ) ;
2008-11-17 21:02:50 -05:00
struct inode * src_inode ;
2012-08-28 12:52:22 -04:00
if ( ! src . file ) {
2008-11-17 21:02:50 -05:00
ret = - EINVAL ;
2012-06-29 03:58:46 -06:00
goto out_drop_write ;
2008-11-17 21:02:50 -05:00
}
2013-01-23 17:07:38 -05:00
src_inode = file_inode ( src . file ) ;
if ( src_inode - > i_sb ! = file_inode ( file ) - > i_sb ) {
2016-03-25 10:02:41 -04:00
btrfs_info ( BTRFS_I ( file_inode ( file ) ) - > root - > fs_info ,
2013-12-20 11:37:06 -05:00
" Snapshot src from another FS " ) ;
2014-01-30 16:32:02 +09:00
ret = - EXDEV ;
2021-01-21 14:19:25 +01:00
} else if ( ! inode_owner_or_capable ( & init_user_ns , src_inode ) ) {
2014-01-15 18:15:52 +01:00
/*
* Subvolume creation is not restricted , but snapshots
* are limited to own subvolumes only
*/
ret = - EPERM ;
2012-08-26 21:20:24 -04:00
} else {
2020-05-14 17:19:18 +08:00
ret = btrfs_mksnapshot ( & file - > f_path , name , namelen ,
2012-08-26 21:20:24 -04:00
BTRFS_I ( src_inode ) - > root ,
2020-03-13 17:23:19 +02:00
readonly , inherit ) ;
2008-11-17 21:02:50 -05:00
}
2012-08-28 12:52:22 -04:00
fdput ( src ) ;
2008-10-09 13:39:39 -04:00
}
2012-06-29 03:58:46 -06:00
out_drop_write :
mnt_drop_write_file ( file ) ;
2008-06-11 21:53:53 -04:00
out :
2010-10-29 15:41:32 -04:00
return ret ;
}
static noinline int btrfs_ioctl_snap_create ( struct file * file ,
2010-12-20 15:53:28 +08:00
void __user * arg , int subvol )
2010-10-29 15:41:32 -04:00
{
2010-12-20 15:53:28 +08:00
struct btrfs_ioctl_vol_args * vol_args ;
2010-10-29 15:41:32 -04:00
int ret ;
2016-09-21 08:31:29 -04:00
if ( ! S_ISDIR ( file_inode ( file ) - > i_mode ) )
return - ENOTDIR ;
2010-12-20 15:53:28 +08:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
if ( IS_ERR ( vol_args ) )
return PTR_ERR ( vol_args ) ;
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
2010-10-29 15:41:32 -04:00
2020-03-13 17:23:19 +02:00
ret = __btrfs_ioctl_snap_create ( file , vol_args - > name , vol_args - > fd ,
subvol , false , NULL ) ;
2010-12-10 06:41:56 +00:00
2010-12-20 15:53:28 +08:00
kfree ( vol_args ) ;
return ret ;
}
2010-12-10 06:41:56 +00:00
2010-12-20 15:53:28 +08:00
static noinline int btrfs_ioctl_snap_create_v2 ( struct file * file ,
void __user * arg , int subvol )
{
struct btrfs_ioctl_vol_args_v2 * vol_args ;
int ret ;
2010-12-20 16:04:08 +08:00
bool readonly = false ;
2011-09-14 15:58:21 +02:00
struct btrfs_qgroup_inherit * inherit = NULL ;
2010-12-10 00:36:28 +00:00
2016-09-21 08:31:29 -04:00
if ( ! S_ISDIR ( file_inode ( file ) - > i_mode ) )
return - ENOTDIR ;
2010-12-20 15:53:28 +08:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
if ( IS_ERR ( vol_args ) )
return PTR_ERR ( vol_args ) ;
vol_args - > name [ BTRFS_SUBVOL_NAME_MAX ] = ' \0 ' ;
2010-12-10 00:36:28 +00:00
2020-02-21 13:24:37 +01:00
if ( vol_args - > flags & ~ BTRFS_SUBVOL_CREATE_ARGS_MASK ) {
2010-12-20 16:04:08 +08:00
ret = - EOPNOTSUPP ;
2014-09-04 14:09:15 +03:00
goto free_args ;
2010-10-29 15:41:32 -04:00
}
2010-12-20 15:53:28 +08:00
2010-12-20 16:04:08 +08:00
if ( vol_args - > flags & BTRFS_SUBVOL_RDONLY )
readonly = true ;
2011-09-14 15:58:21 +02:00
if ( vol_args - > flags & BTRFS_SUBVOL_QGROUP_INHERIT ) {
2021-02-17 09:04:34 +03:00
u64 nums ;
if ( vol_args - > size < sizeof ( * inherit ) | |
vol_args - > size > PAGE_SIZE ) {
2011-09-14 15:58:21 +02:00
ret = - EINVAL ;
2014-09-04 14:09:15 +03:00
goto free_args ;
2011-09-14 15:58:21 +02:00
}
inherit = memdup_user ( vol_args - > qgroup_inherit , vol_args - > size ) ;
if ( IS_ERR ( inherit ) ) {
ret = PTR_ERR ( inherit ) ;
2014-09-04 14:09:15 +03:00
goto free_args ;
2011-09-14 15:58:21 +02:00
}
2021-02-17 09:04:34 +03:00
if ( inherit - > num_qgroups > PAGE_SIZE | |
inherit - > num_ref_copies > PAGE_SIZE | |
inherit - > num_excl_copies > PAGE_SIZE ) {
ret = - EINVAL ;
goto free_inherit ;
}
nums = inherit - > num_qgroups + 2 * inherit - > num_ref_copies +
2 * inherit - > num_excl_copies ;
if ( vol_args - > size ! = struct_size ( inherit , qgroups , nums ) ) {
ret = - EINVAL ;
goto free_inherit ;
}
2011-09-14 15:58:21 +02:00
}
2010-12-20 15:53:28 +08:00
2020-03-13 17:23:19 +02:00
ret = __btrfs_ioctl_snap_create ( file , vol_args - > name , vol_args - > fd ,
subvol , readonly , inherit ) ;
2014-09-04 14:09:15 +03:00
if ( ret )
goto free_inherit ;
free_inherit :
2011-09-14 15:58:21 +02:00
kfree ( inherit ) ;
2014-09-04 14:09:15 +03:00
free_args :
kfree ( vol_args ) ;
2008-06-11 21:53:53 -04:00
return ret ;
}
2010-12-20 16:30:25 +08:00
static noinline int btrfs_ioctl_subvol_getflags ( struct file * file ,
void __user * arg )
{
2013-01-23 17:07:38 -05:00
struct inode * inode = file_inode ( file ) ;
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2010-12-20 16:30:25 +08:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
int ret = 0 ;
u64 flags = 0 ;
2017-01-10 20:35:31 +02:00
if ( btrfs_ino ( BTRFS_I ( inode ) ) ! = BTRFS_FIRST_FREE_OBJECTID )
2010-12-20 16:30:25 +08:00
return - EINVAL ;
2016-06-22 18:54:23 -04:00
down_read ( & fs_info - > subvol_sem ) ;
2010-12-20 16:30:25 +08:00
if ( btrfs_root_readonly ( root ) )
flags | = BTRFS_SUBVOL_RDONLY ;
2016-06-22 18:54:23 -04:00
up_read ( & fs_info - > subvol_sem ) ;
2010-12-20 16:30:25 +08:00
if ( copy_to_user ( arg , & flags , sizeof ( flags ) ) )
ret = - EFAULT ;
return ret ;
}
static noinline int btrfs_ioctl_subvol_setflags ( struct file * file ,
void __user * arg )
{
2013-01-23 17:07:38 -05:00
struct inode * inode = file_inode ( file ) ;
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2010-12-20 16:30:25 +08:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_trans_handle * trans ;
u64 root_flags ;
u64 flags ;
int ret = 0 ;
2021-01-21 14:19:25 +01:00
if ( ! inode_owner_or_capable ( & init_user_ns , inode ) )
2014-01-16 15:50:22 +01:00
return - EPERM ;
2012-06-29 03:58:49 -06:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
goto out ;
2010-12-20 16:30:25 +08:00
2017-01-10 20:35:31 +02:00
if ( btrfs_ino ( BTRFS_I ( inode ) ) ! = BTRFS_FIRST_FREE_OBJECTID ) {
2012-06-29 03:58:49 -06:00
ret = - EINVAL ;
goto out_drop_write ;
}
2010-12-20 16:30:25 +08:00
2012-06-29 03:58:49 -06:00
if ( copy_from_user ( & flags , arg , sizeof ( flags ) ) ) {
ret = - EFAULT ;
goto out_drop_write ;
}
2010-12-20 16:30:25 +08:00
2012-06-29 03:58:49 -06:00
if ( flags & ~ BTRFS_SUBVOL_RDONLY ) {
ret = - EOPNOTSUPP ;
goto out_drop_write ;
}
2010-12-20 16:30:25 +08:00
2016-06-22 18:54:23 -04:00
down_write ( & fs_info - > subvol_sem ) ;
2010-12-20 16:30:25 +08:00
/* nothing to do */
if ( ! ! ( flags & BTRFS_SUBVOL_RDONLY ) = = btrfs_root_readonly ( root ) )
2012-06-29 03:58:49 -06:00
goto out_drop_sem ;
2010-12-20 16:30:25 +08:00
root_flags = btrfs_root_flags ( & root - > root_item ) ;
2013-12-16 17:34:17 +01:00
if ( flags & BTRFS_SUBVOL_RDONLY ) {
2010-12-20 16:30:25 +08:00
btrfs_set_root_flags ( & root - > root_item ,
root_flags | BTRFS_ROOT_SUBVOL_RDONLY ) ;
2013-12-16 17:34:17 +01:00
} else {
/*
* Block RO - > RW transition if this subvolume is involved in
* send
*/
spin_lock ( & root - > root_item_lock ) ;
if ( root - > send_in_progress = = 0 ) {
btrfs_set_root_flags ( & root - > root_item ,
2010-12-20 16:30:25 +08:00
root_flags & ~ BTRFS_ROOT_SUBVOL_RDONLY ) ;
2013-12-16 17:34:17 +01:00
spin_unlock ( & root - > root_item_lock ) ;
} else {
spin_unlock ( & root - > root_item_lock ) ;
2016-06-22 18:54:23 -04:00
btrfs_warn ( fs_info ,
" Attempt to set subvolume %llu read-write during send " ,
root - > root_key . objectid ) ;
2013-12-16 17:34:17 +01:00
ret = - EPERM ;
goto out_drop_sem ;
}
}
2010-12-20 16:30:25 +08:00
trans = btrfs_start_transaction ( root , 1 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out_reset ;
}
2016-06-22 18:54:23 -04:00
ret = btrfs_update_root ( trans , fs_info - > tree_root ,
2010-12-20 16:30:25 +08:00
& root - > root_key , & root - > root_item ) ;
2017-09-28 10:53:17 +03:00
if ( ret < 0 ) {
btrfs_end_transaction ( trans ) ;
goto out_reset ;
}
ret = btrfs_commit_transaction ( trans ) ;
2010-12-20 16:30:25 +08:00
out_reset :
if ( ret )
btrfs_set_root_flags ( & root - > root_item , root_flags ) ;
2012-06-29 03:58:49 -06:00
out_drop_sem :
2016-06-22 18:54:23 -04:00
up_write ( & fs_info - > subvol_sem ) ;
2012-06-29 03:58:49 -06:00
out_drop_write :
mnt_drop_write_file ( file ) ;
out :
2010-12-20 16:30:25 +08:00
return ret ;
}
2010-02-28 15:39:26 -05:00
static noinline int key_in_sk ( struct btrfs_key * key ,
struct btrfs_ioctl_search_key * sk )
{
2010-03-18 12:10:08 -04:00
struct btrfs_key test ;
int ret ;
test . objectid = sk - > min_objectid ;
test . type = sk - > min_type ;
test . offset = sk - > min_offset ;
ret = btrfs_comp_cpu_keys ( key , & test ) ;
if ( ret < 0 )
2010-02-28 15:39:26 -05:00
return 0 ;
2010-03-18 12:10:08 -04:00
test . objectid = sk - > max_objectid ;
test . type = sk - > max_type ;
test . offset = sk - > max_offset ;
ret = btrfs_comp_cpu_keys ( key , & test ) ;
if ( ret > 0 )
2010-02-28 15:39:26 -05:00
return 0 ;
return 1 ;
}
2016-06-21 20:18:21 -04:00
static noinline int copy_to_sk ( struct btrfs_path * path ,
2010-02-28 15:39:26 -05:00
struct btrfs_key * key ,
struct btrfs_ioctl_search_key * sk ,
2014-01-30 16:24:00 +01:00
size_t * buf_size ,
2014-01-30 16:24:02 +01:00
char __user * ubuf ,
2010-02-28 15:39:26 -05:00
unsigned long * sk_offset ,
int * num_found )
{
u64 found_transid ;
struct extent_buffer * leaf ;
struct btrfs_ioctl_search_header sh ;
2015-06-30 11:25:43 +09:00
struct btrfs_key test ;
2010-02-28 15:39:26 -05:00
unsigned long item_off ;
unsigned long item_len ;
int nritems ;
int i ;
int slot ;
int ret = 0 ;
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
nritems = btrfs_header_nritems ( leaf ) ;
if ( btrfs_header_generation ( leaf ) > sk - > max_transid ) {
i = nritems ;
goto advance_key ;
}
found_transid = btrfs_header_generation ( leaf ) ;
for ( i = slot ; i < nritems ; i + + ) {
item_off = btrfs_item_ptr_offset ( leaf , i ) ;
item_len = btrfs_item_size_nr ( leaf , i ) ;
2013-05-06 17:40:18 +00:00
btrfs_item_key_to_cpu ( leaf , key , i ) ;
if ( ! key_in_sk ( key , sk ) )
continue ;
2014-01-30 16:24:00 +01:00
if ( sizeof ( sh ) + item_len > * buf_size ) {
2014-01-30 16:23:59 +01:00
if ( * num_found ) {
ret = 1 ;
goto out ;
}
/*
* return one empty item back for v1 , which does not
* handle - EOVERFLOW
*/
2014-01-30 16:24:00 +01:00
* buf_size = sizeof ( sh ) + item_len ;
2010-02-28 15:39:26 -05:00
item_len = 0 ;
2014-01-30 16:23:59 +01:00
ret = - EOVERFLOW ;
}
2010-02-28 15:39:26 -05:00
2014-01-30 16:24:00 +01:00
if ( sizeof ( sh ) + item_len + * sk_offset > * buf_size ) {
2010-02-28 15:39:26 -05:00
ret = 1 ;
2014-01-30 16:23:57 +01:00
goto out ;
2010-02-28 15:39:26 -05:00
}
sh . objectid = key - > objectid ;
sh . offset = key - > offset ;
sh . type = key - > type ;
sh . len = item_len ;
sh . transid = found_transid ;
2020-08-10 11:42:27 -04:00
/*
* Copy search result header . If we fault then loop again so we
* can fault in the pages and - EFAULT there if there ' s a
* problem . Otherwise we ' ll fault and then copy the buffer in
* properly this next time through
*/
if ( copy_to_user_nofault ( ubuf + * sk_offset , & sh , sizeof ( sh ) ) ) {
ret = 0 ;
2014-01-30 16:24:02 +01:00
goto out ;
}
2010-02-28 15:39:26 -05:00
* sk_offset + = sizeof ( sh ) ;
if ( item_len ) {
2014-01-30 16:24:02 +01:00
char __user * up = ubuf + * sk_offset ;
2020-08-10 11:42:27 -04:00
/*
* Copy the item , same behavior as above , but reset the
* * sk_offset so we copy the full thing again .
*/
if ( read_extent_buffer_to_user_nofault ( leaf , up ,
item_off , item_len ) ) {
ret = 0 ;
* sk_offset - = sizeof ( sh ) ;
2014-01-30 16:24:02 +01:00
goto out ;
}
2010-02-28 15:39:26 -05:00
* sk_offset + = item_len ;
}
2011-05-14 17:43:41 +00:00
( * num_found ) + + ;
2010-02-28 15:39:26 -05:00
2014-01-30 16:23:59 +01:00
if ( ret ) /* -EOVERFLOW from above */
goto out ;
2014-01-30 16:23:57 +01:00
if ( * num_found > = sk - > nr_items ) {
ret = 1 ;
goto out ;
}
2010-02-28 15:39:26 -05:00
}
advance_key :
2010-03-18 12:10:08 -04:00
ret = 0 ;
2015-06-30 11:25:43 +09:00
test . objectid = sk - > max_objectid ;
test . type = sk - > max_type ;
test . offset = sk - > max_offset ;
if ( btrfs_comp_cpu_keys ( key , & test ) > = 0 )
ret = 1 ;
else if ( key - > offset < ( u64 ) - 1 )
2010-02-28 15:39:26 -05:00
key - > offset + + ;
2015-06-30 11:25:43 +09:00
else if ( key - > type < ( u8 ) - 1 ) {
2010-03-18 12:10:08 -04:00
key - > offset = 0 ;
2010-02-28 15:39:26 -05:00
key - > type + + ;
2015-06-30 11:25:43 +09:00
} else if ( key - > objectid < ( u64 ) - 1 ) {
2010-03-18 12:10:08 -04:00
key - > offset = 0 ;
key - > type = 0 ;
2010-02-28 15:39:26 -05:00
key - > objectid + + ;
2010-03-18 12:10:08 -04:00
} else
ret = 1 ;
2014-01-30 16:23:57 +01:00
out :
2014-01-30 16:24:02 +01:00
/*
* 0 : all items from this leaf copied , continue with next
* 1 : * more items can be copied , but unused buffer is too small
* * all items were found
* Either way , it will stops the loop which iterates to the next
* leaf
* - EOVERFLOW : item was to large for buffer
* - EFAULT : could not copy extent buffer back to userspace
*/
2010-02-28 15:39:26 -05:00
return ret ;
}
static noinline int search_ioctl ( struct inode * inode ,
2014-01-30 16:23:58 +01:00
struct btrfs_ioctl_search_key * sk ,
2014-01-30 16:24:00 +01:00
size_t * buf_size ,
2014-01-30 16:24:02 +01:00
char __user * ubuf )
2010-02-28 15:39:26 -05:00
{
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * info = btrfs_sb ( inode - > i_sb ) ;
2010-02-28 15:39:26 -05:00
struct btrfs_root * root ;
struct btrfs_key key ;
struct btrfs_path * path ;
int ret ;
int num_found = 0 ;
unsigned long sk_offset = 0 ;
2014-01-30 16:24:00 +01:00
if ( * buf_size < sizeof ( struct btrfs_ioctl_search_header ) ) {
* buf_size = sizeof ( struct btrfs_ioctl_search_header ) ;
2014-01-30 16:23:58 +01:00
return - EOVERFLOW ;
2014-01-30 16:24:00 +01:00
}
2014-01-30 16:23:58 +01:00
2010-02-28 15:39:26 -05:00
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
if ( sk - > tree_id = = 0 ) {
/* search the root of the inode that was passed */
2020-01-24 09:33:01 -05:00
root = btrfs_grab_root ( BTRFS_I ( inode ) - > root ) ;
2010-02-28 15:39:26 -05:00
} else {
2020-05-15 19:35:55 +02:00
root = btrfs_get_fs_root ( info , sk - > tree_id , true ) ;
2010-02-28 15:39:26 -05:00
if ( IS_ERR ( root ) ) {
btrfs_free_path ( path ) ;
2018-05-21 13:57:27 +09:00
return PTR_ERR ( root ) ;
2010-02-28 15:39:26 -05:00
}
}
key . objectid = sk - > min_objectid ;
key . type = sk - > min_type ;
key . offset = sk - > min_offset ;
2013-10-31 10:33:04 +05:30
while ( 1 ) {
2020-09-14 09:01:04 +01:00
ret = fault_in_pages_writeable ( ubuf + sk_offset ,
* buf_size - sk_offset ) ;
2020-08-10 11:42:27 -04:00
if ( ret )
break ;
2013-10-01 16:13:42 +01:00
ret = btrfs_search_forward ( root , & key , path , sk - > min_transid ) ;
2010-02-28 15:39:26 -05:00
if ( ret ! = 0 ) {
if ( ret > 0 )
ret = 0 ;
goto err ;
}
2016-06-21 20:18:21 -04:00
ret = copy_to_sk ( path , & key , sk , buf_size , ubuf ,
2010-02-28 15:39:26 -05:00
& sk_offset , & num_found ) ;
2011-04-21 01:20:15 +02:00
btrfs_release_path ( path ) ;
2014-01-30 16:23:57 +01:00
if ( ret )
2010-02-28 15:39:26 -05:00
break ;
}
2014-01-30 16:23:59 +01:00
if ( ret > 0 )
ret = 0 ;
2010-02-28 15:39:26 -05:00
err :
sk - > nr_items = num_found ;
2020-01-24 09:33:01 -05:00
btrfs_put_root ( root ) ;
2010-02-28 15:39:26 -05:00
btrfs_free_path ( path ) ;
return ret ;
}
static noinline int btrfs_ioctl_tree_search ( struct file * file ,
void __user * argp )
{
2014-01-30 16:24:02 +01:00
struct btrfs_ioctl_search_args __user * uargs ;
struct btrfs_ioctl_search_key sk ;
2014-01-30 16:24:00 +01:00
struct inode * inode ;
int ret ;
size_t buf_size ;
2010-02-28 15:39:26 -05:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2014-01-30 16:24:02 +01:00
uargs = ( struct btrfs_ioctl_search_args __user * ) argp ;
if ( copy_from_user ( & sk , & uargs - > key , sizeof ( sk ) ) )
return - EFAULT ;
2010-02-28 15:39:26 -05:00
2014-01-30 16:24:02 +01:00
buf_size = sizeof ( uargs - > buf ) ;
2010-02-28 15:39:26 -05:00
2013-01-23 17:07:38 -05:00
inode = file_inode ( file ) ;
2014-01-30 16:24:02 +01:00
ret = search_ioctl ( inode , & sk , & buf_size , uargs - > buf ) ;
2014-01-30 16:23:59 +01:00
/*
* In the origin implementation an overflow is handled by returning a
* search header with a len of zero , so reset ret .
*/
if ( ret = = - EOVERFLOW )
ret = 0 ;
2014-01-30 16:24:02 +01:00
if ( ret = = 0 & & copy_to_user ( & uargs - > key , & sk , sizeof ( sk ) ) )
2010-02-28 15:39:26 -05:00
ret = - EFAULT ;
return ret ;
}
2014-01-30 16:24:03 +01:00
static noinline int btrfs_ioctl_tree_search_v2 ( struct file * file ,
void __user * argp )
{
struct btrfs_ioctl_search_args_v2 __user * uarg ;
struct btrfs_ioctl_search_args_v2 args ;
struct inode * inode ;
int ret ;
size_t buf_size ;
2015-12-15 01:42:10 +09:00
const size_t buf_limit = SZ_16M ;
2014-01-30 16:24:03 +01:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
/* copy search header and buffer size */
uarg = ( struct btrfs_ioctl_search_args_v2 __user * ) argp ;
if ( copy_from_user ( & args , uarg , sizeof ( args ) ) )
return - EFAULT ;
buf_size = args . buf_size ;
/* limit result size to 16MB */
if ( buf_size > buf_limit )
buf_size = buf_limit ;
inode = file_inode ( file ) ;
ret = search_ioctl ( inode , & args . key , & buf_size ,
2017-08-22 23:46:05 -07:00
( char __user * ) ( & uarg - > buf [ 0 ] ) ) ;
2014-01-30 16:24:03 +01:00
if ( ret = = 0 & & copy_to_user ( & uarg - > key , & args . key , sizeof ( args . key ) ) )
ret = - EFAULT ;
else if ( ret = = - EOVERFLOW & &
copy_to_user ( & uarg - > buf_size , & buf_size , sizeof ( buf_size ) ) )
ret = - EFAULT ;
2010-02-28 15:39:26 -05:00
return ret ;
}
2009-11-18 05:42:14 +00:00
/*
2010-02-28 15:39:26 -05:00
* Search INODE_REFs to identify path name of ' dirid ' directory
* in a ' tree_id ' tree . and sets path name to ' name ' .
*/
2009-11-18 05:42:14 +00:00
static noinline int btrfs_search_path_in_tree ( struct btrfs_fs_info * info ,
u64 tree_id , u64 dirid , char * name )
{
struct btrfs_root * root ;
struct btrfs_key key ;
2010-02-28 15:39:26 -05:00
char * ptr ;
2009-11-18 05:42:14 +00:00
int ret = - 1 ;
int slot ;
int len ;
int total_len = 0 ;
struct btrfs_inode_ref * iref ;
struct extent_buffer * l ;
struct btrfs_path * path ;
if ( dirid = = BTRFS_FIRST_FREE_OBJECTID ) {
name [ 0 ] = ' \0 ' ;
return 0 ;
}
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2017-12-01 11:19:42 +02:00
ptr = & name [ BTRFS_INO_LOOKUP_PATH_MAX - 1 ] ;
2009-11-18 05:42:14 +00:00
2020-05-15 19:35:55 +02:00
root = btrfs_get_fs_root ( info , tree_id , true ) ;
2009-11-18 05:42:14 +00:00
if ( IS_ERR ( root ) ) {
2018-05-21 13:57:27 +09:00
ret = PTR_ERR ( root ) ;
2020-01-24 09:32:34 -05:00
root = NULL ;
goto out ;
}
2009-11-18 05:42:14 +00:00
key . objectid = dirid ;
key . type = BTRFS_INODE_REF_KEY ;
2010-03-18 12:23:10 -04:00
key . offset = ( u64 ) - 1 ;
2009-11-18 05:42:14 +00:00
2013-10-31 10:33:04 +05:30
while ( 1 ) {
2009-11-18 05:42:14 +00:00
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto out ;
2013-08-14 03:00:21 +01:00
else if ( ret > 0 ) {
ret = btrfs_previous_item ( root , path , dirid ,
BTRFS_INODE_REF_KEY ) ;
if ( ret < 0 )
goto out ;
else if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
}
2009-11-18 05:42:14 +00:00
l = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( l , & key , slot ) ;
iref = btrfs_item_ptr ( l , slot , struct btrfs_inode_ref ) ;
len = btrfs_inode_ref_name_len ( l , iref ) ;
ptr - = len + 1 ;
total_len + = len + 1 ;
2013-08-14 03:00:20 +01:00
if ( ptr < name ) {
ret = - ENAMETOOLONG ;
2009-11-18 05:42:14 +00:00
goto out ;
2013-08-14 03:00:20 +01:00
}
2009-11-18 05:42:14 +00:00
* ( ptr + len ) = ' / ' ;
2013-10-31 10:33:04 +05:30
read_extent_buffer ( l , ptr , ( unsigned long ) ( iref + 1 ) , len ) ;
2009-11-18 05:42:14 +00:00
if ( key . offset = = BTRFS_FIRST_FREE_OBJECTID )
break ;
2011-04-21 01:20:15 +02:00
btrfs_release_path ( path ) ;
2009-11-18 05:42:14 +00:00
key . objectid = key . offset ;
2010-03-18 12:23:10 -04:00
key . offset = ( u64 ) - 1 ;
2009-11-18 05:42:14 +00:00
dirid = key . objectid ;
}
2011-07-14 03:16:00 +00:00
memmove ( name , ptr , total_len ) ;
2013-10-31 10:33:04 +05:30
name [ total_len ] = ' \0 ' ;
2009-11-18 05:42:14 +00:00
ret = 0 ;
out :
2020-01-24 09:33:01 -05:00
btrfs_put_root ( root ) ;
2009-11-18 05:42:14 +00:00
btrfs_free_path ( path ) ;
2010-02-28 15:39:26 -05:00
return ret ;
}
2018-05-21 10:09:44 +09:00
static int btrfs_search_path_in_tree_user ( struct inode * inode ,
struct btrfs_ioctl_ino_lookup_user_args * args )
{
struct btrfs_fs_info * fs_info = BTRFS_I ( inode ) - > root - > fs_info ;
struct super_block * sb = inode - > i_sb ;
struct btrfs_key upper_limit = BTRFS_I ( inode ) - > location ;
u64 treeid = BTRFS_I ( inode ) - > root - > root_key . objectid ;
u64 dirid = args - > dirid ;
unsigned long item_off ;
unsigned long item_len ;
struct btrfs_inode_ref * iref ;
struct btrfs_root_ref * rref ;
2020-01-24 09:32:35 -05:00
struct btrfs_root * root = NULL ;
2018-05-21 10:09:44 +09:00
struct btrfs_path * path ;
struct btrfs_key key , key2 ;
struct extent_buffer * leaf ;
struct inode * temp_inode ;
char * ptr ;
int slot ;
int len ;
int total_len = 0 ;
int ret ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
/*
* If the bottom subvolume does not exist directly under upper_limit ,
* construct the path in from the bottom up .
*/
if ( dirid ! = upper_limit . objectid ) {
ptr = & args - > path [ BTRFS_INO_LOOKUP_USER_PATH_MAX - 1 ] ;
2020-05-15 19:35:55 +02:00
root = btrfs_get_fs_root ( fs_info , treeid , true ) ;
2018-05-21 10:09:44 +09:00
if ( IS_ERR ( root ) ) {
ret = PTR_ERR ( root ) ;
goto out ;
}
key . objectid = dirid ;
key . type = BTRFS_INODE_REF_KEY ;
key . offset = ( u64 ) - 1 ;
while ( 1 ) {
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 ) {
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
} else if ( ret > 0 ) {
ret = btrfs_previous_item ( root , path , dirid ,
BTRFS_INODE_REF_KEY ) ;
if ( ret < 0 ) {
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
} else if ( ret > 0 ) {
ret = - ENOENT ;
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
}
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
iref = btrfs_item_ptr ( leaf , slot , struct btrfs_inode_ref ) ;
len = btrfs_inode_ref_name_len ( leaf , iref ) ;
ptr - = len + 1 ;
total_len + = len + 1 ;
if ( ptr < args - > path ) {
ret = - ENAMETOOLONG ;
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
}
* ( ptr + len ) = ' / ' ;
read_extent_buffer ( leaf , ptr ,
( unsigned long ) ( iref + 1 ) , len ) ;
/* Check the read+exec permission of this directory */
ret = btrfs_previous_item ( root , path , dirid ,
BTRFS_INODE_ITEM_KEY ) ;
if ( ret < 0 ) {
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
} else if ( ret > 0 ) {
ret = - ENOENT ;
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key2 , slot ) ;
if ( key2 . objectid ! = dirid ) {
ret = - ENOENT ;
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
}
2020-05-15 19:35:59 +02:00
temp_inode = btrfs_iget ( sb , key2 . objectid , root ) ;
2018-06-04 16:41:07 +09:00
if ( IS_ERR ( temp_inode ) ) {
ret = PTR_ERR ( temp_inode ) ;
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-06-04 16:41:07 +09:00
}
2021-01-21 14:19:24 +01:00
ret = inode_permission ( & init_user_ns , temp_inode ,
MAY_READ | MAY_EXEC ) ;
2018-05-21 10:09:44 +09:00
iput ( temp_inode ) ;
if ( ret ) {
ret = - EACCES ;
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
}
if ( key . offset = = upper_limit . objectid )
break ;
if ( key . objectid = = BTRFS_FIRST_FREE_OBJECTID ) {
ret = - EACCES ;
2020-01-24 09:32:35 -05:00
goto out_put ;
2018-05-21 10:09:44 +09:00
}
btrfs_release_path ( path ) ;
key . objectid = key . offset ;
key . offset = ( u64 ) - 1 ;
dirid = key . objectid ;
}
memmove ( args - > path , ptr , total_len ) ;
args - > path [ total_len ] = ' \0 ' ;
2020-01-24 09:33:01 -05:00
btrfs_put_root ( root ) ;
2020-01-24 09:32:35 -05:00
root = NULL ;
2018-05-21 10:09:44 +09:00
btrfs_release_path ( path ) ;
}
/* Get the bottom subvolume's name from ROOT_REF */
key . objectid = treeid ;
key . type = BTRFS_ROOT_REF_KEY ;
key . offset = args - > treeid ;
2020-01-24 09:32:35 -05:00
ret = btrfs_search_slot ( NULL , fs_info - > tree_root , & key , path , 0 , 0 ) ;
2018-05-21 10:09:44 +09:00
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - ENOENT ;
goto out ;
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
item_off = btrfs_item_ptr_offset ( leaf , slot ) ;
item_len = btrfs_item_size_nr ( leaf , slot ) ;
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr ( leaf , slot , struct btrfs_root_ref ) ;
if ( args - > dirid ! = btrfs_root_ref_dirid ( leaf , rref ) ) {
ret = - EINVAL ;
goto out ;
}
/* Copy subvolume's name */
item_off + = sizeof ( struct btrfs_root_ref ) ;
item_len - = sizeof ( struct btrfs_root_ref ) ;
read_extent_buffer ( leaf , args - > name , item_off , item_len ) ;
args - > name [ item_len ] = 0 ;
2020-01-24 09:32:35 -05:00
out_put :
2020-01-24 09:33:01 -05:00
btrfs_put_root ( root ) ;
2018-05-21 10:09:44 +09:00
out :
btrfs_free_path ( path ) ;
return ret ;
}
2010-02-28 15:39:26 -05:00
static noinline int btrfs_ioctl_ino_lookup ( struct file * file ,
void __user * argp )
{
2018-06-20 10:03:31 -07:00
struct btrfs_ioctl_ino_lookup_args * args ;
struct inode * inode ;
2015-05-12 19:14:49 +02:00
int ret = 0 ;
2010-02-28 15:39:26 -05:00
2010-10-29 15:14:18 -04:00
args = memdup_user ( argp , sizeof ( * args ) ) ;
if ( IS_ERR ( args ) )
return PTR_ERR ( args ) ;
2010-03-20 11:24:15 +00:00
2013-01-23 17:07:38 -05:00
inode = file_inode ( file ) ;
2010-02-28 15:39:26 -05:00
2015-05-12 19:14:49 +02:00
/*
* Unprivileged query to obtain the containing subvolume root id . The
* path is reset so it ' s consistent with btrfs_search_path_in_tree .
*/
2010-03-18 12:17:05 -04:00
if ( args - > treeid = = 0 )
args - > treeid = BTRFS_I ( inode ) - > root - > root_key . objectid ;
2015-05-12 19:14:49 +02:00
if ( args - > objectid = = BTRFS_FIRST_FREE_OBJECTID ) {
args - > name [ 0 ] = 0 ;
goto out ;
}
if ( ! capable ( CAP_SYS_ADMIN ) ) {
ret = - EPERM ;
goto out ;
}
2010-02-28 15:39:26 -05:00
ret = btrfs_search_path_in_tree ( BTRFS_I ( inode ) - > root - > fs_info ,
args - > treeid , args - > objectid ,
args - > name ) ;
2015-05-12 19:14:49 +02:00
out :
2010-02-28 15:39:26 -05:00
if ( ret = = 0 & & copy_to_user ( argp , args , sizeof ( * args ) ) )
ret = - EFAULT ;
kfree ( args ) ;
2009-11-18 05:42:14 +00:00
return ret ;
}
2018-05-21 10:09:44 +09:00
/*
* Version of ino_lookup ioctl ( unprivileged )
*
* The main differences from ino_lookup ioctl are :
*
* 1. Read + Exec permission will be checked using inode_permission ( ) during
* path construction . - EACCES will be returned in case of failure .
* 2. Path construction will be stopped at the inode number which corresponds
* to the fd with which this ioctl is called . If constructed path does not
* exist under fd ' s inode , - EACCES will be returned .
* 3. The name of bottom subvolume is also searched and filled .
*/
static int btrfs_ioctl_ino_lookup_user ( struct file * file , void __user * argp )
{
struct btrfs_ioctl_ino_lookup_user_args * args ;
struct inode * inode ;
int ret ;
args = memdup_user ( argp , sizeof ( * args ) ) ;
if ( IS_ERR ( args ) )
return PTR_ERR ( args ) ;
inode = file_inode ( file ) ;
if ( args - > dirid = = BTRFS_FIRST_FREE_OBJECTID & &
BTRFS_I ( inode ) - > location . objectid ! = BTRFS_FIRST_FREE_OBJECTID ) {
/*
* The subvolume does not exist under fd with which this is
* called
*/
kfree ( args ) ;
return - EACCES ;
}
ret = btrfs_search_path_in_tree_user ( inode , args ) ;
if ( ret = = 0 & & copy_to_user ( argp , args , sizeof ( * args ) ) )
ret = - EFAULT ;
kfree ( args ) ;
return ret ;
}
2018-05-21 10:09:42 +09:00
/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
static int btrfs_ioctl_get_subvol_info ( struct file * file , void __user * argp )
{
struct btrfs_ioctl_get_subvol_info_args * subvol_info ;
struct btrfs_fs_info * fs_info ;
struct btrfs_root * root ;
struct btrfs_path * path ;
struct btrfs_key key ;
struct btrfs_root_item * root_item ;
struct btrfs_root_ref * rref ;
struct extent_buffer * leaf ;
unsigned long item_off ;
unsigned long item_len ;
struct inode * inode ;
int slot ;
int ret = 0 ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
subvol_info = kzalloc ( sizeof ( * subvol_info ) , GFP_KERNEL ) ;
if ( ! subvol_info ) {
btrfs_free_path ( path ) ;
return - ENOMEM ;
}
inode = file_inode ( file ) ;
fs_info = BTRFS_I ( inode ) - > root - > fs_info ;
/* Get root_item of inode's subvolume */
key . objectid = BTRFS_I ( inode ) - > root - > root_key . objectid ;
2020-05-15 19:35:55 +02:00
root = btrfs_get_fs_root ( fs_info , key . objectid , true ) ;
2018-05-21 10:09:42 +09:00
if ( IS_ERR ( root ) ) {
ret = PTR_ERR ( root ) ;
2020-01-24 09:32:36 -05:00
goto out_free ;
}
2018-05-21 10:09:42 +09:00
root_item = & root - > root_item ;
subvol_info - > treeid = key . objectid ;
subvol_info - > generation = btrfs_root_generation ( root_item ) ;
subvol_info - > flags = btrfs_root_flags ( root_item ) ;
memcpy ( subvol_info - > uuid , root_item - > uuid , BTRFS_UUID_SIZE ) ;
memcpy ( subvol_info - > parent_uuid , root_item - > parent_uuid ,
BTRFS_UUID_SIZE ) ;
memcpy ( subvol_info - > received_uuid , root_item - > received_uuid ,
BTRFS_UUID_SIZE ) ;
subvol_info - > ctransid = btrfs_root_ctransid ( root_item ) ;
subvol_info - > ctime . sec = btrfs_stack_timespec_sec ( & root_item - > ctime ) ;
subvol_info - > ctime . nsec = btrfs_stack_timespec_nsec ( & root_item - > ctime ) ;
subvol_info - > otransid = btrfs_root_otransid ( root_item ) ;
subvol_info - > otime . sec = btrfs_stack_timespec_sec ( & root_item - > otime ) ;
subvol_info - > otime . nsec = btrfs_stack_timespec_nsec ( & root_item - > otime ) ;
subvol_info - > stransid = btrfs_root_stransid ( root_item ) ;
subvol_info - > stime . sec = btrfs_stack_timespec_sec ( & root_item - > stime ) ;
subvol_info - > stime . nsec = btrfs_stack_timespec_nsec ( & root_item - > stime ) ;
subvol_info - > rtransid = btrfs_root_rtransid ( root_item ) ;
subvol_info - > rtime . sec = btrfs_stack_timespec_sec ( & root_item - > rtime ) ;
subvol_info - > rtime . nsec = btrfs_stack_timespec_nsec ( & root_item - > rtime ) ;
if ( key . objectid ! = BTRFS_FS_TREE_OBJECTID ) {
/* Search root tree for ROOT_BACKREF of this subvolume */
key . type = BTRFS_ROOT_BACKREF_KEY ;
key . offset = 0 ;
2020-01-24 09:32:36 -05:00
ret = btrfs_search_slot ( NULL , fs_info - > tree_root , & key , path , 0 , 0 ) ;
2018-05-21 10:09:42 +09:00
if ( ret < 0 ) {
goto out ;
} else if ( path - > slots [ 0 ] > =
btrfs_header_nritems ( path - > nodes [ 0 ] ) ) {
2020-01-24 09:32:36 -05:00
ret = btrfs_next_leaf ( fs_info - > tree_root , path ) ;
2018-05-21 10:09:42 +09:00
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - EUCLEAN ;
goto out ;
}
}
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
if ( key . objectid = = subvol_info - > treeid & &
key . type = = BTRFS_ROOT_BACKREF_KEY ) {
subvol_info - > parent_id = key . offset ;
rref = btrfs_item_ptr ( leaf , slot , struct btrfs_root_ref ) ;
subvol_info - > dirid = btrfs_root_ref_dirid ( leaf , rref ) ;
item_off = btrfs_item_ptr_offset ( leaf , slot )
+ sizeof ( struct btrfs_root_ref ) ;
item_len = btrfs_item_size_nr ( leaf , slot )
- sizeof ( struct btrfs_root_ref ) ;
read_extent_buffer ( leaf , subvol_info - > name ,
item_off , item_len ) ;
} else {
ret = - ENOENT ;
goto out ;
}
}
if ( copy_to_user ( argp , subvol_info , sizeof ( * subvol_info ) ) )
ret = - EFAULT ;
out :
2020-01-24 09:33:01 -05:00
btrfs_put_root ( root ) ;
2020-01-24 09:32:36 -05:00
out_free :
2018-05-21 10:09:42 +09:00
btrfs_free_path ( path ) ;
2020-06-16 11:31:59 -04:00
kfree ( subvol_info ) ;
2018-05-21 10:09:42 +09:00
return ret ;
}
2018-05-21 10:09:43 +09:00
/*
* Return ROOT_REF information of the subvolume containing this inode
* except the subvolume name .
*/
static int btrfs_ioctl_get_subvol_rootref ( struct file * file , void __user * argp )
{
struct btrfs_ioctl_get_subvol_rootref_args * rootrefs ;
struct btrfs_root_ref * rref ;
struct btrfs_root * root ;
struct btrfs_path * path ;
struct btrfs_key key ;
struct extent_buffer * leaf ;
struct inode * inode ;
u64 objectid ;
int slot ;
int ret ;
u8 found ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
rootrefs = memdup_user ( argp , sizeof ( * rootrefs ) ) ;
if ( IS_ERR ( rootrefs ) ) {
btrfs_free_path ( path ) ;
return PTR_ERR ( rootrefs ) ;
}
inode = file_inode ( file ) ;
root = BTRFS_I ( inode ) - > root - > fs_info - > tree_root ;
objectid = BTRFS_I ( inode ) - > root - > root_key . objectid ;
key . objectid = objectid ;
key . type = BTRFS_ROOT_REF_KEY ;
key . offset = rootrefs - > min_treeid ;
found = 0 ;
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 ) {
goto out ;
} else if ( path - > slots [ 0 ] > =
btrfs_header_nritems ( path - > nodes [ 0 ] ) ) {
ret = btrfs_next_leaf ( root , path ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - EUCLEAN ;
goto out ;
}
}
while ( 1 ) {
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
if ( key . objectid ! = objectid | | key . type ! = BTRFS_ROOT_REF_KEY ) {
ret = 0 ;
goto out ;
}
if ( found = = BTRFS_MAX_ROOTREF_BUFFER_NUM ) {
ret = - EOVERFLOW ;
goto out ;
}
rref = btrfs_item_ptr ( leaf , slot , struct btrfs_root_ref ) ;
rootrefs - > rootref [ found ] . treeid = key . offset ;
rootrefs - > rootref [ found ] . dirid =
btrfs_root_ref_dirid ( leaf , rref ) ;
found + + ;
ret = btrfs_next_item ( root , path ) ;
if ( ret < 0 ) {
goto out ;
} else if ( ret > 0 ) {
ret = - EUCLEAN ;
goto out ;
}
}
out :
if ( ! ret | | ret = = - EOVERFLOW ) {
rootrefs - > num_items = found ;
/* update min_treeid for next search */
if ( found )
rootrefs - > min_treeid =
rootrefs - > rootref [ found - 1 ] . treeid + 1 ;
if ( copy_to_user ( argp , rootrefs , sizeof ( * rootrefs ) ) )
ret = - EFAULT ;
}
kfree ( rootrefs ) ;
btrfs_free_path ( path ) ;
return ret ;
}
2009-09-21 16:00:26 -04:00
static noinline int btrfs_ioctl_snap_destroy ( struct file * file ,
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
void __user * arg ,
bool destroy_v2 )
2009-09-21 16:00:26 -04:00
{
2013-09-01 15:57:51 -04:00
struct dentry * parent = file - > f_path . dentry ;
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( parent - > d_sb ) ;
2009-09-21 16:00:26 -04:00
struct dentry * dentry ;
2015-03-17 22:25:59 +00:00
struct inode * dir = d_inode ( parent ) ;
2009-09-21 16:00:26 -04:00
struct inode * inode ;
struct btrfs_root * root = BTRFS_I ( dir ) - > root ;
struct btrfs_root * dest = NULL ;
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
struct btrfs_ioctl_vol_args * vol_args = NULL ;
struct btrfs_ioctl_vol_args_v2 * vol_args2 = NULL ;
char * subvol_name , * subvol_name_ptr = NULL ;
int subvol_namelen ;
2009-09-21 16:00:26 -04:00
int err = 0 ;
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
bool destroy_parent = false ;
2009-09-21 16:00:26 -04:00
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
if ( destroy_v2 ) {
vol_args2 = memdup_user ( arg , sizeof ( * vol_args2 ) ) ;
if ( IS_ERR ( vol_args2 ) )
return PTR_ERR ( vol_args2 ) ;
2016-09-21 08:31:29 -04:00
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
if ( vol_args2 - > flags & ~ BTRFS_SUBVOL_DELETE_ARGS_MASK ) {
err = - EOPNOTSUPP ;
goto out ;
}
2009-09-21 16:00:26 -04:00
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
/*
* If SPEC_BY_ID is not set , we are looking for the subvolume by
* name , same as v1 currently does .
*/
if ( ! ( vol_args2 - > flags & BTRFS_SUBVOL_SPEC_BY_ID ) ) {
vol_args2 - > name [ BTRFS_SUBVOL_NAME_MAX ] = 0 ;
subvol_name = vol_args2 - > name ;
err = mnt_want_write_file ( file ) ;
if ( err )
goto out ;
} else {
if ( vol_args2 - > subvolid < BTRFS_FIRST_FREE_OBJECTID ) {
err = - EINVAL ;
goto out ;
}
err = mnt_want_write_file ( file ) ;
if ( err )
goto out ;
dentry = btrfs_get_dentry ( fs_info - > sb ,
BTRFS_FIRST_FREE_OBJECTID ,
vol_args2 - > subvolid , 0 , 0 ) ;
if ( IS_ERR ( dentry ) ) {
err = PTR_ERR ( dentry ) ;
goto out_drop_write ;
}
/*
* Change the default parent since the subvolume being
* deleted can be outside of the current mount point .
*/
parent = btrfs_get_parent ( dentry ) ;
/*
* At this point dentry - > d_name can point to ' / ' if the
* subvolume we want to destroy is outsite of the
* current mount point , so we need to release the
* current dentry and execute the lookup to return a new
* one with - > d_name pointing to the
* < mount point > / subvol_name .
*/
dput ( dentry ) ;
if ( IS_ERR ( parent ) ) {
err = PTR_ERR ( parent ) ;
goto out_drop_write ;
}
dir = d_inode ( parent ) ;
/*
* If v2 was used with SPEC_BY_ID , a new parent was
* allocated since the subvolume can be outside of the
* current mount point . Later on we need to release this
* new parent dentry .
*/
destroy_parent = true ;
subvol_name_ptr = btrfs_get_subvol_name_from_objectid (
fs_info , vol_args2 - > subvolid ) ;
if ( IS_ERR ( subvol_name_ptr ) ) {
err = PTR_ERR ( subvol_name_ptr ) ;
goto free_parent ;
}
/* subvol_name_ptr is already NULL termined */
subvol_name = ( char * ) kbasename ( subvol_name_ptr ) ;
}
} else {
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
if ( IS_ERR ( vol_args ) )
return PTR_ERR ( vol_args ) ;
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = 0 ;
subvol_name = vol_args - > name ;
err = mnt_want_write_file ( file ) ;
if ( err )
goto out ;
2009-09-21 16:00:26 -04:00
}
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
subvol_namelen = strlen ( subvol_name ) ;
2009-09-21 16:00:26 -04:00
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
if ( strchr ( subvol_name , ' / ' ) | |
strncmp ( subvol_name , " .. " , subvol_namelen ) = = 0 ) {
err = - EINVAL ;
goto free_subvol_name ;
}
if ( ! S_ISDIR ( dir - > i_mode ) ) {
err = - ENOTDIR ;
goto free_subvol_name ;
}
2014-04-15 16:41:44 +02:00
2016-05-26 00:05:12 -04:00
err = down_write_killable_nested ( & dir - > i_rwsem , I_MUTEX_PARENT ) ;
if ( err = = - EINTR )
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
goto free_subvol_name ;
dentry = lookup_one_len ( subvol_name , parent , subvol_namelen ) ;
2009-09-21 16:00:26 -04:00
if ( IS_ERR ( dentry ) ) {
err = PTR_ERR ( dentry ) ;
goto out_unlock_dir ;
}
2015-03-17 22:25:59 +00:00
if ( d_really_is_negative ( dentry ) ) {
2009-09-21 16:00:26 -04:00
err = - ENOENT ;
goto out_dput ;
}
2015-03-17 22:25:59 +00:00
inode = d_inode ( dentry ) ;
2010-10-29 15:46:43 -04:00
dest = BTRFS_I ( inode ) - > root ;
2013-10-31 10:33:04 +05:30
if ( ! capable ( CAP_SYS_ADMIN ) ) {
2010-10-29 15:46:43 -04:00
/*
* Regular user . Only allow this with a special mount
* option , when the user has write + exec access to the
* subvol root , and when rmdir ( 2 ) would have been
* allowed .
*
* Note that this is _not_ check that the subvol is
* empty or doesn ' t contain data that we wouldn ' t
* otherwise be able to delete .
*
* Users who want to delete empty subvols should try
* rmdir ( 2 ) .
*/
err = - EPERM ;
2016-06-22 18:54:23 -04:00
if ( ! btrfs_test_opt ( fs_info , USER_SUBVOL_RM_ALLOWED ) )
2010-10-29 15:46:43 -04:00
goto out_dput ;
/*
* Do not allow deletion if the parent dir is the same
* as the dir to be deleted . That means the ioctl
* must be called on the dentry referencing the root
* of the subvol , not a random directory contained
* within it .
*/
err = - EINVAL ;
if ( root = = dest )
goto out_dput ;
2021-01-21 14:19:24 +01:00
err = inode_permission ( & init_user_ns , inode ,
MAY_WRITE | MAY_EXEC ) ;
2010-10-29 15:46:43 -04:00
if ( err )
goto out_dput ;
}
2012-10-22 11:39:53 +00:00
/* check if subvolume may be deleted by a user */
err = btrfs_may_delete ( dir , dentry , 1 ) ;
if ( err )
goto out_dput ;
2017-01-10 20:35:31 +02:00
if ( btrfs_ino ( BTRFS_I ( inode ) ) ! = BTRFS_FIRST_FREE_OBJECTID ) {
2009-09-21 16:00:26 -04:00
err = - EINVAL ;
goto out_dput ;
}
2021-02-10 17:14:34 -05:00
btrfs_inode_lock ( inode , 0 ) ;
2018-04-18 11:34:52 +09:00
err = btrfs_delete_subvolume ( dir , dentry ) ;
2021-02-10 17:14:34 -05:00
btrfs_inode_unlock ( inode , 0 ) ;
2019-05-26 17:34:03 +03:00
if ( ! err ) {
fsnotify_rmdir ( dir , dentry ) ;
2009-09-21 16:00:26 -04:00
d_delete ( dentry ) ;
2019-05-26 17:34:03 +03:00
}
Btrfs: fix cleaner thread not working with inode cache option
Right now inode cache inode is treated as the same as space cache
inode, ie. keep inode in memory till putting super.
But this leads to an awkward situation.
If we're going to delete a snapshot/subvolume, btrfs will not
actually delete it and return free space, but will add it to dead
roots list until the last inode on this snap/subvol being destroyed.
Then we'll fetch deleted roots and cleanup them via cleaner thread.
So here is the problem, if we enable inode cache option, each
snap/subvol has a cached inode which is used to store inode allcation
information. And this cache inode will be kept in memory, as the above
said. So with inode cache, snap/subvol can only be added into
dead roots list during freeing roots stage in umount, so that we can
ONLY get space back after another remount(we cleanup dead roots on mount).
But the real thing is we'll no more use the snap/subvol if we mark it
deleted, so we can safely iput its cache inode when we delete snap/subvol.
Another thing is that we need to change the rules of droping inode, we
don't keep snap/subvol's cache inode in memory till end so that we can
add snap/subvol into dead roots list in time.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-02-20 14:10:23 +00:00
2009-09-21 16:00:26 -04:00
out_dput :
dput ( dentry ) ;
out_unlock_dir :
2021-02-10 17:14:34 -05:00
btrfs_inode_unlock ( dir , 0 ) ;
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
free_subvol_name :
kfree ( subvol_name_ptr ) ;
free_parent :
if ( destroy_parent )
dput ( parent ) ;
2016-05-26 00:05:12 -04:00
out_drop_write :
2011-12-09 08:06:57 -05:00
mnt_drop_write_file ( file ) ;
2009-09-21 16:00:26 -04:00
out :
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
kfree ( vol_args2 ) ;
2009-09-21 16:00:26 -04:00
kfree ( vol_args ) ;
return err ;
}
2010-03-11 09:42:04 -05:00
static int btrfs_ioctl_defrag ( struct file * file , void __user * argp )
2008-06-11 21:53:53 -04:00
{
2013-01-23 17:07:38 -05:00
struct inode * inode = file_inode ( file ) ;
2008-06-11 21:53:53 -04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2010-03-11 09:42:04 -05:00
struct btrfs_ioctl_defrag_range_args * range ;
2008-11-12 14:34:12 -05:00
int ret ;
2013-01-20 15:57:57 +02:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2010-12-20 16:04:08 +08:00
2013-01-20 15:57:57 +02:00
if ( btrfs_root_readonly ( root ) ) {
ret = - EROFS ;
goto out ;
2012-11-05 17:54:08 +01:00
}
2008-06-11 21:53:53 -04:00
switch ( inode - > i_mode & S_IFMT ) {
case S_IFDIR :
2009-01-05 16:57:23 -05:00
if ( ! capable ( CAP_SYS_ADMIN ) ) {
ret = - EPERM ;
goto out ;
}
2013-01-31 18:21:12 +00:00
ret = btrfs_defrag_root ( root ) ;
2008-06-11 21:53:53 -04:00
break ;
case S_IFREG :
2018-07-18 00:08:59 +02:00
/*
* Note that this does not check the file descriptor for write
* access . This prevents defragmenting executables that are
* running and allows defrag on files open in read - only mode .
*/
if ( ! capable ( CAP_SYS_ADMIN ) & &
2021-01-21 14:19:24 +01:00
inode_permission ( & init_user_ns , inode , MAY_WRITE ) ) {
2018-07-18 00:08:59 +02:00
ret = - EPERM ;
2009-01-05 16:57:23 -05:00
goto out ;
}
2010-03-11 09:42:04 -05:00
range = kzalloc ( sizeof ( * range ) , GFP_KERNEL ) ;
if ( ! range ) {
ret = - ENOMEM ;
goto out ;
}
if ( argp ) {
if ( copy_from_user ( range , argp ,
sizeof ( * range ) ) ) {
ret = - EFAULT ;
kfree ( range ) ;
2010-03-20 11:24:48 +00:00
goto out ;
2010-03-11 09:42:04 -05:00
}
/* compression requires us to start the IO */
if ( ( range - > flags & BTRFS_DEFRAG_RANGE_COMPRESS ) ) {
range - > flags | = BTRFS_DEFRAG_RANGE_START_IO ;
range - > extent_thresh = ( u32 ) - 1 ;
}
} else {
/* the rest are all set to zero by kzalloc */
range - > len = ( u64 ) - 1 ;
}
2013-01-23 17:07:38 -05:00
ret = btrfs_defrag_file ( file_inode ( file ) , file ,
2018-03-07 17:29:18 +08:00
range , BTRFS_OLDEST_GENERATION , 0 ) ;
2011-05-24 15:35:30 -04:00
if ( ret > 0 )
ret = 0 ;
2010-03-11 09:42:04 -05:00
kfree ( range ) ;
2008-06-11 21:53:53 -04:00
break ;
2010-05-16 10:49:58 -04:00
default :
ret = - EINVAL ;
2008-06-11 21:53:53 -04:00
}
2009-01-05 16:57:23 -05:00
out :
2013-01-20 15:57:57 +02:00
mnt_drop_write_file ( file ) ;
2009-01-05 16:57:23 -05:00
return ret ;
2008-06-11 21:53:53 -04:00
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_add_dev ( struct btrfs_fs_info * fs_info , void __user * arg )
2008-06-11 21:53:53 -04:00
{
struct btrfs_ioctl_vol_args * vol_args ;
int ret ;
2009-01-05 16:57:23 -05:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2020-08-25 10:02:32 -05:00
if ( ! btrfs_exclop_start ( fs_info , BTRFS_EXCLOP_DEV_ADD ) )
2013-08-21 11:44:48 +08:00
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2012-01-16 22:04:47 +02:00
2009-04-08 15:06:54 +08:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
2012-01-16 22:04:47 +02:00
if ( IS_ERR ( vol_args ) ) {
ret = PTR_ERR ( vol_args ) ;
goto out ;
}
2008-06-11 21:53:53 -04:00
2008-07-24 12:20:14 -04:00
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
2016-06-22 18:54:24 -04:00
ret = btrfs_init_new_device ( fs_info , vol_args - > name ) ;
2008-06-11 21:53:53 -04:00
2014-07-01 00:58:56 +08:00
if ( ! ret )
2016-06-22 18:54:23 -04:00
btrfs_info ( fs_info , " disk added %s " , vol_args - > name ) ;
2014-07-01 00:58:56 +08:00
2008-06-11 21:53:53 -04:00
kfree ( vol_args ) ;
2012-01-16 22:04:47 +02:00
out :
2020-08-25 10:02:32 -05:00
btrfs_exclop_finish ( fs_info ) ;
2008-06-11 21:53:53 -04:00
return ret ;
}
2016-02-13 10:01:39 +08:00
static long btrfs_ioctl_rm_dev_v2 ( struct file * file , void __user * arg )
2008-06-11 21:53:53 -04:00
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2016-02-13 10:01:39 +08:00
struct btrfs_ioctl_vol_args_v2 * vol_args ;
2008-06-11 21:53:53 -04:00
int ret ;
2009-01-05 16:57:23 -05:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 08:44:50 +00:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2008-11-12 14:34:12 -05:00
2009-04-08 15:06:54 +08:00
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
2012-01-16 22:04:47 +02:00
if ( IS_ERR ( vol_args ) ) {
ret = PTR_ERR ( vol_args ) ;
2014-09-04 14:09:15 +03:00
goto err_drop ;
2012-01-16 22:04:47 +02:00
}
2008-06-11 21:53:53 -04:00
2020-02-21 13:30:14 +01:00
if ( vol_args - > flags & ~ BTRFS_DEVICE_REMOVE_ARGS_MASK ) {
2018-05-22 15:44:01 -07:00
ret = - EOPNOTSUPP ;
goto out ;
}
2008-06-11 21:53:53 -04:00
2020-08-25 10:02:32 -05:00
if ( ! btrfs_exclop_start ( fs_info , BTRFS_EXCLOP_DEV_REMOVE ) ) {
2013-05-17 10:52:45 +00:00
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
goto out ;
}
2016-02-15 18:15:21 +01:00
if ( vol_args - > flags & BTRFS_DEVICE_SPEC_BY_ID ) {
2016-06-22 18:54:24 -04:00
ret = btrfs_rm_device ( fs_info , NULL , vol_args - > devid ) ;
2016-02-13 10:01:39 +08:00
} else {
vol_args - > name [ BTRFS_SUBVOL_NAME_MAX ] = ' \0 ' ;
2016-06-22 18:54:24 -04:00
ret = btrfs_rm_device ( fs_info , vol_args - > name , 0 ) ;
2016-02-13 10:01:39 +08:00
}
2020-08-25 10:02:32 -05:00
btrfs_exclop_finish ( fs_info ) ;
2013-05-17 10:52:45 +00:00
2016-02-13 10:01:39 +08:00
if ( ! ret ) {
2016-02-15 18:15:21 +01:00
if ( vol_args - > flags & BTRFS_DEVICE_SPEC_BY_ID )
2016-06-22 18:54:23 -04:00
btrfs_info ( fs_info , " device deleted: id %llu " ,
2016-02-13 10:01:39 +08:00
vol_args - > devid ) ;
else
2016-06-22 18:54:23 -04:00
btrfs_info ( fs_info , " device deleted: %s " ,
2016-02-13 10:01:39 +08:00
vol_args - > name ) ;
}
2013-05-17 10:52:45 +00:00
out :
kfree ( vol_args ) ;
2014-09-04 14:09:15 +03:00
err_drop :
2013-01-20 15:57:57 +02:00
mnt_drop_write_file ( file ) ;
2008-06-11 21:53:53 -04:00
return ret ;
}
2012-11-26 08:44:50 +00:00
static long btrfs_ioctl_rm_dev ( struct file * file , void __user * arg )
2008-06-11 21:53:53 -04:00
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2008-06-11 21:53:53 -04:00
struct btrfs_ioctl_vol_args * vol_args ;
int ret ;
2009-01-05 16:57:23 -05:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 08:44:50 +00:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2008-11-12 14:34:12 -05:00
2020-08-25 10:02:32 -05:00
if ( ! btrfs_exclop_start ( fs_info , BTRFS_EXCLOP_DEV_REMOVE ) ) {
2013-05-17 10:52:45 +00:00
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2016-05-04 14:10:47 +02:00
goto out_drop_write ;
}
vol_args = memdup_user ( arg , sizeof ( * vol_args ) ) ;
if ( IS_ERR ( vol_args ) ) {
ret = PTR_ERR ( vol_args ) ;
2013-05-17 10:52:45 +00:00
goto out ;
}
2016-05-04 14:10:47 +02:00
vol_args - > name [ BTRFS_PATH_NAME_MAX ] = ' \0 ' ;
2016-06-22 18:54:24 -04:00
ret = btrfs_rm_device ( fs_info , vol_args - > name , 0 ) ;
2013-05-17 10:52:45 +00:00
2014-07-01 00:58:57 +08:00
if ( ! ret )
2016-06-22 18:54:23 -04:00
btrfs_info ( fs_info , " disk deleted %s " , vol_args - > name ) ;
2013-05-17 10:52:45 +00:00
kfree ( vol_args ) ;
2016-05-04 14:10:47 +02:00
out :
2020-08-25 10:02:32 -05:00
btrfs_exclop_finish ( fs_info ) ;
2016-05-04 14:10:47 +02:00
out_drop_write :
2013-01-20 15:57:57 +02:00
mnt_drop_write_file ( file ) ;
2016-05-04 14:10:47 +02:00
2008-06-11 21:53:53 -04:00
return ret ;
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_fs_info ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2011-03-11 15:41:01 +01:00
{
2011-06-08 08:27:56 +00:00
struct btrfs_ioctl_fs_info_args * fi_args ;
2011-03-11 15:41:01 +01:00
struct btrfs_device * device ;
2016-06-22 18:54:23 -04:00
struct btrfs_fs_devices * fs_devices = fs_info - > fs_devices ;
2020-07-13 21:28:58 +09:00
u64 flags_in ;
2011-06-08 08:27:56 +00:00
int ret = 0 ;
2011-03-11 15:41:01 +01:00
2020-07-13 21:28:58 +09:00
fi_args = memdup_user ( arg , sizeof ( * fi_args ) ) ;
if ( IS_ERR ( fi_args ) )
return PTR_ERR ( fi_args ) ;
flags_in = fi_args - > flags ;
memset ( fi_args , 0 , sizeof ( * fi_args ) ) ;
2011-06-08 08:27:56 +00:00
2017-06-16 00:09:21 +02:00
rcu_read_lock ( ) ;
2011-06-08 08:27:56 +00:00
fi_args - > num_devices = fs_devices - > num_devices ;
2011-03-11 15:41:01 +01:00
2017-06-16 00:09:21 +02:00
list_for_each_entry_rcu ( device , & fs_devices - > devices , dev_list ) {
2011-06-08 08:27:56 +00:00
if ( device - > devid > fi_args - > max_id )
fi_args - > max_id = device - > devid ;
2011-03-11 15:41:01 +01:00
}
2017-06-16 00:09:21 +02:00
rcu_read_unlock ( ) ;
2011-03-11 15:41:01 +01:00
2018-10-30 16:43:24 +02:00
memcpy ( & fi_args - > fsid , fs_devices - > fsid , sizeof ( fi_args - > fsid ) ) ;
2017-08-22 23:46:00 -07:00
fi_args - > nodesize = fs_info - > nodesize ;
fi_args - > sectorsize = fs_info - > sectorsize ;
fi_args - > clone_alignment = fs_info - > sectorsize ;
2014-05-07 18:17:06 +02:00
2020-07-13 21:28:58 +09:00
if ( flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO ) {
fi_args - > csum_type = btrfs_super_csum_type ( fs_info - > super_copy ) ;
fi_args - > csum_size = btrfs_super_csum_size ( fs_info - > super_copy ) ;
fi_args - > flags | = BTRFS_FS_INFO_FLAG_CSUM_INFO ;
}
2020-07-13 21:28:59 +09:00
if ( flags_in & BTRFS_FS_INFO_FLAG_GENERATION ) {
fi_args - > generation = fs_info - > generation ;
fi_args - > flags | = BTRFS_FS_INFO_FLAG_GENERATION ;
}
2020-07-13 21:29:00 +09:00
if ( flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID ) {
memcpy ( & fi_args - > metadata_uuid , fs_devices - > metadata_uuid ,
sizeof ( fi_args - > metadata_uuid ) ) ;
fi_args - > flags | = BTRFS_FS_INFO_FLAG_METADATA_UUID ;
}
2011-06-08 08:27:56 +00:00
if ( copy_to_user ( arg , fi_args , sizeof ( * fi_args ) ) )
ret = - EFAULT ;
2011-03-11 15:41:01 +01:00
2011-06-08 08:27:56 +00:00
kfree ( fi_args ) ;
return ret ;
2011-03-11 15:41:01 +01:00
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_dev_info ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2011-03-11 15:41:01 +01:00
{
struct btrfs_ioctl_dev_info_args * di_args ;
struct btrfs_device * dev ;
int ret = 0 ;
char * s_uuid = NULL ;
di_args = memdup_user ( arg , sizeof ( * di_args ) ) ;
if ( IS_ERR ( di_args ) )
return PTR_ERR ( di_args ) ;
2013-08-15 17:11:20 +02:00
if ( ! btrfs_is_empty_uuid ( di_args - > uuid ) )
2011-03-11 15:41:01 +01:00
s_uuid = di_args - > uuid ;
2017-06-16 00:09:21 +02:00
rcu_read_lock ( ) ;
2019-01-17 23:32:31 +08:00
dev = btrfs_find_device ( fs_info - > fs_devices , di_args - > devid , s_uuid ,
2020-11-03 13:49:43 +08:00
NULL ) ;
2011-03-11 15:41:01 +01:00
if ( ! dev ) {
ret = - ENODEV ;
goto out ;
}
di_args - > devid = dev - > devid ;
2014-09-03 21:35:38 +08:00
di_args - > bytes_used = btrfs_device_get_bytes_used ( dev ) ;
di_args - > total_bytes = btrfs_device_get_total_bytes ( dev ) ;
2011-03-11 15:41:01 +01:00
memcpy ( di_args - > uuid , dev - > uuid , sizeof ( di_args - > uuid ) ) ;
2012-04-26 18:36:56 +02:00
if ( dev - > name ) {
2018-08-02 16:19:07 +09:00
strncpy ( di_args - > path , rcu_str_deref ( dev - > name ) ,
sizeof ( di_args - > path ) - 1 ) ;
2012-04-26 18:36:56 +02:00
di_args - > path [ sizeof ( di_args - > path ) - 1 ] = 0 ;
} else {
2012-03-19 16:17:22 +01:00
di_args - > path [ 0 ] = ' \0 ' ;
2012-04-26 18:36:56 +02:00
}
2011-03-11 15:41:01 +01:00
out :
2017-06-16 00:09:21 +02:00
rcu_read_unlock ( ) ;
2011-03-11 15:41:01 +01:00
if ( ret = = 0 & & copy_to_user ( arg , di_args , sizeof ( * di_args ) ) )
ret = - EFAULT ;
kfree ( di_args ) ;
return ret ;
}
2009-12-11 21:11:29 +00:00
static long btrfs_ioctl_default_subvol ( struct file * file , void __user * argp )
{
2013-01-23 17:07:38 -05:00
struct inode * inode = file_inode ( file ) ;
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2009-12-11 21:11:29 +00:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_root * new_root ;
struct btrfs_dir_item * di ;
struct btrfs_trans_handle * trans ;
2020-01-24 09:32:37 -05:00
struct btrfs_path * path = NULL ;
2009-12-11 21:11:29 +00:00
struct btrfs_disk_key disk_key ;
u64 objectid = 0 ;
u64 dir_id ;
2012-11-26 08:43:07 +00:00
int ret ;
2009-12-11 21:11:29 +00:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 08:43:07 +00:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
if ( copy_from_user ( & objectid , argp , sizeof ( objectid ) ) ) {
ret = - EFAULT ;
goto out ;
}
2009-12-11 21:11:29 +00:00
if ( ! objectid )
2013-09-13 19:34:10 +05:30
objectid = BTRFS_FS_TREE_OBJECTID ;
2009-12-11 21:11:29 +00:00
2020-05-15 19:35:55 +02:00
new_root = btrfs_get_fs_root ( fs_info , objectid , true ) ;
2012-11-26 08:43:07 +00:00
if ( IS_ERR ( new_root ) ) {
ret = PTR_ERR ( new_root ) ;
goto out ;
}
2020-01-24 09:32:37 -05:00
if ( ! is_fstree ( new_root - > root_key . objectid ) ) {
ret = - ENOENT ;
goto out_free ;
}
2009-12-11 21:11:29 +00:00
path = btrfs_alloc_path ( ) ;
2012-11-26 08:43:07 +00:00
if ( ! path ) {
ret = - ENOMEM ;
2020-01-24 09:32:37 -05:00
goto out_free ;
2012-11-26 08:43:07 +00:00
}
2009-12-11 21:11:29 +00:00
trans = btrfs_start_transaction ( root , 1 ) ;
2011-01-20 06:19:37 +00:00
if ( IS_ERR ( trans ) ) {
2012-11-26 08:43:07 +00:00
ret = PTR_ERR ( trans ) ;
2020-01-24 09:32:37 -05:00
goto out_free ;
2009-12-11 21:11:29 +00:00
}
2016-06-22 18:54:23 -04:00
dir_id = btrfs_super_root_dir ( fs_info - > super_copy ) ;
di = btrfs_lookup_dir_item ( trans , fs_info - > tree_root , path ,
2009-12-11 21:11:29 +00:00
dir_id , " default " , 7 , 1 ) ;
2010-05-29 09:47:24 +00:00
if ( IS_ERR_OR_NULL ( di ) ) {
2020-01-24 09:32:37 -05:00
btrfs_release_path ( path ) ;
2016-09-09 21:39:03 -04:00
btrfs_end_transaction ( trans ) ;
2016-06-22 18:54:23 -04:00
btrfs_err ( fs_info ,
2016-09-20 10:05:00 -04:00
" Umm, you don't have the default diritem, this isn't going to work " ) ;
2012-11-26 08:43:07 +00:00
ret = - ENOENT ;
2020-01-24 09:32:37 -05:00
goto out_free ;
2009-12-11 21:11:29 +00:00
}
btrfs_cpu_key_to_disk ( & disk_key , & new_root - > root_key ) ;
btrfs_set_dir_item_key ( path - > nodes [ 0 ] , di , & disk_key ) ;
btrfs_mark_buffer_dirty ( path - > nodes [ 0 ] ) ;
2020-01-24 09:32:37 -05:00
btrfs_release_path ( path ) ;
2009-12-11 21:11:29 +00:00
2016-06-22 18:54:23 -04:00
btrfs_set_fs_incompat ( fs_info , DEFAULT_SUBVOL ) ;
2016-09-09 21:39:03 -04:00
btrfs_end_transaction ( trans ) ;
2020-01-24 09:32:37 -05:00
out_free :
2020-01-24 09:33:01 -05:00
btrfs_put_root ( new_root ) ;
2020-01-24 09:32:37 -05:00
btrfs_free_path ( path ) ;
2012-11-26 08:43:07 +00:00
out :
mnt_drop_write_file ( file ) ;
return ret ;
2009-12-11 21:11:29 +00:00
}
2018-04-02 17:24:11 +08:00
static void get_block_group_info ( struct list_head * groups_list ,
struct btrfs_ioctl_space_info * space )
2010-09-29 11:22:36 -04:00
{
2019-10-29 19:20:18 +01:00
struct btrfs_block_group * block_group ;
2010-09-29 11:22:36 -04:00
space - > total_bytes = 0 ;
space - > used_bytes = 0 ;
space - > flags = 0 ;
list_for_each_entry ( block_group , groups_list , list ) {
space - > flags = block_group - > flags ;
2019-10-23 18:48:22 +02:00
space - > total_bytes + = block_group - > length ;
2019-10-23 18:48:11 +02:00
space - > used_bytes + = block_group - > used ;
2010-09-29 11:22:36 -04:00
}
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_space_info ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2010-01-13 18:19:06 +00:00
{
struct btrfs_ioctl_space_args space_args ;
struct btrfs_ioctl_space_info space ;
struct btrfs_ioctl_space_info * dest ;
2010-03-16 15:40:10 -04:00
struct btrfs_ioctl_space_info * dest_orig ;
2011-04-11 15:56:31 +00:00
struct btrfs_ioctl_space_info __user * user_dest ;
2010-01-13 18:19:06 +00:00
struct btrfs_space_info * info ;
2017-09-19 16:01:23 +01:00
static const u64 types [ ] = {
BTRFS_BLOCK_GROUP_DATA ,
BTRFS_BLOCK_GROUP_SYSTEM ,
BTRFS_BLOCK_GROUP_METADATA ,
BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
} ;
2010-09-29 11:22:36 -04:00
int num_types = 4 ;
2010-03-16 15:40:10 -04:00
int alloc_size ;
2010-01-13 18:19:06 +00:00
int ret = 0 ;
2011-02-14 16:04:23 -05:00
u64 slot_count = 0 ;
2010-09-29 11:22:36 -04:00
int i , c ;
2010-01-13 18:19:06 +00:00
if ( copy_from_user ( & space_args ,
( struct btrfs_ioctl_space_args __user * ) arg ,
sizeof ( space_args ) ) )
return - EFAULT ;
2010-09-29 11:22:36 -04:00
for ( i = 0 ; i < num_types ; i + + ) {
struct btrfs_space_info * tmp ;
info = NULL ;
2020-09-01 17:40:37 -04:00
list_for_each_entry ( tmp , & fs_info - > space_info , list ) {
2010-09-29 11:22:36 -04:00
if ( tmp - > flags = = types [ i ] ) {
info = tmp ;
break ;
}
}
if ( ! info )
continue ;
down_read ( & info - > groups_sem ) ;
for ( c = 0 ; c < BTRFS_NR_RAID_TYPES ; c + + ) {
if ( ! list_empty ( & info - > block_groups [ c ] ) )
slot_count + + ;
}
up_read ( & info - > groups_sem ) ;
}
2010-03-16 15:40:10 -04:00
2014-02-07 14:34:12 +01:00
/*
* Global block reserve , exported as a space_info
*/
slot_count + + ;
2010-03-16 15:40:10 -04:00
/* space_slots == 0 means they are asking for a count */
if ( space_args . space_slots = = 0 ) {
space_args . total_spaces = slot_count ;
goto out ;
}
2010-09-29 11:22:36 -04:00
2011-02-14 16:04:23 -05:00
slot_count = min_t ( u64 , space_args . space_slots , slot_count ) ;
2010-09-29 11:22:36 -04:00
2010-03-16 15:40:10 -04:00
alloc_size = sizeof ( * dest ) * slot_count ;
2010-09-29 11:22:36 -04:00
2010-03-16 15:40:10 -04:00
/* we generally have at most 6 or so space infos, one for each raid
* level . So , a whole page should be more than enough for everyone
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
if ( alloc_size > PAGE_SIZE )
2010-03-16 15:40:10 -04:00
return - ENOMEM ;
2010-01-13 18:19:06 +00:00
space_args . total_spaces = 0 ;
2015-11-04 15:38:29 +01:00
dest = kmalloc ( alloc_size , GFP_KERNEL ) ;
2010-03-16 15:40:10 -04:00
if ( ! dest )
return - ENOMEM ;
dest_orig = dest ;
2010-01-13 18:19:06 +00:00
2010-03-16 15:40:10 -04:00
/* now we have a buffer to copy into */
2010-09-29 11:22:36 -04:00
for ( i = 0 ; i < num_types ; i + + ) {
struct btrfs_space_info * tmp ;
2011-02-14 16:04:23 -05:00
if ( ! slot_count )
break ;
2010-09-29 11:22:36 -04:00
info = NULL ;
2020-09-01 17:40:37 -04:00
list_for_each_entry ( tmp , & fs_info - > space_info , list ) {
2010-09-29 11:22:36 -04:00
if ( tmp - > flags = = types [ i ] ) {
info = tmp ;
break ;
}
}
2010-03-16 15:40:10 -04:00
2010-09-29 11:22:36 -04:00
if ( ! info )
continue ;
down_read ( & info - > groups_sem ) ;
for ( c = 0 ; c < BTRFS_NR_RAID_TYPES ; c + + ) {
if ( ! list_empty ( & info - > block_groups [ c ] ) ) {
2018-04-02 17:24:11 +08:00
get_block_group_info ( & info - > block_groups [ c ] ,
& space ) ;
2010-09-29 11:22:36 -04:00
memcpy ( dest , & space , sizeof ( space ) ) ;
dest + + ;
space_args . total_spaces + + ;
2011-02-14 16:04:23 -05:00
slot_count - - ;
2010-09-29 11:22:36 -04:00
}
2011-02-14 16:04:23 -05:00
if ( ! slot_count )
break ;
2010-09-29 11:22:36 -04:00
}
up_read ( & info - > groups_sem ) ;
2010-01-13 18:19:06 +00:00
}
2014-02-07 14:34:12 +01:00
/*
* Add global block reserve
*/
if ( slot_count ) {
2016-06-22 18:54:23 -04:00
struct btrfs_block_rsv * block_rsv = & fs_info - > global_block_rsv ;
2014-02-07 14:34:12 +01:00
spin_lock ( & block_rsv - > lock ) ;
space . total_bytes = block_rsv - > size ;
space . used_bytes = block_rsv - > size - block_rsv - > reserved ;
spin_unlock ( & block_rsv - > lock ) ;
space . flags = BTRFS_SPACE_INFO_GLOBAL_RSV ;
memcpy ( dest , & space , sizeof ( space ) ) ;
space_args . total_spaces + + ;
}
2012-04-26 00:37:14 +08:00
user_dest = ( struct btrfs_ioctl_space_info __user * )
2010-03-16 15:40:10 -04:00
( arg + sizeof ( struct btrfs_ioctl_space_args ) ) ;
if ( copy_to_user ( user_dest , dest_orig , alloc_size ) )
ret = - EFAULT ;
kfree ( dest_orig ) ;
out :
if ( ret = = 0 & & copy_to_user ( arg , & space_args , sizeof ( space_args ) ) )
2010-01-13 18:19:06 +00:00
ret = - EFAULT ;
return ret ;
}
2012-11-26 08:40:43 +00:00
static noinline long btrfs_ioctl_start_sync ( struct btrfs_root * root ,
void __user * argp )
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
{
struct btrfs_trans_handle * trans ;
u64 transid ;
2011-03-23 08:14:16 +00:00
int ret ;
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
Btrfs: fix uncompleted transaction
In some cases, we need commit the current transaction, but don't want
to start a new one if there is no running transaction, so we introduce
the function - btrfs_attach_transaction(), which can catch the current
transaction, and return -ENOENT if there is no running transaction.
But no running transaction doesn't mean the current transction completely,
because we removed the running transaction before it completes. In some
cases, it doesn't matter. But in some special cases, such as freeze fs, we
hope the transaction is fully on disk, it will introduce some bugs, for
example, we may feeze the fs and dump the data in the disk, if the transction
doesn't complete, we would dump inconsistent data. So we need fix the above
problem for those cases.
We fixes this problem by introducing a function:
btrfs_attach_transaction_barrier()
if we hope all the transaction is fully on the disk, even they are not
running, we can use this function.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-02-20 09:17:06 +00:00
trans = btrfs_attach_transaction_barrier ( root ) ;
2012-11-26 08:41:29 +00:00
if ( IS_ERR ( trans ) ) {
if ( PTR_ERR ( trans ) ! = - ENOENT )
return PTR_ERR ( trans ) ;
/* No running transaction, don't bother */
transid = root - > fs_info - > last_trans_committed ;
goto out ;
}
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
transid = trans - > transid ;
2016-09-09 21:39:03 -04:00
ret = btrfs_commit_transaction_async ( trans , 0 ) ;
2011-04-04 01:52:13 +00:00
if ( ret ) {
2016-09-09 21:39:03 -04:00
btrfs_end_transaction ( trans ) ;
2011-03-23 08:14:16 +00:00
return ret ;
2011-04-04 01:52:13 +00:00
}
2012-11-26 08:41:29 +00:00
out :
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
if ( argp )
if ( copy_to_user ( argp , & transid , sizeof ( transid ) ) )
return - EFAULT ;
return 0 ;
}
2016-06-22 18:54:24 -04:00
static noinline long btrfs_ioctl_wait_sync ( struct btrfs_fs_info * fs_info ,
2012-11-26 08:40:43 +00:00
void __user * argp )
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
{
u64 transid ;
if ( argp ) {
if ( copy_from_user ( & transid , argp , sizeof ( transid ) ) )
return - EFAULT ;
} else {
transid = 0 ; /* current trans */
}
2016-06-22 18:54:24 -04:00
return btrfs_wait_for_commit ( fs_info , transid ) ;
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
}
2012-11-26 08:48:01 +00:00
static long btrfs_ioctl_scrub ( struct file * file , void __user * arg )
2011-03-11 15:41:01 +01:00
{
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( file_inode ( file ) - > i_sb ) ;
2011-03-11 15:41:01 +01:00
struct btrfs_ioctl_scrub_args * sa ;
2012-11-26 08:48:01 +00:00
int ret ;
2011-03-11 15:41:01 +01:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
if ( IS_ERR ( sa ) )
return PTR_ERR ( sa ) ;
2012-11-26 08:48:01 +00:00
if ( ! ( sa - > flags & BTRFS_SCRUB_READONLY ) ) {
ret = mnt_want_write_file ( file ) ;
if ( ret )
goto out ;
}
2016-06-22 18:54:23 -04:00
ret = btrfs_scrub_dev ( fs_info , sa - > devid , sa - > start , sa - > end ,
2012-11-05 18:29:28 +01:00
& sa - > progress , sa - > flags & BTRFS_SCRUB_READONLY ,
0 ) ;
2011-03-11 15:41:01 +01:00
2020-01-16 11:29:20 +00:00
/*
* Copy scrub args to user space even if btrfs_scrub_dev ( ) returned an
* error . This is important as it allows user space to know how much
* progress scrub has done . For example , if scrub is canceled we get
* - ECANCELED from btrfs_scrub_dev ( ) and return that error back to user
* space . Later user space can inspect the progress from the structure
* btrfs_ioctl_scrub_args and resume scrub from where it left off
* previously ( btrfs - progs does this ) .
* If we fail to copy the btrfs_ioctl_scrub_args structure to user space
* then return - EFAULT to signal the structure was not copied or it may
* be corrupt and unreliable due to a partial copy .
*/
if ( copy_to_user ( arg , sa , sizeof ( * sa ) ) )
2011-03-11 15:41:01 +01:00
ret = - EFAULT ;
2012-11-26 08:48:01 +00:00
if ( ! ( sa - > flags & BTRFS_SCRUB_READONLY ) )
mnt_drop_write_file ( file ) ;
out :
2011-03-11 15:41:01 +01:00
kfree ( sa ) ;
return ret ;
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_scrub_cancel ( struct btrfs_fs_info * fs_info )
2011-03-11 15:41:01 +01:00
{
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2016-06-22 18:54:24 -04:00
return btrfs_scrub_cancel ( fs_info ) ;
2011-03-11 15:41:01 +01:00
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_scrub_progress ( struct btrfs_fs_info * fs_info ,
2011-03-11 15:41:01 +01:00
void __user * arg )
{
struct btrfs_ioctl_scrub_args * sa ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
if ( IS_ERR ( sa ) )
return PTR_ERR ( sa ) ;
2016-06-22 18:54:24 -04:00
ret = btrfs_scrub_progress ( fs_info , sa - > devid , & sa - > progress ) ;
2011-03-11 15:41:01 +01:00
2018-12-14 19:45:13 +00:00
if ( ret = = 0 & & copy_to_user ( arg , sa , sizeof ( * sa ) ) )
2011-03-11 15:41:01 +01:00
ret = - EFAULT ;
kfree ( sa ) ;
return ret ;
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_get_dev_stats ( struct btrfs_fs_info * fs_info ,
2012-06-22 06:30:39 -06:00
void __user * arg )
2012-05-25 16:06:09 +02:00
{
struct btrfs_ioctl_get_dev_stats * sa ;
int ret ;
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
if ( IS_ERR ( sa ) )
return PTR_ERR ( sa ) ;
2012-06-22 06:30:39 -06:00
if ( ( sa - > flags & BTRFS_DEV_STATS_RESET ) & & ! capable ( CAP_SYS_ADMIN ) ) {
kfree ( sa ) ;
return - EPERM ;
}
2016-06-22 18:54:24 -04:00
ret = btrfs_get_dev_stats ( fs_info , sa ) ;
2012-05-25 16:06:09 +02:00
2018-12-14 19:45:22 +00:00
if ( ret = = 0 & & copy_to_user ( arg , sa , sizeof ( * sa ) ) )
2012-05-25 16:06:09 +02:00
ret = - EFAULT ;
kfree ( sa ) ;
return ret ;
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_dev_replace ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2012-11-06 15:08:53 +01:00
{
struct btrfs_ioctl_dev_replace_args * p ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
p = memdup_user ( arg , sizeof ( * p ) ) ;
if ( IS_ERR ( p ) )
return PTR_ERR ( p ) ;
switch ( p - > cmd ) {
case BTRFS_IOCTL_DEV_REPLACE_CMD_START :
2017-07-17 08:45:34 +01:00
if ( sb_rdonly ( fs_info - > sb ) ) {
2013-10-10 20:39:28 +03:00
ret = - EROFS ;
goto out ;
}
2020-08-25 10:02:32 -05:00
if ( ! btrfs_exclop_start ( fs_info , BTRFS_EXCLOP_DEV_REPLACE ) ) {
2013-08-21 11:44:48 +08:00
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2012-11-06 15:08:53 +01:00
} else {
2016-06-22 18:54:24 -04:00
ret = btrfs_dev_replace_by_ioctl ( fs_info , p ) ;
2020-08-25 10:02:32 -05:00
btrfs_exclop_finish ( fs_info ) ;
2012-11-06 15:08:53 +01:00
}
break ;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS :
2016-06-22 18:54:23 -04:00
btrfs_dev_replace_status ( fs_info , p ) ;
2012-11-06 15:08:53 +01:00
ret = 0 ;
break ;
case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL :
2018-02-12 23:33:30 +08:00
p - > result = btrfs_dev_replace_cancel ( fs_info ) ;
2018-02-12 23:33:29 +08:00
ret = 0 ;
2012-11-06 15:08:53 +01:00
break ;
default :
ret = - EINVAL ;
break ;
}
2019-01-08 11:42:09 +00:00
if ( ( ret = = 0 | | ret = = - ECANCELED ) & & copy_to_user ( arg , p , sizeof ( * p ) ) )
2012-11-06 15:08:53 +01:00
ret = - EFAULT ;
2013-10-10 20:39:28 +03:00
out :
2012-11-06 15:08:53 +01:00
kfree ( p ) ;
return ret ;
}
2011-07-07 16:48:38 +02:00
static long btrfs_ioctl_ino_to_path ( struct btrfs_root * root , void __user * arg )
{
int ret = 0 ;
int i ;
2011-11-02 15:48:34 -04:00
u64 rel_ptr ;
2011-07-07 16:48:38 +02:00
int size ;
2011-11-06 03:07:10 -05:00
struct btrfs_ioctl_ino_path_args * ipa = NULL ;
2011-07-07 16:48:38 +02:00
struct inode_fs_paths * ipath = NULL ;
struct btrfs_path * path ;
2013-01-28 11:33:31 +00:00
if ( ! capable ( CAP_DAC_READ_SEARCH ) )
2011-07-07 16:48:38 +02:00
return - EPERM ;
path = btrfs_alloc_path ( ) ;
if ( ! path ) {
ret = - ENOMEM ;
goto out ;
}
ipa = memdup_user ( arg , sizeof ( * ipa ) ) ;
if ( IS_ERR ( ipa ) ) {
ret = PTR_ERR ( ipa ) ;
ipa = NULL ;
goto out ;
}
size = min_t ( u32 , ipa - > size , 4096 ) ;
ipath = init_ipath ( size , root , path ) ;
if ( IS_ERR ( ipath ) ) {
ret = PTR_ERR ( ipath ) ;
ipath = NULL ;
goto out ;
}
ret = paths_from_inode ( ipa - > inum , ipath ) ;
if ( ret < 0 )
goto out ;
for ( i = 0 ; i < ipath - > fspath - > elem_cnt ; + + i ) {
2011-11-20 07:31:57 -05:00
rel_ptr = ipath - > fspath - > val [ i ] -
( u64 ) ( unsigned long ) ipath - > fspath - > val ;
2011-11-02 15:48:34 -04:00
ipath - > fspath - > val [ i ] = rel_ptr ;
2011-07-07 16:48:38 +02:00
}
2017-08-22 23:46:05 -07:00
ret = copy_to_user ( ( void __user * ) ( unsigned long ) ipa - > fspath ,
ipath - > fspath , size ) ;
2011-07-07 16:48:38 +02:00
if ( ret ) {
ret = - EFAULT ;
goto out ;
}
out :
btrfs_free_path ( path ) ;
free_ipath ( ipath ) ;
kfree ( ipa ) ;
return ret ;
}
static int build_ino_list ( u64 inum , u64 offset , u64 root , void * ctx )
{
struct btrfs_data_container * inodes = ctx ;
const size_t c = 3 * sizeof ( u64 ) ;
if ( inodes - > bytes_left > = c ) {
inodes - > bytes_left - = c ;
inodes - > val [ inodes - > elem_cnt ] = inum ;
inodes - > val [ inodes - > elem_cnt + 1 ] = offset ;
inodes - > val [ inodes - > elem_cnt + 2 ] = root ;
inodes - > elem_cnt + = 3 ;
} else {
inodes - > bytes_missing + = c - inodes - > bytes_left ;
inodes - > bytes_left = 0 ;
inodes - > elem_missed + = 3 ;
}
return 0 ;
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_logical_to_ino ( struct btrfs_fs_info * fs_info ,
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:46 -04:00
void __user * arg , int version )
2011-07-07 16:48:38 +02:00
{
int ret = 0 ;
int size ;
struct btrfs_ioctl_logical_ino_args * loi ;
struct btrfs_data_container * inodes = NULL ;
struct btrfs_path * path = NULL ;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:46 -04:00
bool ignore_offset ;
2011-07-07 16:48:38 +02:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
loi = memdup_user ( arg , sizeof ( * loi ) ) ;
2016-11-10 15:17:41 +05:30
if ( IS_ERR ( loi ) )
return PTR_ERR ( loi ) ;
2011-07-07 16:48:38 +02:00
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:46 -04:00
if ( version = = 1 ) {
ignore_offset = false ;
2017-09-22 13:58:47 -04:00
size = min_t ( u32 , loi - > size , SZ_64K ) ;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:46 -04:00
} else {
/* All reserved bits must be 0 for now */
if ( memchr_inv ( loi - > reserved , 0 , sizeof ( loi - > reserved ) ) ) {
ret = - EINVAL ;
goto out_loi ;
}
/* Only accept flags we have defined so far */
if ( loi - > flags & ~ ( BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET ) ) {
ret = - EINVAL ;
goto out_loi ;
}
ignore_offset = loi - > flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET ;
2017-09-22 13:58:47 -04:00
size = min_t ( u32 , loi - > size , SZ_16M ) ;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:46 -04:00
}
2011-07-07 16:48:38 +02:00
path = btrfs_alloc_path ( ) ;
if ( ! path ) {
ret = - ENOMEM ;
goto out ;
}
inodes = init_data_container ( size ) ;
if ( IS_ERR ( inodes ) ) {
ret = PTR_ERR ( inodes ) ;
inodes = NULL ;
goto out ;
}
2016-06-22 18:54:24 -04:00
ret = iterate_inodes_from_logical ( loi - > logical , fs_info , path ,
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:46 -04:00
build_ino_list , inodes , ignore_offset ) ;
2012-09-07 20:01:29 -06:00
if ( ret = = - EINVAL )
2011-07-07 16:48:38 +02:00
ret = - ENOENT ;
if ( ret < 0 )
goto out ;
2017-08-22 23:46:05 -07:00
ret = copy_to_user ( ( void __user * ) ( unsigned long ) loi - > inodes , inodes ,
size ) ;
2011-07-07 16:48:38 +02:00
if ( ret )
ret = - EFAULT ;
out :
btrfs_free_path ( path ) ;
2017-05-31 19:32:09 +02:00
kvfree ( inodes ) ;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:46 -04:00
out_loi :
2011-07-07 16:48:38 +02:00
kfree ( loi ) ;
return ret ;
}
2018-03-21 02:05:27 +01:00
void btrfs_update_ioctl_balance_args ( struct btrfs_fs_info * fs_info ,
2012-01-16 22:04:47 +02:00
struct btrfs_ioctl_balance_args * bargs )
{
struct btrfs_balance_control * bctl = fs_info - > balance_ctl ;
bargs - > flags = bctl - > flags ;
2018-03-21 01:31:04 +01:00
if ( test_bit ( BTRFS_FS_BALANCE_RUNNING , & fs_info - > flags ) )
2012-01-16 22:04:49 +02:00
bargs - > state | = BTRFS_BALANCE_STATE_RUNNING ;
if ( atomic_read ( & fs_info - > balance_pause_req ) )
bargs - > state | = BTRFS_BALANCE_STATE_PAUSE_REQ ;
2012-01-16 22:04:49 +02:00
if ( atomic_read ( & fs_info - > balance_cancel_req ) )
bargs - > state | = BTRFS_BALANCE_STATE_CANCEL_REQ ;
2012-01-16 22:04:49 +02:00
2012-01-16 22:04:47 +02:00
memcpy ( & bargs - > data , & bctl - > data , sizeof ( bargs - > data ) ) ;
memcpy ( & bargs - > meta , & bctl - > meta , sizeof ( bargs - > meta ) ) ;
memcpy ( & bargs - > sys , & bctl - > sys , sizeof ( bargs - > sys ) ) ;
2012-01-16 22:04:49 +02:00
2018-03-21 02:05:27 +01:00
spin_lock ( & fs_info - > balance_lock ) ;
memcpy ( & bargs - > stat , & bctl - > stat , sizeof ( bargs - > stat ) ) ;
spin_unlock ( & fs_info - > balance_lock ) ;
2012-01-16 22:04:47 +02:00
}
2012-05-11 18:11:26 +08:00
static long btrfs_ioctl_balance ( struct file * file , void __user * arg )
2012-01-16 22:04:47 +02:00
{
2013-01-23 17:07:38 -05:00
struct btrfs_root * root = BTRFS_I ( file_inode ( file ) ) - > root ;
2012-01-16 22:04:47 +02:00
struct btrfs_fs_info * fs_info = root - > fs_info ;
struct btrfs_ioctl_balance_args * bargs ;
struct btrfs_balance_control * bctl ;
2013-01-20 15:57:57 +02:00
bool need_unlock ; /* for mut. excl. ops lock */
2012-01-16 22:04:47 +02:00
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-06-29 03:58:48 -06:00
ret = mnt_want_write_file ( file ) ;
2012-05-11 18:11:26 +08:00
if ( ret )
return ret ;
2013-01-20 15:57:57 +02:00
again :
2020-08-25 10:02:32 -05:00
if ( btrfs_exclop_start ( fs_info , BTRFS_EXCLOP_BALANCE ) ) {
2013-01-20 15:57:57 +02:00
mutex_lock ( & fs_info - > balance_mutex ) ;
need_unlock = true ;
goto locked ;
}
/*
2016-05-19 21:18:45 -04:00
* mut . excl . ops lock is locked . Three possibilities :
2013-01-20 15:57:57 +02:00
* ( 1 ) some other op is running
* ( 2 ) balance is running
* ( 3 ) balance is paused - - special case ( think resume )
*/
2012-01-16 22:04:47 +02:00
mutex_lock ( & fs_info - > balance_mutex ) ;
2013-01-20 15:57:57 +02:00
if ( fs_info - > balance_ctl ) {
/* this is either (2) or (3) */
2018-03-21 01:31:04 +01:00
if ( ! test_bit ( BTRFS_FS_BALANCE_RUNNING , & fs_info - > flags ) ) {
2013-01-20 15:57:57 +02:00
mutex_unlock ( & fs_info - > balance_mutex ) ;
2018-03-21 00:20:05 +01:00
/*
* Lock released to allow other waiters to continue ,
* we ' ll reexamine the status again .
*/
2013-01-20 15:57:57 +02:00
mutex_lock ( & fs_info - > balance_mutex ) ;
if ( fs_info - > balance_ctl & &
2018-03-21 01:31:04 +01:00
! test_bit ( BTRFS_FS_BALANCE_RUNNING , & fs_info - > flags ) ) {
2013-01-20 15:57:57 +02:00
/* this is (3) */
need_unlock = false ;
goto locked ;
}
mutex_unlock ( & fs_info - > balance_mutex ) ;
goto again ;
} else {
/* this is (2) */
mutex_unlock ( & fs_info - > balance_mutex ) ;
ret = - EINPROGRESS ;
goto out ;
}
} else {
/* this is (1) */
mutex_unlock ( & fs_info - > balance_mutex ) ;
2013-08-21 11:44:48 +08:00
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS ;
2013-01-20 15:57:57 +02:00
goto out ;
}
locked :
2012-01-16 22:04:47 +02:00
if ( arg ) {
bargs = memdup_user ( arg , sizeof ( * bargs ) ) ;
if ( IS_ERR ( bargs ) ) {
ret = PTR_ERR ( bargs ) ;
2013-01-20 15:57:57 +02:00
goto out_unlock ;
2012-01-16 22:04:47 +02:00
}
2012-01-16 22:04:49 +02:00
if ( bargs - > flags & BTRFS_BALANCE_RESUME ) {
if ( ! fs_info - > balance_ctl ) {
ret = - ENOTCONN ;
goto out_bargs ;
}
bctl = fs_info - > balance_ctl ;
spin_lock ( & fs_info - > balance_lock ) ;
bctl - > flags | = BTRFS_BALANCE_RESUME ;
spin_unlock ( & fs_info - > balance_lock ) ;
goto do_balance ;
}
2012-01-16 22:04:47 +02:00
} else {
bargs = NULL ;
}
2013-01-20 15:57:57 +02:00
if ( fs_info - > balance_ctl ) {
2012-01-16 22:04:49 +02:00
ret = - EINPROGRESS ;
goto out_bargs ;
}
2015-11-04 15:38:29 +01:00
bctl = kzalloc ( sizeof ( * bctl ) , GFP_KERNEL ) ;
2012-01-16 22:04:47 +02:00
if ( ! bctl ) {
ret = - ENOMEM ;
goto out_bargs ;
}
if ( arg ) {
memcpy ( & bctl - > data , & bargs - > data , sizeof ( bctl - > data ) ) ;
memcpy ( & bctl - > meta , & bargs - > meta , sizeof ( bctl - > meta ) ) ;
memcpy ( & bctl - > sys , & bargs - > sys , sizeof ( bctl - > sys ) ) ;
bctl - > flags = bargs - > flags ;
2012-01-16 22:04:47 +02:00
} else {
/* balance everything - no filters */
bctl - > flags | = BTRFS_BALANCE_TYPE_MASK ;
2012-01-16 22:04:47 +02:00
}
2015-10-12 16:55:54 +02:00
if ( bctl - > flags & ~ ( BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK ) ) {
ret = - EINVAL ;
2015-10-21 00:50:06 +02:00
goto out_bctl ;
2015-10-12 16:55:54 +02:00
}
2012-01-16 22:04:49 +02:00
do_balance :
2012-01-16 22:04:47 +02:00
/*
2020-08-25 10:02:32 -05:00
* Ownership of bctl and exclusive operation goes to btrfs_balance .
* bctl is freed in reset_balance_state , or , if restriper was paused
* all the way until unmount , in free_fs_info . The flag should be
* cleared after reset_balance_state .
2012-01-16 22:04:47 +02:00
*/
2013-01-20 15:57:57 +02:00
need_unlock = false ;
2018-05-07 17:44:03 +02:00
ret = btrfs_balance ( fs_info , bctl , bargs ) ;
2015-10-21 00:50:06 +02:00
bctl = NULL ;
2013-01-20 15:57:57 +02:00
2019-01-08 11:42:01 +00:00
if ( ( ret = = 0 | | ret = = - ECANCELED ) & & arg ) {
2012-01-16 22:04:47 +02:00
if ( copy_to_user ( arg , bargs , sizeof ( * bargs ) ) )
ret = - EFAULT ;
}
2015-10-21 00:50:06 +02:00
out_bctl :
kfree ( bctl ) ;
2012-01-16 22:04:47 +02:00
out_bargs :
kfree ( bargs ) ;
2013-01-20 15:57:57 +02:00
out_unlock :
2012-01-16 22:04:47 +02:00
mutex_unlock ( & fs_info - > balance_mutex ) ;
2013-01-20 15:57:57 +02:00
if ( need_unlock )
2020-08-25 10:02:32 -05:00
btrfs_exclop_finish ( fs_info ) ;
2013-01-20 15:57:57 +02:00
out :
2012-06-29 03:58:48 -06:00
mnt_drop_write_file ( file ) ;
2012-01-16 22:04:47 +02:00
return ret ;
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_balance_ctl ( struct btrfs_fs_info * fs_info , int cmd )
2012-01-16 22:04:49 +02:00
{
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
switch ( cmd ) {
case BTRFS_BALANCE_CTL_PAUSE :
2016-06-22 18:54:23 -04:00
return btrfs_pause_balance ( fs_info ) ;
2012-01-16 22:04:49 +02:00
case BTRFS_BALANCE_CTL_CANCEL :
2016-06-22 18:54:23 -04:00
return btrfs_cancel_balance ( fs_info ) ;
2012-01-16 22:04:49 +02:00
}
return - EINVAL ;
}
2016-06-22 18:54:24 -04:00
static long btrfs_ioctl_balance_progress ( struct btrfs_fs_info * fs_info ,
2012-01-16 22:04:49 +02:00
void __user * arg )
{
struct btrfs_ioctl_balance_args * bargs ;
int ret = 0 ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
mutex_lock ( & fs_info - > balance_mutex ) ;
if ( ! fs_info - > balance_ctl ) {
ret = - ENOTCONN ;
goto out ;
}
2015-11-04 15:38:29 +01:00
bargs = kzalloc ( sizeof ( * bargs ) , GFP_KERNEL ) ;
2012-01-16 22:04:49 +02:00
if ( ! bargs ) {
ret = - ENOMEM ;
goto out ;
}
2018-03-21 02:05:27 +01:00
btrfs_update_ioctl_balance_args ( fs_info , bargs ) ;
2012-01-16 22:04:49 +02:00
if ( copy_to_user ( arg , bargs , sizeof ( * bargs ) ) )
ret = - EFAULT ;
kfree ( bargs ) ;
out :
mutex_unlock ( & fs_info - > balance_mutex ) ;
return ret ;
}
2012-11-26 08:50:11 +00:00
static long btrfs_ioctl_quota_ctl ( struct file * file , void __user * arg )
2011-09-14 15:53:51 +02:00
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2011-09-14 15:53:51 +02:00
struct btrfs_ioctl_quota_ctl_args * sa ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 08:50:11 +00:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2011-09-14 15:53:51 +02:00
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2012-11-26 08:50:11 +00:00
if ( IS_ERR ( sa ) ) {
ret = PTR_ERR ( sa ) ;
goto drop_write ;
}
2011-09-14 15:53:51 +02:00
2016-06-22 18:54:23 -04:00
down_write ( & fs_info - > subvol_sem ) ;
2011-09-14 15:53:51 +02:00
switch ( sa - > cmd ) {
case BTRFS_QUOTA_CTL_ENABLE :
2018-07-05 14:50:48 +03:00
ret = btrfs_quota_enable ( fs_info ) ;
2011-09-14 15:53:51 +02:00
break ;
case BTRFS_QUOTA_CTL_DISABLE :
2018-07-05 14:50:48 +03:00
ret = btrfs_quota_disable ( fs_info ) ;
2011-09-14 15:53:51 +02:00
break ;
default :
ret = - EINVAL ;
break ;
}
kfree ( sa ) ;
2016-06-22 18:54:23 -04:00
up_write ( & fs_info - > subvol_sem ) ;
2012-11-26 08:50:11 +00:00
drop_write :
mnt_drop_write_file ( file ) ;
2011-09-14 15:53:51 +02:00
return ret ;
}
2012-11-26 08:50:11 +00:00
static long btrfs_ioctl_qgroup_assign ( struct file * file , void __user * arg )
2011-09-14 15:53:51 +02:00
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-09-14 15:53:51 +02:00
struct btrfs_ioctl_qgroup_assign_args * sa ;
struct btrfs_trans_handle * trans ;
int ret ;
int err ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 08:50:11 +00:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2011-09-14 15:53:51 +02:00
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2012-11-26 08:50:11 +00:00
if ( IS_ERR ( sa ) ) {
ret = PTR_ERR ( sa ) ;
goto drop_write ;
}
2011-09-14 15:53:51 +02:00
trans = btrfs_join_transaction ( root ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out ;
}
if ( sa - > assign ) {
2018-07-18 14:45:30 +08:00
ret = btrfs_add_qgroup_relation ( trans , sa - > src , sa - > dst ) ;
2011-09-14 15:53:51 +02:00
} else {
2018-07-18 14:45:32 +08:00
ret = btrfs_del_qgroup_relation ( trans , sa - > src , sa - > dst ) ;
2011-09-14 15:53:51 +02:00
}
2015-02-27 16:24:28 +08:00
/* update qgroup status and info */
2018-07-18 14:45:40 +08:00
err = btrfs_run_qgroups ( trans ) ;
2015-02-27 16:24:28 +08:00
if ( err < 0 )
2016-06-22 18:54:23 -04:00
btrfs_handle_fs_error ( fs_info , err ,
" failed to update qgroup status and info " ) ;
2016-09-09 21:39:03 -04:00
err = btrfs_end_transaction ( trans ) ;
2011-09-14 15:53:51 +02:00
if ( err & & ! ret )
ret = err ;
out :
kfree ( sa ) ;
2012-11-26 08:50:11 +00:00
drop_write :
mnt_drop_write_file ( file ) ;
2011-09-14 15:53:51 +02:00
return ret ;
}
2012-11-26 08:50:11 +00:00
static long btrfs_ioctl_qgroup_create ( struct file * file , void __user * arg )
2011-09-14 15:53:51 +02:00
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-09-14 15:53:51 +02:00
struct btrfs_ioctl_qgroup_create_args * sa ;
struct btrfs_trans_handle * trans ;
int ret ;
int err ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 08:50:11 +00:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2011-09-14 15:53:51 +02:00
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2012-11-26 08:50:11 +00:00
if ( IS_ERR ( sa ) ) {
ret = PTR_ERR ( sa ) ;
goto drop_write ;
}
2011-09-14 15:53:51 +02:00
2012-11-15 11:35:41 +00:00
if ( ! sa - > qgroupid ) {
ret = - EINVAL ;
goto out ;
}
2011-09-14 15:53:51 +02:00
trans = btrfs_join_transaction ( root ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out ;
}
if ( sa - > create ) {
2018-07-18 14:45:33 +08:00
ret = btrfs_create_qgroup ( trans , sa - > qgroupid ) ;
2011-09-14 15:53:51 +02:00
} else {
2018-07-18 14:45:34 +08:00
ret = btrfs_remove_qgroup ( trans , sa - > qgroupid ) ;
2011-09-14 15:53:51 +02:00
}
2016-09-09 21:39:03 -04:00
err = btrfs_end_transaction ( trans ) ;
2011-09-14 15:53:51 +02:00
if ( err & & ! ret )
ret = err ;
out :
kfree ( sa ) ;
2012-11-26 08:50:11 +00:00
drop_write :
mnt_drop_write_file ( file ) ;
2011-09-14 15:53:51 +02:00
return ret ;
}
2012-11-26 08:50:11 +00:00
static long btrfs_ioctl_qgroup_limit ( struct file * file , void __user * arg )
2011-09-14 15:53:51 +02:00
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-09-14 15:53:51 +02:00
struct btrfs_ioctl_qgroup_limit_args * sa ;
struct btrfs_trans_handle * trans ;
int ret ;
int err ;
u64 qgroupid ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2012-11-26 08:50:11 +00:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2011-09-14 15:53:51 +02:00
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2012-11-26 08:50:11 +00:00
if ( IS_ERR ( sa ) ) {
ret = PTR_ERR ( sa ) ;
goto drop_write ;
}
2011-09-14 15:53:51 +02:00
trans = btrfs_join_transaction ( root ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out ;
}
qgroupid = sa - > qgroupid ;
if ( ! qgroupid ) {
/* take the current subvol as qgroup */
qgroupid = root - > root_key . objectid ;
}
2018-07-18 14:45:35 +08:00
ret = btrfs_limit_qgroup ( trans , qgroupid , & sa - > lim ) ;
2011-09-14 15:53:51 +02:00
2016-09-09 21:39:03 -04:00
err = btrfs_end_transaction ( trans ) ;
2011-09-14 15:53:51 +02:00
if ( err & & ! ret )
ret = err ;
out :
kfree ( sa ) ;
2012-11-26 08:50:11 +00:00
drop_write :
mnt_drop_write_file ( file ) ;
2011-09-14 15:53:51 +02:00
return ret ;
}
2013-04-25 16:04:51 +00:00
static long btrfs_ioctl_quota_rescan ( struct file * file , void __user * arg )
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2013-04-25 16:04:51 +00:00
struct btrfs_ioctl_quota_rescan_args * qsa ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
qsa = memdup_user ( arg , sizeof ( * qsa ) ) ;
if ( IS_ERR ( qsa ) ) {
ret = PTR_ERR ( qsa ) ;
goto drop_write ;
}
if ( qsa - > flags ) {
ret = - EINVAL ;
goto out ;
}
2016-06-22 18:54:23 -04:00
ret = btrfs_qgroup_rescan ( fs_info ) ;
2013-04-25 16:04:51 +00:00
out :
kfree ( qsa ) ;
drop_write :
mnt_drop_write_file ( file ) ;
return ret ;
}
2019-10-10 21:23:11 -03:00
static long btrfs_ioctl_quota_rescan_status ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2013-04-25 16:04:51 +00:00
{
struct btrfs_ioctl_quota_rescan_args * qsa ;
int ret = 0 ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2015-11-04 15:38:29 +01:00
qsa = kzalloc ( sizeof ( * qsa ) , GFP_KERNEL ) ;
2013-04-25 16:04:51 +00:00
if ( ! qsa )
return - ENOMEM ;
2016-06-22 18:54:23 -04:00
if ( fs_info - > qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN ) {
2013-04-25 16:04:51 +00:00
qsa - > flags = 1 ;
2016-06-22 18:54:23 -04:00
qsa - > progress = fs_info - > qgroup_rescan_progress . objectid ;
2013-04-25 16:04:51 +00:00
}
if ( copy_to_user ( arg , qsa , sizeof ( * qsa ) ) )
ret = - EFAULT ;
kfree ( qsa ) ;
return ret ;
}
2019-10-10 21:23:11 -03:00
static long btrfs_ioctl_quota_rescan_wait ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2013-05-06 19:14:17 +00:00
{
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2016-06-22 18:54:23 -04:00
return btrfs_qgroup_wait_for_completion ( fs_info , true ) ;
2013-05-06 19:14:17 +00:00
}
2014-01-30 20:17:00 +00:00
static long _btrfs_ioctl_set_received_subvol ( struct file * file ,
struct btrfs_ioctl_received_subvol_args * sa )
2012-07-25 17:35:53 +02:00
{
2013-01-23 17:07:38 -05:00
struct inode * inode = file_inode ( file ) ;
2016-06-22 18:54:23 -04:00
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
2012-07-25 17:35:53 +02:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_root_item * root_item = & root - > root_item ;
struct btrfs_trans_handle * trans ;
vfs: change inode times to use struct timespec64
struct timespec is not y2038 safe. Transition vfs to use
y2038 safe struct timespec64 instead.
The change was made with the help of the following cocinelle
script. This catches about 80% of the changes.
All the header file and logic changes are included in the
first 5 rules. The rest are trivial substitutions.
I avoid changing any of the function signatures or any other
filesystem specific data structures to keep the patch simple
for review.
The script can be a little shorter by combining different cases.
But, this version was sufficient for my usecase.
virtual patch
@ depends on patch @
identifier now;
@@
- struct timespec
+ struct timespec64
current_time ( ... )
{
- struct timespec now = current_kernel_time();
+ struct timespec64 now = current_kernel_time64();
...
- return timespec_trunc(
+ return timespec64_trunc(
... );
}
@ depends on patch @
identifier xtime;
@@
struct \( iattr \| inode \| kstat \) {
...
- struct timespec xtime;
+ struct timespec64 xtime;
...
}
@ depends on patch @
identifier t;
@@
struct inode_operations {
...
int (*update_time) (...,
- struct timespec t,
+ struct timespec64 t,
...);
...
}
@ depends on patch @
identifier t;
identifier fn_update_time =~ "update_time$";
@@
fn_update_time (...,
- struct timespec *t,
+ struct timespec64 *t,
...) { ... }
@ depends on patch @
identifier t;
@@
lease_get_mtime( ... ,
- struct timespec *t
+ struct timespec64 *t
) { ... }
@te depends on patch forall@
identifier ts;
local idexpression struct inode *inode_node;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn_update_time =~ "update_time$";
identifier fn;
expression e, E3;
local idexpression struct inode *node1;
local idexpression struct inode *node2;
local idexpression struct iattr *attr1;
local idexpression struct iattr *attr2;
local idexpression struct iattr attr;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
@@
(
(
- struct timespec ts;
+ struct timespec64 ts;
|
- struct timespec ts = current_time(inode_node);
+ struct timespec64 ts = current_time(inode_node);
)
<+... when != ts
(
- timespec_equal(&inode_node->i_xtime, &ts)
+ timespec64_equal(&inode_node->i_xtime, &ts)
|
- timespec_equal(&ts, &inode_node->i_xtime)
+ timespec64_equal(&ts, &inode_node->i_xtime)
|
- timespec_compare(&inode_node->i_xtime, &ts)
+ timespec64_compare(&inode_node->i_xtime, &ts)
|
- timespec_compare(&ts, &inode_node->i_xtime)
+ timespec64_compare(&ts, &inode_node->i_xtime)
|
ts = current_time(e)
|
fn_update_time(..., &ts,...)
|
inode_node->i_xtime = ts
|
node1->i_xtime = ts
|
ts = inode_node->i_xtime
|
<+... attr1->ia_xtime ...+> = ts
|
ts = attr1->ia_xtime
|
ts.tv_sec
|
ts.tv_nsec
|
btrfs_set_stack_timespec_sec(..., ts.tv_sec)
|
btrfs_set_stack_timespec_nsec(..., ts.tv_nsec)
|
- ts = timespec64_to_timespec(
+ ts =
...
-)
|
- ts = ktime_to_timespec(
+ ts = ktime_to_timespec64(
...)
|
- ts = E3
+ ts = timespec_to_timespec64(E3)
|
- ktime_get_real_ts(&ts)
+ ktime_get_real_ts64(&ts)
|
fn(...,
- ts
+ timespec64_to_timespec(ts)
,...)
)
...+>
(
<... when != ts
- return ts;
+ return timespec64_to_timespec(ts);
...>
)
|
- timespec_equal(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_equal(&node1->i_xtime2, &node2->i_xtime2)
|
- timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2)
+ timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2)
|
- timespec_compare(&node1->i_xtime1, &node2->i_xtime2)
+ timespec64_compare(&node1->i_xtime1, &node2->i_xtime2)
|
node1->i_xtime1 =
- timespec_trunc(attr1->ia_xtime1,
+ timespec64_trunc(attr1->ia_xtime1,
...)
|
- attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2,
+ attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2,
...)
|
- ktime_get_real_ts(&attr1->ia_xtime1)
+ ktime_get_real_ts64(&attr1->ia_xtime1)
|
- ktime_get_real_ts(&attr.ia_xtime1)
+ ktime_get_real_ts64(&attr.ia_xtime1)
)
@ depends on patch @
struct inode *node;
struct iattr *attr;
identifier fn;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
expression e;
@@
(
- fn(node->i_xtime);
+ fn(timespec64_to_timespec(node->i_xtime));
|
fn(...,
- node->i_xtime);
+ timespec64_to_timespec(node->i_xtime));
|
- e = fn(attr->ia_xtime);
+ e = fn(timespec64_to_timespec(attr->ia_xtime));
)
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
identifier i_xtime =~ "^i_[acm]time$";
identifier ia_xtime =~ "^ia_[acm]time$";
identifier fn;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
fn (...,
- &attr->ia_xtime,
+ &ts,
...);
)
...+>
}
@ depends on patch forall @
struct inode *node;
struct iattr *attr;
struct kstat *stat;
identifier ia_xtime =~ "^ia_[acm]time$";
identifier i_xtime =~ "^i_[acm]time$";
identifier xtime =~ "^[acm]time$";
identifier fn, ret;
@@
{
+ struct timespec ts;
<+...
(
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(node->i_xtime);
ret = fn (...,
- &node->i_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime,
+ &ts,
...);
|
+ ts = timespec64_to_timespec(attr->ia_xtime);
ret = fn (...,
- &attr->ia_xtime);
+ &ts);
|
+ ts = timespec64_to_timespec(stat->xtime);
ret = fn (...,
- &stat->xtime);
+ &ts);
)
...+>
}
@ depends on patch @
struct inode *node;
struct inode *node2;
identifier i_xtime1 =~ "^i_[acm]time$";
identifier i_xtime2 =~ "^i_[acm]time$";
identifier i_xtime3 =~ "^i_[acm]time$";
struct iattr *attrp;
struct iattr *attrp2;
struct iattr attr ;
identifier ia_xtime1 =~ "^ia_[acm]time$";
identifier ia_xtime2 =~ "^ia_[acm]time$";
struct kstat *stat;
struct kstat stat1;
struct timespec64 ts;
identifier xtime =~ "^[acmb]time$";
expression e;
@@
(
( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ;
|
node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \);
|
stat->xtime = node2->i_xtime1;
|
stat1.xtime = node2->i_xtime1;
|
( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ;
|
( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2;
|
- e = node->i_xtime1;
+ e = timespec64_to_timespec( node->i_xtime1 );
|
- e = attrp->ia_xtime1;
+ e = timespec64_to_timespec( attrp->ia_xtime1 );
|
node->i_xtime1 = current_time(...);
|
node->i_xtime2 = node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
node->i_xtime1 = node->i_xtime3 =
- e;
+ timespec_to_timespec64(e);
|
- node->i_xtime1 = e;
+ node->i_xtime1 = timespec_to_timespec64(e);
)
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: <anton@tuxera.com>
Cc: <balbi@kernel.org>
Cc: <bfields@fieldses.org>
Cc: <darrick.wong@oracle.com>
Cc: <dhowells@redhat.com>
Cc: <dsterba@suse.com>
Cc: <dwmw2@infradead.org>
Cc: <hch@lst.de>
Cc: <hirofumi@mail.parknet.co.jp>
Cc: <hubcap@omnibond.com>
Cc: <jack@suse.com>
Cc: <jaegeuk@kernel.org>
Cc: <jaharkes@cs.cmu.edu>
Cc: <jslaby@suse.com>
Cc: <keescook@chromium.org>
Cc: <mark@fasheh.com>
Cc: <miklos@szeredi.hu>
Cc: <nico@linaro.org>
Cc: <reiserfs-devel@vger.kernel.org>
Cc: <richard@nod.at>
Cc: <sage@redhat.com>
Cc: <sfrench@samba.org>
Cc: <swhiteho@redhat.com>
Cc: <tj@kernel.org>
Cc: <trond.myklebust@primarydata.com>
Cc: <tytso@mit.edu>
Cc: <viro@zeniv.linux.org.uk>
2018-05-08 19:36:02 -07:00
struct timespec64 ct = current_time ( inode ) ;
2012-07-25 17:35:53 +02:00
int ret = 0 ;
2013-08-15 17:11:20 +02:00
int received_uuid_changed ;
2012-07-25 17:35:53 +02:00
2021-01-21 14:19:25 +01:00
if ( ! inode_owner_or_capable ( & init_user_ns , inode ) )
2014-01-16 15:50:22 +01:00
return - EPERM ;
2012-07-25 17:35:53 +02:00
ret = mnt_want_write_file ( file ) ;
if ( ret < 0 )
return ret ;
2016-06-22 18:54:23 -04:00
down_write ( & fs_info - > subvol_sem ) ;
2012-07-25 17:35:53 +02:00
2017-01-10 20:35:31 +02:00
if ( btrfs_ino ( BTRFS_I ( inode ) ) ! = BTRFS_FIRST_FREE_OBJECTID ) {
2012-07-25 17:35:53 +02:00
ret = - EINVAL ;
goto out ;
}
if ( btrfs_root_readonly ( root ) ) {
ret = - EROFS ;
goto out ;
}
2013-08-15 17:11:20 +02:00
/*
* 1 - root item
* 2 - uuid items ( received uuid + subvol uuid )
*/
trans = btrfs_start_transaction ( root , 3 ) ;
2012-07-25 17:35:53 +02:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
trans = NULL ;
goto out ;
}
sa - > rtransid = trans - > transid ;
sa - > rtime . sec = ct . tv_sec ;
sa - > rtime . nsec = ct . tv_nsec ;
2013-08-15 17:11:20 +02:00
received_uuid_changed = memcmp ( root_item - > received_uuid , sa - > uuid ,
BTRFS_UUID_SIZE ) ;
if ( received_uuid_changed & &
2018-03-12 14:48:09 +02:00
! btrfs_is_empty_uuid ( root_item - > received_uuid ) ) {
2018-05-29 15:01:54 +08:00
ret = btrfs_uuid_tree_remove ( trans , root_item - > received_uuid ,
2018-03-12 14:48:09 +02:00
BTRFS_UUID_KEY_RECEIVED_SUBVOL ,
root - > root_key . objectid ) ;
if ( ret & & ret ! = - ENOENT ) {
btrfs_abort_transaction ( trans , ret ) ;
btrfs_end_transaction ( trans ) ;
goto out ;
}
}
2012-07-25 17:35:53 +02:00
memcpy ( root_item - > received_uuid , sa - > uuid , BTRFS_UUID_SIZE ) ;
btrfs_set_root_stransid ( root_item , sa - > stransid ) ;
btrfs_set_root_rtransid ( root_item , sa - > rtransid ) ;
2013-07-16 11:19:18 +08:00
btrfs_set_stack_timespec_sec ( & root_item - > stime , sa - > stime . sec ) ;
btrfs_set_stack_timespec_nsec ( & root_item - > stime , sa - > stime . nsec ) ;
btrfs_set_stack_timespec_sec ( & root_item - > rtime , sa - > rtime . sec ) ;
btrfs_set_stack_timespec_nsec ( & root_item - > rtime , sa - > rtime . nsec ) ;
2012-07-25 17:35:53 +02:00
2016-06-22 18:54:23 -04:00
ret = btrfs_update_root ( trans , fs_info - > tree_root ,
2012-07-25 17:35:53 +02:00
& root - > root_key , & root - > root_item ) ;
if ( ret < 0 ) {
2016-09-09 21:39:03 -04:00
btrfs_end_transaction ( trans ) ;
2012-07-25 17:35:53 +02:00
goto out ;
2013-08-15 17:11:20 +02:00
}
if ( received_uuid_changed & & ! btrfs_is_empty_uuid ( sa - > uuid ) ) {
2018-05-29 15:01:53 +08:00
ret = btrfs_uuid_tree_add ( trans , sa - > uuid ,
2013-08-15 17:11:20 +02:00
BTRFS_UUID_KEY_RECEIVED_SUBVOL ,
root - > root_key . objectid ) ;
if ( ret < 0 & & ret ! = - EEXIST ) {
2016-06-10 18:19:25 -04:00
btrfs_abort_transaction ( trans , ret ) ;
2017-09-28 11:45:26 +03:00
btrfs_end_transaction ( trans ) ;
2012-07-25 17:35:53 +02:00
goto out ;
2013-08-15 17:11:20 +02:00
}
}
2016-09-09 21:39:03 -04:00
ret = btrfs_commit_transaction ( trans ) ;
2014-01-30 20:17:00 +00:00
out :
2016-06-22 18:54:23 -04:00
up_write ( & fs_info - > subvol_sem ) ;
2014-01-30 20:17:00 +00:00
mnt_drop_write_file ( file ) ;
return ret ;
}
# ifdef CONFIG_64BIT
static long btrfs_ioctl_set_received_subvol_32 ( struct file * file ,
void __user * arg )
{
struct btrfs_ioctl_received_subvol_args_32 * args32 = NULL ;
struct btrfs_ioctl_received_subvol_args * args64 = NULL ;
int ret = 0 ;
args32 = memdup_user ( arg , sizeof ( * args32 ) ) ;
2016-11-10 15:17:41 +05:30
if ( IS_ERR ( args32 ) )
return PTR_ERR ( args32 ) ;
2014-01-30 20:17:00 +00:00
2015-11-04 15:38:29 +01:00
args64 = kmalloc ( sizeof ( * args64 ) , GFP_KERNEL ) ;
2014-03-28 11:06:00 +03:00
if ( ! args64 ) {
ret = - ENOMEM ;
2014-01-30 20:17:00 +00:00
goto out ;
}
memcpy ( args64 - > uuid , args32 - > uuid , BTRFS_UUID_SIZE ) ;
args64 - > stransid = args32 - > stransid ;
args64 - > rtransid = args32 - > rtransid ;
args64 - > stime . sec = args32 - > stime . sec ;
args64 - > stime . nsec = args32 - > stime . nsec ;
args64 - > rtime . sec = args32 - > rtime . sec ;
args64 - > rtime . nsec = args32 - > rtime . nsec ;
args64 - > flags = args32 - > flags ;
ret = _btrfs_ioctl_set_received_subvol ( file , args64 ) ;
if ( ret )
goto out ;
memcpy ( args32 - > uuid , args64 - > uuid , BTRFS_UUID_SIZE ) ;
args32 - > stransid = args64 - > stransid ;
args32 - > rtransid = args64 - > rtransid ;
args32 - > stime . sec = args64 - > stime . sec ;
args32 - > stime . nsec = args64 - > stime . nsec ;
args32 - > rtime . sec = args64 - > rtime . sec ;
args32 - > rtime . nsec = args64 - > rtime . nsec ;
args32 - > flags = args64 - > flags ;
ret = copy_to_user ( arg , args32 , sizeof ( * args32 ) ) ;
if ( ret )
ret = - EFAULT ;
out :
kfree ( args32 ) ;
kfree ( args64 ) ;
return ret ;
}
# endif
static long btrfs_ioctl_set_received_subvol ( struct file * file ,
void __user * arg )
{
struct btrfs_ioctl_received_subvol_args * sa = NULL ;
int ret = 0 ;
sa = memdup_user ( arg , sizeof ( * sa ) ) ;
2016-11-10 15:17:41 +05:30
if ( IS_ERR ( sa ) )
return PTR_ERR ( sa ) ;
2014-01-30 20:17:00 +00:00
ret = _btrfs_ioctl_set_received_subvol ( file , sa ) ;
if ( ret )
goto out ;
2012-07-25 17:35:53 +02:00
ret = copy_to_user ( arg , sa , sizeof ( * sa ) ) ;
if ( ret )
ret = - EFAULT ;
out :
kfree ( sa ) ;
return ret ;
}
2019-10-10 21:23:11 -03:00
static int btrfs_ioctl_get_fslabel ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2013-01-05 02:48:01 +00:00
{
2013-07-19 17:39:32 +08:00
size_t len ;
2013-01-05 02:48:01 +00:00
int ret ;
2013-07-19 17:39:32 +08:00
char label [ BTRFS_LABEL_SIZE ] ;
2016-06-22 18:54:23 -04:00
spin_lock ( & fs_info - > super_lock ) ;
memcpy ( label , fs_info - > super_copy - > label , BTRFS_LABEL_SIZE ) ;
spin_unlock ( & fs_info - > super_lock ) ;
2013-07-19 17:39:32 +08:00
len = strnlen ( label , BTRFS_LABEL_SIZE ) ;
2013-01-05 02:48:01 +00:00
if ( len = = BTRFS_LABEL_SIZE ) {
2016-06-22 18:54:23 -04:00
btrfs_warn ( fs_info ,
" label is too long, return the first %zu bytes " ,
- - len ) ;
2013-01-05 02:48:01 +00:00
}
ret = copy_to_user ( arg , label , len ) ;
return ret ? - EFAULT : 0 ;
}
2013-01-05 02:48:08 +00:00
static int btrfs_ioctl_set_fslabel ( struct file * file , void __user * arg )
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_super_block * super_block = fs_info - > super_copy ;
2013-01-05 02:48:08 +00:00
struct btrfs_trans_handle * trans ;
char label [ BTRFS_LABEL_SIZE ] ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( copy_from_user ( label , arg , sizeof ( label ) ) )
return - EFAULT ;
if ( strnlen ( label , BTRFS_LABEL_SIZE ) = = BTRFS_LABEL_SIZE ) {
2016-06-22 18:54:23 -04:00
btrfs_err ( fs_info ,
2016-09-20 10:05:00 -04:00
" unable to set label with more than %d bytes " ,
BTRFS_LABEL_SIZE - 1 ) ;
2013-01-05 02:48:08 +00:00
return - EINVAL ;
}
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
trans = btrfs_start_transaction ( root , 0 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out_unlock ;
}
2016-06-22 18:54:23 -04:00
spin_lock ( & fs_info - > super_lock ) ;
2013-01-05 02:48:08 +00:00
strcpy ( super_block - > label , label ) ;
2016-06-22 18:54:23 -04:00
spin_unlock ( & fs_info - > super_lock ) ;
2016-09-09 21:39:03 -04:00
ret = btrfs_commit_transaction ( trans ) ;
2013-01-05 02:48:08 +00:00
out_unlock :
mnt_drop_write_file ( file ) ;
return ret ;
}
2013-11-15 15:33:55 -05:00
# define INIT_FEATURE_FLAGS(suffix) \
{ . compat_flags = BTRFS_FEATURE_COMPAT_ # # suffix , \
. compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_ # # suffix , \
. incompat_flags = BTRFS_FEATURE_INCOMPAT_ # # suffix }
2016-02-17 15:26:27 +01:00
int btrfs_ioctl_get_supported_features ( void __user * arg )
2013-11-15 15:33:55 -05:00
{
2015-11-19 11:42:31 +01:00
static const struct btrfs_ioctl_feature_flags features [ 3 ] = {
2013-11-15 15:33:55 -05:00
INIT_FEATURE_FLAGS ( SUPP ) ,
INIT_FEATURE_FLAGS ( SAFE_SET ) ,
INIT_FEATURE_FLAGS ( SAFE_CLEAR )
} ;
if ( copy_to_user ( arg , & features , sizeof ( features ) ) )
return - EFAULT ;
return 0 ;
}
2019-10-10 21:23:11 -03:00
static int btrfs_ioctl_get_features ( struct btrfs_fs_info * fs_info ,
void __user * arg )
2013-11-15 15:33:55 -05:00
{
2016-06-22 18:54:23 -04:00
struct btrfs_super_block * super_block = fs_info - > super_copy ;
2013-11-15 15:33:55 -05:00
struct btrfs_ioctl_feature_flags features ;
features . compat_flags = btrfs_super_compat_flags ( super_block ) ;
features . compat_ro_flags = btrfs_super_compat_ro_flags ( super_block ) ;
features . incompat_flags = btrfs_super_incompat_flags ( super_block ) ;
if ( copy_to_user ( arg , & features , sizeof ( features ) ) )
return - EFAULT ;
return 0 ;
}
2016-06-22 18:54:24 -04:00
static int check_feature_bits ( struct btrfs_fs_info * fs_info ,
2013-11-01 13:07:02 -04:00
enum btrfs_feature_set set ,
2013-11-15 15:33:55 -05:00
u64 change_mask , u64 flags , u64 supported_flags ,
u64 safe_set , u64 safe_clear )
{
2019-08-01 19:07:55 +02:00
const char * type = btrfs_feature_set_name ( set ) ;
2013-11-01 13:07:02 -04:00
char * names ;
2013-11-15 15:33:55 -05:00
u64 disallowed , unsupported ;
u64 set_mask = flags & change_mask ;
u64 clear_mask = ~ flags & change_mask ;
unsupported = set_mask & ~ supported_flags ;
if ( unsupported ) {
2013-11-01 13:07:02 -04:00
names = btrfs_printable_features ( set , unsupported ) ;
if ( names ) {
2016-06-22 18:54:23 -04:00
btrfs_warn ( fs_info ,
" this kernel does not support the %s feature bit%s " ,
names , strchr ( names , ' , ' ) ? " s " : " " ) ;
2013-11-01 13:07:02 -04:00
kfree ( names ) ;
} else
2016-06-22 18:54:23 -04:00
btrfs_warn ( fs_info ,
" this kernel does not support %s bits 0x%llx " ,
type , unsupported ) ;
2013-11-15 15:33:55 -05:00
return - EOPNOTSUPP ;
}
disallowed = set_mask & ~ safe_set ;
if ( disallowed ) {
2013-11-01 13:07:02 -04:00
names = btrfs_printable_features ( set , disallowed ) ;
if ( names ) {
2016-06-22 18:54:23 -04:00
btrfs_warn ( fs_info ,
" can't set the %s feature bit%s while mounted " ,
names , strchr ( names , ' , ' ) ? " s " : " " ) ;
2013-11-01 13:07:02 -04:00
kfree ( names ) ;
} else
2016-06-22 18:54:23 -04:00
btrfs_warn ( fs_info ,
" can't set %s bits 0x%llx while mounted " ,
type , disallowed ) ;
2013-11-15 15:33:55 -05:00
return - EPERM ;
}
disallowed = clear_mask & ~ safe_clear ;
if ( disallowed ) {
2013-11-01 13:07:02 -04:00
names = btrfs_printable_features ( set , disallowed ) ;
if ( names ) {
2016-06-22 18:54:23 -04:00
btrfs_warn ( fs_info ,
" can't clear the %s feature bit%s while mounted " ,
names , strchr ( names , ' , ' ) ? " s " : " " ) ;
2013-11-01 13:07:02 -04:00
kfree ( names ) ;
} else
2016-06-22 18:54:23 -04:00
btrfs_warn ( fs_info ,
" can't clear %s bits 0x%llx while mounted " ,
type , disallowed ) ;
2013-11-15 15:33:55 -05:00
return - EPERM ;
}
return 0 ;
}
2016-06-22 18:54:24 -04:00
# define check_feature(fs_info, change_mask, flags, mask_base) \
check_feature_bits ( fs_info , FEAT_ # # mask_base , change_mask , flags , \
2013-11-15 15:33:55 -05:00
BTRFS_FEATURE_ # # mask_base # # _SUPP , \
BTRFS_FEATURE_ # # mask_base # # _SAFE_SET , \
BTRFS_FEATURE_ # # mask_base # # _SAFE_CLEAR )
static int btrfs_ioctl_set_features ( struct file * file , void __user * arg )
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_super_block * super_block = fs_info - > super_copy ;
2013-11-15 15:33:55 -05:00
struct btrfs_ioctl_feature_flags flags [ 2 ] ;
struct btrfs_trans_handle * trans ;
u64 newflags ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( copy_from_user ( flags , arg , sizeof ( flags ) ) )
return - EFAULT ;
/* Nothing to do */
if ( ! flags [ 0 ] . compat_flags & & ! flags [ 0 ] . compat_ro_flags & &
! flags [ 0 ] . incompat_flags )
return 0 ;
2016-06-22 18:54:24 -04:00
ret = check_feature ( fs_info , flags [ 0 ] . compat_flags ,
2013-11-15 15:33:55 -05:00
flags [ 1 ] . compat_flags , COMPAT ) ;
if ( ret )
return ret ;
2016-06-22 18:54:24 -04:00
ret = check_feature ( fs_info , flags [ 0 ] . compat_ro_flags ,
2013-11-15 15:33:55 -05:00
flags [ 1 ] . compat_ro_flags , COMPAT_RO ) ;
if ( ret )
return ret ;
2016-06-22 18:54:24 -04:00
ret = check_feature ( fs_info , flags [ 0 ] . incompat_flags ,
2013-11-15 15:33:55 -05:00
flags [ 1 ] . incompat_flags , INCOMPAT ) ;
if ( ret )
return ret ;
2016-05-04 11:32:00 +02:00
ret = mnt_want_write_file ( file ) ;
if ( ret )
return ret ;
2014-02-07 14:34:04 +01:00
trans = btrfs_start_transaction ( root , 0 ) ;
2016-05-04 11:32:00 +02:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
goto out_drop_write ;
}
2013-11-15 15:33:55 -05:00
2016-06-22 18:54:23 -04:00
spin_lock ( & fs_info - > super_lock ) ;
2013-11-15 15:33:55 -05:00
newflags = btrfs_super_compat_flags ( super_block ) ;
newflags | = flags [ 0 ] . compat_flags & flags [ 1 ] . compat_flags ;
newflags & = ~ ( flags [ 0 ] . compat_flags & ~ flags [ 1 ] . compat_flags ) ;
btrfs_set_super_compat_flags ( super_block , newflags ) ;
newflags = btrfs_super_compat_ro_flags ( super_block ) ;
newflags | = flags [ 0 ] . compat_ro_flags & flags [ 1 ] . compat_ro_flags ;
newflags & = ~ ( flags [ 0 ] . compat_ro_flags & ~ flags [ 1 ] . compat_ro_flags ) ;
btrfs_set_super_compat_ro_flags ( super_block , newflags ) ;
newflags = btrfs_super_incompat_flags ( super_block ) ;
newflags | = flags [ 0 ] . incompat_flags & flags [ 1 ] . incompat_flags ;
newflags & = ~ ( flags [ 0 ] . incompat_flags & ~ flags [ 1 ] . incompat_flags ) ;
btrfs_set_super_incompat_flags ( super_block , newflags ) ;
2016-06-22 18:54:23 -04:00
spin_unlock ( & fs_info - > super_lock ) ;
2013-11-15 15:33:55 -05:00
2016-09-09 21:39:03 -04:00
ret = btrfs_commit_transaction ( trans ) ;
2016-05-04 11:32:00 +02:00
out_drop_write :
mnt_drop_write_file ( file ) ;
return ret ;
2013-11-15 15:33:55 -05:00
}
2017-09-27 10:43:13 -04:00
static int _btrfs_ioctl_send ( struct file * file , void __user * argp , bool compat )
{
struct btrfs_ioctl_send_args * arg ;
int ret ;
if ( compat ) {
# if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_send_args_32 args32 ;
ret = copy_from_user ( & args32 , argp , sizeof ( args32 ) ) ;
if ( ret )
return - EFAULT ;
arg = kzalloc ( sizeof ( * arg ) , GFP_KERNEL ) ;
if ( ! arg )
return - ENOMEM ;
arg - > send_fd = args32 . send_fd ;
arg - > clone_sources_count = args32 . clone_sources_count ;
arg - > clone_sources = compat_ptr ( args32 . clone_sources ) ;
arg - > parent_root = args32 . parent_root ;
arg - > flags = args32 . flags ;
memcpy ( arg - > reserved , args32 . reserved ,
sizeof ( args32 . reserved ) ) ;
# else
return - ENOTTY ;
# endif
} else {
arg = memdup_user ( argp , sizeof ( * arg ) ) ;
if ( IS_ERR ( arg ) )
return PTR_ERR ( arg ) ;
}
ret = btrfs_ioctl_send ( file , arg ) ;
kfree ( arg ) ;
return ret ;
}
2008-06-11 21:53:53 -04:00
long btrfs_ioctl ( struct file * file , unsigned int
cmd , unsigned long arg )
{
2016-06-22 18:54:23 -04:00
struct inode * inode = file_inode ( file ) ;
struct btrfs_fs_info * fs_info = btrfs_sb ( inode - > i_sb ) ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2008-12-02 06:36:08 -05:00
void __user * argp = ( void __user * ) arg ;
2008-06-11 21:53:53 -04:00
switch ( cmd ) {
2009-04-17 10:37:41 +02:00
case FS_IOC_GETVERSION :
return btrfs_ioctl_getversion ( file , argp ) ;
2019-07-17 12:39:20 -05:00
case FS_IOC_GETFSLABEL :
2019-10-10 21:23:11 -03:00
return btrfs_ioctl_get_fslabel ( fs_info , argp ) ;
2019-07-17 12:39:20 -05:00
case FS_IOC_SETFSLABEL :
return btrfs_ioctl_set_fslabel ( file , argp ) ;
2011-03-24 10:24:28 +00:00
case FITRIM :
2019-10-10 21:23:11 -03:00
return btrfs_ioctl_fitrim ( fs_info , argp ) ;
2008-06-11 21:53:53 -04:00
case BTRFS_IOC_SNAP_CREATE :
2010-12-20 15:53:28 +08:00
return btrfs_ioctl_snap_create ( file , argp , 0 ) ;
2010-12-10 06:41:56 +00:00
case BTRFS_IOC_SNAP_CREATE_V2 :
2010-12-20 15:53:28 +08:00
return btrfs_ioctl_snap_create_v2 ( file , argp , 0 ) ;
2008-11-17 21:02:50 -05:00
case BTRFS_IOC_SUBVOL_CREATE :
2010-12-20 15:53:28 +08:00
return btrfs_ioctl_snap_create ( file , argp , 1 ) ;
2011-09-14 15:58:21 +02:00
case BTRFS_IOC_SUBVOL_CREATE_V2 :
return btrfs_ioctl_snap_create_v2 ( file , argp , 1 ) ;
2009-09-21 16:00:26 -04:00
case BTRFS_IOC_SNAP_DESTROY :
btrfs: add new BTRFS_IOC_SNAP_DESTROY_V2 ioctl
This ioctl will be responsible for deleting a subvolume using its id.
This can be used when a system has a file system mounted from a
subvolume, rather than the root file system, like below:
/
@subvol1/
@subvol2/
@subvol_default/
If only @subvol_default is mounted, we have no path to reach @subvol1
and @subvol2, thus no way to delete them. Current subvolume delete ioctl
takes a file handle point as argument, and if @subvol_default is
mounted, we can't reach @subvol1 and @subvol2 from the same mount point.
This patch introduces a new ioctl BTRFS_IOC_SNAP_DESTROY_V2 that takes
the extended structure with flags to allow to delete subvolume using
subvolid.
Now, we can use this new ioctl specifying the subvolume id and refer to
the same mount point. It doesn't matter which subvolume was mounted,
since we can reach to the desired one using the subvolume id, and then
delete it.
The full path to the subvolume id is resolved internally and access is
verified as if the subvolume was accessed by path.
The volume args v2 structure is extended to use the existing union for
subvolume id specification, that's valid in case the
BTRFS_SUBVOL_SPEC_BY_ID is set.
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-02-07 10:05:46 -03:00
return btrfs_ioctl_snap_destroy ( file , argp , false ) ;
case BTRFS_IOC_SNAP_DESTROY_V2 :
return btrfs_ioctl_snap_destroy ( file , argp , true ) ;
2010-12-20 16:30:25 +08:00
case BTRFS_IOC_SUBVOL_GETFLAGS :
return btrfs_ioctl_subvol_getflags ( file , argp ) ;
case BTRFS_IOC_SUBVOL_SETFLAGS :
return btrfs_ioctl_subvol_setflags ( file , argp ) ;
2009-12-11 21:11:29 +00:00
case BTRFS_IOC_DEFAULT_SUBVOL :
return btrfs_ioctl_default_subvol ( file , argp ) ;
2008-06-11 21:53:53 -04:00
case BTRFS_IOC_DEFRAG :
2010-03-11 09:42:04 -05:00
return btrfs_ioctl_defrag ( file , NULL ) ;
case BTRFS_IOC_DEFRAG_RANGE :
return btrfs_ioctl_defrag ( file , argp ) ;
2008-06-11 21:53:53 -04:00
case BTRFS_IOC_RESIZE :
2012-11-26 08:43:45 +00:00
return btrfs_ioctl_resize ( file , argp ) ;
2008-06-11 21:53:53 -04:00
case BTRFS_IOC_ADD_DEV :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_add_dev ( fs_info , argp ) ;
2008-06-11 21:53:53 -04:00
case BTRFS_IOC_RM_DEV :
2012-11-26 08:44:50 +00:00
return btrfs_ioctl_rm_dev ( file , argp ) ;
2016-02-13 10:01:39 +08:00
case BTRFS_IOC_RM_DEV_V2 :
return btrfs_ioctl_rm_dev_v2 ( file , argp ) ;
2011-03-11 15:41:01 +01:00
case BTRFS_IOC_FS_INFO :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_fs_info ( fs_info , argp ) ;
2011-03-11 15:41:01 +01:00
case BTRFS_IOC_DEV_INFO :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_dev_info ( fs_info , argp ) ;
2008-06-11 21:53:53 -04:00
case BTRFS_IOC_BALANCE :
2012-05-11 18:11:26 +08:00
return btrfs_ioctl_balance ( file , NULL ) ;
2010-02-28 15:39:26 -05:00
case BTRFS_IOC_TREE_SEARCH :
return btrfs_ioctl_tree_search ( file , argp ) ;
2014-01-30 16:24:03 +01:00
case BTRFS_IOC_TREE_SEARCH_V2 :
return btrfs_ioctl_tree_search_v2 ( file , argp ) ;
2010-02-28 15:39:26 -05:00
case BTRFS_IOC_INO_LOOKUP :
return btrfs_ioctl_ino_lookup ( file , argp ) ;
2011-07-07 16:48:38 +02:00
case BTRFS_IOC_INO_PATHS :
return btrfs_ioctl_ino_to_path ( root , argp ) ;
case BTRFS_IOC_LOGICAL_INO :
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2
Now that check_extent_in_eb()'s extent offset filter can be turned off,
we need a way to do it from userspace.
Add a 'flags' field to the btrfs_logical_ino_args structure to disable
extent offset filtering, taking the place of one of the existing
reserved[] fields.
Previous versions of LOGICAL_INO neglected to check whether any of the
reserved fields have non-zero values. Assigning meaning to those fields
now may change the behavior of existing programs that left these fields
uninitialized. The lack of a zero check also means that new programs
have no way to know whether the kernel is honoring the flags field.
To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can
use the same argument layout as LOGICAL_INO, but shorten the reserved[]
array by one element and turn it into the 'flags' field. The V2 ioctl
explicitly checks that reserved fields and unsupported flag bits are zero
so that userspace can negotiate future feature bits as they are defined.
Since the memory layouts of the two ioctls' arguments are compatible,
there is no need for a separate function for logical_to_ino_v2 (contrast
with tree_search_v2 vs tree_search where the layout and code are quite
different). A version parameter and an 'if' statement will suffice.
Now that we have a flags field in logical_ino_args, add a flag
BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
and pass it down the stack to iterate_inodes_from_logical.
Motivation and background, copied from the patchset cover letter:
Suppose we have a file with one extent:
root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a
root@tester:~# sync
Split the extent by overwriting it in the middle:
root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a
We should now have 3 extent refs to 2 extents, with one block unreachable.
The extent tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 2
[...]
item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53
extent refs 2 gen 29 flags DATA
extent data backref root 5 objectid 261 offset 0 count 2
[...]
item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53
extent refs 1 gen 30 flags DATA
extent data backref root 5 objectid 261 offset 8192 count 1
[...]
and the ref tree looks like:
root@tester:~# btrfs-debug-tree /dev/vdc -t 5
[...]
item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 0 nr 8192 ram 73728
extent compression(none)
item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53
extent data disk byte 1103175680 nr 4096
extent data offset 0 nr 4096 ram 4096
extent compression(none)
item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53
extent data disk byte 1103101952 nr 73728
extent data offset 12288 nr 61440 ram 73728
extent compression(none)
[...]
There are two references to the same extent with different, non-overlapping
byte offsets:
[------------------72K extent at 1103101952----------------------]
[--8K----------------|--4K unreachable----|--60K-----------------]
^ ^
| |
[--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--]
|
v
[-----4K extent-----] at 1103175680
We want to find all of the references to extent bytenr 1103101952.
Without the patch (and without running btrfs-debug-tree), we have to
do it with 18 LOGICAL_INO calls:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO
inode 261 offset 0 root 5
root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode
inode 261 offset 0 root 5
inode 261 offset 4096 root 5 <- same extent ref as offset 0
(offset 8192 returns empty set, not reachable)
inode 261 offset 12288 root 5
inode 261 offset 16384 root 5 \
inode 261 offset 20480 root 5 |
inode 261 offset 24576 root 5 |
inode 261 offset 28672 root 5 |
inode 261 offset 32768 root 5 |
inode 261 offset 36864 root 5 \
inode 261 offset 40960 root 5 > all the same extent ref as offset 12288.
inode 261 offset 45056 root 5 / More processing required in userspace
inode 261 offset 49152 root 5 | to figure out these are all duplicates.
inode 261 offset 53248 root 5 |
inode 261 offset 57344 root 5 |
inode 261 offset 61440 root 5 |
inode 261 offset 65536 root 5 |
inode 261 offset 69632 root 5 /
In the worst case the extents are 128MB long, and we have to do 32768
iterations of the loop to find one 4K extent ref.
With the patch, we just use one call to map all refs to the extent at once:
root@tester:~# btrfs ins log 1103101952 -P /test/
Using LOGICAL_INO_V2
inode 261 offset 0 root 5
inode 261 offset 12288 root 5
The TREE_SEARCH ioctl allows userspace to retrieve the offset and
extent bytenr fields easily once the root, inode and offset are known.
This is sufficient information to build a complete map of the extent
and all of its references. Userspace can use this information to make
better choices to dedup or defrag.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
[ copy background and motivation from cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 13:58:46 -04:00
return btrfs_ioctl_logical_to_ino ( fs_info , argp , 1 ) ;
case BTRFS_IOC_LOGICAL_INO_V2 :
return btrfs_ioctl_logical_to_ino ( fs_info , argp , 2 ) ;
2010-01-13 18:19:06 +00:00
case BTRFS_IOC_SPACE_INFO :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_space_info ( fs_info , argp ) ;
2013-09-23 11:35:11 +01:00
case BTRFS_IOC_SYNC : {
int ret ;
2021-01-11 12:58:11 +02:00
ret = btrfs_start_delalloc_roots ( fs_info , LONG_MAX , false ) ;
2013-09-23 11:35:11 +01:00
if ( ret )
return ret ;
2016-06-22 18:54:23 -04:00
ret = btrfs_sync_fs ( inode - > i_sb , 1 ) ;
2014-07-23 14:39:35 +02:00
/*
* The transaction thread may want to do more work ,
2016-05-19 21:18:45 -04:00
* namely it pokes the cleaner kthread that will start
2014-07-23 14:39:35 +02:00
* processing uncleaned subvols .
*/
2016-06-22 18:54:23 -04:00
wake_up_process ( fs_info - > transaction_kthread ) ;
2013-09-23 11:35:11 +01:00
return ret ;
}
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
case BTRFS_IOC_START_SYNC :
2012-11-26 08:40:43 +00:00
return btrfs_ioctl_start_sync ( root , argp ) ;
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
case BTRFS_IOC_WAIT_SYNC :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_wait_sync ( fs_info , argp ) ;
2011-03-11 15:41:01 +01:00
case BTRFS_IOC_SCRUB :
2012-11-26 08:48:01 +00:00
return btrfs_ioctl_scrub ( file , argp ) ;
2011-03-11 15:41:01 +01:00
case BTRFS_IOC_SCRUB_CANCEL :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_scrub_cancel ( fs_info ) ;
2011-03-11 15:41:01 +01:00
case BTRFS_IOC_SCRUB_PROGRESS :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_scrub_progress ( fs_info , argp ) ;
2012-01-16 22:04:47 +02:00
case BTRFS_IOC_BALANCE_V2 :
2012-05-11 18:11:26 +08:00
return btrfs_ioctl_balance ( file , argp ) ;
2012-01-16 22:04:49 +02:00
case BTRFS_IOC_BALANCE_CTL :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_balance_ctl ( fs_info , arg ) ;
2012-01-16 22:04:49 +02:00
case BTRFS_IOC_BALANCE_PROGRESS :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_balance_progress ( fs_info , argp ) ;
2012-07-25 17:35:53 +02:00
case BTRFS_IOC_SET_RECEIVED_SUBVOL :
return btrfs_ioctl_set_received_subvol ( file , argp ) ;
2014-01-30 20:17:00 +00:00
# ifdef CONFIG_64BIT
case BTRFS_IOC_SET_RECEIVED_SUBVOL_32 :
return btrfs_ioctl_set_received_subvol_32 ( file , argp ) ;
# endif
2012-07-25 23:19:24 +02:00
case BTRFS_IOC_SEND :
2017-09-27 10:43:13 -04:00
return _btrfs_ioctl_send ( file , argp , false ) ;
# if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32 :
return _btrfs_ioctl_send ( file , argp , true ) ;
# endif
2012-05-25 16:06:09 +02:00
case BTRFS_IOC_GET_DEV_STATS :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_get_dev_stats ( fs_info , argp ) ;
2011-09-14 15:53:51 +02:00
case BTRFS_IOC_QUOTA_CTL :
2012-11-26 08:50:11 +00:00
return btrfs_ioctl_quota_ctl ( file , argp ) ;
2011-09-14 15:53:51 +02:00
case BTRFS_IOC_QGROUP_ASSIGN :
2012-11-26 08:50:11 +00:00
return btrfs_ioctl_qgroup_assign ( file , argp ) ;
2011-09-14 15:53:51 +02:00
case BTRFS_IOC_QGROUP_CREATE :
2012-11-26 08:50:11 +00:00
return btrfs_ioctl_qgroup_create ( file , argp ) ;
2011-09-14 15:53:51 +02:00
case BTRFS_IOC_QGROUP_LIMIT :
2012-11-26 08:50:11 +00:00
return btrfs_ioctl_qgroup_limit ( file , argp ) ;
2013-04-25 16:04:51 +00:00
case BTRFS_IOC_QUOTA_RESCAN :
return btrfs_ioctl_quota_rescan ( file , argp ) ;
case BTRFS_IOC_QUOTA_RESCAN_STATUS :
2019-10-10 21:23:11 -03:00
return btrfs_ioctl_quota_rescan_status ( fs_info , argp ) ;
2013-05-06 19:14:17 +00:00
case BTRFS_IOC_QUOTA_RESCAN_WAIT :
2019-10-10 21:23:11 -03:00
return btrfs_ioctl_quota_rescan_wait ( fs_info , argp ) ;
2012-11-06 15:08:53 +01:00
case BTRFS_IOC_DEV_REPLACE :
2016-06-22 18:54:24 -04:00
return btrfs_ioctl_dev_replace ( fs_info , argp ) ;
2013-11-15 15:33:55 -05:00
case BTRFS_IOC_GET_SUPPORTED_FEATURES :
2016-02-17 15:26:27 +01:00
return btrfs_ioctl_get_supported_features ( argp ) ;
2013-11-15 15:33:55 -05:00
case BTRFS_IOC_GET_FEATURES :
2019-10-10 21:23:11 -03:00
return btrfs_ioctl_get_features ( fs_info , argp ) ;
2013-11-15 15:33:55 -05:00
case BTRFS_IOC_SET_FEATURES :
return btrfs_ioctl_set_features ( file , argp ) ;
2018-05-21 10:09:42 +09:00
case BTRFS_IOC_GET_SUBVOL_INFO :
return btrfs_ioctl_get_subvol_info ( file , argp ) ;
2018-05-21 10:09:43 +09:00
case BTRFS_IOC_GET_SUBVOL_ROOTREF :
return btrfs_ioctl_get_subvol_rootref ( file , argp ) ;
2018-05-21 10:09:44 +09:00
case BTRFS_IOC_INO_LOOKUP_USER :
return btrfs_ioctl_ino_lookup_user ( file , argp ) ;
2008-06-11 21:53:53 -04:00
}
return - ENOTTY ;
}
2015-10-29 08:22:21 +00:00
# ifdef CONFIG_COMPAT
long btrfs_compat_ioctl ( struct file * file , unsigned int cmd , unsigned long arg )
{
2017-02-06 19:39:09 -05:00
/*
* These all access 32 - bit values anyway so no further
* handling is necessary .
*/
2015-10-29 08:22:21 +00:00
switch ( cmd ) {
case FS_IOC32_GETVERSION :
cmd = FS_IOC_GETVERSION ;
break ;
}
return btrfs_ioctl ( file , cmd , ( unsigned long ) compat_ptr ( arg ) ) ;
}
# endif