2018-04-03 19:23:33 +02:00
// SPDX-License-Identifier: GPL-2.0
2007-11-16 11:45:54 -05:00
/*
* Copyright ( C ) 2007 Red Hat . All rights reserved .
*/
# include <linux/init.h>
# include <linux/fs.h>
# include <linux/slab.h>
# include <linux/rwsem.h>
# include <linux/xattr.h>
2009-02-04 09:29:13 -05:00
# include <linux/security.h>
2013-12-20 05:16:43 -08:00
# include <linux/posix_acl_xattr.h>
2018-01-29 06:41:30 -05:00
# include <linux/iversion.h>
2018-12-10 17:53:35 +00:00
# include <linux/sched/mm.h>
2007-11-16 11:45:54 -05:00
# include "ctree.h"
# include "btrfs_inode.h"
# include "transaction.h"
# include "xattr.h"
# include "disk-io.h"
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 11:47:46 +00:00
# include "props.h"
2014-11-09 08:38:39 +00:00
# include "locking.h"
2008-07-24 12:16:36 -04:00
2018-02-27 15:48:57 +01:00
int btrfs_getxattr ( struct inode * inode , const char * name ,
2008-08-28 06:21:17 -04:00
void * buffer , size_t size )
2007-11-16 11:45:54 -05:00
{
struct btrfs_dir_item * di ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_path * path ;
struct extent_buffer * leaf ;
int ret = 0 ;
unsigned long data_ptr ;
path = btrfs_alloc_path ( ) ;
2008-08-28 06:21:17 -04:00
if ( ! path )
2007-11-16 11:45:54 -05:00
return - ENOMEM ;
/* lookup the xattr by name */
2017-01-20 14:54:07 +01:00
di = btrfs_lookup_xattr ( NULL , root , path , btrfs_ino ( BTRFS_I ( inode ) ) ,
name , strlen ( name ) , 0 ) ;
2009-01-21 10:49:16 -05:00
if ( ! di ) {
2007-11-16 11:45:54 -05:00
ret = - ENODATA ;
goto out ;
2009-01-21 10:49:16 -05:00
} else if ( IS_ERR ( di ) ) {
ret = PTR_ERR ( di ) ;
goto out ;
2007-11-16 11:45:54 -05:00
}
leaf = path - > nodes [ 0 ] ;
/* if size is 0, that means we want the size of the attr */
if ( ! size ) {
ret = btrfs_dir_data_len ( leaf , di ) ;
goto out ;
}
/* now get the data out of our dir_item */
if ( btrfs_dir_data_len ( leaf , di ) > size ) {
ret = - ERANGE ;
goto out ;
}
2009-01-21 10:49:16 -05:00
/*
* The way things are packed into the leaf is like this
* | struct btrfs_dir_item | name | data |
* where name is the xattr name , so security . foo , and data is the
* content of the xattr . data_ptr points to the location in memory
* where the data starts in the in memory leaf
*/
2007-11-16 11:45:54 -05:00
data_ptr = ( unsigned long ) ( ( char * ) ( di + 1 ) +
btrfs_dir_name_len ( leaf , di ) ) ;
read_extent_buffer ( leaf , buffer , data_ptr ,
2007-11-19 10:18:19 -05:00
btrfs_dir_data_len ( leaf , di ) ) ;
2007-11-16 11:45:54 -05:00
ret = btrfs_dir_data_len ( leaf , di ) ;
out :
btrfs_free_path ( path ) ;
return ret ;
}
2019-04-12 16:02:56 +08:00
int btrfs_setxattr ( struct btrfs_trans_handle * trans , struct inode * inode ,
const char * name , const void * value , size_t size , int flags )
2007-11-16 11:45:54 -05:00
{
2014-11-09 08:38:39 +00:00
struct btrfs_dir_item * di = NULL ;
2007-11-16 11:45:54 -05:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2016-06-22 18:54:24 -04:00
struct btrfs_fs_info * fs_info = root - > fs_info ;
2007-11-16 11:45:54 -05:00
struct btrfs_path * path ;
2009-11-12 09:35:27 +00:00
size_t name_len = strlen ( name ) ;
int ret = 0 ;
2019-04-12 16:02:58 +08:00
ASSERT ( trans ) ;
2016-06-15 09:22:56 -04:00
if ( name_len + size > BTRFS_MAX_XATTR_SIZE ( root - > fs_info ) )
2009-11-12 09:35:27 +00:00
return - ENOSPC ;
2007-11-16 11:45:54 -05:00
path = btrfs_alloc_path ( ) ;
2008-08-28 06:21:17 -04:00
if ( ! path )
2007-11-16 11:45:54 -05:00
return - ENOMEM ;
2014-11-09 08:38:39 +00:00
path - > skip_release_on_error = 1 ;
if ( ! value ) {
2017-01-20 14:54:07 +01:00
di = btrfs_lookup_xattr ( trans , root , path ,
btrfs_ino ( BTRFS_I ( inode ) ) , name , name_len , - 1 ) ;
2014-11-09 08:38:39 +00:00
if ( ! di & & ( flags & XATTR_REPLACE ) )
ret = - ENODATA ;
2015-02-23 19:50:49 +00:00
else if ( IS_ERR ( di ) )
ret = PTR_ERR ( di ) ;
2014-11-09 08:38:39 +00:00
else if ( di )
ret = btrfs_delete_one_dir_name ( trans , root , path , di ) ;
goto out ;
}
2007-11-16 11:45:54 -05:00
2014-11-09 08:38:39 +00:00
/*
* For a replace we can ' t just do the insert blindly .
* Do a lookup first ( read - only btrfs_search_slot ) , and return if xattr
* doesn ' t exist . If it exists , fall down below to the insert / replace
* path - we can ' t race with a concurrent xattr delete , because the VFS
* locks the inode ' s i_mutex before calling setxattr or removexattr .
*/
2011-05-27 12:06:11 -04:00
if ( flags & XATTR_REPLACE ) {
2016-01-22 15:40:57 -05:00
ASSERT ( inode_is_locked ( inode ) ) ;
2017-01-20 14:54:07 +01:00
di = btrfs_lookup_xattr ( NULL , root , path ,
btrfs_ino ( BTRFS_I ( inode ) ) , name , name_len , 0 ) ;
2015-02-23 19:50:49 +00:00
if ( ! di )
2011-05-27 12:06:11 -04:00
ret = - ENODATA ;
2015-02-23 19:50:49 +00:00
else if ( IS_ERR ( di ) )
ret = PTR_ERR ( di ) ;
if ( ret )
2007-11-16 11:45:54 -05:00
goto out ;
2011-04-21 01:20:15 +02:00
btrfs_release_path ( path ) ;
2014-11-09 08:38:39 +00:00
di = NULL ;
}
2011-09-11 10:52:25 -04:00
2017-01-10 20:35:31 +02:00
ret = btrfs_insert_xattr_item ( trans , root , path , btrfs_ino ( BTRFS_I ( inode ) ) ,
2014-11-09 08:38:39 +00:00
name , name_len , value , size ) ;
if ( ret = = - EOVERFLOW ) {
2011-09-11 10:52:25 -04:00
/*
2014-11-09 08:38:39 +00:00
* We have an existing item in a leaf , split_leaf couldn ' t
* expand it . That item might have or not a dir_item that
* matches our target xattr , so lets check .
2011-09-11 10:52:25 -04:00
*/
2014-11-09 08:38:39 +00:00
ret = 0 ;
btrfs_assert_tree_locked ( path - > nodes [ 0 ] ) ;
2016-06-22 18:54:24 -04:00
di = btrfs_match_dir_item_name ( fs_info , path , name , name_len ) ;
2014-11-09 08:38:39 +00:00
if ( ! di & & ! ( flags & XATTR_REPLACE ) ) {
ret = - ENOSPC ;
2012-11-28 10:43:12 +00:00
goto out ;
}
2014-11-09 08:38:39 +00:00
} else if ( ret = = - EEXIST ) {
ret = 0 ;
2016-06-22 18:54:24 -04:00
di = btrfs_match_dir_item_name ( fs_info , path , name , name_len ) ;
2014-11-09 08:38:39 +00:00
ASSERT ( di ) ; /* logic error */
} else if ( ret ) {
goto out ;
2011-05-27 12:06:11 -04:00
}
2007-11-16 11:45:54 -05:00
2014-11-09 08:38:39 +00:00
if ( di & & ( flags & XATTR_CREATE ) ) {
2011-10-13 13:09:22 -04:00
ret = - EEXIST ;
2014-11-09 08:38:39 +00:00
goto out ;
}
2011-10-13 13:09:22 -04:00
2014-11-09 08:38:39 +00:00
if ( di ) {
2011-05-27 12:06:11 -04:00
/*
2014-11-09 08:38:39 +00:00
* We ' re doing a replace , and it must be atomic , that is , at
* any point in time we have either the old or the new xattr
* value in the tree . We don ' t want readers ( getxattr and
* listxattrs ) to miss a value , this is specially important
* for ACLs .
2011-05-27 12:06:11 -04:00
*/
2014-11-09 08:38:39 +00:00
const int slot = path - > slots [ 0 ] ;
struct extent_buffer * leaf = path - > nodes [ 0 ] ;
const u16 old_data_len = btrfs_dir_data_len ( leaf , di ) ;
const u32 item_size = btrfs_item_size_nr ( leaf , slot ) ;
const u32 data_size = sizeof ( * di ) + name_len + size ;
struct btrfs_item * item ;
unsigned long data_ptr ;
char * ptr ;
if ( size > old_data_len ) {
2019-03-20 14:36:46 +01:00
if ( btrfs_leaf_free_space ( leaf ) <
2014-11-09 08:38:39 +00:00
( size - old_data_len ) ) {
ret = - ENOSPC ;
goto out ;
}
2011-05-27 12:06:11 -04:00
}
2008-07-24 12:16:36 -04:00
2014-11-09 08:38:39 +00:00
if ( old_data_len + name_len + sizeof ( * di ) = = item_size ) {
/* No other xattrs packed in the same leaf item. */
if ( size > old_data_len )
2019-03-20 14:51:10 +01:00
btrfs_extend_item ( path , size - old_data_len ) ;
2014-11-09 08:38:39 +00:00
else if ( size < old_data_len )
2019-03-20 14:49:12 +01:00
btrfs_truncate_item ( path , data_size , 1 ) ;
2014-11-09 08:38:39 +00:00
} else {
/* There are other xattrs packed in the same item. */
ret = btrfs_delete_one_dir_name ( trans , root , path , di ) ;
if ( ret )
goto out ;
2019-03-20 14:51:10 +01:00
btrfs_extend_item ( path , data_size ) ;
2014-11-09 08:38:39 +00:00
}
2011-05-27 12:06:11 -04:00
2014-11-09 08:38:39 +00:00
item = btrfs_item_nr ( slot ) ;
ptr = btrfs_item_ptr ( leaf , slot , char ) ;
ptr + = btrfs_item_size ( leaf , item ) - data_size ;
di = ( struct btrfs_dir_item * ) ptr ;
btrfs_set_dir_data_len ( leaf , di , size ) ;
data_ptr = ( ( unsigned long ) ( di + 1 ) ) + name_len ;
write_extent_buffer ( leaf , value , data_ptr , size ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
} else {
2011-05-27 12:06:11 -04:00
/*
2014-11-09 08:38:39 +00:00
* Insert , and we had space for the xattr , so path - > slots [ 0 ] is
* where our xattr dir_item is and btrfs_insert_xattr_item ( )
* filled it .
2011-05-27 12:06:11 -04:00
*/
2007-11-16 11:45:54 -05:00
}
2009-11-12 09:35:27 +00:00
out :
btrfs_free_path ( path ) ;
2020-11-13 11:21:49 +00:00
if ( ! ret ) {
Btrfs: fix failure to persist compression property xattr deletion on fsync
After the recent series of cleanups in the properties and xattrs modules
that landed in the 5.2 merge window, we ended up with a regression where
after deleting the compression xattr property through the setflags ioctl,
we don't set the BTRFS_INODE_COPY_EVERYTHING flag in the inode anymore.
As a consequence, if the inode was fsync'ed when it had the compression
property set, after deleting the compression property through the setflags
ioctl and fsync'ing again the inode, the log will still contain the
compression xattr, because the inode did not had that bit set, which
made the fsync not delete all xattrs from the log and copy all xattrs
from the subvolume tree to the log tree.
This regression happens due to the fact that that series of cleanups
made btrfs_set_prop() call the old function do_setxattr() (which is now
named btrfs_setxattr()), and not the old version of btrfs_setxattr(),
which is now called btrfs_setxattr_trans().
Fix this by setting the BTRFS_INODE_COPY_EVERYTHING bit in the current
btrfs_setxattr() function and remove it from everywhere else, including
its setup at btrfs_ioctl_setflags(). This is cleaner, avoids similar
regressions in the future, and centralizes the setup of the bit. After
all, the need to setup this bit should only be in the xattrs module,
since it is an implementation of xattrs.
Fixes: 04e6863b19c722 ("btrfs: split btrfs_setxattr calls regarding transaction")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-12 15:14:11 +01:00
set_bit ( BTRFS_INODE_COPY_EVERYTHING ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
2020-11-13 11:21:49 +00:00
clear_bit ( BTRFS_INODE_NO_XATTRS , & BTRFS_I ( inode ) - > runtime_flags ) ;
}
2009-11-12 09:35:27 +00:00
return ret ;
}
2011-09-11 10:52:25 -04:00
/*
* @ value : " " makes the attribute to empty , NULL removes it
*/
2019-04-12 16:02:59 +08:00
int btrfs_setxattr_trans ( struct inode * inode , const char * name ,
2019-04-12 16:02:54 +08:00
const void * value , size_t size , int flags )
2009-11-12 09:35:27 +00:00
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2019-04-12 16:02:59 +08:00
struct btrfs_trans_handle * trans ;
2021-02-26 17:51:44 +00:00
const bool start_trans = ( current - > journal_info = = NULL ) ;
2009-11-12 09:35:27 +00:00
int ret ;
2021-02-26 17:51:44 +00:00
if ( start_trans ) {
/*
* 1 unit for inserting / updating / deleting the xattr
* 1 unit for the inode item update
*/
trans = btrfs_start_transaction ( root , 2 ) ;
if ( IS_ERR ( trans ) )
return PTR_ERR ( trans ) ;
} else {
/*
* This can happen when smack is enabled and a directory is being
* created . It happens through d_instantiate_new ( ) , which calls
* smack_d_instantiate ( ) , which in turn calls __vfs_setxattr ( ) to
* set the transmute xattr ( XATTR_NAME_SMACKTRANSMUTE ) on the
* inode . We have already reserved space for the xattr and inode
* update at btrfs_mkdir ( ) , so just use the transaction handle .
* We don ' t join or start a transaction , as that will reset the
* block_rsv of the handle and trigger a warning for the start
* case .
*/
ASSERT ( strncmp ( name , XATTR_SECURITY_PREFIX ,
XATTR_SECURITY_PREFIX_LEN ) = = 0 ) ;
trans = current - > journal_info ;
}
2007-11-16 11:45:54 -05:00
2019-04-12 16:02:55 +08:00
ret = btrfs_setxattr ( trans , inode , name , value , size , flags ) ;
2009-11-12 09:35:27 +00:00
if ( ret )
goto out ;
2012-04-05 15:03:02 -04:00
inode_inc_iversion ( inode ) ;
2016-09-14 07:48:06 -07:00
inode - > i_ctime = current_time ( inode ) ;
2020-11-02 16:48:59 +02:00
ret = btrfs_update_inode ( trans , root , BTRFS_I ( inode ) ) ;
2009-11-12 09:35:27 +00:00
BUG_ON ( ret ) ;
out :
2021-02-26 17:51:44 +00:00
if ( start_trans )
btrfs_end_transaction ( trans ) ;
2007-11-16 11:45:54 -05:00
return ret ;
}
ssize_t btrfs_listxattr ( struct dentry * dentry , char * buffer , size_t size )
{
2016-02-21 15:03:02 +00:00
struct btrfs_key key ;
2015-03-17 22:25:59 +00:00
struct inode * inode = d_inode ( dentry ) ;
2007-11-16 11:45:54 -05:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_path * path ;
2016-02-21 15:03:02 +00:00
int ret = 0 ;
2008-08-28 06:21:16 -04:00
size_t total_size = 0 , size_left = size ;
2007-11-16 11:45:54 -05:00
/*
* ok we want all objects associated with this id .
* NOTE : we set key . offset = 0 ; because we want to start with the
* first xattr that we find and walk forward
*/
2017-01-10 20:35:31 +02:00
key . objectid = btrfs_ino ( BTRFS_I ( inode ) ) ;
2014-06-04 18:41:45 +02:00
key . type = BTRFS_XATTR_ITEM_KEY ;
2007-11-16 11:45:54 -05:00
key . offset = 0 ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2015-11-27 16:31:35 +01:00
path - > reada = READA_FORWARD ;
2007-11-16 11:45:54 -05:00
/* search for our xattrs */
ret = btrfs_search_slot ( NULL , root , & key , path , 0 , 0 ) ;
if ( ret < 0 )
goto err ;
2011-03-17 15:17:59 +08:00
2007-11-16 11:45:54 -05:00
while ( 1 ) {
2016-02-21 15:03:02 +00:00
struct extent_buffer * leaf ;
int slot ;
struct btrfs_dir_item * di ;
struct btrfs_key found_key ;
u32 item_size ;
u32 cur ;
2007-11-16 11:45:54 -05:00
leaf = path - > nodes [ 0 ] ;
slot = path - > slots [ 0 ] ;
/* this is where we start walking through the path */
2011-03-17 15:17:59 +08:00
if ( slot > = btrfs_header_nritems ( leaf ) ) {
2007-11-16 11:45:54 -05:00
/*
* if we ' ve reached the last slot in this leaf we need
* to go to the next leaf and reset everything
*/
2011-03-17 15:17:59 +08:00
ret = btrfs_next_leaf ( root , path ) ;
if ( ret < 0 )
goto err ;
else if ( ret > 0 )
break ;
continue ;
2007-11-16 11:45:54 -05:00
}
btrfs_item_key_to_cpu ( leaf , & found_key , slot ) ;
/* check to make sure this item is what we want */
if ( found_key . objectid ! = key . objectid )
break ;
Btrfs: fix race when listing an inode's xattrs
When listing a inode's xattrs we have a time window where we race against
a concurrent operation for adding a new hard link for our inode that makes
us not return any xattr to user space. In order for this to happen, the
first xattr of our inode needs to be at slot 0 of a leaf and the previous
leaf must still have room for an inode ref (or extref) item, and this can
happen because an inode's listxattrs callback does not lock the inode's
i_mutex (nor does the VFS does it for us), but adding a hard link to an
inode makes the VFS lock the inode's i_mutex before calling the inode's
link callback.
If we have the following leafs:
Leaf X (has N items) Leaf Y
[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 XATTR_ITEM 12345), ... ]
slot N - 2 slot N - 1 slot 0
The race illustrated by the following sequence diagram is possible:
CPU 1 CPU 2
btrfs_listxattr()
searches for key (257 XATTR_ITEM 0)
gets path with path->nodes[0] == leaf X
and path->slots[0] == N
because path->slots[0] is >=
btrfs_header_nritems(leaf X), it calls
btrfs_next_leaf()
btrfs_next_leaf()
releases the path
adds key (257 INODE_REF 666)
to the end of leaf X (slot N),
and leaf X now has N + 1 items
searches for the key (257 INODE_REF 256),
with path->keep_locks == 1, because that
is the last key it saw in leaf X before
releasing the path
ends up at leaf X again and it verifies
that the key (257 INODE_REF 256) is no
longer the last key in leaf X, so it
returns with path->nodes[0] == leaf X
and path->slots[0] == N, pointing to
the new item with key (257 INODE_REF 666)
btrfs_listxattr's loop iteration sees that
the type of the key pointed by the path is
different from the type BTRFS_XATTR_ITEM_KEY
and so it breaks the loop and stops looking
for more xattr items
--> the application doesn't get any xattr
listed for our inode
So fix this by breaking the loop only if the key's type is greater than
BTRFS_XATTR_ITEM_KEY and skip the current key if its type is smaller.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-09 18:06:38 +00:00
if ( found_key . type > BTRFS_XATTR_ITEM_KEY )
2007-11-16 11:45:54 -05:00
break ;
Btrfs: fix race when listing an inode's xattrs
When listing a inode's xattrs we have a time window where we race against
a concurrent operation for adding a new hard link for our inode that makes
us not return any xattr to user space. In order for this to happen, the
first xattr of our inode needs to be at slot 0 of a leaf and the previous
leaf must still have room for an inode ref (or extref) item, and this can
happen because an inode's listxattrs callback does not lock the inode's
i_mutex (nor does the VFS does it for us), but adding a hard link to an
inode makes the VFS lock the inode's i_mutex before calling the inode's
link callback.
If we have the following leafs:
Leaf X (has N items) Leaf Y
[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 XATTR_ITEM 12345), ... ]
slot N - 2 slot N - 1 slot 0
The race illustrated by the following sequence diagram is possible:
CPU 1 CPU 2
btrfs_listxattr()
searches for key (257 XATTR_ITEM 0)
gets path with path->nodes[0] == leaf X
and path->slots[0] == N
because path->slots[0] is >=
btrfs_header_nritems(leaf X), it calls
btrfs_next_leaf()
btrfs_next_leaf()
releases the path
adds key (257 INODE_REF 666)
to the end of leaf X (slot N),
and leaf X now has N + 1 items
searches for the key (257 INODE_REF 256),
with path->keep_locks == 1, because that
is the last key it saw in leaf X before
releasing the path
ends up at leaf X again and it verifies
that the key (257 INODE_REF 256) is no
longer the last key in leaf X, so it
returns with path->nodes[0] == leaf X
and path->slots[0] == N, pointing to
the new item with key (257 INODE_REF 666)
btrfs_listxattr's loop iteration sees that
the type of the key pointed by the path is
different from the type BTRFS_XATTR_ITEM_KEY
and so it breaks the loop and stops looking
for more xattr items
--> the application doesn't get any xattr
listed for our inode
So fix this by breaking the loop only if the key's type is greater than
BTRFS_XATTR_ITEM_KEY and skip the current key if its type is smaller.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-09 18:06:38 +00:00
if ( found_key . type < BTRFS_XATTR_ITEM_KEY )
2016-02-21 15:03:02 +00:00
goto next_item ;
2007-11-16 11:45:54 -05:00
di = btrfs_item_ptr ( leaf , slot , struct btrfs_dir_item ) ;
2016-02-21 15:03:02 +00:00
item_size = btrfs_item_size_nr ( leaf , slot ) ;
cur = 0 ;
while ( cur < item_size ) {
u16 name_len = btrfs_dir_name_len ( leaf , di ) ;
u16 data_len = btrfs_dir_data_len ( leaf , di ) ;
u32 this_len = sizeof ( * di ) + name_len + data_len ;
unsigned long name_ptr = ( unsigned long ) ( di + 1 ) ;
total_size + = name_len + 1 ;
/*
* We are just looking for how big our buffer needs to
* be .
*/
if ( ! size )
goto next ;
2007-11-16 11:45:54 -05:00
2016-02-21 15:03:02 +00:00
if ( ! buffer | | ( name_len + 1 ) > size_left ) {
ret = - ERANGE ;
goto err ;
}
2007-11-16 11:45:54 -05:00
2016-02-21 15:03:02 +00:00
read_extent_buffer ( leaf , buffer , name_ptr , name_len ) ;
buffer [ name_len ] = ' \0 ' ;
2008-08-28 06:21:16 -04:00
2016-02-21 15:03:02 +00:00
size_left - = name_len + 1 ;
buffer + = name_len + 1 ;
2011-03-17 15:17:59 +08:00
next :
2016-02-21 15:03:02 +00:00
cur + = this_len ;
di = ( struct btrfs_dir_item * ) ( ( char * ) di + this_len ) ;
}
next_item :
2011-03-17 15:17:59 +08:00
path - > slots [ 0 ] + + ;
2007-11-16 11:45:54 -05:00
}
ret = total_size ;
err :
btrfs_free_path ( path ) ;
return ret ;
}
2015-12-02 14:44:37 +01:00
static int btrfs_xattr_handler_get ( const struct xattr_handler * handler ,
2016-04-10 20:48:24 -04:00
struct dentry * unused , struct inode * inode ,
const char * name , void * buffer , size_t size )
2008-08-28 06:21:17 -04:00
{
2015-12-02 14:44:37 +01:00
name = xattr_full_name ( handler , name ) ;
2018-02-27 15:48:52 +01:00
return btrfs_getxattr ( inode , name , buffer , size ) ;
2008-01-14 14:33:35 -05:00
}
2015-12-02 14:44:37 +01:00
static int btrfs_xattr_handler_set ( const struct xattr_handler * handler ,
2021-01-21 14:19:27 +01:00
struct user_namespace * mnt_userns ,
2016-05-27 10:19:30 -04:00
struct dentry * unused , struct inode * inode ,
const char * name , const void * buffer ,
size_t size , int flags )
2008-08-28 06:21:17 -04:00
{
2015-12-02 14:44:37 +01:00
name = xattr_full_name ( handler , name ) ;
2019-04-12 16:02:59 +08:00
return btrfs_setxattr_trans ( inode , name , buffer , size , flags ) ;
2015-12-02 14:44:37 +01:00
}
2007-11-16 11:45:54 -05:00
2015-12-02 14:44:37 +01:00
static int btrfs_xattr_handler_set_prop ( const struct xattr_handler * handler ,
2021-01-21 14:19:27 +01:00
struct user_namespace * mnt_userns ,
2016-05-27 10:19:30 -04:00
struct dentry * unused , struct inode * inode ,
2015-12-02 14:44:37 +01:00
const char * name , const void * value ,
size_t size , int flags )
{
2019-04-20 19:48:51 +08:00
int ret ;
2019-04-20 19:48:58 +08:00
struct btrfs_trans_handle * trans ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2019-04-20 19:48:51 +08:00
2015-12-02 14:44:37 +01:00
name = xattr_full_name ( handler , name ) ;
2019-04-20 19:48:51 +08:00
ret = btrfs_validate_prop ( name , value , size ) ;
if ( ret )
return ret ;
2019-04-20 19:48:58 +08:00
trans = btrfs_start_transaction ( root , 2 ) ;
if ( IS_ERR ( trans ) )
return PTR_ERR ( trans ) ;
ret = btrfs_set_prop ( trans , inode , name , value , size , flags ) ;
if ( ! ret ) {
inode_inc_iversion ( inode ) ;
inode - > i_ctime = current_time ( inode ) ;
2020-11-02 16:48:59 +02:00
ret = btrfs_update_inode ( trans , root , BTRFS_I ( inode ) ) ;
2019-04-20 19:48:58 +08:00
BUG_ON ( ret ) ;
}
btrfs_end_transaction ( trans ) ;
return ret ;
2008-08-28 06:21:17 -04:00
}
2007-11-16 11:45:54 -05:00
2015-12-02 14:44:37 +01:00
static const struct xattr_handler btrfs_security_xattr_handler = {
. prefix = XATTR_SECURITY_PREFIX ,
. get = btrfs_xattr_handler_get ,
. set = btrfs_xattr_handler_set ,
} ;
static const struct xattr_handler btrfs_trusted_xattr_handler = {
. prefix = XATTR_TRUSTED_PREFIX ,
. get = btrfs_xattr_handler_get ,
. set = btrfs_xattr_handler_set ,
} ;
static const struct xattr_handler btrfs_user_xattr_handler = {
. prefix = XATTR_USER_PREFIX ,
. get = btrfs_xattr_handler_get ,
. set = btrfs_xattr_handler_set ,
} ;
static const struct xattr_handler btrfs_btrfs_xattr_handler = {
. prefix = XATTR_BTRFS_PREFIX ,
. get = btrfs_xattr_handler_get ,
. set = btrfs_xattr_handler_set_prop ,
} ;
const struct xattr_handler * btrfs_xattr_handlers [ ] = {
& btrfs_security_xattr_handler ,
# ifdef CONFIG_BTRFS_FS_POSIX_ACL
& posix_acl_access_xattr_handler ,
& posix_acl_default_xattr_handler ,
# endif
& btrfs_trusted_xattr_handler ,
& btrfs_user_xattr_handler ,
& btrfs_btrfs_xattr_handler ,
NULL ,
} ;
2013-04-25 20:41:01 +00:00
static int btrfs_initxattrs ( struct inode * inode ,
2019-03-01 12:34:49 +08:00
const struct xattr * xattr_array , void * fs_private )
2009-02-04 09:29:13 -05:00
{
2019-03-01 12:34:49 +08:00
struct btrfs_trans_handle * trans = fs_private ;
2011-06-06 15:29:25 -04:00
const struct xattr * xattr ;
2018-12-10 17:53:35 +00:00
unsigned int nofs_flag ;
2009-02-04 09:29:13 -05:00
char * name ;
2011-06-06 15:29:25 -04:00
int err = 0 ;
2009-02-04 09:29:13 -05:00
2018-12-10 17:53:35 +00:00
/*
* We ' re holding a transaction handle , so use a NOFS memory allocation
* context to avoid deadlock if reclaim happens .
*/
nofs_flag = memalloc_nofs_save ( ) ;
2011-06-06 15:29:25 -04:00
for ( xattr = xattr_array ; xattr - > name ! = NULL ; xattr + + ) {
name = kmalloc ( XATTR_SECURITY_PREFIX_LEN +
2015-12-03 12:49:48 +01:00
strlen ( xattr - > name ) + 1 , GFP_KERNEL ) ;
2011-06-06 15:29:25 -04:00
if ( ! name ) {
err = - ENOMEM ;
break ;
}
2009-02-04 09:29:13 -05:00
strcpy ( name , XATTR_SECURITY_PREFIX ) ;
2011-06-06 15:29:25 -04:00
strcpy ( name + XATTR_SECURITY_PREFIX_LEN , xattr - > name ) ;
2019-04-12 16:02:58 +08:00
err = btrfs_setxattr ( trans , inode , name , xattr - > value ,
xattr - > value_len , 0 ) ;
2009-02-04 09:29:13 -05:00
kfree ( name ) ;
2011-06-06 15:29:25 -04:00
if ( err < 0 )
break ;
2009-02-04 09:29:13 -05:00
}
2018-12-10 17:53:35 +00:00
memalloc_nofs_restore ( nofs_flag ) ;
2009-02-04 09:29:13 -05:00
return err ;
}
2011-06-06 15:29:25 -04:00
int btrfs_xattr_security_init ( struct btrfs_trans_handle * trans ,
struct inode * inode , struct inode * dir ,
const struct qstr * qstr )
{
return security_inode_init_security ( inode , dir , qstr ,
& btrfs_initxattrs , trans ) ;
}