f2f121ab50
Every time we log an inode we lookup in the fs/subvol tree for xattrs and if we have any, log them into the log tree. However it is very common to have inodes without any xattrs, so doing the search wastes times, but more importantly it adds contention on the fs/subvol tree locks, either making the logging code block and wait for tree locks or making the logging code making other concurrent operations block and wait. The most typical use cases where xattrs are used are when capabilities or ACLs are defined for an inode, or when SELinux is enabled. This change makes the logging code detect when an inode does not have xattrs and skip the xattrs search the next time the inode is logged, unless the inode is evicted and loaded again or a xattr is added to the inode. Therefore skipping the search for xattrs on inodes that don't ever have xattrs and are fsynced with some frequency. The following script that calls dbench was used to measure the impact of this change on a VM with 8 CPUs, 16Gb of ram, using a raw NVMe device directly (no intermediary filesystem on the host) and using a non-debug kernel (default configuration on Debian distributions): $ cat test.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd" mkfs.btrfs -f -m single -d single $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 200 40 umount $MNT The results before this change: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 5761605 0.172 312.057 Close 4232452 0.002 10.927 Rename 243937 1.406 277.344 Unlink 1163456 0.631 298.402 Deltree 160 11.581 221.107 Mkdir 80 0.003 0.005 Qpathinfo 5221410 0.065 122.309 Qfileinfo 915432 0.001 3.333 Qfsinfo 957555 0.003 3.992 Sfileinfo 469244 0.023 20.494 Find 2018865 0.448 123.659 WriteX 2874851 0.049 118.529 ReadX 9030579 0.004 21.654 LockX 18754 0.003 4.423 UnlockX 18754 0.002 0.331 Flush 403792 10.944 359.494 Throughput 908.444 MB/sec 40 clients 40 procs max_latency=359.500 ms The results after this change: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 6442521 0.159 230.693 Close 4732357 0.002 10.972 Rename 272809 1.293 227.398 Unlink 1301059 0.563 218.500 Deltree 160 7.796 54.887 Mkdir 80 0.008 0.478 Qpathinfo 5839452 0.047 124.330 Qfileinfo 1023199 0.001 4.996 Qfsinfo 1070760 0.003 5.709 Sfileinfo 524790 0.033 21.765 Find 2257658 0.314 125.611 WriteX 3211520 0.040 232.135 ReadX 10098969 0.004 25.340 LockX 20974 0.003 1.569 UnlockX 20974 0.002 3.475 Flush 451553 10.287 331.037 Throughput 1011.77 MB/sec 40 clients 40 procs max_latency=331.045 ms +10.8% throughput, -8.2% max latency Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
480 lines
12 KiB
C
480 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2007 Red Hat. All rights reserved.
|
|
*/
|
|
|
|
#include <linux/init.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/rwsem.h>
|
|
#include <linux/xattr.h>
|
|
#include <linux/security.h>
|
|
#include <linux/posix_acl_xattr.h>
|
|
#include <linux/iversion.h>
|
|
#include <linux/sched/mm.h>
|
|
#include "ctree.h"
|
|
#include "btrfs_inode.h"
|
|
#include "transaction.h"
|
|
#include "xattr.h"
|
|
#include "disk-io.h"
|
|
#include "props.h"
|
|
#include "locking.h"
|
|
|
|
int btrfs_getxattr(struct inode *inode, const char *name,
|
|
void *buffer, size_t size)
|
|
{
|
|
struct btrfs_dir_item *di;
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
struct btrfs_path *path;
|
|
struct extent_buffer *leaf;
|
|
int ret = 0;
|
|
unsigned long data_ptr;
|
|
|
|
path = btrfs_alloc_path();
|
|
if (!path)
|
|
return -ENOMEM;
|
|
|
|
/* lookup the xattr by name */
|
|
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)),
|
|
name, strlen(name), 0);
|
|
if (!di) {
|
|
ret = -ENODATA;
|
|
goto out;
|
|
} else if (IS_ERR(di)) {
|
|
ret = PTR_ERR(di);
|
|
goto out;
|
|
}
|
|
|
|
leaf = path->nodes[0];
|
|
/* if size is 0, that means we want the size of the attr */
|
|
if (!size) {
|
|
ret = btrfs_dir_data_len(leaf, di);
|
|
goto out;
|
|
}
|
|
|
|
/* now get the data out of our dir_item */
|
|
if (btrfs_dir_data_len(leaf, di) > size) {
|
|
ret = -ERANGE;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* The way things are packed into the leaf is like this
|
|
* |struct btrfs_dir_item|name|data|
|
|
* where name is the xattr name, so security.foo, and data is the
|
|
* content of the xattr. data_ptr points to the location in memory
|
|
* where the data starts in the in memory leaf
|
|
*/
|
|
data_ptr = (unsigned long)((char *)(di + 1) +
|
|
btrfs_dir_name_len(leaf, di));
|
|
read_extent_buffer(leaf, buffer, data_ptr,
|
|
btrfs_dir_data_len(leaf, di));
|
|
ret = btrfs_dir_data_len(leaf, di);
|
|
|
|
out:
|
|
btrfs_free_path(path);
|
|
return ret;
|
|
}
|
|
|
|
int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
|
|
const char *name, const void *value, size_t size, int flags)
|
|
{
|
|
struct btrfs_dir_item *di = NULL;
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
struct btrfs_path *path;
|
|
size_t name_len = strlen(name);
|
|
int ret = 0;
|
|
|
|
ASSERT(trans);
|
|
|
|
if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info))
|
|
return -ENOSPC;
|
|
|
|
path = btrfs_alloc_path();
|
|
if (!path)
|
|
return -ENOMEM;
|
|
path->skip_release_on_error = 1;
|
|
|
|
if (!value) {
|
|
di = btrfs_lookup_xattr(trans, root, path,
|
|
btrfs_ino(BTRFS_I(inode)), name, name_len, -1);
|
|
if (!di && (flags & XATTR_REPLACE))
|
|
ret = -ENODATA;
|
|
else if (IS_ERR(di))
|
|
ret = PTR_ERR(di);
|
|
else if (di)
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* For a replace we can't just do the insert blindly.
|
|
* Do a lookup first (read-only btrfs_search_slot), and return if xattr
|
|
* doesn't exist. If it exists, fall down below to the insert/replace
|
|
* path - we can't race with a concurrent xattr delete, because the VFS
|
|
* locks the inode's i_mutex before calling setxattr or removexattr.
|
|
*/
|
|
if (flags & XATTR_REPLACE) {
|
|
ASSERT(inode_is_locked(inode));
|
|
di = btrfs_lookup_xattr(NULL, root, path,
|
|
btrfs_ino(BTRFS_I(inode)), name, name_len, 0);
|
|
if (!di)
|
|
ret = -ENODATA;
|
|
else if (IS_ERR(di))
|
|
ret = PTR_ERR(di);
|
|
if (ret)
|
|
goto out;
|
|
btrfs_release_path(path);
|
|
di = NULL;
|
|
}
|
|
|
|
ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(BTRFS_I(inode)),
|
|
name, name_len, value, size);
|
|
if (ret == -EOVERFLOW) {
|
|
/*
|
|
* We have an existing item in a leaf, split_leaf couldn't
|
|
* expand it. That item might have or not a dir_item that
|
|
* matches our target xattr, so lets check.
|
|
*/
|
|
ret = 0;
|
|
btrfs_assert_tree_locked(path->nodes[0]);
|
|
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
|
|
if (!di && !(flags & XATTR_REPLACE)) {
|
|
ret = -ENOSPC;
|
|
goto out;
|
|
}
|
|
} else if (ret == -EEXIST) {
|
|
ret = 0;
|
|
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
|
|
ASSERT(di); /* logic error */
|
|
} else if (ret) {
|
|
goto out;
|
|
}
|
|
|
|
if (di && (flags & XATTR_CREATE)) {
|
|
ret = -EEXIST;
|
|
goto out;
|
|
}
|
|
|
|
if (di) {
|
|
/*
|
|
* We're doing a replace, and it must be atomic, that is, at
|
|
* any point in time we have either the old or the new xattr
|
|
* value in the tree. We don't want readers (getxattr and
|
|
* listxattrs) to miss a value, this is specially important
|
|
* for ACLs.
|
|
*/
|
|
const int slot = path->slots[0];
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
const u16 old_data_len = btrfs_dir_data_len(leaf, di);
|
|
const u32 item_size = btrfs_item_size_nr(leaf, slot);
|
|
const u32 data_size = sizeof(*di) + name_len + size;
|
|
struct btrfs_item *item;
|
|
unsigned long data_ptr;
|
|
char *ptr;
|
|
|
|
if (size > old_data_len) {
|
|
if (btrfs_leaf_free_space(leaf) <
|
|
(size - old_data_len)) {
|
|
ret = -ENOSPC;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
if (old_data_len + name_len + sizeof(*di) == item_size) {
|
|
/* No other xattrs packed in the same leaf item. */
|
|
if (size > old_data_len)
|
|
btrfs_extend_item(path, size - old_data_len);
|
|
else if (size < old_data_len)
|
|
btrfs_truncate_item(path, data_size, 1);
|
|
} else {
|
|
/* There are other xattrs packed in the same item. */
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
|
if (ret)
|
|
goto out;
|
|
btrfs_extend_item(path, data_size);
|
|
}
|
|
|
|
item = btrfs_item_nr(slot);
|
|
ptr = btrfs_item_ptr(leaf, slot, char);
|
|
ptr += btrfs_item_size(leaf, item) - data_size;
|
|
di = (struct btrfs_dir_item *)ptr;
|
|
btrfs_set_dir_data_len(leaf, di, size);
|
|
data_ptr = ((unsigned long)(di + 1)) + name_len;
|
|
write_extent_buffer(leaf, value, data_ptr, size);
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
} else {
|
|
/*
|
|
* Insert, and we had space for the xattr, so path->slots[0] is
|
|
* where our xattr dir_item is and btrfs_insert_xattr_item()
|
|
* filled it.
|
|
*/
|
|
}
|
|
out:
|
|
btrfs_free_path(path);
|
|
if (!ret) {
|
|
set_bit(BTRFS_INODE_COPY_EVERYTHING,
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* @value: "" makes the attribute to empty, NULL removes it
|
|
*/
|
|
int btrfs_setxattr_trans(struct inode *inode, const char *name,
|
|
const void *value, size_t size, int flags)
|
|
{
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
struct btrfs_trans_handle *trans;
|
|
int ret;
|
|
|
|
trans = btrfs_start_transaction(root, 2);
|
|
if (IS_ERR(trans))
|
|
return PTR_ERR(trans);
|
|
|
|
ret = btrfs_setxattr(trans, inode, name, value, size, flags);
|
|
if (ret)
|
|
goto out;
|
|
|
|
inode_inc_iversion(inode);
|
|
inode->i_ctime = current_time(inode);
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
|
BUG_ON(ret);
|
|
out:
|
|
btrfs_end_transaction(trans);
|
|
return ret;
|
|
}
|
|
|
|
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
|
|
{
|
|
struct btrfs_key key;
|
|
struct inode *inode = d_inode(dentry);
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
struct btrfs_path *path;
|
|
int ret = 0;
|
|
size_t total_size = 0, size_left = size;
|
|
|
|
/*
|
|
* ok we want all objects associated with this id.
|
|
* NOTE: we set key.offset = 0; because we want to start with the
|
|
* first xattr that we find and walk forward
|
|
*/
|
|
key.objectid = btrfs_ino(BTRFS_I(inode));
|
|
key.type = BTRFS_XATTR_ITEM_KEY;
|
|
key.offset = 0;
|
|
|
|
path = btrfs_alloc_path();
|
|
if (!path)
|
|
return -ENOMEM;
|
|
path->reada = READA_FORWARD;
|
|
|
|
/* search for our xattrs */
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
if (ret < 0)
|
|
goto err;
|
|
|
|
while (1) {
|
|
struct extent_buffer *leaf;
|
|
int slot;
|
|
struct btrfs_dir_item *di;
|
|
struct btrfs_key found_key;
|
|
u32 item_size;
|
|
u32 cur;
|
|
|
|
leaf = path->nodes[0];
|
|
slot = path->slots[0];
|
|
|
|
/* this is where we start walking through the path */
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
|
/*
|
|
* if we've reached the last slot in this leaf we need
|
|
* to go to the next leaf and reset everything
|
|
*/
|
|
ret = btrfs_next_leaf(root, path);
|
|
if (ret < 0)
|
|
goto err;
|
|
else if (ret > 0)
|
|
break;
|
|
continue;
|
|
}
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
/* check to make sure this item is what we want */
|
|
if (found_key.objectid != key.objectid)
|
|
break;
|
|
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
|
|
break;
|
|
if (found_key.type < BTRFS_XATTR_ITEM_KEY)
|
|
goto next_item;
|
|
|
|
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
|
|
item_size = btrfs_item_size_nr(leaf, slot);
|
|
cur = 0;
|
|
while (cur < item_size) {
|
|
u16 name_len = btrfs_dir_name_len(leaf, di);
|
|
u16 data_len = btrfs_dir_data_len(leaf, di);
|
|
u32 this_len = sizeof(*di) + name_len + data_len;
|
|
unsigned long name_ptr = (unsigned long)(di + 1);
|
|
|
|
total_size += name_len + 1;
|
|
/*
|
|
* We are just looking for how big our buffer needs to
|
|
* be.
|
|
*/
|
|
if (!size)
|
|
goto next;
|
|
|
|
if (!buffer || (name_len + 1) > size_left) {
|
|
ret = -ERANGE;
|
|
goto err;
|
|
}
|
|
|
|
read_extent_buffer(leaf, buffer, name_ptr, name_len);
|
|
buffer[name_len] = '\0';
|
|
|
|
size_left -= name_len + 1;
|
|
buffer += name_len + 1;
|
|
next:
|
|
cur += this_len;
|
|
di = (struct btrfs_dir_item *)((char *)di + this_len);
|
|
}
|
|
next_item:
|
|
path->slots[0]++;
|
|
}
|
|
ret = total_size;
|
|
|
|
err:
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
|
|
struct dentry *unused, struct inode *inode,
|
|
const char *name, void *buffer, size_t size)
|
|
{
|
|
name = xattr_full_name(handler, name);
|
|
return btrfs_getxattr(inode, name, buffer, size);
|
|
}
|
|
|
|
static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
|
|
struct dentry *unused, struct inode *inode,
|
|
const char *name, const void *buffer,
|
|
size_t size, int flags)
|
|
{
|
|
name = xattr_full_name(handler, name);
|
|
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
|
|
}
|
|
|
|
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
|
|
struct dentry *unused, struct inode *inode,
|
|
const char *name, const void *value,
|
|
size_t size, int flags)
|
|
{
|
|
int ret;
|
|
struct btrfs_trans_handle *trans;
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
name = xattr_full_name(handler, name);
|
|
ret = btrfs_validate_prop(name, value, size);
|
|
if (ret)
|
|
return ret;
|
|
|
|
trans = btrfs_start_transaction(root, 2);
|
|
if (IS_ERR(trans))
|
|
return PTR_ERR(trans);
|
|
|
|
ret = btrfs_set_prop(trans, inode, name, value, size, flags);
|
|
if (!ret) {
|
|
inode_inc_iversion(inode);
|
|
inode->i_ctime = current_time(inode);
|
|
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
|
BUG_ON(ret);
|
|
}
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static const struct xattr_handler btrfs_security_xattr_handler = {
|
|
.prefix = XATTR_SECURITY_PREFIX,
|
|
.get = btrfs_xattr_handler_get,
|
|
.set = btrfs_xattr_handler_set,
|
|
};
|
|
|
|
static const struct xattr_handler btrfs_trusted_xattr_handler = {
|
|
.prefix = XATTR_TRUSTED_PREFIX,
|
|
.get = btrfs_xattr_handler_get,
|
|
.set = btrfs_xattr_handler_set,
|
|
};
|
|
|
|
static const struct xattr_handler btrfs_user_xattr_handler = {
|
|
.prefix = XATTR_USER_PREFIX,
|
|
.get = btrfs_xattr_handler_get,
|
|
.set = btrfs_xattr_handler_set,
|
|
};
|
|
|
|
static const struct xattr_handler btrfs_btrfs_xattr_handler = {
|
|
.prefix = XATTR_BTRFS_PREFIX,
|
|
.get = btrfs_xattr_handler_get,
|
|
.set = btrfs_xattr_handler_set_prop,
|
|
};
|
|
|
|
const struct xattr_handler *btrfs_xattr_handlers[] = {
|
|
&btrfs_security_xattr_handler,
|
|
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
|
|
&posix_acl_access_xattr_handler,
|
|
&posix_acl_default_xattr_handler,
|
|
#endif
|
|
&btrfs_trusted_xattr_handler,
|
|
&btrfs_user_xattr_handler,
|
|
&btrfs_btrfs_xattr_handler,
|
|
NULL,
|
|
};
|
|
|
|
static int btrfs_initxattrs(struct inode *inode,
|
|
const struct xattr *xattr_array, void *fs_private)
|
|
{
|
|
struct btrfs_trans_handle *trans = fs_private;
|
|
const struct xattr *xattr;
|
|
unsigned int nofs_flag;
|
|
char *name;
|
|
int err = 0;
|
|
|
|
/*
|
|
* We're holding a transaction handle, so use a NOFS memory allocation
|
|
* context to avoid deadlock if reclaim happens.
|
|
*/
|
|
nofs_flag = memalloc_nofs_save();
|
|
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
|
|
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
|
|
strlen(xattr->name) + 1, GFP_KERNEL);
|
|
if (!name) {
|
|
err = -ENOMEM;
|
|
break;
|
|
}
|
|
strcpy(name, XATTR_SECURITY_PREFIX);
|
|
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
|
|
err = btrfs_setxattr(trans, inode, name, xattr->value,
|
|
xattr->value_len, 0);
|
|
kfree(name);
|
|
if (err < 0)
|
|
break;
|
|
}
|
|
memalloc_nofs_restore(nofs_flag);
|
|
return err;
|
|
}
|
|
|
|
int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
|
|
struct inode *inode, struct inode *dir,
|
|
const struct qstr *qstr)
|
|
{
|
|
return security_inode_init_security(inode, dir, qstr,
|
|
&btrfs_initxattrs, trans);
|
|
}
|