9caf696142
Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple kernfs operations. On a large system with few hundred CPUs and few hundred applications simultaneoulsy trying to access sysfs, this results in multiple sys_open(s) contending on kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate. For example on a system with 384 cores, if I run 200 instances of an application which is mostly executing the following loop: for (int loop = 0; loop <100 ; loop++) { for (int port_num = 1; port_num < 2; port_num++) { for (int gid_index = 0; gid_index < 254; gid_index++ ) { char ret_buf[64], ret_buf_lo[64]; char gid_file_path[1024]; int ret_len; int ret_fd; ssize_t ret_rd; ub4 i, saved_errno; memset(ret_buf, 0, sizeof(ret_buf)); memset(gid_file_path, 0, sizeof(gid_file_path)); ret_len = snprintf(gid_file_path, sizeof(gid_file_path), "/sys/class/infiniband/%s/ports/%d/gids/%d", dev_name, port_num, gid_index); ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC); if (ret_fd < 0) { printf("Failed to open %s\n", gid_file_path); continue; } /* Read the GID */ ret_rd = read(ret_fd, ret_buf, 40); if (ret_rd == -1) { printf("Failed to read from file %s, errno: %u\n", gid_file_path, saved_errno); continue; } close(ret_fd); } } I see contention around kernfs_rwsem as follows: path_openat | |----link_path_walk.part.0.constprop.0 | | | |--49.92%--inode_permission | | | | | --48.69%--kernfs_iop_permission | | | | | |--18.16%--down_read | | | | | |--15.38%--up_read | | | | | --14.58%--_raw_spin_lock | | | | | ----- | | | |--29.08%--walk_component | | | | | --29.02%--lookup_fast | | | | | |--24.26%--kernfs_dop_revalidate | | | | | | | |--14.97%--down_read | | | | | | | --9.01%--up_read | | | | | --4.74%--__d_lookup | | | | | --4.64%--_raw_spin_lock | | | | | ---- Having a separate per-fs rwsem to protect kernfs inode attributes, will avoid the above mentioned contention and result in better performance as can bee seen below: path_openat | |----link_path_walk.part.0.constprop.0 | | | | | |--27.06%--inode_permission | | | | | --25.84%--kernfs_iop_permission | | | | | |--9.29%--up_read | | | | | |--8.19%--down_read | | | | | --7.89%--_raw_spin_lock | | | | | ---- | | | |--22.42%--walk_component | | | | | --22.36%--lookup_fast | | | | | |--16.07%--__d_lookup | | | | | | | --16.01%--_raw_spin_lock | | | | | | | ---- | | | | | --6.28%--kernfs_dop_revalidate | | | | | |--3.76%--down_read | | | | | --2.26%--up_read As can be seen from the above data the overhead due to both kerfs_iop_permission and kernfs_dop_revalidate have gone down and this also reduces overall run time of the earlier mentioned loop. Signed-off-by: Imran Khan <imran.f.khan@oracle.com> Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
443 lines
11 KiB
C
443 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* fs/kernfs/inode.c - kernfs inode implementation
|
|
*
|
|
* Copyright (c) 2001-3 Patrick Mochel
|
|
* Copyright (c) 2007 SUSE Linux Products GmbH
|
|
* Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
|
|
*/
|
|
|
|
#include <linux/pagemap.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/capability.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/xattr.h>
|
|
#include <linux/security.h>
|
|
|
|
#include "kernfs-internal.h"
|
|
|
|
static const struct inode_operations kernfs_iops = {
|
|
.permission = kernfs_iop_permission,
|
|
.setattr = kernfs_iop_setattr,
|
|
.getattr = kernfs_iop_getattr,
|
|
.listxattr = kernfs_iop_listxattr,
|
|
};
|
|
|
|
static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
|
|
{
|
|
static DEFINE_MUTEX(iattr_mutex);
|
|
struct kernfs_iattrs *ret;
|
|
|
|
mutex_lock(&iattr_mutex);
|
|
|
|
if (kn->iattr || !alloc)
|
|
goto out_unlock;
|
|
|
|
kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
|
|
if (!kn->iattr)
|
|
goto out_unlock;
|
|
|
|
/* assign default attributes */
|
|
kn->iattr->ia_uid = GLOBAL_ROOT_UID;
|
|
kn->iattr->ia_gid = GLOBAL_ROOT_GID;
|
|
|
|
ktime_get_real_ts64(&kn->iattr->ia_atime);
|
|
kn->iattr->ia_mtime = kn->iattr->ia_atime;
|
|
kn->iattr->ia_ctime = kn->iattr->ia_atime;
|
|
|
|
simple_xattrs_init(&kn->iattr->xattrs);
|
|
atomic_set(&kn->iattr->nr_user_xattrs, 0);
|
|
atomic_set(&kn->iattr->user_xattr_size, 0);
|
|
out_unlock:
|
|
ret = kn->iattr;
|
|
mutex_unlock(&iattr_mutex);
|
|
return ret;
|
|
}
|
|
|
|
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
|
|
{
|
|
return __kernfs_iattrs(kn, 1);
|
|
}
|
|
|
|
static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
|
|
{
|
|
return __kernfs_iattrs(kn, 0);
|
|
}
|
|
|
|
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
|
|
{
|
|
struct kernfs_iattrs *attrs;
|
|
unsigned int ia_valid = iattr->ia_valid;
|
|
|
|
attrs = kernfs_iattrs(kn);
|
|
if (!attrs)
|
|
return -ENOMEM;
|
|
|
|
if (ia_valid & ATTR_UID)
|
|
attrs->ia_uid = iattr->ia_uid;
|
|
if (ia_valid & ATTR_GID)
|
|
attrs->ia_gid = iattr->ia_gid;
|
|
if (ia_valid & ATTR_ATIME)
|
|
attrs->ia_atime = iattr->ia_atime;
|
|
if (ia_valid & ATTR_MTIME)
|
|
attrs->ia_mtime = iattr->ia_mtime;
|
|
if (ia_valid & ATTR_CTIME)
|
|
attrs->ia_ctime = iattr->ia_ctime;
|
|
if (ia_valid & ATTR_MODE)
|
|
kn->mode = iattr->ia_mode;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* kernfs_setattr - set iattr on a node
|
|
* @kn: target node
|
|
* @iattr: iattr to set
|
|
*
|
|
* Return: %0 on success, -errno on failure.
|
|
*/
|
|
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
|
|
{
|
|
int ret;
|
|
struct kernfs_root *root = kernfs_root(kn);
|
|
|
|
down_write(&root->kernfs_iattr_rwsem);
|
|
ret = __kernfs_setattr(kn, iattr);
|
|
up_write(&root->kernfs_iattr_rwsem);
|
|
return ret;
|
|
}
|
|
|
|
int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
|
|
struct iattr *iattr)
|
|
{
|
|
struct inode *inode = d_inode(dentry);
|
|
struct kernfs_node *kn = inode->i_private;
|
|
struct kernfs_root *root;
|
|
int error;
|
|
|
|
if (!kn)
|
|
return -EINVAL;
|
|
|
|
root = kernfs_root(kn);
|
|
down_write(&root->kernfs_iattr_rwsem);
|
|
error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
|
|
if (error)
|
|
goto out;
|
|
|
|
error = __kernfs_setattr(kn, iattr);
|
|
if (error)
|
|
goto out;
|
|
|
|
/* this ignores size changes */
|
|
setattr_copy(&nop_mnt_idmap, inode, iattr);
|
|
|
|
out:
|
|
up_write(&root->kernfs_iattr_rwsem);
|
|
return error;
|
|
}
|
|
|
|
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
|
|
{
|
|
struct kernfs_node *kn = kernfs_dentry_node(dentry);
|
|
struct kernfs_iattrs *attrs;
|
|
|
|
attrs = kernfs_iattrs(kn);
|
|
if (!attrs)
|
|
return -ENOMEM;
|
|
|
|
return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
|
|
}
|
|
|
|
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
|
|
{
|
|
inode->i_mode = mode;
|
|
inode->i_atime = inode->i_mtime =
|
|
inode->i_ctime = current_time(inode);
|
|
}
|
|
|
|
static inline void set_inode_attr(struct inode *inode,
|
|
struct kernfs_iattrs *attrs)
|
|
{
|
|
inode->i_uid = attrs->ia_uid;
|
|
inode->i_gid = attrs->ia_gid;
|
|
inode->i_atime = attrs->ia_atime;
|
|
inode->i_mtime = attrs->ia_mtime;
|
|
inode->i_ctime = attrs->ia_ctime;
|
|
}
|
|
|
|
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
|
|
{
|
|
struct kernfs_iattrs *attrs = kn->iattr;
|
|
|
|
inode->i_mode = kn->mode;
|
|
if (attrs)
|
|
/*
|
|
* kernfs_node has non-default attributes get them from
|
|
* persistent copy in kernfs_node.
|
|
*/
|
|
set_inode_attr(inode, attrs);
|
|
|
|
if (kernfs_type(kn) == KERNFS_DIR)
|
|
set_nlink(inode, kn->dir.subdirs + 2);
|
|
}
|
|
|
|
int kernfs_iop_getattr(struct mnt_idmap *idmap,
|
|
const struct path *path, struct kstat *stat,
|
|
u32 request_mask, unsigned int query_flags)
|
|
{
|
|
struct inode *inode = d_inode(path->dentry);
|
|
struct kernfs_node *kn = inode->i_private;
|
|
struct kernfs_root *root = kernfs_root(kn);
|
|
|
|
down_read(&root->kernfs_iattr_rwsem);
|
|
kernfs_refresh_inode(kn, inode);
|
|
generic_fillattr(&nop_mnt_idmap, inode, stat);
|
|
up_read(&root->kernfs_iattr_rwsem);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
|
|
{
|
|
kernfs_get(kn);
|
|
inode->i_private = kn;
|
|
inode->i_mapping->a_ops = &ram_aops;
|
|
inode->i_op = &kernfs_iops;
|
|
inode->i_generation = kernfs_gen(kn);
|
|
|
|
set_default_inode_attr(inode, kn->mode);
|
|
kernfs_refresh_inode(kn, inode);
|
|
|
|
/* initialize inode according to type */
|
|
switch (kernfs_type(kn)) {
|
|
case KERNFS_DIR:
|
|
inode->i_op = &kernfs_dir_iops;
|
|
inode->i_fop = &kernfs_dir_fops;
|
|
if (kn->flags & KERNFS_EMPTY_DIR)
|
|
make_empty_dir_inode(inode);
|
|
break;
|
|
case KERNFS_FILE:
|
|
inode->i_size = kn->attr.size;
|
|
inode->i_fop = &kernfs_file_fops;
|
|
break;
|
|
case KERNFS_LINK:
|
|
inode->i_op = &kernfs_symlink_iops;
|
|
break;
|
|
default:
|
|
BUG();
|
|
}
|
|
|
|
unlock_new_inode(inode);
|
|
}
|
|
|
|
/**
|
|
* kernfs_get_inode - get inode for kernfs_node
|
|
* @sb: super block
|
|
* @kn: kernfs_node to allocate inode for
|
|
*
|
|
* Get inode for @kn. If such inode doesn't exist, a new inode is
|
|
* allocated and basics are initialized. New inode is returned
|
|
* locked.
|
|
*
|
|
* Locking:
|
|
* Kernel thread context (may sleep).
|
|
*
|
|
* Return:
|
|
* Pointer to allocated inode on success, %NULL on failure.
|
|
*/
|
|
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
|
|
{
|
|
struct inode *inode;
|
|
|
|
inode = iget_locked(sb, kernfs_ino(kn));
|
|
if (inode && (inode->i_state & I_NEW))
|
|
kernfs_init_inode(kn, inode);
|
|
|
|
return inode;
|
|
}
|
|
|
|
/*
|
|
* The kernfs_node serves as both an inode and a directory entry for
|
|
* kernfs. To prevent the kernfs inode numbers from being freed
|
|
* prematurely we take a reference to kernfs_node from the kernfs inode. A
|
|
* super_operations.evict_inode() implementation is needed to drop that
|
|
* reference upon inode destruction.
|
|
*/
|
|
void kernfs_evict_inode(struct inode *inode)
|
|
{
|
|
struct kernfs_node *kn = inode->i_private;
|
|
|
|
truncate_inode_pages_final(&inode->i_data);
|
|
clear_inode(inode);
|
|
kernfs_put(kn);
|
|
}
|
|
|
|
int kernfs_iop_permission(struct mnt_idmap *idmap,
|
|
struct inode *inode, int mask)
|
|
{
|
|
struct kernfs_node *kn;
|
|
struct kernfs_root *root;
|
|
int ret;
|
|
|
|
if (mask & MAY_NOT_BLOCK)
|
|
return -ECHILD;
|
|
|
|
kn = inode->i_private;
|
|
root = kernfs_root(kn);
|
|
|
|
down_read(&root->kernfs_iattr_rwsem);
|
|
kernfs_refresh_inode(kn, inode);
|
|
ret = generic_permission(&nop_mnt_idmap, inode, mask);
|
|
up_read(&root->kernfs_iattr_rwsem);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
|
|
void *value, size_t size)
|
|
{
|
|
struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn);
|
|
if (!attrs)
|
|
return -ENODATA;
|
|
|
|
return simple_xattr_get(&attrs->xattrs, name, value, size);
|
|
}
|
|
|
|
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
|
|
const void *value, size_t size, int flags)
|
|
{
|
|
struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
|
|
if (!attrs)
|
|
return -ENOMEM;
|
|
|
|
return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL);
|
|
}
|
|
|
|
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
|
|
struct dentry *unused, struct inode *inode,
|
|
const char *suffix, void *value, size_t size)
|
|
{
|
|
const char *name = xattr_full_name(handler, suffix);
|
|
struct kernfs_node *kn = inode->i_private;
|
|
|
|
return kernfs_xattr_get(kn, name, value, size);
|
|
}
|
|
|
|
static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
|
|
struct mnt_idmap *idmap,
|
|
struct dentry *unused, struct inode *inode,
|
|
const char *suffix, const void *value,
|
|
size_t size, int flags)
|
|
{
|
|
const char *name = xattr_full_name(handler, suffix);
|
|
struct kernfs_node *kn = inode->i_private;
|
|
|
|
return kernfs_xattr_set(kn, name, value, size, flags);
|
|
}
|
|
|
|
static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
|
|
const char *full_name,
|
|
struct simple_xattrs *xattrs,
|
|
const void *value, size_t size, int flags)
|
|
{
|
|
atomic_t *sz = &kn->iattr->user_xattr_size;
|
|
atomic_t *nr = &kn->iattr->nr_user_xattrs;
|
|
ssize_t removed_size;
|
|
int ret;
|
|
|
|
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
|
|
ret = -ENOSPC;
|
|
goto dec_count_out;
|
|
}
|
|
|
|
if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
|
|
ret = -ENOSPC;
|
|
goto dec_size_out;
|
|
}
|
|
|
|
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
|
|
&removed_size);
|
|
|
|
if (!ret && removed_size >= 0)
|
|
size = removed_size;
|
|
else if (!ret)
|
|
return 0;
|
|
dec_size_out:
|
|
atomic_sub(size, sz);
|
|
dec_count_out:
|
|
atomic_dec(nr);
|
|
return ret;
|
|
}
|
|
|
|
static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
|
|
const char *full_name,
|
|
struct simple_xattrs *xattrs,
|
|
const void *value, size_t size, int flags)
|
|
{
|
|
atomic_t *sz = &kn->iattr->user_xattr_size;
|
|
atomic_t *nr = &kn->iattr->nr_user_xattrs;
|
|
ssize_t removed_size;
|
|
int ret;
|
|
|
|
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
|
|
&removed_size);
|
|
|
|
if (removed_size >= 0) {
|
|
atomic_sub(removed_size, sz);
|
|
atomic_dec(nr);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
|
|
struct mnt_idmap *idmap,
|
|
struct dentry *unused, struct inode *inode,
|
|
const char *suffix, const void *value,
|
|
size_t size, int flags)
|
|
{
|
|
const char *full_name = xattr_full_name(handler, suffix);
|
|
struct kernfs_node *kn = inode->i_private;
|
|
struct kernfs_iattrs *attrs;
|
|
|
|
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
|
|
return -EOPNOTSUPP;
|
|
|
|
attrs = kernfs_iattrs(kn);
|
|
if (!attrs)
|
|
return -ENOMEM;
|
|
|
|
if (value)
|
|
return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
|
|
value, size, flags);
|
|
else
|
|
return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
|
|
value, size, flags);
|
|
|
|
}
|
|
|
|
static const struct xattr_handler kernfs_trusted_xattr_handler = {
|
|
.prefix = XATTR_TRUSTED_PREFIX,
|
|
.get = kernfs_vfs_xattr_get,
|
|
.set = kernfs_vfs_xattr_set,
|
|
};
|
|
|
|
static const struct xattr_handler kernfs_security_xattr_handler = {
|
|
.prefix = XATTR_SECURITY_PREFIX,
|
|
.get = kernfs_vfs_xattr_get,
|
|
.set = kernfs_vfs_xattr_set,
|
|
};
|
|
|
|
static const struct xattr_handler kernfs_user_xattr_handler = {
|
|
.prefix = XATTR_USER_PREFIX,
|
|
.get = kernfs_vfs_xattr_get,
|
|
.set = kernfs_vfs_user_xattr_set,
|
|
};
|
|
|
|
const struct xattr_handler *kernfs_xattr_handlers[] = {
|
|
&kernfs_trusted_xattr_handler,
|
|
&kernfs_security_xattr_handler,
|
|
&kernfs_user_xattr_handler,
|
|
NULL
|
|
};
|