linux/fs/afs/dynroot.c

394 lines
8.6 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/* AFS dynamic root handling
*
* Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/dns_resolver.h>
#include "internal.h"
afs: Build an abstraction around an "operation" concept Turn the afs_operation struct into the main way that most fileserver operations are managed. Various things are added to the struct, including the following: (1) All the parameters and results of the relevant operations are moved into it, removing corresponding fields from the afs_call struct. afs_call gets a pointer to the op. (2) The target volume is made the main focus of the operation, rather than the target vnode(s), and a bunch of op->vnode->volume are made op->volume instead. (3) Two vnode records are defined (op->file[]) for the vnode(s) involved in most operations. The vnode record (struct afs_vnode_param) contains: - The vnode pointer. - The fid of the vnode to be included in the parameters or that was returned in the reply (eg. FS.MakeDir). - The status and callback information that may be returned in the reply about the vnode. - Callback break and data version tracking for detecting simultaneous third-parth changes. (4) Pointers to dentries to be updated with new inodes. (5) An operations table pointer. The table includes pointers to functions for issuing AFS and YFS-variant RPCs, handling the success and abort of an operation and handling post-I/O-lock local editing of a directory. To make this work, the following function restructuring is made: (A) The rotation loop that issues calls to fileservers that can be found in each function that wants to issue an RPC (such as afs_mkdir()) is extracted out into common code, in a new file called fs_operation.c. (B) The rotation loops, such as the one in afs_mkdir(), are replaced with a much smaller piece of code that allocates an operation, sets the parameters and then calls out to the common code to do the actual work. (C) The code for handling the success and failure of an operation are moved into operation functions (as (5) above) and these are called from the core code at appropriate times. (D) The pseudo inode getting stuff used by the dynamic root code is moved over into dynroot.c. (E) struct afs_iget_data is absorbed into the operation struct and afs_iget() expects to be given an op pointer and a vnode record. (F) Point (E) doesn't work for the root dir of a volume, but we know the FID in advance (it's always vnode 1, unique 1), so a separate inode getter, afs_root_iget(), is provided to special-case that. (G) The inode status init/update functions now also take an op and a vnode record. (H) The RPC marshalling functions now, for the most part, just take an afs_operation struct as their only argument. All the data they need is held there. The result delivery functions write their answers there as well. (I) The call is attached to the operation and then the operation core does the waiting. And then the new operation code is, for the moment, made to just initialise the operation, get the appropriate vnode I/O locks and do the same rotation loop as before. This lays the foundation for the following changes in the future: (*) Overhauling the rotation (again). (*) Support for asynchronous I/O, where the fileserver rotation must be done asynchronously also. Signed-off-by: David Howells <dhowells@redhat.com>
2020-04-10 20:51:51 +01:00
static atomic_t afs_autocell_ino;
/*
* iget5() comparator for inode created by autocell operations
*
* These pseudo inodes don't match anything.
*/
static int afs_iget5_pseudo_test(struct inode *inode, void *opaque)
{
return 0;
}
/*
* iget5() inode initialiser
*/
static int afs_iget5_pseudo_set(struct inode *inode, void *opaque)
{
struct afs_super_info *as = AFS_FS_S(inode->i_sb);
struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_fid *fid = opaque;
vnode->volume = as->volume;
vnode->fid = *fid;
inode->i_ino = fid->vnode;
inode->i_generation = fid->unique;
return 0;
}
/*
* Create an inode for a dynamic root directory or an autocell dynamic
* automount dir.
*/
struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
{
struct afs_super_info *as = AFS_FS_S(sb);
struct afs_vnode *vnode;
struct inode *inode;
struct afs_fid fid = {};
_enter("");
if (as->volume)
fid.vid = as->volume->vid;
if (root) {
fid.vnode = 1;
fid.unique = 1;
} else {
fid.vnode = atomic_inc_return(&afs_autocell_ino);
fid.unique = 0;
}
inode = iget5_locked(sb, fid.vnode,
afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
if (!inode) {
_leave(" = -ENOMEM");
return ERR_PTR(-ENOMEM);
}
_debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }",
inode, inode->i_ino, fid.vid, fid.vnode, fid.unique);
vnode = AFS_FS_I(inode);
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
inode->i_op = &afs_dynroot_inode_operations;
inode->i_fop = &simple_dir_operations;
} else {
inode->i_op = &afs_autocell_inode_operations;
}
set_nlink(inode, 2);
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode);
inode->i_blocks = 0;
inode->i_generation = 0;
set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
if (!root) {
set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
inode->i_flags |= S_AUTOMOUNT;
}
inode->i_flags |= S_NOATIME;
unlock_new_inode(inode);
_leave(" = %p", inode);
return inode;
}
/*
* Probe to see if a cell may exist. This prevents positive dentries from
* being created unnecessarily.
*/
static int afs_probe_cell_name(struct dentry *dentry)
{
struct afs_cell *cell;
struct afs_net *net = afs_d2net(dentry);
const char *name = dentry->d_name.name;
size_t len = dentry->d_name.len;
int ret;
/* Names prefixed with a dot are R/W mounts. */
if (name[0] == '.') {
if (len == 1)
return -EINVAL;
name++;
len--;
}
afs: Fix rapid cell addition/removal by not using RCU on cells tree There are a number of problems that are being seen by the rapidly mounting and unmounting an afs dynamic root with an explicit cell and volume specified (which should probably be rejected, but that's a separate issue): What the tests are doing is to look up/create a cell record for the name given and then tear it down again without actually using it to try to talk to a server. This is repeated endlessly, very fast, and the new cell collides with the old one if it's not quick enough to reuse it. It appears (as suggested by Hillf Danton) that the search through the RB tree under a read_seqbegin_or_lock() under RCU conditions isn't safe and that it's not blocking the write_seqlock(), despite taking two passes at it. He suggested that the code should take a ref on the cell it's attempting to look at - but this shouldn't be necessary until we've compared the cell names. It's possible that I'm missing a barrier somewhere. However, using an RCU search for this is overkill, really - we only need to access the cell name in a few places, and they're places where we're may end up sleeping anyway. Fix this by switching to an R/W semaphore instead. Additionally, draw the down_read() call inside the function (renamed to afs_find_cell()) since all the callers were taking the RCU read lock (or should've been[*]). [*] afs_probe_cell_name() should have been, but that doesn't appear to be involved in the bug reports. The symptoms of this look like: general protection fault, probably for non-canonical address 0xf27d208691691fdb: 0000 [#1] PREEMPT SMP KASAN KASAN: maybe wild-memory-access in range [0x93e924348b48fed8-0x93e924348b48fedf] ... RIP: 0010:strncasecmp lib/string.c:52 [inline] RIP: 0010:strncasecmp+0x5f/0x240 lib/string.c:43 afs_lookup_cell_rcu+0x313/0x720 fs/afs/cell.c:88 afs_lookup_cell+0x2ee/0x1440 fs/afs/cell.c:249 afs_parse_source fs/afs/super.c:290 [inline] ... Fixes: 989782dcdc91 ("afs: Overhaul cell database management") Reported-by: syzbot+459a5dce0b4cb70fd076@syzkaller.appspotmail.com Signed-off-by: David Howells <dhowells@redhat.com> cc: Hillf Danton <hdanton@sina.com> cc: syzkaller-bugs@googlegroups.com
2020-10-09 14:11:58 +01:00
cell = afs_find_cell(net, name, len);
if (!IS_ERR(cell)) {
afs: Fix cell refcounting by splitting the usage counter Management of the lifetime of afs_cell struct has some problems due to the usage counter being used to determine whether objects of that type are in use in addition to whether anyone might be interested in the structure. This is made trickier by cell objects being cached for a period of time in case they're quickly reused as they hold the result of a setup process that may be slow (DNS lookups, AFS RPC ops). Problems include the cached root volume from alias resolution pinning its parent cell record, rmmod occasionally hanging and occasionally producing assertion failures. Fix this by splitting the count of active users from the struct reference count. Things then work as follows: (1) The cell cache keeps +1 on the cell's activity count and this has to be dropped before the cell can be removed. afs_manage_cell() tries to exchange the 1 to a 0 with the cells_lock write-locked, and if successful, the record is removed from the net->cells. (2) One struct ref is 'owned' by the activity count. That is put when the active count is reduced to 0 (final_destruction label). (3) A ref can be held on a cell whilst it is queued for management on a work queue without confusing the active count. afs_queue_cell() is added to wrap this. (4) The queue's ref is dropped at the end of the management. This is split out into a separate function, afs_manage_cell_work(). (5) The root volume record is put after a cell is removed (at the final_destruction label) rather then in the RCU destruction routine. (6) Volumes hold struct refs, but aren't active users. (7) Both counts are displayed in /proc/net/afs/cells. There are some management function changes: (*) afs_put_cell() now just decrements the refcount and triggers the RCU destruction if it becomes 0. It no longer sets a timer to have the manager do this. (*) afs_use_cell() and afs_unuse_cell() are added to increase and decrease the active count. afs_unuse_cell() sets the management timer. (*) afs_queue_cell() is added to queue a cell with approprate refs. There are also some other fixes: (*) Don't let /proc/net/afs/cells access a cell's vllist if it's NULL. (*) Make sure that candidate cells in lookups are properly destroyed rather than being simply kfree'd. This ensures the bits it points to are destroyed also. (*) afs_dec_cells_outstanding() is now called in cell destruction rather than at "final_destruction". This ensures that cell->net is still valid to the end of the destructor. (*) As a consequence of the previous two changes, move the increment of net->cells_outstanding that was at the point of insertion into the tree to the allocation routine to correctly balance things. Fixes: 989782dcdc91 ("afs: Overhaul cell database management") Signed-off-by: David Howells <dhowells@redhat.com>
2019-07-23 11:24:59 +01:00
afs_unuse_cell(net, cell);
return 0;
}
ret = dns_query(net->net, "afsdb", name, len, "srv=1",
NULL, NULL, false);
if (ret == -ENODATA)
ret = -EDESTADDRREQ;
return ret;
}
/*
* Try to auto mount the mountpoint with pseudo directory, if the autocell
* operation is setted.
*/
struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir)
{
struct afs_vnode *vnode = AFS_FS_I(dir);
struct inode *inode;
int ret = -ENOENT;
_enter("%p{%pd}, {%llx:%llu}",
dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
goto out;
ret = afs_probe_cell_name(dentry);
if (ret < 0)
goto out;
inode = afs_iget_pseudo_dir(dir->i_sb, false);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto out;
}
_leave("= %p", inode);
return inode;
out:
_leave("= %d", ret);
return ret == -ENOENT ? NULL : ERR_PTR(ret);
}
/*
* Look up @cell in a dynroot directory. This is a substitution for the
* local cell name for the net namespace.
*/
static struct dentry *afs_lookup_atcell(struct dentry *dentry)
{
struct afs_cell *cell;
struct afs_net *net = afs_d2net(dentry);
struct dentry *ret;
char *name;
int len;
if (!net->ws_cell)
return ERR_PTR(-ENOENT);
ret = ERR_PTR(-ENOMEM);
name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL);
if (!name)
goto out_p;
afs: Fix rapid cell addition/removal by not using RCU on cells tree There are a number of problems that are being seen by the rapidly mounting and unmounting an afs dynamic root with an explicit cell and volume specified (which should probably be rejected, but that's a separate issue): What the tests are doing is to look up/create a cell record for the name given and then tear it down again without actually using it to try to talk to a server. This is repeated endlessly, very fast, and the new cell collides with the old one if it's not quick enough to reuse it. It appears (as suggested by Hillf Danton) that the search through the RB tree under a read_seqbegin_or_lock() under RCU conditions isn't safe and that it's not blocking the write_seqlock(), despite taking two passes at it. He suggested that the code should take a ref on the cell it's attempting to look at - but this shouldn't be necessary until we've compared the cell names. It's possible that I'm missing a barrier somewhere. However, using an RCU search for this is overkill, really - we only need to access the cell name in a few places, and they're places where we're may end up sleeping anyway. Fix this by switching to an R/W semaphore instead. Additionally, draw the down_read() call inside the function (renamed to afs_find_cell()) since all the callers were taking the RCU read lock (or should've been[*]). [*] afs_probe_cell_name() should have been, but that doesn't appear to be involved in the bug reports. The symptoms of this look like: general protection fault, probably for non-canonical address 0xf27d208691691fdb: 0000 [#1] PREEMPT SMP KASAN KASAN: maybe wild-memory-access in range [0x93e924348b48fed8-0x93e924348b48fedf] ... RIP: 0010:strncasecmp lib/string.c:52 [inline] RIP: 0010:strncasecmp+0x5f/0x240 lib/string.c:43 afs_lookup_cell_rcu+0x313/0x720 fs/afs/cell.c:88 afs_lookup_cell+0x2ee/0x1440 fs/afs/cell.c:249 afs_parse_source fs/afs/super.c:290 [inline] ... Fixes: 989782dcdc91 ("afs: Overhaul cell database management") Reported-by: syzbot+459a5dce0b4cb70fd076@syzkaller.appspotmail.com Signed-off-by: David Howells <dhowells@redhat.com> cc: Hillf Danton <hdanton@sina.com> cc: syzkaller-bugs@googlegroups.com
2020-10-09 14:11:58 +01:00
down_read(&net->cells_lock);
cell = net->ws_cell;
if (cell) {
len = cell->name_len;
memcpy(name, cell->name, len + 1);
}
up_read(&net->cells_lock);
ret = ERR_PTR(-ENOENT);
if (!cell)
goto out_n;
ret = lookup_one_len(name, dentry->d_parent, len);
/* We don't want to d_add() the @cell dentry here as we don't want to
* the cached dentry to hide changes to the local cell name.
*/
out_n:
kfree(name);
out_p:
return ret;
}
/*
* Look up an entry in a dynroot directory.
*/
static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
_enter("%pd", dentry);
ASSERTCMP(d_inode(dentry), ==, NULL);
if (flags & LOOKUP_CREATE)
return ERR_PTR(-EOPNOTSUPP);
if (dentry->d_name.len >= AFSNAMEMAX) {
_leave(" = -ENAMETOOLONG");
return ERR_PTR(-ENAMETOOLONG);
}
if (dentry->d_name.len == 5 &&
memcmp(dentry->d_name.name, "@cell", 5) == 0)
return afs_lookup_atcell(dentry);
return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry);
}
const struct inode_operations afs_dynroot_inode_operations = {
.lookup = afs_dynroot_lookup,
};
/*
* Dirs in the dynamic root don't need revalidation.
*/
static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
{
return 1;
}
/*
* Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
* sleep)
* - called from dput() when d_count is going to 0.
* - return 1 to request dentry be unhashed, 0 otherwise
*/
static int afs_dynroot_d_delete(const struct dentry *dentry)
{
return d_really_is_positive(dentry);
}
const struct dentry_operations afs_dynroot_dentry_operations = {
.d_revalidate = afs_dynroot_d_revalidate,
.d_delete = afs_dynroot_d_delete,
.d_release = afs_d_release,
.d_automount = afs_d_automount,
};
/*
* Create a manually added cell mount directory.
* - The caller must hold net->proc_cells_lock
*/
int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
{
struct super_block *sb = net->dynroot_sb;
struct dentry *root, *subdir;
int ret;
if (!sb || atomic_read(&sb->s_active) == 0)
return 0;
/* Let the ->lookup op do the creation */
root = sb->s_root;
inode_lock(root->d_inode);
subdir = lookup_one_len(cell->name, root, cell->name_len);
if (IS_ERR(subdir)) {
ret = PTR_ERR(subdir);
goto unlock;
}
/* Note that we're retaining an extra ref on the dentry */
subdir->d_fsdata = (void *)1UL;
ret = 0;
unlock:
inode_unlock(root->d_inode);
return ret;
}
/*
* Remove a manually added cell mount directory.
* - The caller must hold net->proc_cells_lock
*/
void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
{
struct super_block *sb = net->dynroot_sb;
struct dentry *root, *subdir;
if (!sb || atomic_read(&sb->s_active) == 0)
return;
root = sb->s_root;
inode_lock(root->d_inode);
/* Don't want to trigger a lookup call, which will re-add the cell */
subdir = try_lookup_one_len(cell->name, root, cell->name_len);
if (IS_ERR_OR_NULL(subdir)) {
_debug("lookup %ld", PTR_ERR(subdir));
goto no_dentry;
}
_debug("rmdir %pd %u", subdir, d_count(subdir));
if (subdir->d_fsdata) {
_debug("unpin %u", d_count(subdir));
subdir->d_fsdata = NULL;
dput(subdir);
}
dput(subdir);
no_dentry:
inode_unlock(root->d_inode);
_leave("");
}
/*
* Populate a newly created dynamic root with cell names.
*/
int afs_dynroot_populate(struct super_block *sb)
{
struct afs_cell *cell;
struct afs_net *net = afs_sb2net(sb);
int ret;
mutex_lock(&net->proc_cells_lock);
net->dynroot_sb = sb;
hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
ret = afs_dynroot_mkdir(net, cell);
if (ret < 0)
goto error;
}
ret = 0;
out:
mutex_unlock(&net->proc_cells_lock);
return ret;
error:
net->dynroot_sb = NULL;
goto out;
}
/*
* When a dynamic root that's in the process of being destroyed, depopulate it
* of pinned directories.
*/
void afs_dynroot_depopulate(struct super_block *sb)
{
struct afs_net *net = afs_sb2net(sb);
struct dentry *root = sb->s_root, *subdir, *tmp;
/* Prevent more subdirs from being created */
mutex_lock(&net->proc_cells_lock);
if (net->dynroot_sb == sb)
net->dynroot_sb = NULL;
mutex_unlock(&net->proc_cells_lock);
afs: Fix NULL deref in afs_dynroot_depopulate() If an error occurs during the construction of an afs superblock, it's possible that an error occurs after a superblock is created, but before we've created the root dentry. If the superblock has a dynamic root (ie. what's normally mounted on /afs), the afs_kill_super() will call afs_dynroot_depopulate() to unpin any created dentries - but this will oops if the root hasn't been created yet. Fix this by skipping that bit of code if there is no root dentry. This leads to an oops looking like: general protection fault, ... KASAN: null-ptr-deref in range [0x0000000000000068-0x000000000000006f] ... RIP: 0010:afs_dynroot_depopulate+0x25f/0x529 fs/afs/dynroot.c:385 ... Call Trace: afs_kill_super+0x13b/0x180 fs/afs/super.c:535 deactivate_locked_super+0x94/0x160 fs/super.c:335 afs_get_tree+0x1124/0x1460 fs/afs/super.c:598 vfs_get_tree+0x89/0x2f0 fs/super.c:1547 do_new_mount fs/namespace.c:2875 [inline] path_mount+0x1387/0x2070 fs/namespace.c:3192 do_mount fs/namespace.c:3205 [inline] __do_sys_mount fs/namespace.c:3413 [inline] __se_sys_mount fs/namespace.c:3390 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3390 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 which is oopsing on this line: inode_lock(root->d_inode); presumably because sb->s_root was NULL. Fixes: 0da0b7fd73e4 ("afs: Display manually added cells in dynamic root mount") Reported-by: syzbot+c1eff8205244ae7e11a6@syzkaller.appspotmail.com Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-21 10:15:12 +01:00
if (root) {
inode_lock(root->d_inode);
/* Remove all the pins for dirs created for manually added cells */
list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
if (subdir->d_fsdata) {
subdir->d_fsdata = NULL;
dput(subdir);
}
}
afs: Fix NULL deref in afs_dynroot_depopulate() If an error occurs during the construction of an afs superblock, it's possible that an error occurs after a superblock is created, but before we've created the root dentry. If the superblock has a dynamic root (ie. what's normally mounted on /afs), the afs_kill_super() will call afs_dynroot_depopulate() to unpin any created dentries - but this will oops if the root hasn't been created yet. Fix this by skipping that bit of code if there is no root dentry. This leads to an oops looking like: general protection fault, ... KASAN: null-ptr-deref in range [0x0000000000000068-0x000000000000006f] ... RIP: 0010:afs_dynroot_depopulate+0x25f/0x529 fs/afs/dynroot.c:385 ... Call Trace: afs_kill_super+0x13b/0x180 fs/afs/super.c:535 deactivate_locked_super+0x94/0x160 fs/super.c:335 afs_get_tree+0x1124/0x1460 fs/afs/super.c:598 vfs_get_tree+0x89/0x2f0 fs/super.c:1547 do_new_mount fs/namespace.c:2875 [inline] path_mount+0x1387/0x2070 fs/namespace.c:3192 do_mount fs/namespace.c:3205 [inline] __do_sys_mount fs/namespace.c:3413 [inline] __se_sys_mount fs/namespace.c:3390 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3390 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 which is oopsing on this line: inode_lock(root->d_inode); presumably because sb->s_root was NULL. Fixes: 0da0b7fd73e4 ("afs: Display manually added cells in dynamic root mount") Reported-by: syzbot+c1eff8205244ae7e11a6@syzkaller.appspotmail.com Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-21 10:15:12 +01:00
inode_unlock(root->d_inode);
}
}