eeba1e9cf3
The in-kernel afs filesystem client counts the number of server-level callback invalidation events (CB.InitCallBackState* RPC operations) that it receives from the server. This is stored in cb_s_break in various structures, including afs_server and afs_vnode. If an inode is examined by afs_validate(), say, the afs_server copy is compared, along with other break counters, to those in afs_vnode, and if one or more of the counters do not match, it is considered that the server's callback promise is broken. At points where this happens, AFS_VNODE_CB_PROMISED is cleared to indicate that the status must be refetched from the server. afs_validate() issues an FS.FetchStatus operation to get updated metadata - and based on the updated data_version may invalidate the pagecache too. However, the break counters are also used to determine whether to note a new callback in the vnode (which would set the AFS_VNODE_CB_PROMISED flag) and whether to cache the permit data included in the YFSFetchStatus record by the server. The problem comes when the server sends us a CB.InitCallBackState op. The first such instance doesn't cause cb_s_break to be incremented, but rather causes AFS_SERVER_FL_NEW to be cleared - but thereafter, say some hours after last use and all the volumes have been automatically unmounted and the server has forgotten about the client[*], this *will* likely cause an increment. [*] There are other circumstances too, such as the server restarting or needing to make space in its callback table. Note that the server won't send us a CB.InitCallBackState op until we talk to it again. So what happens is: (1) A mount for a new volume is attempted, a inode is created for the root vnode and vnode->cb_s_break and AFS_VNODE_CB_PROMISED aren't set immediately, as we don't have a nominated server to talk to yet - and we may iterate through a few to find one. (2) Before the operation happens, afs_fetch_status(), say, notes in the cursor (fc.cb_break) the break counter sum from the vnode, volume and server counters, but the server->cb_s_break is currently 0. (3) We send FS.FetchStatus to the server. The server sends us back CB.InitCallBackState. We increment server->cb_s_break. (4) Our FS.FetchStatus completes. The reply includes a callback record. (5) xdr_decode_AFSCallBack()/xdr_decode_YFSCallBack() check to see whether the callback promise was broken by checking the break counter sum from step (2) against the current sum. This fails because of step (3), so we don't set the callback record and, importantly, don't set AFS_VNODE_CB_PROMISED on the vnode. This does not preclude the syscall from progressing, and we don't loop here rechecking the status, but rather assume it's good enough for one round only and will need to be rechecked next time. (6) afs_validate() it triggered on the vnode, probably called from d_revalidate() checking the parent directory. (7) afs_validate() notes that AFS_VNODE_CB_PROMISED isn't set, so doesn't update vnode->cb_s_break and assumes the vnode to be invalid. (8) afs_validate() needs to calls afs_fetch_status(). Go back to step (2) and repeat, every time the vnode is validated. This primarily affects volume root dir vnodes. Everything subsequent to those inherit an already incremented cb_s_break upon mounting. The issue is that we assume that the callback record and the cached permit information in a reply from the server can't be trusted after getting a server break - but this is wrong since the server makes sure things are done in the right order, holding up our ops if necessary[*]. [*] There is an extremely unlikely scenario where a reply from before the CB.InitCallBackState could get its delivery deferred till after - at which point we think we have a promise when we don't. This, however, requires unlucky mass packet loss to one call. AFS_SERVER_FL_NEW tries to paper over the cracks for the initial mount from a server we've never contacted before, but this should be unnecessary. It's also further insulated from the problem on an initial mount by querying the server first with FS.GetCapabilities, which triggers the CB.InitCallBackState. Fix this by (1) Remove AFS_SERVER_FL_NEW. (2) In afs_calc_vnode_cb_break(), don't include cb_s_break in the calculation. (3) In afs_cb_is_broken(), don't include cb_s_break in the check. Signed-off-by: David Howells <dhowells@redhat.com>
338 lines
7.9 KiB
C
338 lines
7.9 KiB
C
/*
|
|
* Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
|
|
*
|
|
* This software may be freely redistributed under the terms of the
|
|
* GNU General Public License.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
*
|
|
* Authors: David Woodhouse <dwmw2@infradead.org>
|
|
* David Howells <dhowells@redhat.com>
|
|
*
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/init.h>
|
|
#include <linux/circ_buf.h>
|
|
#include <linux/sched.h>
|
|
#include "internal.h"
|
|
|
|
/*
|
|
* Create volume and callback interests on a server.
|
|
*/
|
|
static struct afs_cb_interest *afs_create_interest(struct afs_server *server,
|
|
struct afs_vnode *vnode)
|
|
{
|
|
struct afs_vol_interest *new_vi, *vi;
|
|
struct afs_cb_interest *new;
|
|
struct hlist_node **pp;
|
|
|
|
new_vi = kzalloc(sizeof(struct afs_vol_interest), GFP_KERNEL);
|
|
if (!new_vi)
|
|
return NULL;
|
|
|
|
new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL);
|
|
if (!new) {
|
|
kfree(new_vi);
|
|
return NULL;
|
|
}
|
|
|
|
new_vi->usage = 1;
|
|
new_vi->vid = vnode->volume->vid;
|
|
INIT_HLIST_NODE(&new_vi->srv_link);
|
|
INIT_HLIST_HEAD(&new_vi->cb_interests);
|
|
|
|
refcount_set(&new->usage, 1);
|
|
new->sb = vnode->vfs_inode.i_sb;
|
|
new->vid = vnode->volume->vid;
|
|
new->server = afs_get_server(server);
|
|
INIT_HLIST_NODE(&new->cb_vlink);
|
|
|
|
write_lock(&server->cb_break_lock);
|
|
|
|
for (pp = &server->cb_volumes.first; *pp; pp = &(*pp)->next) {
|
|
vi = hlist_entry(*pp, struct afs_vol_interest, srv_link);
|
|
if (vi->vid < new_vi->vid)
|
|
continue;
|
|
if (vi->vid > new_vi->vid)
|
|
break;
|
|
vi->usage++;
|
|
goto found_vi;
|
|
}
|
|
|
|
new_vi->srv_link.pprev = pp;
|
|
new_vi->srv_link.next = *pp;
|
|
if (*pp)
|
|
(*pp)->pprev = &new_vi->srv_link.next;
|
|
*pp = &new_vi->srv_link;
|
|
vi = new_vi;
|
|
new_vi = NULL;
|
|
found_vi:
|
|
|
|
new->vol_interest = vi;
|
|
hlist_add_head(&new->cb_vlink, &vi->cb_interests);
|
|
|
|
write_unlock(&server->cb_break_lock);
|
|
kfree(new_vi);
|
|
return new;
|
|
}
|
|
|
|
/*
|
|
* Set up an interest-in-callbacks record for a volume on a server and
|
|
* register it with the server.
|
|
* - Called with vnode->io_lock held.
|
|
*/
|
|
int afs_register_server_cb_interest(struct afs_vnode *vnode,
|
|
struct afs_server_list *slist,
|
|
unsigned int index)
|
|
{
|
|
struct afs_server_entry *entry = &slist->servers[index];
|
|
struct afs_cb_interest *cbi, *vcbi, *new, *old;
|
|
struct afs_server *server = entry->server;
|
|
|
|
again:
|
|
if (vnode->cb_interest &&
|
|
likely(vnode->cb_interest == entry->cb_interest))
|
|
return 0;
|
|
|
|
read_lock(&slist->lock);
|
|
cbi = afs_get_cb_interest(entry->cb_interest);
|
|
read_unlock(&slist->lock);
|
|
|
|
vcbi = vnode->cb_interest;
|
|
if (vcbi) {
|
|
if (vcbi == cbi) {
|
|
afs_put_cb_interest(afs_v2net(vnode), cbi);
|
|
return 0;
|
|
}
|
|
|
|
/* Use a new interest in the server list for the same server
|
|
* rather than an old one that's still attached to a vnode.
|
|
*/
|
|
if (cbi && vcbi->server == cbi->server) {
|
|
write_seqlock(&vnode->cb_lock);
|
|
old = vnode->cb_interest;
|
|
vnode->cb_interest = cbi;
|
|
write_sequnlock(&vnode->cb_lock);
|
|
afs_put_cb_interest(afs_v2net(vnode), old);
|
|
return 0;
|
|
}
|
|
|
|
/* Re-use the one attached to the vnode. */
|
|
if (!cbi && vcbi->server == server) {
|
|
write_lock(&slist->lock);
|
|
if (entry->cb_interest) {
|
|
write_unlock(&slist->lock);
|
|
afs_put_cb_interest(afs_v2net(vnode), cbi);
|
|
goto again;
|
|
}
|
|
|
|
entry->cb_interest = cbi;
|
|
write_unlock(&slist->lock);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
if (!cbi) {
|
|
new = afs_create_interest(server, vnode);
|
|
if (!new)
|
|
return -ENOMEM;
|
|
|
|
write_lock(&slist->lock);
|
|
if (!entry->cb_interest) {
|
|
entry->cb_interest = afs_get_cb_interest(new);
|
|
cbi = new;
|
|
new = NULL;
|
|
} else {
|
|
cbi = afs_get_cb_interest(entry->cb_interest);
|
|
}
|
|
write_unlock(&slist->lock);
|
|
afs_put_cb_interest(afs_v2net(vnode), new);
|
|
}
|
|
|
|
ASSERT(cbi);
|
|
|
|
/* Change the server the vnode is using. This entails scrubbing any
|
|
* interest the vnode had in the previous server it was using.
|
|
*/
|
|
write_seqlock(&vnode->cb_lock);
|
|
|
|
old = vnode->cb_interest;
|
|
vnode->cb_interest = cbi;
|
|
vnode->cb_s_break = cbi->server->cb_s_break;
|
|
vnode->cb_v_break = vnode->volume->cb_v_break;
|
|
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
|
|
|
|
write_sequnlock(&vnode->cb_lock);
|
|
afs_put_cb_interest(afs_v2net(vnode), old);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Remove an interest on a server.
|
|
*/
|
|
void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
|
|
{
|
|
struct afs_vol_interest *vi;
|
|
|
|
if (cbi && refcount_dec_and_test(&cbi->usage)) {
|
|
if (!hlist_unhashed(&cbi->cb_vlink)) {
|
|
write_lock(&cbi->server->cb_break_lock);
|
|
|
|
hlist_del_init(&cbi->cb_vlink);
|
|
vi = cbi->vol_interest;
|
|
cbi->vol_interest = NULL;
|
|
if (--vi->usage == 0)
|
|
hlist_del(&vi->srv_link);
|
|
else
|
|
vi = NULL;
|
|
|
|
write_unlock(&cbi->server->cb_break_lock);
|
|
kfree(vi);
|
|
afs_put_server(net, cbi->server);
|
|
}
|
|
kfree(cbi);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* allow the fileserver to request callback state (re-)initialisation
|
|
*/
|
|
void afs_init_callback_state(struct afs_server *server)
|
|
{
|
|
server->cb_s_break++;
|
|
}
|
|
|
|
/*
|
|
* actually break a callback
|
|
*/
|
|
void __afs_break_callback(struct afs_vnode *vnode)
|
|
{
|
|
_enter("");
|
|
|
|
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
|
|
if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
|
|
vnode->cb_break++;
|
|
afs_clear_permits(vnode);
|
|
|
|
spin_lock(&vnode->lock);
|
|
|
|
_debug("break callback");
|
|
|
|
if (list_empty(&vnode->granted_locks) &&
|
|
!list_empty(&vnode->pending_locks))
|
|
afs_lock_may_be_available(vnode);
|
|
spin_unlock(&vnode->lock);
|
|
}
|
|
}
|
|
|
|
void afs_break_callback(struct afs_vnode *vnode)
|
|
{
|
|
write_seqlock(&vnode->cb_lock);
|
|
__afs_break_callback(vnode);
|
|
write_sequnlock(&vnode->cb_lock);
|
|
}
|
|
|
|
/*
|
|
* allow the fileserver to explicitly break one callback
|
|
* - happens when
|
|
* - the backing file is changed
|
|
* - a lock is released
|
|
*/
|
|
static void afs_break_one_callback(struct afs_server *server,
|
|
struct afs_fid *fid)
|
|
{
|
|
struct afs_vol_interest *vi;
|
|
struct afs_cb_interest *cbi;
|
|
struct afs_iget_data data;
|
|
struct afs_vnode *vnode;
|
|
struct inode *inode;
|
|
|
|
read_lock(&server->cb_break_lock);
|
|
hlist_for_each_entry(vi, &server->cb_volumes, srv_link) {
|
|
if (vi->vid < fid->vid)
|
|
continue;
|
|
if (vi->vid > fid->vid) {
|
|
vi = NULL;
|
|
break;
|
|
}
|
|
//atomic_inc(&vi->usage);
|
|
break;
|
|
}
|
|
|
|
/* TODO: Find all matching volumes if we couldn't match the server and
|
|
* break them anyway.
|
|
*/
|
|
if (!vi)
|
|
goto out;
|
|
|
|
/* Step through all interested superblocks. There may be more than one
|
|
* because of cell aliasing.
|
|
*/
|
|
hlist_for_each_entry(cbi, &vi->cb_interests, cb_vlink) {
|
|
if (fid->vnode == 0 && fid->unique == 0) {
|
|
/* The callback break applies to an entire volume. */
|
|
struct afs_super_info *as = AFS_FS_S(cbi->sb);
|
|
struct afs_volume *volume = as->volume;
|
|
|
|
write_lock(&volume->cb_break_lock);
|
|
volume->cb_v_break++;
|
|
write_unlock(&volume->cb_break_lock);
|
|
} else {
|
|
data.volume = NULL;
|
|
data.fid = *fid;
|
|
inode = ilookup5_nowait(cbi->sb, fid->vnode,
|
|
afs_iget5_test, &data);
|
|
if (inode) {
|
|
vnode = AFS_FS_I(inode);
|
|
afs_break_callback(vnode);
|
|
iput(inode);
|
|
}
|
|
}
|
|
}
|
|
|
|
out:
|
|
read_unlock(&server->cb_break_lock);
|
|
}
|
|
|
|
/*
|
|
* allow the fileserver to break callback promises
|
|
*/
|
|
void afs_break_callbacks(struct afs_server *server, size_t count,
|
|
struct afs_callback_break *callbacks)
|
|
{
|
|
_enter("%p,%zu,", server, count);
|
|
|
|
ASSERT(server != NULL);
|
|
ASSERTCMP(count, <=, AFSCBMAX);
|
|
|
|
/* TODO: Sort the callback break list by volume ID */
|
|
|
|
for (; count > 0; callbacks++, count--) {
|
|
_debug("- Fid { vl=%08llx n=%llu u=%u }",
|
|
callbacks->fid.vid,
|
|
callbacks->fid.vnode,
|
|
callbacks->fid.unique);
|
|
afs_break_one_callback(server, &callbacks->fid);
|
|
}
|
|
|
|
_leave("");
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Clear the callback interests in a server list.
|
|
*/
|
|
void afs_clear_callback_interests(struct afs_net *net, struct afs_server_list *slist)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < slist->nr_servers; i++) {
|
|
afs_put_cb_interest(net, slist->servers[i].cb_interest);
|
|
slist->servers[i].cb_interest = NULL;
|
|
}
|
|
}
|