495f2ae9e3
Fix the fileserver rotation so that it doesn't use RTT as the basis for deciding which server and address to use as this doesn't necessarily give a good indication of the best path. Instead, use the configurable preference list in conjunction with whatever probes have succeeded at the time of looking. To this end, make the following changes: (1) Keep an array of "server states" to track what addresses we've tried on each server and move the waitqueue entries there that we'll need for probing. (2) Each afs_server_state struct is made to pin the corresponding server's endpoint state rather than the afs_operation struct carrying a pin on the server we're currently looking at. (3) Drop the server list preference; we now always rescan the server list. (4) afs_wait_for_probes() now uses the server state list to guide it in what it waits for (and to provide the waitqueue entries) and returns an indication of whether we'd got a response, run out of responsive addresses or the endpoint state had been superseded and we need to restart the iteration. (5) Call afs_get_address_preferences*() occasionally to refresh the preference values. (6) When picking a server, scan the addresses of the servers for which we have as-yet untested communications, looking for the highest priority one and use that instead of trying all the addresses for a particular server in ascending-RTT order. (7) When a Busy or Offline state is seen across all available servers, do a short sleep. (8) If we detect that we accessed a future RO volume version whilst it is undergoing replication, reissue the op against the older version until at least half of the servers are replicated. (9) Whilst RO replication is ongoing, increase the frequency of Volume Location server checks for that volume to every ten minutes instead of hourly. Also add a tracepoint to track progress through the rotation algorithm. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org
279 lines
6.4 KiB
C
279 lines
6.4 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/* Fileserver-directed operation handling.
|
|
*
|
|
* Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/fs.h>
|
|
#include "internal.h"
|
|
|
|
static atomic_t afs_operation_debug_counter;
|
|
|
|
/*
|
|
* Create an operation against a volume.
|
|
*/
|
|
struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *volume)
|
|
{
|
|
struct afs_operation *op;
|
|
|
|
_enter("");
|
|
|
|
op = kzalloc(sizeof(*op), GFP_KERNEL);
|
|
if (!op)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
if (!key) {
|
|
key = afs_request_key(volume->cell);
|
|
if (IS_ERR(key)) {
|
|
kfree(op);
|
|
return ERR_CAST(key);
|
|
}
|
|
} else {
|
|
key_get(key);
|
|
}
|
|
|
|
op->key = key;
|
|
op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
|
|
op->net = volume->cell->net;
|
|
op->cb_v_break = atomic_read(&volume->cb_v_break);
|
|
op->pre_volsync.creation = volume->creation_time;
|
|
op->pre_volsync.update = volume->update_time;
|
|
op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
|
|
op->nr_iterations = -1;
|
|
afs_op_set_error(op, -EDESTADDRREQ);
|
|
|
|
_leave(" = [op=%08x]", op->debug_id);
|
|
return op;
|
|
}
|
|
|
|
/*
|
|
* Lock the vnode(s) being operated upon.
|
|
*/
|
|
static bool afs_get_io_locks(struct afs_operation *op)
|
|
{
|
|
struct afs_vnode *vnode = op->file[0].vnode;
|
|
struct afs_vnode *vnode2 = op->file[1].vnode;
|
|
|
|
_enter("");
|
|
|
|
if (op->flags & AFS_OPERATION_UNINTR) {
|
|
mutex_lock(&vnode->io_lock);
|
|
op->flags |= AFS_OPERATION_LOCK_0;
|
|
_leave(" = t [1]");
|
|
return true;
|
|
}
|
|
|
|
if (!vnode2 || !op->file[1].need_io_lock || vnode == vnode2)
|
|
vnode2 = NULL;
|
|
|
|
if (vnode2 > vnode)
|
|
swap(vnode, vnode2);
|
|
|
|
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
|
|
afs_op_set_error(op, -ERESTARTSYS);
|
|
op->flags |= AFS_OPERATION_STOP;
|
|
_leave(" = f [I 0]");
|
|
return false;
|
|
}
|
|
op->flags |= AFS_OPERATION_LOCK_0;
|
|
|
|
if (vnode2) {
|
|
if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
|
|
afs_op_set_error(op, -ERESTARTSYS);
|
|
op->flags |= AFS_OPERATION_STOP;
|
|
mutex_unlock(&vnode->io_lock);
|
|
op->flags &= ~AFS_OPERATION_LOCK_0;
|
|
_leave(" = f [I 1]");
|
|
return false;
|
|
}
|
|
op->flags |= AFS_OPERATION_LOCK_1;
|
|
}
|
|
|
|
_leave(" = t [2]");
|
|
return true;
|
|
}
|
|
|
|
static void afs_drop_io_locks(struct afs_operation *op)
|
|
{
|
|
struct afs_vnode *vnode = op->file[0].vnode;
|
|
struct afs_vnode *vnode2 = op->file[1].vnode;
|
|
|
|
_enter("");
|
|
|
|
if (op->flags & AFS_OPERATION_LOCK_1)
|
|
mutex_unlock(&vnode2->io_lock);
|
|
if (op->flags & AFS_OPERATION_LOCK_0)
|
|
mutex_unlock(&vnode->io_lock);
|
|
}
|
|
|
|
static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp,
|
|
unsigned int index)
|
|
{
|
|
struct afs_vnode *vnode = vp->vnode;
|
|
|
|
if (vnode) {
|
|
vp->fid = vnode->fid;
|
|
vp->dv_before = vnode->status.data_version;
|
|
vp->cb_break_before = afs_calc_vnode_cb_break(vnode);
|
|
if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
|
|
op->flags |= AFS_OPERATION_CUR_ONLY;
|
|
if (vp->modification)
|
|
set_bit(AFS_VNODE_MODIFYING, &vnode->flags);
|
|
}
|
|
|
|
if (vp->fid.vnode)
|
|
_debug("PREP[%u] {%llx:%llu.%u}",
|
|
index, vp->fid.vid, vp->fid.vnode, vp->fid.unique);
|
|
}
|
|
|
|
/*
|
|
* Begin an operation on the fileserver.
|
|
*
|
|
* Fileserver operations are serialised on the server by vnode, so we serialise
|
|
* them here also using the io_lock.
|
|
*/
|
|
bool afs_begin_vnode_operation(struct afs_operation *op)
|
|
{
|
|
struct afs_vnode *vnode = op->file[0].vnode;
|
|
|
|
ASSERT(vnode);
|
|
|
|
_enter("");
|
|
|
|
if (op->file[0].need_io_lock)
|
|
if (!afs_get_io_locks(op))
|
|
return false;
|
|
|
|
afs_prepare_vnode(op, &op->file[0], 0);
|
|
afs_prepare_vnode(op, &op->file[1], 1);
|
|
op->cb_v_break = atomic_read(&op->volume->cb_v_break);
|
|
_leave(" = true");
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Tidy up a filesystem cursor and unlock the vnode.
|
|
*/
|
|
static void afs_end_vnode_operation(struct afs_operation *op)
|
|
{
|
|
_enter("");
|
|
|
|
switch (afs_op_error(op)) {
|
|
case -EDESTADDRREQ:
|
|
case -EADDRNOTAVAIL:
|
|
case -ENETUNREACH:
|
|
case -EHOSTUNREACH:
|
|
afs_dump_edestaddrreq(op);
|
|
break;
|
|
}
|
|
|
|
afs_drop_io_locks(op);
|
|
}
|
|
|
|
/*
|
|
* Wait for an in-progress operation to complete.
|
|
*/
|
|
void afs_wait_for_operation(struct afs_operation *op)
|
|
{
|
|
_enter("");
|
|
|
|
while (afs_select_fileserver(op)) {
|
|
op->call_responded = false;
|
|
op->call_error = 0;
|
|
op->call_abort_code = 0;
|
|
if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
|
|
op->ops->issue_yfs_rpc)
|
|
op->ops->issue_yfs_rpc(op);
|
|
else if (op->ops->issue_afs_rpc)
|
|
op->ops->issue_afs_rpc(op);
|
|
else
|
|
op->call_error = -ENOTSUPP;
|
|
|
|
if (op->call) {
|
|
afs_wait_for_call_to_complete(op->call);
|
|
op->call_abort_code = op->call->abort_code;
|
|
op->call_error = op->call->error;
|
|
op->call_responded = op->call->responded;
|
|
afs_put_call(op->call);
|
|
}
|
|
}
|
|
|
|
if (op->call_responded)
|
|
set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
|
|
|
|
if (!afs_op_error(op)) {
|
|
_debug("success");
|
|
op->ops->success(op);
|
|
} else if (op->cumul_error.aborted) {
|
|
if (op->ops->aborted)
|
|
op->ops->aborted(op);
|
|
} else {
|
|
if (op->ops->failed)
|
|
op->ops->failed(op);
|
|
}
|
|
|
|
afs_end_vnode_operation(op);
|
|
|
|
if (!afs_op_error(op) && op->ops->edit_dir) {
|
|
_debug("edit_dir");
|
|
op->ops->edit_dir(op);
|
|
}
|
|
_leave("");
|
|
}
|
|
|
|
/*
|
|
* Dispose of an operation.
|
|
*/
|
|
int afs_put_operation(struct afs_operation *op)
|
|
{
|
|
struct afs_addr_list *alist;
|
|
int i, ret = afs_op_error(op);
|
|
|
|
_enter("op=%08x,%d", op->debug_id, ret);
|
|
|
|
if (op->ops && op->ops->put)
|
|
op->ops->put(op);
|
|
if (op->file[0].modification)
|
|
clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags);
|
|
if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)
|
|
clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);
|
|
if (op->file[0].put_vnode)
|
|
iput(&op->file[0].vnode->netfs.inode);
|
|
if (op->file[1].put_vnode)
|
|
iput(&op->file[1].vnode->netfs.inode);
|
|
|
|
if (op->more_files) {
|
|
for (i = 0; i < op->nr_files - 2; i++)
|
|
if (op->more_files[i].put_vnode)
|
|
iput(&op->more_files[i].vnode->netfs.inode);
|
|
kfree(op->more_files);
|
|
}
|
|
|
|
if (op->estate) {
|
|
alist = op->estate->addresses;
|
|
if (alist) {
|
|
if (op->call_responded &&
|
|
op->addr_index != alist->preferred &&
|
|
test_bit(alist->preferred, &op->addr_tried))
|
|
WRITE_ONCE(alist->preferred, op->addr_index);
|
|
}
|
|
}
|
|
|
|
afs_clear_server_states(op);
|
|
afs_put_serverlist(op->net, op->server_list);
|
|
afs_put_volume(op->volume, afs_volume_trace_put_put_op);
|
|
key_put(op->key);
|
|
kfree(op);
|
|
return ret;
|
|
}
|
|
|
|
int afs_do_sync_operation(struct afs_operation *op)
|
|
{
|
|
afs_begin_vnode_operation(op);
|
|
afs_wait_for_operation(op);
|
|
return afs_put_operation(op);
|
|
}
|