diff --git a/fs/afs/Makefile b/fs/afs/Makefile index e8956b65d7ff..dcdc0f1bb76f 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -5,6 +5,7 @@ kafs-y := \ addr_list.o \ + addr_prefs.o \ callback.o \ cell.o \ cmservice.o \ @@ -27,6 +28,7 @@ kafs-y := \ server.o \ server_list.o \ super.o \ + validation.o \ vlclient.o \ vl_alias.o \ vl_list.o \ diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index de1ae0bead3b..6d42f85c6be5 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -13,26 +13,55 @@ #include "internal.h" #include "afs_fs.h" +static void afs_free_addrlist(struct rcu_head *rcu) +{ + struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu); + unsigned int i; + + for (i = 0; i < alist->nr_addrs; i++) + rxrpc_kernel_put_peer(alist->addrs[i].peer); + trace_afs_alist(alist->debug_id, refcount_read(&alist->usage), afs_alist_trace_free); + kfree(alist); +} + /* * Release an address list. */ -void afs_put_addrlist(struct afs_addr_list *alist) +void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason) { - if (alist && refcount_dec_and_test(&alist->usage)) - kfree_rcu(alist, rcu); + unsigned int debug_id; + bool dead; + int r; + + if (!alist) + return; + debug_id = alist->debug_id; + dead = __refcount_dec_and_test(&alist->usage, &r); + trace_afs_alist(debug_id, r - 1, reason); + if (dead) + call_rcu(&alist->rcu, afs_free_addrlist); +} + +struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason) +{ + int r; + + if (alist) { + __refcount_inc(&alist->usage, &r); + trace_afs_alist(alist->debug_id, r + 1, reason); + } + return alist; } /* * Allocate an address list. */ -struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, - unsigned short service, - unsigned short port) +struct afs_addr_list *afs_alloc_addrlist(unsigned int nr) { struct afs_addr_list *alist; - unsigned int i; + static atomic_t debug_id; - _enter("%u,%u,%u", nr, service, port); + _enter("%u", nr); if (nr > AFS_MAX_ADDRESSES) nr = AFS_MAX_ADDRESSES; @@ -43,17 +72,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, refcount_set(&alist->usage, 1); alist->max_addrs = nr; - - for (i = 0; i < nr; i++) { - struct sockaddr_rxrpc *srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->srx_service = service; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin6); - srx->transport.sin6.sin6_family = AF_INET6; - srx->transport.sin6.sin6_port = htons(port); - } - + alist->debug_id = atomic_inc_return(&debug_id); + trace_afs_alist(alist->debug_id, 1, afs_alist_trace_alloc); return alist; } @@ -126,7 +146,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, if (!vllist->servers[0].server) goto error_vl; - alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT); + alist = afs_alloc_addrlist(nr); if (!alist) goto error; @@ -197,9 +217,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, } if (family == AF_INET) - afs_merge_fs_addr4(alist, x[0], xport); + ret = afs_merge_fs_addr4(net, alist, x[0], xport); else - afs_merge_fs_addr6(alist, x, xport); + ret = afs_merge_fs_addr6(net, alist, x, xport); + if (ret < 0) + goto error; } while (p < end); @@ -216,25 +238,12 @@ bad_address: problem, p - text, (int)len, (int)len, text); ret = -EINVAL; error: - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_parse_error); error_vl: afs_put_vlserverlist(net, vllist); return ERR_PTR(ret); } -/* - * Compare old and new address lists to see if there's been any change. - * - How to do this in better than O(Nlog(N)) time? - * - We don't really want to sort the address list, but would rather take the - * list as we got it so as not to undo record rotation by the DNS server. - */ -#if 0 -static int afs_cmp_addr_list(const struct afs_addr_list *a1, - const struct afs_addr_list *a2) -{ -} -#endif - /* * Perform a DNS query for VL servers and build a up an address list. */ @@ -271,25 +280,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry /* * Merge an IPv4 entry into a fileserver address list. */ -void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) +int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist, + __be32 xdr, u16 port) { - struct sockaddr_rxrpc *srx; - u32 addr = ntohl(xdr); + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; int i; if (alist->nr_addrs >= alist->max_addrs) - return; + return 0; + + srx.srx_family = AF_RXRPC; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin); + srx.transport.sin.sin_family = AF_INET; + srx.transport.sin.sin_port = htons(port); + srx.transport.sin.sin_addr.s_addr = xdr; + + peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); + if (!peer) + return -ENOMEM; for (i = 0; i < alist->nr_ipv4; i++) { - struct sockaddr_in *a = &alist->addrs[i].transport.sin; - u32 a_addr = ntohl(a->sin_addr.s_addr); - u16 a_port = ntohs(a->sin_port); - - if (addr == a_addr && port == a_port) - return; - if (addr == a_addr && port < a_port) - break; - if (addr < a_addr) + if (peer == alist->addrs[i].peer) { + rxrpc_kernel_put_peer(peer); + return 0; + } + if (peer <= alist->addrs[i].peer) break; } @@ -298,38 +315,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin); - srx->transport.sin.sin_family = AF_INET; - srx->transport.sin.sin_port = htons(port); - srx->transport.sin.sin_addr.s_addr = xdr; + alist->addrs[i].peer = peer; alist->nr_ipv4++; alist->nr_addrs++; + return 0; } /* * Merge an IPv6 entry into a fileserver address list. */ -void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) +int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist, + __be32 *xdr, u16 port) { - struct sockaddr_rxrpc *srx; - int i, diff; + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; + int i; if (alist->nr_addrs >= alist->max_addrs) - return; + return 0; + + srx.srx_family = AF_RXRPC; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin6); + srx.transport.sin6.sin6_family = AF_INET6; + srx.transport.sin6.sin6_port = htons(port); + memcpy(&srx.transport.sin6.sin6_addr, xdr, 16); + + peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); + if (!peer) + return -ENOMEM; for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { - struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6; - u16 a_port = ntohs(a->sin6_port); - - diff = memcmp(xdr, &a->sin6_addr, 16); - if (diff == 0 && port == a_port) - return; - if (diff == 0 && port < a_port) - break; - if (diff < 0) + if (peer == alist->addrs[i].peer) { + rxrpc_kernel_put_peer(peer); + return 0; + } + if (peer <= alist->addrs[i].peer) break; } @@ -337,68 +358,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) memmove(alist->addrs + i + 1, alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - - srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin6); - srx->transport.sin6.sin6_family = AF_INET6; - srx->transport.sin6.sin6_port = htons(port); - memcpy(&srx->transport.sin6.sin6_addr, xdr, 16); + alist->addrs[i].peer = peer; alist->nr_addrs++; -} - -/* - * Get an address to try. - */ -bool afs_iterate_addresses(struct afs_addr_cursor *ac) -{ - unsigned long set, failed; - int index; - - if (!ac->alist) - return false; - - set = ac->alist->responded; - failed = ac->alist->failed; - _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index); - - ac->nr_iterations++; - - set &= ~(failed | ac->tried); - - if (!set) - return false; - - index = READ_ONCE(ac->alist->preferred); - if (test_bit(index, &set)) - goto selected; - - index = __ffs(set); - -selected: - ac->index = index; - set_bit(index, &ac->tried); - ac->responded = false; - return true; -} - -/* - * Release an address list cursor. - */ -int afs_end_cursor(struct afs_addr_cursor *ac) -{ - struct afs_addr_list *alist; - - alist = ac->alist; - if (alist) { - if (ac->responded && - ac->index != alist->preferred && - test_bit(ac->alist->preferred, &ac->tried)) - WRITE_ONCE(alist->preferred, ac->index); - afs_put_addrlist(alist); - ac->alist = NULL; - } - - return ac->error; + return 0; } diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c new file mode 100644 index 000000000000..a189ff8a5034 --- /dev/null +++ b/fs/afs/addr_prefs.c @@ -0,0 +1,531 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Address preferences management + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": addr_prefs: " fmt +#include +#include +#include +#include +#include +#include "internal.h" + +static inline struct afs_net *afs_seq2net_single(struct seq_file *m) +{ + return afs_net(seq_file_single_net(m)); +} + +/* + * Split a NUL-terminated string up to the first newline around spaces. The + * source string will be modified to have NUL-terminations inserted. + */ +static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv) +{ + unsigned int count = 0; + char *p = *pbuf; + + maxstrv--; /* Allow for terminal NULL */ + for (;;) { + /* Skip over spaces */ + while (isspace(*p)) { + if (*p == '\n') { + p++; + break; + } + p++; + } + if (!*p) + break; + + /* Mark start of word */ + if (count >= maxstrv) { + pr_warn("Too many elements in string\n"); + return -EINVAL; + } + strv[count++] = p; + + /* Skip over word */ + while (!isspace(*p)) + p++; + if (!*p) + break; + + /* Mark end of word */ + if (*p == '\n') { + *p++ = 0; + break; + } + *p++ = 0; + } + + *pbuf = p; + strv[count] = NULL; + return count; +} + +/* + * Parse an address with an optional subnet mask. + */ +static int afs_parse_address(char *p, struct afs_addr_preference *pref) +{ + const char *stop; + unsigned long mask, tmp; + char *end = p + strlen(p); + bool bracket = false; + + if (*p == '[') { + p++; + bracket = true; + } + +#if 0 + if (*p == '[') { + p++; + q = memchr(p, ']', end - p); + if (!q) { + pr_warn("Can't find closing ']'\n"); + return -EINVAL; + } + } else { + for (q = p; q < end; q++) + if (*q == '/') + break; + } +#endif + + if (in4_pton(p, end - p, (u8 *)&pref->ipv4_addr, -1, &stop)) { + pref->family = AF_INET; + mask = 32; + } else if (in6_pton(p, end - p, (u8 *)&pref->ipv6_addr, -1, &stop)) { + pref->family = AF_INET6; + mask = 128; + } else { + pr_warn("Can't determine address family\n"); + return -EINVAL; + } + + p = (char *)stop; + if (bracket) { + if (*p != ']') { + pr_warn("Can't find closing ']'\n"); + return -EINVAL; + } + p++; + } + + if (*p == '/') { + p++; + tmp = simple_strtoul(p, &p, 10); + if (tmp > mask) { + pr_warn("Subnet mask too large\n"); + return -EINVAL; + } + if (tmp == 0) { + pr_warn("Subnet mask too small\n"); + return -EINVAL; + } + mask = tmp; + } + + if (*p) { + pr_warn("Invalid address\n"); + return -EINVAL; + } + + pref->subnet_mask = mask; + return 0; +} + +enum cmp_ret { + CONTINUE_SEARCH, + INSERT_HERE, + EXACT_MATCH, + SUBNET_MATCH, +}; + +/* + * See if a candidate address matches a listed address. + */ +static enum cmp_ret afs_cmp_address_pref(const struct afs_addr_preference *a, + const struct afs_addr_preference *b) +{ + int subnet = min(a->subnet_mask, b->subnet_mask); + const __be32 *pa, *pb; + u32 mask, na, nb; + int diff; + + if (a->family != b->family) + return INSERT_HERE; + + switch (a->family) { + case AF_INET6: + pa = a->ipv6_addr.s6_addr32; + pb = b->ipv6_addr.s6_addr32; + break; + case AF_INET: + pa = &a->ipv4_addr.s_addr; + pb = &b->ipv4_addr.s_addr; + break; + } + + while (subnet > 32) { + diff = ntohl(*pa++) - ntohl(*pb++); + if (diff < 0) + return INSERT_HERE; /* a 0) + return CONTINUE_SEARCH; /* a>b */ + subnet -= 32; + } + + if (subnet == 0) + return EXACT_MATCH; + + mask = 0xffffffffU << (32 - subnet); + na = ntohl(*pa); + nb = ntohl(*pb); + diff = (na & mask) - (nb & mask); + //kdebug("diff %08x %08x %08x %d", na, nb, mask, diff); + if (diff < 0) + return INSERT_HERE; /* a 0) + return CONTINUE_SEARCH; /* a>b */ + if (a->subnet_mask == b->subnet_mask) + return EXACT_MATCH; + if (a->subnet_mask > b->subnet_mask) + return SUBNET_MATCH; /* a binds tighter than b */ + return CONTINUE_SEARCH; /* b binds tighter than a */ +} + +/* + * Insert an address preference. + */ +static int afs_insert_address_pref(struct afs_addr_preference_list **_preflist, + struct afs_addr_preference *pref, + int index) +{ + struct afs_addr_preference_list *preflist = *_preflist, *old = preflist; + size_t size, max_prefs; + + _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index); + + if (preflist->nr == 255) + return -ENOSPC; + if (preflist->nr >= preflist->max_prefs) { + max_prefs = preflist->max_prefs + 1; + size = struct_size(preflist, prefs, max_prefs); + size = roundup_pow_of_two(size); + max_prefs = min_t(size_t, (size - sizeof(*preflist)) / sizeof(*pref), 255); + preflist = kmalloc(size, GFP_KERNEL); + if (!preflist) + return -ENOMEM; + *preflist = **_preflist; + preflist->max_prefs = max_prefs; + *_preflist = preflist; + + if (index < preflist->nr) + memcpy(preflist->prefs + index + 1, old->prefs + index, + sizeof(*pref) * (preflist->nr - index)); + if (index > 0) + memcpy(preflist->prefs, old->prefs, sizeof(*pref) * index); + } else { + if (index < preflist->nr) + memmove(preflist->prefs + index + 1, preflist->prefs + index, + sizeof(*pref) * (preflist->nr - index)); + } + + preflist->prefs[index] = *pref; + preflist->nr++; + if (pref->family == AF_INET) + preflist->ipv6_off++; + return 0; +} + +/* + * Add an address preference. + * echo "add [/] " >/proc/fs/afs/addr_prefs + */ +static int afs_add_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist, + int argc, char **argv) +{ + struct afs_addr_preference_list *preflist = *_preflist; + struct afs_addr_preference pref; + enum cmp_ret cmp; + int ret, i, stop; + + if (argc != 3) { + pr_warn("Wrong number of params\n"); + return -EINVAL; + } + + if (strcmp(argv[0], "udp") != 0) { + pr_warn("Unsupported protocol\n"); + return -EINVAL; + } + + ret = afs_parse_address(argv[1], &pref); + if (ret < 0) + return ret; + + ret = kstrtou16(argv[2], 10, &pref.prio); + if (ret < 0) { + pr_warn("Invalid priority\n"); + return ret; + } + + if (pref.family == AF_INET) { + i = 0; + stop = preflist->ipv6_off; + } else { + i = preflist->ipv6_off; + stop = preflist->nr; + } + + for (; i < stop; i++) { + cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + case SUBNET_MATCH: + return afs_insert_address_pref(_preflist, &pref, i); + case EXACT_MATCH: + preflist->prefs[i].prio = pref.prio; + return 0; + } + } + + return afs_insert_address_pref(_preflist, &pref, i); +} + +/* + * Delete an address preference. + */ +static int afs_delete_address_pref(struct afs_addr_preference_list **_preflist, + int index) +{ + struct afs_addr_preference_list *preflist = *_preflist; + + _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index); + + if (preflist->nr == 0) + return -ENOENT; + + if (index < preflist->nr - 1) + memmove(preflist->prefs + index, preflist->prefs + index + 1, + sizeof(preflist->prefs[0]) * (preflist->nr - index - 1)); + + if (index < preflist->ipv6_off) + preflist->ipv6_off--; + preflist->nr--; + return 0; +} + +/* + * Delete an address preference. + * echo "del [/]" >/proc/fs/afs/addr_prefs + */ +static int afs_del_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist, + int argc, char **argv) +{ + struct afs_addr_preference_list *preflist = *_preflist; + struct afs_addr_preference pref; + enum cmp_ret cmp; + int ret, i, stop; + + if (argc != 2) { + pr_warn("Wrong number of params\n"); + return -EINVAL; + } + + if (strcmp(argv[0], "udp") != 0) { + pr_warn("Unsupported protocol\n"); + return -EINVAL; + } + + ret = afs_parse_address(argv[1], &pref); + if (ret < 0) + return ret; + + if (pref.family == AF_INET) { + i = 0; + stop = preflist->ipv6_off; + } else { + i = preflist->ipv6_off; + stop = preflist->nr; + } + + for (; i < stop; i++) { + cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + case SUBNET_MATCH: + return 0; + case EXACT_MATCH: + return afs_delete_address_pref(_preflist, i); + } + } + + return -ENOANO; +} + +/* + * Handle writes to /proc/fs/afs/addr_prefs + */ +int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size) +{ + struct afs_addr_preference_list *preflist, *old; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net_single(m); + size_t psize; + char *argv[5]; + int ret, argc, max_prefs; + + inode_lock(file_inode(file)); + + /* Allocate a candidate new list and initialise it from the old. */ + old = rcu_dereference_protected(net->address_prefs, + lockdep_is_held(&file_inode(file)->i_rwsem)); + + if (old) + max_prefs = old->nr + 1; + else + max_prefs = 1; + + psize = struct_size(old, prefs, max_prefs); + psize = roundup_pow_of_two(psize); + max_prefs = min_t(size_t, (psize - sizeof(*old)) / sizeof(old->prefs[0]), 255); + + ret = -ENOMEM; + preflist = kmalloc(struct_size(preflist, prefs, max_prefs), GFP_KERNEL); + if (!preflist) + goto done; + + if (old) + memcpy(preflist, old, struct_size(preflist, prefs, old->nr)); + else + memset(preflist, 0, sizeof(*preflist)); + preflist->max_prefs = max_prefs; + + do { + argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv)); + if (argc < 0) + return argc; + if (argc < 2) + goto inval; + + if (strcmp(argv[0], "add") == 0) + ret = afs_add_address_pref(net, &preflist, argc - 1, argv + 1); + else if (strcmp(argv[0], "del") == 0) + ret = afs_del_address_pref(net, &preflist, argc - 1, argv + 1); + else + goto inval; + if (ret < 0) + goto done; + } while (*buf); + + preflist->version++; + rcu_assign_pointer(net->address_prefs, preflist); + /* Store prefs before version */ + smp_store_release(&net->address_pref_version, preflist->version); + kfree_rcu(old, rcu); + preflist = NULL; + ret = 0; + +done: + kfree(preflist); + inode_unlock(file_inode(file)); + _leave(" = %d", ret); + return ret; + +inval: + pr_warn("Invalid Command\n"); + ret = -EINVAL; + goto done; +} + +/* + * Mark the priorities on an address list if the address preferences table has + * changed. The caller must hold the RCU read lock. + */ +void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist) +{ + const struct afs_addr_preference_list *preflist = + rcu_dereference(net->address_prefs); + const struct sockaddr_in6 *sin6; + const struct sockaddr_in *sin; + const struct sockaddr *sa; + struct afs_addr_preference test; + enum cmp_ret cmp; + int i, j; + + if (!preflist || !preflist->nr || !alist->nr_addrs || + smp_load_acquire(&alist->addr_pref_version) == preflist->version) + return; + + test.family = AF_INET; + test.subnet_mask = 32; + test.prio = 0; + for (i = 0; i < alist->nr_ipv4; i++) { + sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer); + sin = (const struct sockaddr_in *)sa; + test.ipv4_addr = sin->sin_addr; + for (j = 0; j < preflist->ipv6_off; j++) { + cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + break; + case EXACT_MATCH: + case SUBNET_MATCH: + WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio); + break; + } + } + } + + test.family = AF_INET6; + test.subnet_mask = 128; + test.prio = 0; + for (; i < alist->nr_addrs; i++) { + sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer); + sin6 = (const struct sockaddr_in6 *)sa; + test.ipv6_addr = sin6->sin6_addr; + for (j = preflist->ipv6_off; j < preflist->nr; j++) { + cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + break; + case EXACT_MATCH: + case SUBNET_MATCH: + WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio); + break; + } + } + } + + smp_store_release(&alist->addr_pref_version, preflist->version); +} + +/* + * Mark the priorities on an address list if the address preferences table has + * changed. Avoid taking the RCU read lock if we can. + */ +void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist) +{ + if (!net->address_prefs || + /* Load version before prefs */ + smp_load_acquire(&net->address_pref_version) == alist->addr_pref_version) + return; + + rcu_read_lock(); + afs_get_address_preferences_rcu(net, alist); + rcu_read_unlock(); +} diff --git a/fs/afs/afs.h b/fs/afs/afs.h index 81815724db6c..b488072aee87 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -165,7 +165,8 @@ struct afs_status_cb { * AFS volume synchronisation information */ struct afs_volsync { - time64_t creation; /* volume creation time */ + time64_t creation; /* Volume creation time (or TIME64_MIN) */ + time64_t update; /* Volume update time (or TIME64_MIN) */ }; /* diff --git a/fs/afs/callback.c b/fs/afs/callback.c index a484fa642808..99b2c8172021 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -33,22 +33,20 @@ void afs_invalidate_mmap_work(struct work_struct *work) unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); } -void afs_server_init_callback_work(struct work_struct *work) +static void afs_volume_init_callback(struct afs_volume *volume) { - struct afs_server *server = container_of(work, struct afs_server, initcb_work); struct afs_vnode *vnode; - struct afs_cell *cell = server->cell; - down_read(&cell->fs_open_mmaps_lock); + down_read(&volume->open_mmaps_lock); - list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) { - if (vnode->cb_server == server) { - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { + if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { + atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); queue_work(system_unbound_wq, &vnode->cb_work); } } - up_read(&cell->fs_open_mmaps_lock); + up_read(&volume->open_mmaps_lock); } /* @@ -57,15 +55,20 @@ void afs_server_init_callback_work(struct work_struct *work) */ void afs_init_callback_state(struct afs_server *server) { - rcu_read_lock(); - do { - server->cb_s_break++; - atomic_inc(&server->cell->fs_s_break); - if (!list_empty(&server->cell->fs_open_mmaps)) - queue_work(system_unbound_wq, &server->initcb_work); + struct afs_server_entry *se; - } while ((server = rcu_dereference(server->uuid_next))); - rcu_read_unlock(); + down_read(&server->cell->vs_lock); + + list_for_each_entry(se, &server->volumes, slink) { + se->cb_expires_at = AFS_NO_CB_PROMISE; + se->volume->cb_expires_at = AFS_NO_CB_PROMISE; + trace_afs_cb_v_break(se->volume->vid, atomic_read(&se->volume->cb_v_break), + afs_cb_break_for_s_reinit); + if (!list_empty(&se->volume->open_mmaps)) + afs_volume_init_callback(se->volume); + } + + up_read(&server->cell->vs_lock); } /* @@ -76,9 +79,9 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas _enter(""); clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) { vnode->cb_break++; - vnode->cb_v_break = vnode->volume->cb_v_break; + vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); afs_clear_permits(vnode); if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB) @@ -110,13 +113,14 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, { struct afs_volume *volume = NULL; struct rb_node *p; - int seq = 0; + int seq = 1; - do { + for (;;) { /* Unfortunately, rbtree walking doesn't give reliable results * under just the RCU read lock, so we have to check for * changes. */ + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&cell->volume_lock, &seq); p = rcu_dereference_raw(cell->volumes.rb_node); @@ -132,35 +136,63 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, volume = NULL; } - } while (need_seqretry(&cell->volume_lock, seq)); + if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback)) + break; + if (!need_seqretry(&cell->volume_lock, seq)) + break; + seq |= 1; /* Want a lock next time */ + } done_seqretry(&cell->volume_lock, seq); return volume; } +/* + * Allow the fileserver to break callbacks at the volume-level. This is + * typically done when, for example, a R/W volume is snapshotted to a R/O + * volume (the only way to change an R/O volume). It may also, however, happen + * when a volserver takes control of a volume (offlining it, moving it, etc.). + * + * Every file in that volume will need to be reevaluated. + */ +static void afs_break_volume_callback(struct afs_server *server, + struct afs_volume *volume) + __releases(RCU) +{ + struct afs_server_list *slist = rcu_dereference(volume->servers); + unsigned int i, cb_v_break; + + write_lock(&volume->cb_v_break_lock); + + for (i = 0; i < slist->nr_servers; i++) + if (slist->servers[i].server == server) + slist->servers[i].cb_expires_at = AFS_NO_CB_PROMISE; + volume->cb_expires_at = AFS_NO_CB_PROMISE; + + cb_v_break = atomic_inc_return_release(&volume->cb_v_break); + trace_afs_cb_v_break(volume->vid, cb_v_break, afs_cb_break_for_volume_callback); + + write_unlock(&volume->cb_v_break_lock); + rcu_read_unlock(); + + if (!list_empty(&volume->open_mmaps)) + afs_volume_init_callback(volume); +} + /* * allow the fileserver to explicitly break one callback * - happens when * - the backing file is changed * - a lock is released */ -static void afs_break_one_callback(struct afs_volume *volume, +static void afs_break_one_callback(struct afs_server *server, + struct afs_volume *volume, struct afs_fid *fid) { struct super_block *sb; struct afs_vnode *vnode; struct inode *inode; - if (fid->vnode == 0 && fid->unique == 0) { - /* The callback break applies to an entire volume. */ - write_lock(&volume->cb_v_break_lock); - volume->cb_v_break++; - trace_afs_cb_break(fid, volume->cb_v_break, - afs_cb_break_for_volume_callback, false); - write_unlock(&volume->cb_v_break_lock); - return; - } - /* See if we can find a matching inode - even an I_NEW inode needs to * be marked as it can have its callback broken before we finish * setting up the local inode. @@ -187,25 +219,35 @@ static void afs_break_some_callbacks(struct afs_server *server, afs_volid_t vid = cbb->fid.vid; size_t i; + rcu_read_lock(); volume = afs_lookup_volume_rcu(server->cell, vid); + if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) { + afs_break_volume_callback(server, volume); + *_count -= 1; + if (*_count) + memmove(cbb, cbb + 1, sizeof(*cbb) * *_count); + } else { + /* TODO: Find all matching volumes if we couldn't match the server and + * break them anyway. + */ - /* TODO: Find all matching volumes if we couldn't match the server and - * break them anyway. - */ - - for (i = *_count; i > 0; cbb++, i--) { - if (cbb->fid.vid == vid) { - _debug("- Fid { vl=%08llx n=%llu u=%u }", - cbb->fid.vid, - cbb->fid.vnode, - cbb->fid.unique); - --*_count; - if (volume) - afs_break_one_callback(volume, &cbb->fid); - } else { - *residue++ = *cbb; + for (i = *_count; i > 0; cbb++, i--) { + if (cbb->fid.vid == vid) { + _debug("- Fid { vl=%08llx n=%llu u=%u }", + cbb->fid.vid, + cbb->fid.vnode, + cbb->fid.unique); + --*_count; + if (volume) + afs_break_one_callback(server, volume, &cbb->fid); + } else { + *residue++ = *cbb; + } } + rcu_read_unlock(); } + + afs_put_volume(volume, afs_volume_trace_put_callback); } /* @@ -218,11 +260,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count, ASSERT(server != NULL); - rcu_read_lock(); - while (count > 0) afs_break_some_callbacks(server, callbacks, &count); - - rcu_read_unlock(); - return; } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 926cb1188eba..caa09875f520 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -161,13 +161,12 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, refcount_set(&cell->ref, 1); atomic_set(&cell->active, 0); INIT_WORK(&cell->manager, afs_manage_cell_work); + init_rwsem(&cell->vs_lock); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); cell->fs_servers = RB_ROOT; seqlock_init(&cell->fs_lock); - INIT_LIST_HEAD(&cell->fs_open_mmaps); - init_rwsem(&cell->fs_open_mmaps_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); @@ -817,7 +816,7 @@ done: final_destruction: /* The root volume is pinning the cell */ - afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); + afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root); cell->root_volume = NULL; afs_put_cell(cell, afs_cell_trace_put_destroy); } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index d4ddb20d6732..99a3f20bc786 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -146,10 +146,11 @@ static int afs_find_cm_server_by_peer(struct afs_call *call) { struct sockaddr_rxrpc srx; struct afs_server *server; + struct rxrpc_peer *peer; - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); + peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall); - server = afs_find_server(call->net, &srx); + server = afs_find_server(call->net, peer); if (!server) { trace_afs_cm_no_server(call, &srx); return 0; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 5219182e52e1..c14533ef108f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -693,8 +693,9 @@ static void afs_do_lookup_success(struct afs_operation *op) vp = &op->file[0]; abort_code = vp->scb.status.abort_code; if (abort_code != 0) { - op->ac.abort_code = abort_code; - op->error = afs_abort_to_error(abort_code); + op->call_abort_code = abort_code; + afs_op_set_error(op, afs_abort_to_error(abort_code)); + op->cumul_error.abort_code = abort_code; } break; @@ -806,8 +807,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, cookie->fids[i].vid = dvnode->fid.vid; cookie->ctx.actor = afs_lookup_filldir; cookie->name = dentry->d_name; - cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want - * and slot 1 for the directory */ + cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want + * and slot 0 for the directory */ if (!afs_server_supports_ibulk(dvnode)) cookie->one_only = true; @@ -846,13 +847,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, _debug("nr_files %u", op->nr_files); /* Need space for examining all the selected files */ - op->error = -ENOMEM; if (op->nr_files > 2) { op->more_files = kvcalloc(op->nr_files - 2, sizeof(struct afs_vnode_param), GFP_KERNEL); - if (!op->more_files) + if (!op->more_files) { + afs_op_nomem(op); goto out_op; + } for (i = 2; i < op->nr_files; i++) { vp = &op->more_files[i - 2]; @@ -878,14 +880,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, * lookups contained therein are stored in the reply without aborting * the whole operation. */ - op->error = -ENOTSUPP; + afs_op_set_error(op, -ENOTSUPP); if (!cookie->one_only) { op->ops = &afs_inline_bulk_status_operation; afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - if (op->error == -ENOTSUPP) { + if (afs_op_error(op) == -ENOTSUPP) { /* We could try FS.BulkStatus next, but this aborts the entire * op if any of the lookups fails - so, for the moment, revert * to FS.FetchStatus for op->file[1]. @@ -895,10 +897,10 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - inode = ERR_PTR(op->error); + inode = ERR_PTR(afs_op_error(op)); out_op: - if (op->error == 0) { + if (!afs_op_error(op)) { inode = &op->file[1].vnode->netfs.inode; op->file[1].vnode = NULL; } @@ -1116,7 +1118,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) dir = AFS_FS_I(d_inode(parent)); /* validate the parent directory */ - afs_validate(dir, key); + ret = afs_validate(dir, key); + if (ret == -ERESTARTSYS) { + dput(parent); + key_put(key); + return ret; + } if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { _debug("%pd: parent dir deleted", dentry); @@ -1255,9 +1262,10 @@ void afs_check_for_remote_deletion(struct afs_operation *op) { struct afs_vnode *vnode = op->file[0].vnode; - switch (op->ac.abort_code) { + switch (afs_op_abort_code(op)) { case VNOVNODE: set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_nlink(&vnode->netfs.inode); afs_break_callback(vnode, afs_cb_break_for_deleted); } } @@ -1273,20 +1281,20 @@ static void afs_vnode_new_inode(struct afs_operation *op) _enter(""); - ASSERTCMP(op->error, ==, 0); + ASSERTCMP(afs_op_error(op), ==, 0); inode = afs_iget(op, vp); if (IS_ERR(inode)) { /* ENOMEM or EINTR at a really inconvenient time - just abandon * the new directory on the server. */ - op->error = PTR_ERR(inode); + afs_op_accumulate_error(op, PTR_ERR(inode), 0); return; } vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - if (!op->error) + if (!afs_op_error(op)) afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb); d_instantiate(op->dentry, inode); } @@ -1320,7 +1328,7 @@ static void afs_create_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->error) + if (afs_op_error(op)) d_drop(op->dentry); } @@ -1373,7 +1381,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) clear_nlink(&vnode->netfs.inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); } } @@ -1480,7 +1488,7 @@ static void afs_dir_remove_link(struct afs_operation *op) struct dentry *dentry = op->dentry; int ret; - if (op->error != 0 || + if (afs_op_error(op) || (op->file[1].scb.have_status && op->file[1].scb.have_error)) return; if (d_really_is_positive(dentry)) @@ -1504,10 +1512,10 @@ static void afs_dir_remove_link(struct afs_operation *op) ret = afs_validate(vnode, op->key); if (ret != -ESTALE) - op->error = ret; + afs_op_set_error(op, ret); } - _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error); + _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op)); } static void afs_unlink_success(struct afs_operation *op) @@ -1538,7 +1546,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op) static void afs_unlink_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT) + if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT) d_rehash(op->dentry); } @@ -1579,7 +1587,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) /* Try to make sure we have a callback promise on the victim. */ ret = afs_validate(vnode, op->key); if (ret < 0) { - op->error = ret; + afs_op_set_error(op, ret); goto error; } @@ -1588,7 +1596,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); - op->error = afs_sillyrename(dvnode, vnode, dentry, op->key); + afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key)); goto error; } if (!d_unhashed(dentry)) { @@ -1609,7 +1617,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) /* If there was a conflict with a third party, check the status of the * unlinked vnode. */ - if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { op->file[1].update_ctime = false; op->fetch_status.which = 1; op->ops = &afs_fetch_status_operation; @@ -1691,7 +1699,7 @@ static void afs_link_success(struct afs_operation *op) static void afs_link_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->error) + if (afs_op_error(op)) d_drop(op->dentry); } @@ -1889,7 +1897,7 @@ static void afs_rename_put(struct afs_operation *op) if (op->rename.rehash) d_rehash(op->rename.rehash); dput(op->rename.tmp); - if (op->error) + if (afs_op_error(op)) d_rehash(op->dentry); } @@ -1934,7 +1942,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, return PTR_ERR(op); ret = afs_validate(vnode, op->key); - op->error = ret; + afs_op_set_error(op, ret); if (ret < 0) goto error; @@ -1971,7 +1979,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, op->rename.tmp = d_alloc(new_dentry->d_parent, &new_dentry->d_name); if (!op->rename.tmp) { - op->error = -ENOMEM; + afs_op_nomem(op); goto error; } @@ -1979,7 +1987,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, AFS_FS_I(d_inode(new_dentry)), new_dentry, op->key); if (ret) { - op->error = ret; + afs_op_set_error(op, ret); goto error; } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index bb5807e87fa4..a1e581946b93 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -218,7 +218,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode /* If there was a conflict with a third party, check the status of the * unlinked vnode. */ - if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { op->file[1].update_ctime = false; op->fetch_status.which = 1; op->ops = &afs_fetch_status_operation; diff --git a/fs/afs/file.c b/fs/afs/file.c index d37dd201752b..30914e0d9cb2 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -243,12 +243,9 @@ static void afs_fetch_data_notify(struct afs_operation *op) { struct afs_read *req = op->fetch.req; struct netfs_io_subrequest *subreq = req->subreq; - int error = op->error; + int error = afs_op_error(op); - if (error == -ECONNABORTED) - error = afs_abort_to_error(op->ac.abort_code); req->error = error; - if (subreq) { __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); netfs_subreq_terminated(subreq, error ?: req->actual_len, false); @@ -271,7 +268,7 @@ static void afs_fetch_data_success(struct afs_operation *op) static void afs_fetch_data_put(struct afs_operation *op) { - op->fetch.req->error = op->error; + op->fetch.req->error = afs_op_error(op); afs_put_read(op->fetch.req); } @@ -517,13 +514,12 @@ static bool afs_release_folio(struct folio *folio, gfp_t gfp) static void afs_add_open_mmap(struct afs_vnode *vnode) { if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) { - down_write(&vnode->volume->cell->fs_open_mmaps_lock); + down_write(&vnode->volume->open_mmaps_lock); if (list_empty(&vnode->cb_mmap_link)) - list_add_tail(&vnode->cb_mmap_link, - &vnode->volume->cell->fs_open_mmaps); + list_add_tail(&vnode->cb_mmap_link, &vnode->volume->open_mmaps); - up_write(&vnode->volume->cell->fs_open_mmaps_lock); + up_write(&vnode->volume->open_mmaps_lock); } } @@ -532,12 +528,12 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode) if (!atomic_dec_and_test(&vnode->cb_nr_mmap)) return; - down_write(&vnode->volume->cell->fs_open_mmaps_lock); + down_write(&vnode->volume->open_mmaps_lock); if (atomic_read(&vnode->cb_nr_mmap) == 0) list_del_init(&vnode->cb_mmap_link); - up_write(&vnode->volume->cell->fs_open_mmaps_lock); + up_write(&vnode->volume->open_mmaps_lock); flush_work(&vnode->cb_work); } @@ -573,7 +569,7 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg { struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); - if (afs_pagecache_valid(vnode)) + if (afs_check_validity(vnode)) return filemap_map_pages(vmf, start_pgoff, end_pgoff); return 0; } diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 7a3803ce3a22..3546b087e791 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -35,13 +35,15 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo key_get(key); } - op->key = key; - op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); - op->net = volume->cell->net; - op->cb_v_break = volume->cb_v_break; - op->debug_id = atomic_inc_return(&afs_operation_debug_counter); - op->error = -EDESTADDRREQ; - op->ac.error = SHRT_MAX; + op->key = key; + op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); + op->net = volume->cell->net; + op->cb_v_break = atomic_read(&volume->cb_v_break); + op->pre_volsync.creation = volume->creation_time; + op->pre_volsync.update = volume->update_time; + op->debug_id = atomic_inc_return(&afs_operation_debug_counter); + op->nr_iterations = -1; + afs_op_set_error(op, -EDESTADDRREQ); _leave(" = [op=%08x]", op->debug_id); return op; @@ -71,7 +73,7 @@ static bool afs_get_io_locks(struct afs_operation *op) swap(vnode, vnode2); if (mutex_lock_interruptible(&vnode->io_lock) < 0) { - op->error = -ERESTARTSYS; + afs_op_set_error(op, -ERESTARTSYS); op->flags |= AFS_OPERATION_STOP; _leave(" = f [I 0]"); return false; @@ -80,7 +82,7 @@ static bool afs_get_io_locks(struct afs_operation *op) if (vnode2) { if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) { - op->error = -ERESTARTSYS; + afs_op_set_error(op, -ERESTARTSYS); op->flags |= AFS_OPERATION_STOP; mutex_unlock(&vnode->io_lock); op->flags &= ~AFS_OPERATION_LOCK_0; @@ -147,7 +149,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op) afs_prepare_vnode(op, &op->file[0], 0); afs_prepare_vnode(op, &op->file[1], 1); - op->cb_v_break = op->volume->cb_v_break; + op->cb_v_break = atomic_read(&op->volume->cb_v_break); _leave(" = true"); return true; } @@ -159,16 +161,16 @@ static void afs_end_vnode_operation(struct afs_operation *op) { _enter(""); - if (op->error == -EDESTADDRREQ || - op->error == -EADDRNOTAVAIL || - op->error == -ENETUNREACH || - op->error == -EHOSTUNREACH) + switch (afs_op_error(op)) { + case -EDESTADDRREQ: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: afs_dump_edestaddrreq(op); + break; + } afs_drop_io_locks(op); - - if (op->error == -ECONNABORTED) - op->error = afs_abort_to_error(op->ac.abort_code); } /* @@ -179,37 +181,43 @@ void afs_wait_for_operation(struct afs_operation *op) _enter(""); while (afs_select_fileserver(op)) { - op->cb_s_break = op->server->cb_s_break; + op->call_responded = false; + op->call_error = 0; + op->call_abort_code = 0; if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) && op->ops->issue_yfs_rpc) op->ops->issue_yfs_rpc(op); else if (op->ops->issue_afs_rpc) op->ops->issue_afs_rpc(op); else - op->ac.error = -ENOTSUPP; + op->call_error = -ENOTSUPP; - if (op->call) - op->error = afs_wait_for_call_to_complete(op->call, &op->ac); + if (op->call) { + afs_wait_for_call_to_complete(op->call); + op->call_abort_code = op->call->abort_code; + op->call_error = op->call->error; + op->call_responded = op->call->responded; + afs_put_call(op->call); + } } - switch (op->error) { - case 0: + if (op->call_responded) + set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags); + + if (!afs_op_error(op)) { _debug("success"); op->ops->success(op); - break; - case -ECONNABORTED: + } else if (op->cumul_error.aborted) { if (op->ops->aborted) op->ops->aborted(op); - fallthrough; - default: + } else { if (op->ops->failed) op->ops->failed(op); - break; } afs_end_vnode_operation(op); - if (op->error == 0 && op->ops->edit_dir) { + if (!afs_op_error(op) && op->ops->edit_dir) { _debug("edit_dir"); op->ops->edit_dir(op); } @@ -221,7 +229,8 @@ void afs_wait_for_operation(struct afs_operation *op) */ int afs_put_operation(struct afs_operation *op) { - int i, ret = op->error; + struct afs_addr_list *alist; + int i, ret = afs_op_error(op); _enter("op=%08x,%d", op->debug_id, ret); @@ -243,9 +252,19 @@ int afs_put_operation(struct afs_operation *op) kfree(op->more_files); } - afs_end_cursor(&op->ac); + if (op->estate) { + alist = op->estate->addresses; + if (alist) { + if (op->call_responded && + op->addr_index != alist->preferred && + test_bit(alist->preferred, &op->addr_tried)) + WRITE_ONCE(alist->preferred, op->addr_index); + } + } + + afs_clear_server_states(op); afs_put_serverlist(op->net, op->server_list); - afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op); + afs_put_volume(op->volume, afs_volume_trace_put_put_op); key_put(op->key); kfree(op); return ret; diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index daaf3810cc92..580de4adaaf6 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -15,6 +15,42 @@ static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ; static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ; +struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, + enum afs_estate_trace where) +{ + if (estate) { + int r; + + __refcount_inc(&estate->ref, &r); + trace_afs_estate(estate->server_id, estate->probe_seq, r, where); + } + return estate; +} + +static void afs_endpoint_state_rcu(struct rcu_head *rcu) +{ + struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu); + + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_free); + afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate); + kfree(estate); +} + +void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where) +{ + if (estate) { + unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq; + bool dead; + int r; + + dead = __refcount_dec_and_test(&estate->ref, &r); + trace_afs_estate(server_id, probe_seq, r, where); + if (dead) + call_rcu(&estate->rcu, afs_endpoint_state_rcu); + } +} + /* * Start the probe polling timer. We have to supply it with an inc on the * outstanding server count. @@ -38,9 +74,10 @@ static void afs_schedule_fs_probe(struct afs_net *net, /* * Handle the completion of a set of probes. */ -static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server) +static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate) { - bool responded = server->probe.responded; + bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags); write_seqlock(&net->fs_lock); if (responded) { @@ -50,6 +87,7 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags); list_add_tail(&server->probe_link, &net->fs_probe_fast); } + write_sequnlock(&net->fs_lock); afs_schedule_fs_probe(net, server, !responded); @@ -58,12 +96,13 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server /* * Handle the completion of a probe. */ -static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server) +static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate) { _enter(""); - if (atomic_dec_and_test(&server->probe_outstanding)) - afs_finished_fs_probe(net, server); + if (atomic_dec_and_test(&estate->nr_probing)) + afs_finished_fs_probe(net, server, estate); wake_up_all(&server->probe_wq); } @@ -74,24 +113,22 @@ static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server */ static void afs_fs_probe_not_done(struct afs_net *net, struct afs_server *server, - struct afs_addr_cursor *ac) + struct afs_endpoint_state *estate, + int index) { - struct afs_addr_list *alist = ac->alist; - unsigned int index = ac->index; - _enter(""); trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail); spin_lock(&server->probe_lock); - server->probe.local_failure = true; - if (server->probe.error == 0) - server->probe.error = -ENOMEM; + set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags); + if (estate->error == 0) + estate->error = -ENOMEM; - set_bit(index, &alist->failed); + set_bit(index, &estate->failed_set); spin_unlock(&server->probe_lock); - return afs_done_one_fs_probe(net, server); + return afs_done_one_fs_probe(net, server, estate); } /* @@ -100,30 +137,34 @@ static void afs_fs_probe_not_done(struct afs_net *net, */ void afs_fileserver_probe_result(struct afs_call *call) { - struct afs_addr_list *alist = call->alist; + struct afs_endpoint_state *estate = call->probe; + struct afs_addr_list *alist = estate->addresses; + struct afs_address *addr = &alist->addrs[call->probe_index]; struct afs_server *server = call->server; - unsigned int index = call->addr_ix; - unsigned int rtt_us = 0, cap0; + unsigned int index = call->probe_index; + unsigned int rtt_us = -1, cap0; int ret = call->error; _enter("%pU,%u", &server->uuid, index); + WRITE_ONCE(addr->last_error, ret); + spin_lock(&server->probe_lock); switch (ret) { case 0: - server->probe.error = 0; + estate->error = 0; goto responded; case -ECONNABORTED: - if (!server->probe.responded) { - server->probe.abort_code = call->abort_code; - server->probe.error = ret; + if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) { + estate->abort_code = call->abort_code; + estate->error = ret; } goto responded; case -ENOMEM: case -ENONET: - clear_bit(index, &alist->responded); - server->probe.local_failure = true; + clear_bit(index, &estate->responsive_set); + set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags); trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ @@ -136,29 +177,29 @@ void afs_fileserver_probe_result(struct afs_call *call) case -ETIMEDOUT: case -ETIME: default: - clear_bit(index, &alist->responded); - set_bit(index, &alist->failed); - if (!server->probe.responded && - (server->probe.error == 0 || - server->probe.error == -ETIMEDOUT || - server->probe.error == -ETIME)) - server->probe.error = ret; + clear_bit(index, &estate->responsive_set); + set_bit(index, &estate->failed_set); + if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) && + (estate->error == 0 || + estate->error == -ETIMEDOUT || + estate->error == -ETIME)) + estate->error = ret; trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); goto out; } responded: - clear_bit(index, &alist->failed); + clear_bit(index, &estate->failed_set); if (call->service_id == YFS_FS_SERVICE) { - server->probe.is_yfs = true; + set_bit(AFS_ESTATE_IS_YFS, &estate->flags); set_bit(AFS_SERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } else { - server->probe.not_yfs = true; - if (!server->probe.is_yfs) { + set_bit(AFS_ESTATE_NOT_YFS, &estate->flags); + if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) { clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } cap0 = ntohl(call->tmp); if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES) @@ -167,116 +208,136 @@ responded: clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); } - rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); - if (rtt_us < server->probe.rtt) { - server->probe.rtt = rtt_us; + rtt_us = rxrpc_kernel_get_srtt(addr->peer); + if (rtt_us < estate->rtt) { + estate->rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; } smp_wmb(); /* Set rtt before responded. */ - server->probe.responded = true; - set_bit(index, &alist->responded); + set_bit(AFS_ESTATE_RESPONDED, &estate->flags); + set_bit(index, &estate->responsive_set); set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); out: spin_unlock(&server->probe_lock); - _debug("probe %pU [%u] %pISpc rtt=%u ret=%d", - &server->uuid, index, &alist->addrs[index].transport, + trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us); + _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d", + estate->probe_seq, &server->uuid, index, + rxrpc_kernel_remote_addr(alist->addrs[index].peer), rtt_us, ret); - return afs_done_one_fs_probe(call->net, server); + return afs_done_one_fs_probe(call->net, server, estate); } /* - * Probe one or all of a fileserver's addresses to find out the best route and - * to query its capabilities. + * Probe all of a fileserver's addresses to find out the best route and to + * query its capabilities. */ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct key *key, bool all) + struct afs_addr_list *new_alist, struct key *key) { - struct afs_addr_cursor ac = { - .index = 0, - }; + struct afs_endpoint_state *estate, *old; + struct afs_addr_list *alist; + unsigned long unprobed; _enter("%pU", &server->uuid); - read_lock(&server->fs_lock); - ac.alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - afs_get_addrlist(ac.alist); - read_unlock(&server->fs_lock); + estate = kzalloc(sizeof(*estate), GFP_KERNEL); + if (!estate) + return; + + refcount_set(&estate->ref, 1); + estate->server_id = server->debug_id; + estate->rtt = UINT_MAX; + + write_lock(&server->fs_lock); + + old = rcu_dereference_protected(server->endpoint_state, + lockdep_is_held(&server->fs_lock)); + estate->responsive_set = old->responsive_set; + estate->addresses = afs_get_addrlist(new_alist ?: old->addresses, + afs_alist_trace_get_estate); + alist = estate->addresses; + estate->probe_seq = ++server->probe_counter; + atomic_set(&estate->nr_probing, alist->nr_addrs); + + rcu_assign_pointer(server->endpoint_state, estate); + set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); + write_unlock(&server->fs_lock); + + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_alloc_probe); + + afs_get_address_preferences(net, alist); server->probed_at = jiffies; - atomic_set(&server->probe_outstanding, all ? ac.alist->nr_addrs : 1); - memset(&server->probe, 0, sizeof(server->probe)); - server->probe.rtt = UINT_MAX; + unprobed = (1UL << alist->nr_addrs) - 1; + while (unprobed) { + unsigned int index = 0, i; + int best_prio = -1; - ac.index = ac.alist->preferred; - if (ac.index < 0 || ac.index >= ac.alist->nr_addrs) - all = true; + for (i = 0; i < alist->nr_addrs; i++) { + if (test_bit(i, &unprobed) && + alist->addrs[i].prio > best_prio) { + index = i; + best_prio = alist->addrs[i].prio; + } + } + __clear_bit(index, &unprobed); - if (all) { - for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) - if (!afs_fs_get_capabilities(net, server, &ac, key)) - afs_fs_probe_not_done(net, server, &ac); - } else { - if (!afs_fs_get_capabilities(net, server, &ac, key)) - afs_fs_probe_not_done(net, server, &ac); + trace_afs_fs_probe(server, true, estate, index, 0, 0, 0); + if (!afs_fs_get_capabilities(net, server, estate, index, key)) + afs_fs_probe_not_done(net, server, estate, index); } - afs_put_addrlist(ac.alist); + afs_put_endpoint_state(old, afs_estate_trace_put_probe); } /* - * Wait for the first as-yet untried fileserver to respond. + * Wait for the first as-yet untried fileserver to respond, for the probe state + * to be superseded or for all probes to finish. */ -int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) +int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr) { - struct wait_queue_entry *waits; - struct afs_server *server; - unsigned int rtt = UINT_MAX, rtt_s; - bool have_responders = false; - int pref = -1, i; + struct afs_endpoint_state *estate; + struct afs_server_list *slist = op->server_list; + bool still_probing = true; + int ret = 0, i; - _enter("%u,%lx", slist->nr_servers, untried); + _enter("%u", slist->nr_servers); - /* Only wait for servers that have a probe outstanding. */ for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - if (!atomic_read(&server->probe_outstanding)) - __clear_bit(i, &untried); - if (server->probe.responded) - have_responders = true; - } + estate = states[i].endpoint_state; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) + return 2; + if (atomic_read(&estate->nr_probing)) + still_probing = true; + if (estate->responsive_set & states[i].untried_addrs) + return 1; } - if (have_responders || !untried) + if (!still_probing) return 0; - waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL); - if (!waits) - return -ENOMEM; - - for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - init_waitqueue_entry(&waits[i], current); - add_wait_queue(&server->probe_wq, &waits[i]); - } - } + for (i = 0; i < slist->nr_servers; i++) + add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter); for (;;) { - bool still_probing = false; + still_probing = false; - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - if (server->probe.responded) - goto stop; - if (atomic_read(&server->probe_outstanding)) - still_probing = true; + estate = states[i].endpoint_state; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) { + ret = 2; + goto stop; + } + if (atomic_read(&estate->nr_probing)) + still_probing = true; + if (estate->responsive_set & states[i].untried_addrs) { + ret = 1; + goto stop; } } @@ -288,28 +349,12 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) stop: set_current_state(TASK_RUNNING); - for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - rtt_s = READ_ONCE(server->rtt); - if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) && - rtt_s < rtt) { - pref = i; - rtt = rtt_s; - } + for (i = 0; i < slist->nr_servers; i++) + remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter); - remove_wait_queue(&server->probe_wq, &waits[i]); - } - } - - kfree(waits); - - if (pref == -1 && signal_pending(current)) - return -ERESTARTSYS; - - if (pref >= 0) - slist->preferred = pref; - return 0; + if (!ret && signal_pending(current)) + ret = -ERESTARTSYS; + return ret; } /* @@ -327,7 +372,7 @@ void afs_fs_probe_timer(struct timer_list *timer) /* * Dispatch a probe to a server. */ -static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all) +static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server) __releases(&net->fs_lock) { struct key *key = NULL; @@ -340,7 +385,7 @@ static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server afs_get_server(server, afs_server_trace_get_probe); write_sequnlock(&net->fs_lock); - afs_fs_probe_fileserver(net, server, key, all); + afs_fs_probe_fileserver(net, server, NULL, key); afs_put_server(net, server, afs_server_trace_put_probe); } @@ -352,7 +397,7 @@ void afs_probe_fileserver(struct afs_net *net, struct afs_server *server) { write_seqlock(&net->fs_lock); if (!list_empty(&server->probe_link)) - return afs_dispatch_fs_probe(net, server, true); + return afs_dispatch_fs_probe(net, server); write_sequnlock(&net->fs_lock); } @@ -412,7 +457,7 @@ again: _debug("probe %pU", &server->uuid); if (server && (first_pass || !need_resched())) { - afs_dispatch_fs_probe(net, server, server == fast); + afs_dispatch_fs_probe(net, server); first_pass = false; goto again; } @@ -436,12 +481,13 @@ again: /* * Wait for a probe on a particular fileserver to complete for 2s. */ -int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) +int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, + unsigned long exclude, bool is_intr) { struct wait_queue_entry wait; unsigned long timo = 2 * HZ; - if (atomic_read(&server->probe_outstanding) == 0) + if (atomic_read(&estate->nr_probing) == 0) goto dont_wait; init_wait_entry(&wait, 0); @@ -449,8 +495,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) prepare_to_wait_event(&server->probe_wq, &wait, is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (timo == 0 || - server->probe.responded || - atomic_read(&server->probe_outstanding) == 0 || + test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) || + (estate->responsive_set & ~exclude) || + atomic_read(&estate->nr_probing) == 0 || (is_intr && signal_pending(current))) break; timo = schedule_timeout(timo); @@ -459,7 +506,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) finish_wait(&server->probe_wq, &wait); dont_wait: - if (server->probe.responded) + if (estate->responsive_set & ~exclude) + return 1; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) return 0; if (is_intr && signal_pending(current)) return -ERESTARTSYS; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 7d37f63ef0f0..79cd30775b7a 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -290,6 +290,7 @@ void afs_fs_fetch_status(struct afs_operation *op) bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -442,6 +443,7 @@ static void afs_fs_fetch_data64(struct afs_operation *op) bp[6] = 0; bp[7] = htonl(lower_32_bits(req->len)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -476,6 +478,7 @@ void afs_fs_fetch_data(struct afs_operation *op) bp[4] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->len)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -559,6 +562,7 @@ void afs_fs_create_file(struct afs_operation *op) *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -612,6 +616,7 @@ void afs_fs_make_dir(struct afs_operation *op) *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -685,6 +690,7 @@ void afs_fs_remove_file(struct afs_operation *op) bp = (void *) bp + padsz; } + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -732,6 +738,7 @@ void afs_fs_remove_dir(struct afs_operation *op) bp = (void *) bp + padsz; } + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -812,6 +819,7 @@ void afs_fs_link(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call1(call, &vp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -907,6 +915,7 @@ void afs_fs_symlink(struct afs_operation *op) *bp++ = htonl(S_IRWXUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1003,6 +1012,7 @@ void afs_fs_rename(struct afs_operation *op) bp = (void *) bp + n_padsz; } + call->fid = orig_dvp->fid; trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1090,6 +1100,7 @@ static void afs_fs_store_data64(struct afs_operation *op) *bp++ = htonl(upper_32_bits(op->store.i_size)); *bp++ = htonl(lower_32_bits(op->store.i_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1140,6 +1151,7 @@ void afs_fs_store_data(struct afs_operation *op) *bp++ = htonl(lower_32_bits(op->store.size)); *bp++ = htonl(lower_32_bits(op->store.i_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1206,6 +1218,7 @@ static void afs_fs_setattr_size64(struct afs_operation *op) *bp++ = htonl(upper_32_bits(attr->ia_size)); /* new file length */ *bp++ = htonl(lower_32_bits(attr->ia_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1247,6 +1260,7 @@ static void afs_fs_setattr_size(struct afs_operation *op) *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1283,6 +1297,7 @@ void afs_fs_setattr(struct afs_operation *op) xdr_encode_AFS_StoreStatus(&bp, op->setattr.attr); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1446,6 +1461,7 @@ void afs_fs_get_volume_status(struct afs_operation *op) bp[0] = htonl(FSGETVOLUMESTATUS); bp[1] = htonl(vp->fid.vid); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1528,6 +1544,7 @@ void afs_fs_set_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.unique); *bp++ = htonl(op->lock.type); + call->fid = vp->fid; trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); afs_make_op_call(op, call, GFP_NOFS); } @@ -1554,6 +1571,7 @@ void afs_fs_extend_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1580,6 +1598,7 @@ void afs_fs_release_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1605,13 +1624,12 @@ static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = { /* * Flush all the callbacks we have on a server. */ -int afs_fs_give_up_all_callbacks(struct afs_net *net, - struct afs_server *server, - struct afs_addr_cursor *ac, - struct key *key) +int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, + struct afs_address *addr, struct key *key) { struct afs_call *call; __be32 *bp; + int ret; _enter(""); @@ -1619,15 +1637,22 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, if (!call) return -ENOMEM; - call->key = key; + call->key = key; + call->peer = rxrpc_kernel_get_peer(addr->peer); + call->service_id = server->service_id; /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSGIVEUPALLCALLBACKS); call->server = afs_use_server(server, afs_server_trace_give_up_cb); - afs_make_call(ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, ac); + afs_make_call(call, GFP_NOFS); + afs_wait_for_call_to_complete(call); + ret = call->error; + if (call->responded) + set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); + afs_put_call(call); + return ret; } /* @@ -1689,6 +1714,12 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) return 0; } +static void afs_fs_get_capabilities_destructor(struct afs_call *call) +{ + afs_put_endpoint_state(call->probe, afs_estate_trace_put_getcaps); + afs_flat_call_destructor(call); +} + /* * FS.GetCapabilities operation type */ @@ -1697,7 +1728,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { .op = afs_FS_GetCapabilities, .deliver = afs_deliver_fs_get_capabilities, .done = afs_fileserver_probe_result, - .destructor = afs_flat_call_destructor, + .destructor = afs_fs_get_capabilities_destructor, }; /* @@ -1707,7 +1738,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { * ->done() - otherwise we return false to indicate we didn't even try. */ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, - struct afs_addr_cursor *ac, struct key *key) + struct afs_endpoint_state *estate, unsigned int addr_index, + struct key *key) { struct afs_call *call; __be32 *bp; @@ -1718,10 +1750,14 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, if (!call) return false; - call->key = key; - call->server = afs_use_server(server, afs_server_trace_get_caps); - call->upgrade = true; - call->async = true; + call->key = key; + call->server = afs_use_server(server, afs_server_trace_get_caps); + call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer); + call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps); + call->probe_index = addr_index; + call->service_id = server->service_id; + call->upgrade = true; + call->async = true; call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; /* marshall the parameters */ @@ -1729,7 +1765,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, *bp++ = htonl(FSGETCAPABILITIES); trace_afs_make_fs_call(call, NULL); - afs_make_call(ac, call, GFP_NOFS); + afs_make_call(call, GFP_NOFS); afs_put_call(call); return true; } @@ -1853,7 +1889,10 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSVolSync(&bp, &op->volsync); + /* Unfortunately, prior to OpenAFS-1.6, volsync here is filled + * with rubbish. + */ + xdr_decode_AFSVolSync(&bp, NULL); call->unmarshall++; fallthrough; @@ -1899,7 +1938,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op) int i; if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) { - op->error = -ENOTSUPP; + afs_op_set_error(op, -ENOTSUPP); return; } @@ -1928,6 +1967,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op) *bp++ = htonl(op->more_files[i].fid.unique); } + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -2033,6 +2073,7 @@ void afs_fs_fetch_acl(struct afs_operation *op) bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } @@ -2078,6 +2119,7 @@ void afs_fs_store_acl(struct afs_operation *op) if (acl->size != size) memset((void *)&bp[5] + acl->size, 0, size - acl->size); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 78efc9719349..4f04f6f33f46 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -85,8 +85,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, write_seqlock(&vnode->cb_lock); - vnode->cb_v_break = op->cb_v_break; - vnode->cb_s_break = op->cb_s_break; + vnode->cb_v_check = op->cb_v_break; vnode->status = *status; t = status->mtime_client; @@ -146,11 +145,10 @@ static int afs_inode_init_from_status(struct afs_operation *op, if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver * didn't give us a callback) */ - vnode->cb_expires_at = ktime_get_real_seconds(); + atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); } else { - vnode->cb_expires_at = vp->scb.callback.expires_at; vnode->cb_server = op->server; - set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at); } write_sequnlock(&vnode->cb_lock); @@ -214,7 +212,8 @@ static void afs_apply_status(struct afs_operation *op, vnode->status = *status; if (vp->dv_before + vp->dv_delta != status->data_version) { - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) + if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) && + atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE) pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", vnode->fid.vid, vnode->fid.vnode, (unsigned long long)vp->dv_before + vp->dv_delta, @@ -268,9 +267,9 @@ static void afs_apply_callback(struct afs_operation *op, struct afs_vnode *vnode = vp->vnode; if (!afs_cb_is_broken(vp->cb_break_before, vnode)) { - vnode->cb_expires_at = cb->expires_at; - vnode->cb_server = op->server; - set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + if (op->volume->type == AFSVL_RWVOL) + vnode->cb_server = op->server; + atomic64_set(&vnode->cb_expires_at, cb->expires_at); } } @@ -331,7 +330,7 @@ static void afs_fetch_status_success(struct afs_operation *op) if (vnode->netfs.inode.i_state & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); - op->error = ret; + afs_op_set_error(op, ret); if (ret == 0) afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); } else { @@ -542,7 +541,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) BUG_ON(!(inode->i_state & I_NEW)); vnode = AFS_FS_I(inode); - vnode->cb_v_break = as->volume->cb_v_break, + vnode->cb_v_check = atomic_read(&as->volume->cb_v_break), afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); @@ -572,180 +571,6 @@ error: return ERR_PTR(ret); } -/* - * mark the data attached to an inode as obsolete due to a write on the server - * - might also want to ditch all the outstanding writes and dirty pages - */ -static void afs_zap_data(struct afs_vnode *vnode) -{ - _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); - - afs_invalidate_cache(vnode, 0); - - /* nuke all the non-dirty pages that aren't locked, mapped or being - * written back in a regular file and completely discard the pages in a - * directory or symlink */ - if (S_ISREG(vnode->netfs.inode.i_mode)) - invalidate_remote_inode(&vnode->netfs.inode); - else - invalidate_inode_pages2(vnode->netfs.inode.i_mapping); -} - -/* - * Check to see if we have a server currently serving this volume and that it - * hasn't been reinitialised or dropped from the list. - */ -static bool afs_check_server_good(struct afs_vnode *vnode) -{ - struct afs_server_list *slist; - struct afs_server *server; - bool good; - int i; - - if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break)) - return true; - - rcu_read_lock(); - - slist = rcu_dereference(vnode->volume->servers); - for (i = 0; i < slist->nr_servers; i++) { - server = slist->servers[i].server; - if (server == vnode->cb_server) { - good = (vnode->cb_s_break == server->cb_s_break); - rcu_read_unlock(); - return good; - } - } - - rcu_read_unlock(); - return false; -} - -/* - * Check the validity of a vnode/inode. - */ -bool afs_check_validity(struct afs_vnode *vnode) -{ - enum afs_cb_break_reason need_clear = afs_cb_break_no_break; - time64_t now = ktime_get_real_seconds(); - unsigned int cb_break; - int seq = 0; - - do { - read_seqbegin_or_lock(&vnode->cb_lock, &seq); - cb_break = vnode->cb_break; - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - if (vnode->cb_v_break != vnode->volume->cb_v_break) - need_clear = afs_cb_break_for_v_break; - else if (!afs_check_server_good(vnode)) - need_clear = afs_cb_break_for_s_reinit; - else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - need_clear = afs_cb_break_for_zap; - else if (vnode->cb_expires_at - 10 <= now) - need_clear = afs_cb_break_for_lapsed; - } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - ; - } else { - need_clear = afs_cb_break_no_promise; - } - - } while (need_seqretry(&vnode->cb_lock, seq)); - - done_seqretry(&vnode->cb_lock, seq); - - if (need_clear == afs_cb_break_no_break) - return true; - - write_seqlock(&vnode->cb_lock); - if (need_clear == afs_cb_break_no_promise) - vnode->cb_v_break = vnode->volume->cb_v_break; - else if (cb_break == vnode->cb_break) - __afs_break_callback(vnode, need_clear); - else - trace_afs_cb_miss(&vnode->fid, need_clear); - write_sequnlock(&vnode->cb_lock); - return false; -} - -/* - * Returns true if the pagecache is still valid. Does not sleep. - */ -bool afs_pagecache_valid(struct afs_vnode *vnode) -{ - if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->netfs.inode.i_nlink) - clear_nlink(&vnode->netfs.inode); - return true; - } - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && - afs_check_validity(vnode)) - return true; - - return false; -} - -/* - * validate a vnode/inode - * - there are several things we need to check - * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, - * symlink) - * - parent dir metadata changed (security changes) - * - dentry data changed (write, truncate) - * - dentry metadata changed (security changes) - */ -int afs_validate(struct afs_vnode *vnode, struct key *key) -{ - int ret; - - _enter("{v={%llx:%llu} fl=%lx},%x", - vnode->fid.vid, vnode->fid.vnode, vnode->flags, - key_serial(key)); - - if (afs_pagecache_valid(vnode)) - goto valid; - - down_write(&vnode->validate_lock); - - /* if the promise has expired, we need to check the server again to get - * a new promise - note that if the (parent) directory's metadata was - * changed then the security may be different and we may no longer have - * access */ - if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - _debug("not promised"); - ret = afs_fetch_status(vnode, key, false, NULL); - if (ret < 0) { - if (ret == -ENOENT) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - goto error_unlock; - } - _debug("new promise [fl=%lx]", vnode->flags); - } - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _debug("file already deleted"); - ret = -ESTALE; - goto error_unlock; - } - - /* if the vnode's data version number changed then its contents are - * different */ - if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - afs_zap_data(vnode); - up_write(&vnode->validate_lock); -valid: - _leave(" = 0"); - return 0; - -error_unlock: - up_write(&vnode->validate_lock); - _leave(" = %d", ret); - return ret; -} - /* * read the attributes of an inode */ @@ -755,13 +580,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); struct key *key; - int ret, seq = 0; + int ret, seq; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); if (vnode->volume && !(query_flags & AT_STATX_DONT_SYNC) && - !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE) { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) return PTR_ERR(key); @@ -772,7 +597,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, } do { - read_seqbegin_or_lock(&vnode->cb_lock, &seq); + seq = read_seqbegin(&vnode->cb_lock); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) @@ -784,9 +609,8 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, */ if (S_ISDIR(inode->i_mode)) stat->size = vnode->netfs.remote_i_size; - } while (need_seqretry(&vnode->cb_lock, seq)); + } while (read_seqretry(&vnode->cb_lock, seq)); - done_seqretry(&vnode->cb_lock, seq); return 0; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 7385d62c8cf5..e33ace259cc6 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -33,6 +33,7 @@ struct pagevec; struct afs_call; struct afs_vnode; +struct afs_server_probe; /* * Partial file-locking emulation mode. (The problem being that AFS3 only @@ -72,6 +73,34 @@ enum afs_call_state { AFS_CALL_COMPLETE, /* Completed or failed */ }; +/* + * Address preferences. + */ +struct afs_addr_preference { + union { + struct in_addr ipv4_addr; /* AF_INET address to compare against */ + struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */ + }; + sa_family_t family; /* Which address to use */ + u16 prio; /* Priority */ + u8 subnet_mask; /* How many bits to compare */ +}; + +struct afs_addr_preference_list { + struct rcu_head rcu; + u16 version; /* Incremented when prefs list changes */ + u8 ipv6_off; /* Offset of IPv6 addresses */ + u8 nr; /* Number of addresses in total */ + u8 max_prefs; /* Number of prefs allocated */ + struct afs_addr_preference prefs[] __counted_by(max_prefs); +}; + +struct afs_address { + struct rxrpc_peer *peer; + short last_error; /* Last error from this address */ + u16 prio; /* Address priority */ +}; + /* * List of server addresses. */ @@ -79,15 +108,17 @@ struct afs_addr_list { struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ + unsigned int debug_id; + unsigned int addr_pref_version; /* Version of address preference list */ unsigned char max_addrs; unsigned char nr_addrs; unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ enum dns_record_source source:8; enum dns_lookup_status status:8; - unsigned long failed; /* Mask of addrs that failed locally/ICMP */ + unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ - struct sockaddr_rxrpc addrs[] __counted_by(max_addrs); + struct afs_address addrs[] __counted_by(max_addrs); #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; @@ -96,11 +127,11 @@ struct afs_addr_list { */ struct afs_call { const struct afs_call_type *type; /* type of call */ - struct afs_addr_list *alist; /* Address is alist[addr_ix] */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ + struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ struct afs_net *net; /* The network namespace */ struct afs_server *server; /* The fileserver record if fs op (pins ref) */ @@ -116,11 +147,14 @@ struct afs_call { }; void *buffer; /* reply receive buffer */ union { - long ret0; /* Value to reply with instead of 0 */ + struct afs_endpoint_state *probe; + struct afs_addr_list *vl_probe; struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; char *ret_str; }; + struct afs_fid fid; /* Primary vnode ID (or all zeroes) */ + unsigned char probe_index; /* Address in ->probe_alist */ struct afs_operation *op; unsigned int server_index; refcount_t ref; @@ -133,13 +167,13 @@ struct afs_call { unsigned reply_max; /* maximum size of reply */ unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ - unsigned char addr_ix; /* Address in ->alist */ bool drop_ref; /* T if need to drop ref for incoming call */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ + bool responded; /* Got a response from the call (may be abort) */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ @@ -306,6 +340,8 @@ struct afs_net { struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ struct afs_sysnames *sysnames; rwlock_t sysnames_lock; + struct afs_addr_preference_list __rcu *address_prefs; + u16 address_pref_version; /* Statistics counters */ atomic_t n_lookup; /* Number of lookups done */ @@ -379,6 +415,7 @@ struct afs_cell { unsigned int debug_id; /* The volumes belonging to this cell */ + struct rw_semaphore vs_lock; /* Lock for server->volumes */ struct rb_root volumes; /* Tree of volumes on this server */ struct hlist_head proc_volumes; /* procfs volume list */ seqlock_t volume_lock; /* For volumes */ @@ -386,9 +423,6 @@ struct afs_cell { /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ seqlock_t fs_lock; /* For fs_servers */ - struct rw_semaphore fs_open_mmaps_lock; - struct list_head fs_open_mmaps; /* List of vnodes that are mmapped */ - atomic_t fs_s_break; /* Counter of CB.InitCallBackState messages */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ @@ -412,13 +446,14 @@ struct afs_vlserver { rwlock_t lock; /* Lock on addresses */ refcount_t ref; unsigned int rtt; /* Server's current RTT in uS */ + unsigned int debug_id; /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { - unsigned int rtt; /* RTT in uS */ + unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ u32 abort_code; short error; unsigned short flags; @@ -428,6 +463,7 @@ struct afs_vlserver { #define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; + u16 service_id; /* Service ID we're using */ u16 port; u16 name_len; /* Length of name */ char name[]; /* Server name, case-flattened */ @@ -477,12 +513,39 @@ struct afs_vldb_entry { #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ + u8 vlsf_flags[AFS_NMAXNSERVERS]; short error; u8 nr_servers; /* Number of server records */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; +/* + * Fileserver endpoint state. The records the addresses of a fileserver's + * endpoints and the state and result of a round of probing on them. This + * allows the rotation algorithm to access those results without them being + * erased by a subsequent round of probing. + */ +struct afs_endpoint_state { + struct rcu_head rcu; + struct afs_addr_list *addresses; /* The addresses being probed */ + unsigned long responsive_set; /* Bitset of responsive endpoints */ + unsigned long failed_set; /* Bitset of endpoints we failed to probe */ + refcount_t ref; + unsigned int server_id; /* Debug ID of server */ + unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */ + atomic_t nr_probing; /* Number of outstanding probes */ + unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ + s32 abort_code; + short error; + unsigned long flags; +#define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */ +#define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */ +#define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */ +#define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */ +#define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */ +}; + /* * Record of fileserver with which we're actively communicating. */ @@ -493,7 +556,6 @@ struct afs_server { struct afs_uuid _uuid; }; - struct afs_addr_list __rcu *addresses; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node uuid_rb; /* Link in net->fs_servers */ struct afs_server __rcu *uuid_next; /* Next server with same UUID */ @@ -502,7 +564,7 @@ struct afs_server { struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ - struct work_struct initcb_work; /* Work for CB.InitCallBackState* */ + struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; @@ -520,44 +582,47 @@ struct afs_server { refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ + u16 service_id; /* Service ID we're using. */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ rwlock_t fs_lock; /* access lock */ - /* callback promise management */ - unsigned cb_s_break; /* Break-everything counter. */ - /* Probe state */ + struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ wait_queue_head_t probe_wq; - atomic_t probe_outstanding; + unsigned int probe_counter; /* Number of probes issued */ spinlock_t probe_lock; - struct { - unsigned int rtt; /* RTT in uS */ - u32 abort_code; - short error; - bool responded:1; - bool is_yfs:1; - bool not_yfs:1; - bool local_failure:1; - } probe; }; +enum afs_ro_replicating { + AFS_RO_NOT_REPLICATING, /* Not doing replication */ + AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */ + AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */ +} __mode(byte); + /* * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; + struct afs_volume *volume; + struct list_head slink; /* Link in server->volumes */ + time64_t cb_expires_at; /* Time at which volume-level callback expires */ + unsigned long flags; +#define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */ +#define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */ +#define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */ }; struct afs_server_list { struct rcu_head rcu; - afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */ refcount_t usage; + bool attached; /* T if attached to servers */ + enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ unsigned char nr_servers; - unsigned char preferred; /* Preferred server */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; @@ -568,25 +633,23 @@ struct afs_server_list { * Live AFS volume management. */ struct afs_volume { - union { - struct rcu_head rcu; - afs_volid_t vid; /* volume ID */ - }; + struct rcu_head rcu; + afs_volid_t vid; /* The volume ID of this volume */ + afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ struct hlist_node proc_link; /* Link in cell->proc_volumes */ struct super_block __rcu *sb; /* Superblock on which inodes reside */ + struct work_struct destructor; /* Deferred destructor */ unsigned long flags; #define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ #define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ #define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ #define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ -#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */ -#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ -#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */ -#define AFS_VOLUME_RM_TREE 7 /* - Set if volume removed from cell->volumes */ +#define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */ +#define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE struct fscache_volume *cache; /* Caching cookie */ #endif @@ -594,8 +657,21 @@ struct afs_volume { rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ - unsigned cb_v_break; /* Break-everything counter. */ + /* RO release tracking */ + struct mutex volsync_lock; /* Time/state evaluation lock */ + time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ + time64_t update_time; /* Volume update time (or TIME64_MIN) */ + + /* Callback management */ + struct mutex cb_check_lock; /* Lock to control race to check after v_break */ + time64_t cb_expires_at; /* Earliest volume callback expiry time */ + atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ + atomic_t cb_v_break; /* Volume-break event counter. */ + atomic_t cb_v_check; /* Volume-break has-been-checked counter. */ + atomic_t cb_scrub; /* Scrub-all-data event counter. */ rwlock_t cb_v_break_lock; + struct rw_semaphore open_mmaps_lock; + struct list_head open_mmaps; /* List of vnodes that are mmapped */ afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ @@ -634,7 +710,6 @@ struct afs_vnode { spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; -#define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ @@ -660,13 +735,14 @@ struct afs_vnode { struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ void *cb_server; /* Server with callback/filelock */ atomic_t cb_nr_mmap; /* Number of mmaps */ - unsigned int cb_fs_s_break; /* Mass server break counter (cell->fs_s_break) */ - unsigned int cb_s_break; /* Mass break counter on ->server */ - unsigned int cb_v_break; /* Mass break counter on ->volume */ + unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */ + unsigned int cb_scrub; /* Scrub counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ + unsigned int cb_v_check; /* Break check counter on ->volume */ seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ - time64_t cb_expires_at; /* time at which callback expires */ + atomic64_t cb_expires_at; /* time at which callback expires */ +#define AFS_NO_CB_PROMISE TIME64_MIN }; static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) @@ -714,40 +790,49 @@ struct afs_permits { * Error prioritisation and accumulation. */ struct afs_error { - short error; /* Accumulated error */ + s32 abort_code; /* Cumulative abort code */ + short error; /* Cumulative error */ bool responded; /* T if server responded */ -}; - -/* - * Cursor for iterating over a server's address list. - */ -struct afs_addr_cursor { - struct afs_addr_list *alist; /* Current address list (pins ref) */ - unsigned long tried; /* Tried addresses */ - signed char index; /* Current address */ - bool responded; /* T if the current address responded */ - unsigned short nr_iterations; /* Number of address iterations */ - short error; - u32 abort_code; + bool aborted; /* T if ->error is from an abort */ }; /* * Cursor for iterating over a set of volume location servers. */ struct afs_vl_cursor { - struct afs_addr_cursor ac; struct afs_cell *cell; /* The cell we're querying */ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ struct afs_vlserver *server; /* Server on which this resides */ + struct afs_addr_list *alist; /* Current address list (pins ref) */ struct key *key; /* Key for the server */ - unsigned long untried; /* Bitmask of untried servers */ - short index; /* Current server */ - short error; + unsigned long untried_servers; /* Bitmask of untried servers */ + unsigned long addr_tried; /* Tried addresses */ + struct afs_error cumul_error; /* Cumulative error */ + unsigned int debug_id; + s32 call_abort_code; + short call_error; /* Error from single call */ + short server_index; /* Current server */ + signed char addr_index; /* Current address */ unsigned short flags; #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ - unsigned short nr_iterations; /* Number of server iterations */ + short nr_iterations; /* Number of server iterations */ + bool call_responded; /* T if the current address responded */ +}; + +/* + * Fileserver state tracking for an operation. An array of these is kept, + * indexed by server index. + */ +struct afs_server_state { + /* Tracking of fileserver probe state. Other operations may interfere + * by probing a fileserver when accessing other volumes. + */ + unsigned int probe_seq; + unsigned long untried_addrs; /* Addresses we haven't tried yet */ + struct wait_queue_entry probe_waiter; + struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */ }; /* @@ -768,7 +853,7 @@ struct afs_vnode_param { struct afs_fid fid; /* Fid to access */ struct afs_status_cb scb; /* Returned status and callback promise */ afs_dataversion_t dv_before; /* Data version before the call */ - unsigned int cb_break_before; /* cb_break + cb_s_break before the call */ + unsigned int cb_break_before; /* cb_break before the call */ u8 dv_delta; /* Expected change in data version */ bool put_vnode:1; /* T if we have a ref on the vnode */ bool need_io_lock:1; /* T if we need the I/O lock on this */ @@ -793,17 +878,17 @@ struct afs_operation { struct afs_volume *volume; /* Volume being accessed */ struct afs_vnode_param file[2]; struct afs_vnode_param *more_files; - struct afs_volsync volsync; + struct afs_volsync pre_volsync; /* Volsync before op */ + struct afs_volsync volsync; /* Volsync returned by op */ struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ struct timespec64 ctime; /* Change time to set */ + struct afs_error cumul_error; /* Cumulative error */ short nr_files; /* Number of entries in file[], more_files */ - short error; unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ - unsigned int cb_s_break; /* Server break counter before op */ union { struct { @@ -848,13 +933,19 @@ struct afs_operation { }; /* Fileserver iteration state */ - struct afs_addr_cursor ac; struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_server *server; /* Server we're using (ref pinned by server_list) */ + struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */ + struct afs_server_state *server_states; /* States of the servers involved */ struct afs_call *call; - unsigned long untried; /* Bitmask of untried servers */ - short index; /* Current server */ - unsigned short nr_iterations; /* Number of server iterations */ + unsigned long untried_servers; /* Bitmask of untried servers */ + unsigned long addr_tried; /* Tried addresses */ + s32 call_abort_code; /* Abort code from single call */ + short call_error; /* Error from single call */ + short server_index; /* Current server */ + short nr_iterations; /* Number of server iterations */ + signed char addr_index; /* Current address */ + bool call_responded; /* T if the current address responded */ unsigned int flags; #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ @@ -956,31 +1047,32 @@ static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) /* * addr_list.c */ -static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist) -{ - if (alist) - refcount_inc(&alist->usage); - return alist; -} -extern struct afs_addr_list *afs_alloc_addrlist(unsigned int, - unsigned short, - unsigned short); -extern void afs_put_addrlist(struct afs_addr_list *); +struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); +extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr); +extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, const char *, size_t, char, unsigned short, unsigned short); +bool afs_addr_list_same(const struct afs_addr_list *a, + const struct afs_addr_list *b); extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); -extern bool afs_iterate_addresses(struct afs_addr_cursor *); -extern int afs_end_cursor(struct afs_addr_cursor *); -extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16); -extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); +extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, + __be32 xdr, u16 port); +extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, + __be32 *xdr, u16 port); + +/* + * addr_prefs.c + */ +int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size); +void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist); +void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist); /* * callback.c */ extern void afs_invalidate_mmap_work(struct work_struct *); -extern void afs_server_init_callback_work(struct work_struct *work); extern void afs_init_callback_state(struct afs_server *); extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); @@ -988,13 +1080,15 @@ extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { - return vnode->cb_break + vnode->cb_v_break; + return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub; } static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_vnode *vnode) { - return cb_break != (vnode->cb_break + vnode->volume->cb_v_break); + return cb_break != (vnode->cb_break + + atomic_read(&vnode->volume->cb_ro_snapshot) + + atomic_read(&vnode->volume->cb_scrub)); } /* @@ -1110,10 +1204,11 @@ extern void afs_fs_get_volume_status(struct afs_operation *); extern void afs_fs_set_lock(struct afs_operation *); extern void afs_fs_extend_lock(struct afs_operation *); extern void afs_fs_release_lock(struct afs_operation *); -extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *); -extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *); +int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, + struct afs_address *addr, struct key *key); +bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate, unsigned int addr_index, + struct key *key); extern void afs_fs_inline_bulk_status(struct afs_operation *); struct afs_acl { @@ -1133,11 +1228,6 @@ extern bool afs_begin_vnode_operation(struct afs_operation *); extern void afs_wait_for_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *); -static inline void afs_op_nomem(struct afs_operation *op) -{ - op->error = -ENOMEM; -} - static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, struct afs_vnode *vnode) { @@ -1154,12 +1244,17 @@ static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, /* * fs_probe.c */ +struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, + enum afs_estate_trace where); +void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); -extern void afs_fs_probe_fileserver(struct afs_net *, struct afs_server *, struct key *, bool); -extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); +void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_addrs, struct key *key); +int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); -extern int afs_wait_for_one_fs_probe(struct afs_server *, bool); +int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, + unsigned long exclude, bool is_intr); extern void afs_fs_probe_cleanup(struct afs_net *); /* @@ -1173,9 +1268,6 @@ extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); -extern bool afs_check_validity(struct afs_vnode *); -extern int afs_validate(struct afs_vnode *, struct key *); -bool afs_pagecache_valid(struct afs_vnode *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); @@ -1231,6 +1323,31 @@ static inline void __afs_stat(atomic_t *s) extern int afs_abort_to_error(u32); extern void afs_prioritise_error(struct afs_error *, int, u32); +static inline void afs_op_nomem(struct afs_operation *op) +{ + op->cumul_error.error = -ENOMEM; +} + +static inline int afs_op_error(const struct afs_operation *op) +{ + return op->cumul_error.error; +} + +static inline s32 afs_op_abort_code(const struct afs_operation *op) +{ + return op->cumul_error.abort_code; +} + +static inline int afs_op_set_error(struct afs_operation *op, int error) +{ + return op->cumul_error.error = error; +} + +static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) +{ + afs_prioritise_error(&op->cumul_error, error, abort_code); +} + /* * mntpt.c */ @@ -1261,6 +1378,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} /* * rotate.c */ +void afs_clear_server_states(struct afs_operation *op); extern bool afs_select_fileserver(struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *); @@ -1273,8 +1391,8 @@ extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); -extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t); -extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *); +void afs_make_call(struct afs_call *call, gfp_t gfp); +void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, size_t, size_t); @@ -1287,12 +1405,16 @@ extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, gfp_t gfp) { - op->call = call; - op->type = call->type; - call->op = op; - call->key = op->key; - call->intr = !(op->flags & AFS_OPERATION_UNINTR); - afs_make_call(&op->ac, call, gfp); + struct afs_addr_list *alist = op->estate->addresses; + + op->call = call; + op->type = call->type; + call->op = op; + call->key = op->key; + call->intr = !(op->flags & AFS_OPERATION_UNINTR); + call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer); + call->service_id = op->server->service_id; + afs_make_call(call, gfp); } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) @@ -1401,8 +1523,7 @@ extern void __exit afs_clean_up_permit_cache(void); */ extern spinlock_t afs_server_peer_lock; -extern struct afs_server *afs_find_server(struct afs_net *, - const struct sockaddr_rxrpc *); +extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); @@ -1414,7 +1535,7 @@ extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); extern void afs_fs_probe_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); -extern bool afs_check_server_record(struct afs_operation *, struct afs_server *); +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); static inline void afs_inc_servers_outstanding(struct afs_net *net) { @@ -1442,10 +1563,14 @@ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list } extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); -extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *, - struct afs_vldb_entry *, - u8); +struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, + struct key *key, + struct afs_vldb_entry *vldb); extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); +void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist); +void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist, + struct afs_server_list *old); +void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist); /* * super.c @@ -1453,14 +1578,25 @@ extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server extern int __init afs_fs_init(void); extern void afs_fs_exit(void); +/* + * validation.c + */ +bool afs_check_validity(const struct afs_vnode *vnode); +int afs_update_volume_state(struct afs_operation *op); +int afs_validate(struct afs_vnode *vnode, struct key *key); + /* * vlclient.c */ extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, const char *, int); extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); -extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, - struct key *, struct afs_vlserver *, unsigned int); +struct afs_call *afs_vl_get_capabilities(struct afs_net *net, + struct afs_addr_list *alist, + unsigned int addr_index, + struct key *key, + struct afs_vlserver *server, + unsigned int server_index); extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); @@ -1516,7 +1652,7 @@ extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); -extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace); +void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* @@ -1603,7 +1739,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op, struct afs_vnode_param *dir_vp, struct dentry *dentry) { - if (!op->error) + if (!op->cumul_error.error) dentry->d_fsdata = (void *)(unsigned long)dir_vp->scb.status.data_version; } diff --git a/fs/afs/main.c b/fs/afs/main.c index 6425c81d07de..1b3bd21c168a 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -156,6 +156,7 @@ static void __net_exit afs_net_exit(struct net *net_ns) afs_close_socket(net); afs_proc_cleanup(net); afs_put_sysnames(net->sysnames); + kfree_rcu(rcu_access_pointer(net->address_prefs), rcu); } static struct pernet_operations afs_net_ops = { diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 805328ca5428..b8180bf2281f 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -116,6 +116,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) { switch (error) { case 0: + e->aborted = false; + e->error = 0; return; default: if (e->error == -ETIMEDOUT || @@ -161,12 +163,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) if (e->responded) return; e->error = error; + e->aborted = false; return; case -ECONNABORTED: - error = afs_abort_to_error(abort_code); - fallthrough; + e->error = afs_abort_to_error(abort_code); + e->aborted = true; + e->responded = true; + return; case -ENETRESET: /* Responded, but we seem to have changed address */ + e->aborted = false; e->responded = true; e->error = error; return; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 2a0c83d71565..3bd02571f30d 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -146,6 +146,55 @@ inval: goto done; } +/* + * Display the list of addr_prefs known to the namespace. + */ +static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) +{ + struct afs_addr_preference_list *preflist; + struct afs_addr_preference *pref; + struct afs_net *net = afs_seq2net_single(m); + union { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } addr; + unsigned int i; + char buf[44]; /* Maximum ipv6 + max subnet is 43 */ + + rcu_read_lock(); + preflist = rcu_dereference(net->address_prefs); + + if (!preflist) { + seq_puts(m, "NO PREFS\n"); + return 0; + } + + seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", + preflist->version, preflist->ipv6_off, preflist->nr, preflist->max_prefs); + + memset(&addr, 0, sizeof(addr)); + + for (i = 0; i < preflist->nr; i++) { + pref = &preflist->prefs[i]; + + addr.sin.sin_family = pref->family; + if (pref->family == AF_INET) { + memcpy(&addr.sin.sin_addr, &pref->ipv4_addr, + sizeof(addr.sin.sin_addr)); + snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin, pref->subnet_mask); + seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio); + } else { + memcpy(&addr.sin6.sin6_addr, &pref->ipv6_addr, + sizeof(addr.sin6.sin6_addr)); + snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin6, pref->subnet_mask); + seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio); + } + } + + rcu_read_lock(); + return 0; +} + /* * Display the name of the current workstation cell. */ @@ -307,7 +356,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) for (i = 0; i < alist->nr_addrs; i++) seq_printf(m, " %c %pISpc\n", alist->preferred == i ? '>' : '-', - &alist->addrs[i].transport); + rxrpc_kernel_remote_addr(alist->addrs[i].peer)); } seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", @@ -375,32 +424,45 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = { */ static int afs_proc_servers_show(struct seq_file *m, void *v) { - struct afs_server *server; + struct afs_endpoint_state *estate; struct afs_addr_list *alist; + struct afs_server *server; + unsigned long failed; int i; if (v == SEQ_START_TOKEN) { - seq_puts(m, "UUID REF ACT\n"); + seq_puts(m, "UUID REF ACT CELL\n"); return 0; } server = list_entry(v, struct afs_server, proc_link); - alist = rcu_dereference(server->addresses); - seq_printf(m, "%pU %3d %3d\n", + estate = rcu_dereference(server->endpoint_state); + alist = estate->addresses; + seq_printf(m, "%pU %3d %3d %s\n", &server->uuid, refcount_read(&server->ref), - atomic_read(&server->active)); - seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n", - server->flags, server->rtt, server->cb_s_break); - seq_printf(m, " - probe: last=%d out=%d\n", - (int)(jiffies - server->probed_at) / HZ, - atomic_read(&server->probe_outstanding)); - seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n", - alist->version, alist->responded, alist->failed); - for (i = 0; i < alist->nr_addrs; i++) - seq_printf(m, " [%x] %pISpc%s\n", - i, &alist->addrs[i].transport, - alist->preferred == i ? "*" : ""); + atomic_read(&server->active), + server->cell->name); + seq_printf(m, " - info: fl=%lx rtt=%u\n", + server->flags, server->rtt); + seq_printf(m, " - probe: last=%d\n", + (int)(jiffies - server->probed_at) / HZ); + failed = estate->failed_set; + seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n", + estate->probe_seq, atomic_read(&estate->nr_probing), + estate->responsive_set, estate->failed_set); + seq_printf(m, " - ALIST v=%u ap=%u\n", + alist->version, alist->addr_pref_version); + for (i = 0; i < alist->nr_addrs; i++) { + const struct afs_address *addr = &alist->addrs[i]; + + seq_printf(m, " [%x] %pISpc%s rtt=%d err=%d p=%u\n", + i, rxrpc_kernel_remote_addr(addr->peer), + alist->preferred == i ? "*" : + test_bit(i, &failed) ? "!" : "", + rxrpc_kernel_get_srtt(addr->peer), + addr->last_error, addr->prio); + } return 0; } @@ -681,7 +743,11 @@ int afs_proc_init(struct afs_net *net) &afs_proc_sysname_ops, afs_proc_sysname_write, sizeof(struct seq_net_private), - NULL)) + NULL) || + !proc_create_net_single_write("addr_prefs", 0644, p, + afs_proc_addr_prefs_show, + afs_proc_addr_prefs_write, + NULL)) goto error_tree; net->proc_afs = p; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index a840c3588ebb..700a27bc8c25 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -13,6 +13,19 @@ #include #include "internal.h" #include "afs_fs.h" +#include "protocol_uae.h" + +void afs_clear_server_states(struct afs_operation *op) +{ + unsigned int i; + + if (op->server_states) { + for (i = 0; i < op->server_list->nr_servers; i++) + afs_put_endpoint_state(op->server_states[i].endpoint_state, + afs_estate_trace_put_server_state); + kfree(op->server_states); + } +} /* * Begin iteration through a server list, starting with the vnode's last used @@ -25,14 +38,41 @@ static bool afs_start_fs_iteration(struct afs_operation *op, void *cb_server; int i; + trace_afs_rotate(op, afs_rotate_trace_start, 0); + read_lock(&op->volume->servers_lock); op->server_list = afs_get_serverlist( rcu_dereference_protected(op->volume->servers, lockdep_is_held(&op->volume->servers_lock))); read_unlock(&op->volume->servers_lock); - op->untried = (1UL << op->server_list->nr_servers) - 1; - op->index = READ_ONCE(op->server_list->preferred); + op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]), + GFP_KERNEL); + if (!op->server_states) { + afs_op_nomem(op); + trace_afs_rotate(op, afs_rotate_trace_nomem, 0); + return false; + } + + rcu_read_lock(); + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_endpoint_state *estate; + struct afs_server_state *s = &op->server_states[i]; + + server = op->server_list->servers[i].server; + estate = rcu_dereference(server->endpoint_state); + s->endpoint_state = afs_get_endpoint_state(estate, + afs_estate_trace_get_server_state); + s->probe_seq = estate->probe_seq; + s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1; + init_waitqueue_entry(&s->probe_waiter, current); + afs_get_address_preferences(op->net, estate->addresses); + } + rcu_read_unlock(); + + + op->untried_servers = (1UL << op->server_list->nr_servers) - 1; + op->server_index = -1; cb_server = vnode->cb_server; if (cb_server) { @@ -40,7 +80,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, for (i = 0; i < op->server_list->nr_servers; i++) { server = op->server_list->servers[i].server; if (server == cb_server) { - op->index = i; + op->server_index = i; goto found_interest; } } @@ -50,7 +90,8 @@ static bool afs_start_fs_iteration(struct afs_operation *op, * and have to return an error. */ if (op->flags & AFS_OPERATION_CUR_ONLY) { - op->error = -ESTALE; + afs_op_set_error(op, -ESTALE); + trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0); return false; } @@ -58,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, write_seqlock(&vnode->cb_lock); ASSERTCMP(cb_server, ==, vnode->cb_server); vnode->cb_server = NULL; - if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) + if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) vnode->cb_break++; write_sequnlock(&vnode->cb_lock); } @@ -70,7 +111,7 @@ found_interest: /* * Post volume busy note. */ -static void afs_busy(struct afs_volume *volume, u32 abort_code) +static void afs_busy(struct afs_operation *op, u32 abort_code) { const char *m; @@ -81,7 +122,8 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) default: m = "busy"; break; } - pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m); + pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n", + op->volume->vid, op->volume->name, &op->server->uuid, m); } /* @@ -89,10 +131,11 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) */ static bool afs_sleep_and_retry(struct afs_operation *op) { + trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0); if (!(op->flags & AFS_OPERATION_UNINTR)) { msleep_interruptible(1000); if (signal_pending(current)) { - op->error = -ERESTARTSYS; + afs_op_set_error(op, -ERESTARTSYS); return false; } } else { @@ -111,62 +154,105 @@ bool afs_select_fileserver(struct afs_operation *op) struct afs_addr_list *alist; struct afs_server *server; struct afs_vnode *vnode = op->file[0].vnode; - struct afs_error e; - u32 rtt; - int error = op->ac.error, i; + unsigned long set, failed; + s32 abort_code = op->call_abort_code; + int best_prio = 0; + int error = op->call_error, addr_index, i, j; - _enter("%lx[%d],%lx[%d],%d,%d", - op->untried, op->index, - op->ac.tried, op->ac.index, - error, op->ac.abort_code); + op->nr_iterations++; + + _enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d", + op->debug_id, op->nr_iterations, op->volume->vid, + op->server_index, op->untried_servers, + op->addr_index, op->addr_tried, + error, abort_code); if (op->flags & AFS_OPERATION_STOP) { + trace_afs_rotate(op, afs_rotate_trace_stopped, 0); _leave(" = f [stopped]"); return false; } - op->nr_iterations++; - - /* Evaluate the result of the previous operation, if there was one. */ - switch (error) { - case SHRT_MAX: + if (op->nr_iterations == 0) goto start; + WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error); + trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error); + + /* Evaluate the result of the previous operation, if there was one. */ + switch (op->call_error) { case 0: + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); + op->cumul_error.responded = true; + + /* We succeeded, but we may need to redo the op from another + * server if we're looking at a set of RO volumes where some of + * the servers have not yet been brought up to date lest we + * regress the data. We only switch to the new version once + * >=50% of the servers are updated. + */ + error = afs_update_volume_state(op); + if (error != 0) { + if (error == 1) { + afs_sleep_and_retry(op); + goto restart_from_beginning; + } + afs_op_set_error(op, error); + goto failed; + } + fallthrough; default: /* Success or local failure. Stop. */ - op->error = error; + afs_op_set_error(op, error); op->flags |= AFS_OPERATION_STOP; + trace_afs_rotate(op, afs_rotate_trace_stop, error); _leave(" = f [okay/local %d]", error); return false; case -ECONNABORTED: /* The far side rejected the operation on some grounds. This * might involve the server being busy or the volume having been moved. + * + * Note that various V* errors should not be sent to a cache manager + * by a fileserver as they should be translated to more modern UAE* + * errors instead. IBM AFS and OpenAFS fileservers, however, do leak + * these abort codes. */ - switch (op->ac.abort_code) { + trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code); + op->cumul_error.responded = true; + switch (abort_code) { case VNOVOL: /* This fileserver doesn't know about the volume. * - May indicate that the VL is wrong - retry once and compare * the results. * - May indicate that the fileserver couldn't attach to the vol. + * - The volume might have been temporarily removed so that it can + * be replaced by a volume restore. "vos" might have ended one + * transaction and has yet to create the next. + * - The volume might not be blessed or might not be in-service + * (administrative action). */ if (op->flags & AFS_OPERATION_VNOVOL) { - op->error = -EREMOTEIO; + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; } write_lock(&op->volume->servers_lock); - op->server_list->vnovol_mask |= 1 << op->index; + op->server_list->vnovol_mask |= 1 << op->server_index; write_unlock(&op->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) { - op->error = -ENOMEDIUM; + afs_op_set_error(op, -ENOMEDIUM); goto failed; } @@ -174,7 +260,7 @@ bool afs_select_fileserver(struct afs_operation *op) * it's the fileserver having trouble. */ if (rcu_access_pointer(op->volume->servers) == op->server_list) { - op->error = -EREMOTEIO; + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; } @@ -183,50 +269,99 @@ bool afs_select_fileserver(struct afs_operation *op) _leave(" = t [vnovol]"); return true; - case VSALVAGE: /* TODO: Should this return an error or iterate? */ case VVOLEXISTS: - case VNOSERVICE: case VONLINE: - case VDISKFULL: - case VOVERQUOTA: - op->error = afs_abort_to_error(op->ac.abort_code); + /* These should not be returned from the fileserver. */ + pr_warn("Fileserver returned unexpected abort %d\n", + abort_code); + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; + case VNOSERVICE: + /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver + * if the volume was neither in-service nor administratively + * blessed. All usage was replaced by VNOVOL because AFS 3.1 and + * earlier cache managers did not handle VNOSERVICE and assumed + * it was the client OSes errno 105. + * + * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the + * fileserver idle dead time error which was sent in place of + * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the + * fileserver took too long to send a reply to the client. + * RX_CALL_TIMEOUT would have caused the cache manager to mark the + * server down whereas VNOSERVICE since AFS 3.2 would cause cache + * manager to temporarily (up to 15 minutes) mark the volume + * instance as unusable. + * + * The idle dead logic resulted in cache inconsistency since a + * state changing call that the cache manager assumed was dead + * could still be processed to completion by the fileserver. This + * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer + * returned. However, many 1.4.8 through 1.6.24 fileservers are + * still in existence. + * + * AuriStorFS fileservers have never returned VNOSERVICE. + * + * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT. + */ + case RX_CALL_TIMEOUT: + afs_op_accumulate_error(op, -ETIMEDOUT, abort_code); + goto next_server; + + case VSALVAGING: /* This error should not be leaked to cache managers + * but is from OpenAFS demand attach fileservers. + * It should be treated as an alias for VOFFLINE. + */ + case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */ case VOFFLINE: - if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) { - afs_busy(op->volume, op->ac.abort_code); - clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); + /* The volume is in use by the volserver or another volume utility + * for an operation that might alter the contents. The volume is + * expected to come back but it might take a long time (could be + * days). + */ + if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags)) { + afs_busy(op, abort_code); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); } if (op->flags & AFS_OPERATION_NO_VSLEEP) { - op->error = -EADV; - goto failed; - } - if (op->flags & AFS_OPERATION_CUR_ONLY) { - op->error = -ESTALE; + afs_op_set_error(op, -EADV); goto failed; } goto busy; - case VSALVAGING: - case VRESTARTING: + case VRESTARTING: /* The fileserver is either shutting down or starting up. */ case VBUSY: - /* Retry after going round all the servers unless we - * have a file lock we need to maintain. + /* The volume is in use by the volserver or another volume + * utility for an operation that is not expected to alter the + * contents of the volume. VBUSY does not need to be returned + * for a ROVOL or BACKVOL bound to an ITBusy volserver + * transaction. The fileserver is permitted to continue serving + * content from ROVOLs and BACKVOLs during an ITBusy transaction + * because the content will not change. However, many fileserver + * releases do return VBUSY for ROVOL and BACKVOL instances under + * many circumstances. + * + * Retry after going round all the servers unless we have a file + * lock we need to maintain. */ if (op->flags & AFS_OPERATION_NO_VSLEEP) { - op->error = -EBUSY; + afs_op_set_error(op, -EBUSY); goto failed; } - if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) { - afs_busy(op->volume, op->ac.abort_code); - clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); + if (!test_and_set_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags)) { + afs_busy(op, abort_code); + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); } busy: if (op->flags & AFS_OPERATION_CUR_ONLY) { if (!afs_sleep_and_retry(op)) goto failed; - /* Retry with same server & address */ + /* Retry with same server & address */ _leave(" = t [vbusy]"); return true; } @@ -243,7 +378,7 @@ bool afs_select_fileserver(struct afs_operation *op) * honour, just in case someone sets up a loop. */ if (op->flags & AFS_OPERATION_VMOVED) { - op->error = -EREMOTEIO; + afs_op_set_error(op, -EREMOTEIO); goto failed; } op->flags |= AFS_OPERATION_VMOVED; @@ -251,8 +386,10 @@ bool afs_select_fileserver(struct afs_operation *op) set_bit(AFS_VOLUME_WAIT, &op->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } /* If the server list didn't change, then the VLDB is * out of sync with the fileservers. This is hopefully @@ -264,22 +401,50 @@ bool afs_select_fileserver(struct afs_operation *op) * TODO: Retry a few times with sleeps. */ if (rcu_access_pointer(op->volume->servers) == op->server_list) { - op->error = -ENOMEDIUM; + afs_op_accumulate_error(op, -ENOMEDIUM, abort_code); goto failed; } goto restart_from_beginning; + case UAEIO: + case VIO: + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + if (op->volume->type != AFSVL_RWVOL) + goto next_server; + goto failed; + + case VDISKFULL: + case UAENOSPC: + /* The partition is full. Only applies to RWVOLs. + * Translate locally and return ENOSPC. + * No replicas to failover to. + */ + afs_op_set_error(op, -ENOSPC); + goto failed_but_online; + + case VOVERQUOTA: + case UAEDQUOT: + /* Volume is full. Only applies to RWVOLs. + * Translate locally and return EDQUOT. + * No replicas to failover to. + */ + afs_op_set_error(op, -EDQUOT); + goto failed_but_online; + default: - clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); - clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); - op->error = afs_abort_to_error(op->ac.abort_code); + afs_op_accumulate_error(op, error, abort_code); + failed_but_online: + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); goto failed; } case -ETIMEDOUT: case -ETIME: - if (op->error != -EDESTADDRREQ) + if (afs_op_error(op) != -EDESTADDRREQ) goto iterate_address; fallthrough; case -ERFKILL: @@ -289,7 +454,7 @@ bool afs_select_fileserver(struct afs_operation *op) case -EHOSTDOWN: case -ECONNREFUSED: _debug("no conn"); - op->error = error; + afs_op_accumulate_error(op, error, 0); goto iterate_address; case -ENETRESET: @@ -298,24 +463,31 @@ bool afs_select_fileserver(struct afs_operation *op) fallthrough; case -ECONNRESET: _debug("call reset"); - op->error = error; + afs_op_set_error(op, error); goto failed; } restart_from_beginning: + trace_afs_rotate(op, afs_rotate_trace_restart, 0); _debug("restart"); - afs_end_cursor(&op->ac); + op->estate = NULL; op->server = NULL; + afs_clear_server_states(op); + op->server_states = NULL; afs_put_serverlist(op->net, op->server_list); op->server_list = NULL; start: _debug("start"); + ASSERTCMP(op->estate, ==, NULL); /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error); + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } if (!afs_start_fs_iteration(op, vnode)) goto failed; @@ -323,52 +495,83 @@ start: _debug("__ VOL %llx __", op->volume->vid); pick_server: - _debug("pick [%lx]", op->untried); + _debug("pick [%lx]", op->untried_servers); + ASSERTCMP(op->estate, ==, NULL); - error = afs_wait_for_fs_probes(op->server_list, op->untried); - if (error < 0) - goto failed_set_error; + error = afs_wait_for_fs_probes(op, op->server_states, + !(op->flags & AFS_OPERATION_UNINTR)); + switch (error) { + case 0: /* No untried responsive servers and no outstanding probes */ + trace_afs_rotate(op, afs_rotate_trace_probe_none, 0); + goto no_more_servers; + case 1: /* Got a response */ + trace_afs_rotate(op, afs_rotate_trace_probe_response, 0); + break; + case 2: /* Probe data superseded */ + trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0); + goto restart_from_beginning; + default: + trace_afs_rotate(op, afs_rotate_trace_probe_error, error); + afs_op_set_error(op, error); + goto failed; + } - /* Pick the untried server with the lowest RTT. If we have outstanding - * callbacks, we stick with the server we're already using if we can. + /* Pick the untried server with the highest priority untried endpoint. + * If we have outstanding callbacks, we stick with the server we're + * already using if we can. */ if (op->server) { - _debug("server %u", op->index); - if (test_bit(op->index, &op->untried)) + _debug("server %u", op->server_index); + if (test_bit(op->server_index, &op->untried_servers)) goto selected_server; op->server = NULL; _debug("no server"); } - op->index = -1; - rtt = U32_MAX; + rcu_read_lock(); + op->server_index = -1; + best_prio = -1; for (i = 0; i < op->server_list->nr_servers; i++) { - struct afs_server *s = op->server_list->servers[i].server; + struct afs_endpoint_state *es; + struct afs_server_entry *se = &op->server_list->servers[i]; + struct afs_addr_list *sal; + struct afs_server *s = se->server; - if (!test_bit(i, &op->untried) || + if (!test_bit(i, &op->untried_servers) || + test_bit(AFS_SE_EXCLUDED, &se->flags) || !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags)) continue; - if (s->probe.rtt < rtt) { - op->index = i; - rtt = s->probe.rtt; + es = op->server_states->endpoint_state; + sal = es->addresses; + + afs_get_address_preferences_rcu(op->net, sal); + for (j = 0; j < sal->nr_addrs; j++) { + if (!sal->addrs[j].peer) + continue; + if (sal->addrs[j].prio > best_prio) { + op->server_index = i; + best_prio = sal->addrs[j].prio; + } } } + rcu_read_unlock(); - if (op->index == -1) + if (op->server_index == -1) goto no_more_servers; selected_server: - _debug("use %d", op->index); - __clear_bit(op->index, &op->untried); + trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio); + _debug("use %d prio %u", op->server_index, best_prio); + __clear_bit(op->server_index, &op->untried_servers); /* We're starting on a different fileserver from the list. We need to * check it, create a callback intercept, find its address list and * probe its capabilities before we use it. */ - ASSERTCMP(op->ac.alist, ==, NULL); - server = op->server_list->servers[op->index].server; + ASSERTCMP(op->estate, ==, NULL); + server = op->server_list->servers[op->server_index].server; - if (!afs_check_server_record(op, server)) + if (!afs_check_server_record(op, server, op->key)) goto failed; _debug("USING SERVER: %pU", &server->uuid); @@ -377,58 +580,73 @@ selected_server: op->server = server; if (vnode->cb_server != server) { vnode->cb_server = server; - vnode->cb_s_break = server->cb_s_break; - vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break); - vnode->cb_v_break = vnode->volume->cb_v_break; - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); + atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); } - read_lock(&server->fs_lock); - alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - afs_get_addrlist(alist); - read_unlock(&server->fs_lock); - retry_server: - memset(&op->ac, 0, sizeof(op->ac)); - - if (!op->ac.alist) - op->ac.alist = alist; - else - afs_put_addrlist(alist); - - op->ac.index = -1; + op->addr_tried = 0; + op->addr_index = -1; iterate_address: - ASSERT(op->ac.alist); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ - if (!afs_iterate_addresses(&op->ac)) - goto out_of_addresses; + op->estate = op->server_states[op->server_index].endpoint_state; + set = READ_ONCE(op->estate->responsive_set); + failed = READ_ONCE(op->estate->failed_set); + _debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed); + set &= ~(failed | op->addr_tried); + trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set); + if (!set) + goto wait_for_more_probe_results; + alist = op->estate->addresses; + for (i = 0; i < alist->nr_addrs; i++) { + if (alist->addrs[i].prio > best_prio) { + addr_index = i; + best_prio = alist->addrs[i].prio; + } + } + + addr_index = READ_ONCE(alist->preferred); + if (!test_bit(addr_index, &set)) + addr_index = __ffs(set); + + op->addr_index = addr_index; + set_bit(addr_index, &op->addr_tried); + + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; _debug("address [%u] %u/%u %pISp", - op->index, op->ac.index, op->ac.alist->nr_addrs, - &op->ac.alist->addrs[op->ac.index].transport); - + op->server_index, addr_index, alist->nr_addrs, + rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); _leave(" = t"); return true; -out_of_addresses: +wait_for_more_probe_results: + error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried, + !(op->flags & AFS_OPERATION_UNINTR)); + if (!error) + goto iterate_address; + /* We've now had a failure to respond on all of a server's addresses - * immediately probe them again and consider retrying the server. */ + trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0); afs_probe_fileserver(op->net, op->server); if (op->flags & AFS_OPERATION_RETRY_SERVER) { - alist = op->ac.alist; - error = afs_wait_for_one_fs_probe( - op->server, !(op->flags & AFS_OPERATION_UNINTR)); + error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried, + !(op->flags & AFS_OPERATION_UNINTR)); switch (error) { case 0: op->flags &= ~AFS_OPERATION_RETRY_SERVER; + trace_afs_rotate(op, afs_rotate_trace_retry_server, 0); goto retry_server; case -ERESTARTSYS: - goto failed_set_error; + afs_op_set_error(op, error); + goto failed; case -ETIME: case -EDESTADDRREQ: goto next_server; @@ -436,34 +654,51 @@ out_of_addresses: } next_server: + trace_afs_rotate(op, afs_rotate_trace_next_server, 0); _debug("next"); - afs_end_cursor(&op->ac); + ASSERT(op->estate); + alist = op->estate->addresses; + if (op->call_responded && + op->addr_index != READ_ONCE(alist->preferred) && + test_bit(alist->preferred, &op->addr_tried)) + WRITE_ONCE(alist->preferred, op->addr_index); + op->estate = NULL; goto pick_server; no_more_servers: /* That's all the servers poked to no good effect. Try again if some * of them were busy. */ - if (op->flags & AFS_OPERATION_VBUSY) + trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0); + if (op->flags & AFS_OPERATION_VBUSY) { + afs_sleep_and_retry(op); + op->flags &= ~AFS_OPERATION_VBUSY; goto restart_from_beginning; - - e.error = -EDESTADDRREQ; - e.responded = false; - for (i = 0; i < op->server_list->nr_servers; i++) { - struct afs_server *s = op->server_list->servers[i].server; - - afs_prioritise_error(&e, READ_ONCE(s->probe.error), - s->probe.abort_code); } - error = e.error; + rcu_read_lock(); + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_endpoint_state *estate; + + estate = op->server_states->endpoint_state; + error = READ_ONCE(estate->error); + if (error < 0) + afs_op_accumulate_error(op, error, estate->abort_code); + } + rcu_read_unlock(); -failed_set_error: - op->error = error; failed: + trace_afs_rotate(op, afs_rotate_trace_failed, 0); op->flags |= AFS_OPERATION_STOP; - afs_end_cursor(&op->ac); - _leave(" = f [failed %d]", op->error); + if (op->estate) { + alist = op->estate->addresses; + if (op->call_responded && + op->addr_index != READ_ONCE(alist->preferred) && + test_bit(alist->preferred, &op->addr_tried)) + WRITE_ONCE(alist->preferred, op->addr_index); + op->estate = NULL; + } + _leave(" = f [failed %d]", afs_op_error(op)); return false; } @@ -482,37 +717,40 @@ void afs_dump_edestaddrreq(const struct afs_operation *op) rcu_read_lock(); pr_notice("EDESTADDR occurred\n"); - pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n", + pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n", op->file[0].cb_break_before, - op->file[1].cb_break_before, op->flags, op->error); - pr_notice("FC: ut=%lx ix=%d ni=%u\n", - op->untried, op->index, op->nr_iterations); + op->file[1].cb_break_before, op->flags, op->cumul_error.error); + pr_notice("OP: ut=%lx ix=%d ni=%u\n", + op->untried_servers, op->server_index, op->nr_iterations); + pr_notice("OP: call er=%d ac=%d r=%u\n", + op->call_error, op->call_abort_code, op->call_responded); if (op->server_list) { const struct afs_server_list *sl = op->server_list; - pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", - sl->nr_servers, sl->preferred, sl->vnovol_mask); + + pr_notice("FC: SL nr=%u vnov=%hx\n", + sl->nr_servers, sl->vnovol_mask); for (i = 0; i < sl->nr_servers; i++) { const struct afs_server *s = sl->servers[i].server; + const struct afs_endpoint_state *e = + rcu_dereference(s->endpoint_state); + const struct afs_addr_list *a = e->addresses; + pr_notice("FC: server fl=%lx av=%u %pU\n", s->flags, s->addr_version, &s->uuid); - if (s->addresses) { - const struct afs_addr_list *a = - rcu_dereference(s->addresses); + pr_notice("FC: - pq=%x R=%lx F=%lx\n", + e->probe_seq, e->responsive_set, e->failed_set); + if (a) { pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", a->version, a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); - pr_notice("FC: - R=%lx F=%lx\n", - a->responded, a->failed); - if (a == op->ac.alist) + if (a == e->addresses) pr_notice("FC: - current\n"); } } } - pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", - op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error, - op->ac.responded, op->ac.nr_iterations); + pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index); rcu_read_unlock(); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d642d06a453b..c453428f3c8b 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -178,6 +178,8 @@ void afs_put_call(struct afs_call *call) ASSERT(!work_pending(&call->async_work)); ASSERT(call->type->name != NULL); + rxrpc_kernel_put_peer(call->peer); + if (call->rxcall) { rxrpc_kernel_shutdown_call(net->socket, call->rxcall); rxrpc_kernel_put_call(net->socket, call->rxcall); @@ -187,7 +189,6 @@ void afs_put_call(struct afs_call *call) call->type->destructor(call); afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); - afs_put_addrlist(call->alist); kfree(call->request); trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, @@ -294,9 +295,8 @@ static void afs_notify_end_request_tx(struct sock *sock, * Initiate a call and synchronously queue up the parameters for dispatch. Any * error is stored into the call struct, which the caller must check for. */ -void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) +void afs_make_call(struct afs_call *call, gfp_t gfp) { - struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index]; struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; @@ -304,7 +304,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) s64 tx_total_len; int ret; - _enter(",{%pISp},", &srx->transport); + _enter(",{%pISp+%u},", rxrpc_kernel_remote_addr(call->peer), call->service_id); ASSERT(call->type != NULL); ASSERT(call->type->name != NULL); @@ -313,8 +313,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) call, call->type->name, key_serial(call->key), atomic_read(&call->net->nr_outstanding_calls)); - call->addr_ix = ac->index; - call->alist = afs_get_addrlist(ac->alist); + trace_afs_make_call(call); /* Work out the length we're going to transmit. This is awkward for * calls such as FS.StoreData where there's an extra injection of data @@ -333,7 +332,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) } /* create a call */ - rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, + rxcall = rxrpc_kernel_begin_call(call->net->socket, call->peer, call->key, (unsigned long)call, tx_total_len, call->max_lifespan, @@ -341,6 +340,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) (call->async ? afs_wake_up_async_call : afs_wake_up_call_waiter), + call->service_id, call->upgrade, (call->intr ? RXRPC_PREINTERRUPTIBLE : RXRPC_UNINTERRUPTIBLE), @@ -390,7 +390,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) /* Note that at this point, we may have received the reply or an abort * - and an asynchronous call may already have completed. * - * afs_wait_for_call_to_complete(call, ac) + * afs_wait_for_call_to_complete(call) * must be called to synchronously clean up. */ return; @@ -406,8 +406,7 @@ error_do_abort: rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); - ac->abort_code = call->abort_code; - ac->responded = true; + call->responded = true; } call->error = ret; trace_afs_call_done(call); @@ -427,7 +426,7 @@ error_kill_call: afs_set_call_complete(call, ret, 0); } - ac->error = ret; + call->error = ret; call->state = AFS_CALL_COMPLETE; _leave(" = %d", ret); } @@ -461,7 +460,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort) max = m + 1; pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", msg, call->type->name, - &call->alist->addrs[call->addr_ix].transport); + rxrpc_kernel_remote_addr(call->peer)); } } @@ -508,6 +507,7 @@ static void afs_deliver_to_call(struct afs_call *call) ret = -EBADMSG; switch (ret) { case 0: + call->responded = true; afs_queue_call_work(call); if (state == AFS_CALL_CL_PROC_REPLY) { if (call->op) @@ -522,9 +522,11 @@ static void afs_deliver_to_call(struct afs_call *call) goto out; case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); + call->responded = true; afs_log_error(call, call->abort_code); goto done; case -ENOTSUPP: + call->responded = true; abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, @@ -571,50 +573,46 @@ call_complete: } /* - * Wait synchronously for a call to complete and clean up the call struct. + * Wait synchronously for a call to complete. */ -long afs_wait_for_call_to_complete(struct afs_call *call, - struct afs_addr_cursor *ac) +void afs_wait_for_call_to_complete(struct afs_call *call) { - long ret; bool rxrpc_complete = false; - DECLARE_WAITQUEUE(myself, current); - _enter(""); - ret = call->error; - if (ret < 0) - goto out; + if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { + DECLARE_WAITQUEUE(myself, current); - add_wait_queue(&call->waitq, &myself); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&call->waitq, &myself); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); - /* deliver any messages that are in the queue */ - if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && - call->need_attention) { - call->need_attention = false; - __set_current_state(TASK_RUNNING); - afs_deliver_to_call(call); - continue; + /* deliver any messages that are in the queue */ + if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && + call->need_attention) { + call->need_attention = false; + __set_current_state(TASK_RUNNING); + afs_deliver_to_call(call); + continue; + } + + if (afs_check_call_state(call, AFS_CALL_COMPLETE)) + break; + + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { + /* rxrpc terminated the call. */ + rxrpc_complete = true; + break; + } + + schedule(); } - if (afs_check_call_state(call, AFS_CALL_COMPLETE)) - break; - - if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { - /* rxrpc terminated the call. */ - rxrpc_complete = true; - break; - } - - schedule(); + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); } - remove_wait_queue(&call->waitq, &myself); - __set_current_state(TASK_RUNNING); - if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { if (rxrpc_complete) { afs_set_call_complete(call, call->error, call->abort_code); @@ -627,29 +625,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call, afs_set_call_complete(call, -EINTR, 0); } } - - spin_lock_bh(&call->state_lock); - ac->abort_code = call->abort_code; - ac->error = call->error; - spin_unlock_bh(&call->state_lock); - - ret = ac->error; - switch (ret) { - case 0: - ret = call->ret0; - call->ret0 = 0; - - fallthrough; - case -ECONNABORTED: - ac->responded = true; - break; - } - -out: - _debug("call complete"); - afs_put_call(call); - _leave(" = %p", (void *)ret); - return ret; } /* diff --git a/fs/afs/server.c b/fs/afs/server.c index b5237206eac3..e169121f603e 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -21,13 +21,13 @@ static void __afs_put_server(struct afs_net *, struct afs_server *); /* * Find a server by one of its addresses. */ -struct afs_server *afs_find_server(struct afs_net *net, - const struct sockaddr_rxrpc *srx) +struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer) { + const struct afs_endpoint_state *estate; const struct afs_addr_list *alist; struct afs_server *server = NULL; unsigned int i; - int seq = 0, diff; + int seq = 1; rcu_read_lock(); @@ -35,39 +35,15 @@ struct afs_server *afs_find_server(struct afs_net *net, if (server) afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq); server = NULL; + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - if (srx->transport.family == AF_INET6) { - const struct sockaddr_in6 *a = &srx->transport.sin6, *b; - hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { - alist = rcu_dereference(server->addresses); - for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { - b = &alist->addrs[i].transport.sin6; - diff = ((u16 __force)a->sin6_port - - (u16 __force)b->sin6_port); - if (diff == 0) - diff = memcmp(&a->sin6_addr, - &b->sin6_addr, - sizeof(struct in6_addr)); - if (diff == 0) - goto found; - } - } - } else { - const struct sockaddr_in *a = &srx->transport.sin, *b; - hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { - alist = rcu_dereference(server->addresses); - for (i = 0; i < alist->nr_ipv4; i++) { - b = &alist->addrs[i].transport.sin; - diff = ((u16 __force)a->sin_port - - (u16 __force)b->sin_port); - if (diff == 0) - diff = ((u32 __force)a->sin_addr.s_addr - - (u32 __force)b->sin_addr.s_addr); - if (diff == 0) - goto found; - } - } + hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { + estate = rcu_dereference(server->endpoint_state); + alist = estate->addresses; + for (i = 0; i < alist->nr_addrs; i++) + if (alist->addrs[i].peer == peer) + goto found; } server = NULL; @@ -90,7 +66,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu { struct afs_server *server = NULL; struct rb_node *p; - int diff, seq = 0; + int diff, seq = 1; _enter("%pU", uuid); @@ -102,7 +78,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu if (server) afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq); server = NULL; - + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&net->fs_lock, &seq); p = net->fs_servers.rb_node; @@ -137,6 +113,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu static struct afs_server *afs_install_server(struct afs_cell *cell, struct afs_server *candidate) { + const struct afs_endpoint_state *estate; const struct afs_addr_list *alist; struct afs_server *server, *next; struct afs_net *net = cell->net; @@ -188,8 +165,9 @@ static struct afs_server *afs_install_server(struct afs_cell *cell, added_dup: write_seqlock(&net->fs_addr_lock); - alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&net->fs_addr_lock.lock)); + estate = rcu_dereference_protected(server->endpoint_state, + lockdep_is_held(&net->fs_addr_lock.lock)); + alist = estate->addresses; /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install * it in the IPv4 and/or IPv6 reverse-map lists. @@ -219,6 +197,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid, struct afs_addr_list *alist) { + struct afs_endpoint_state *estate; struct afs_server *server; struct afs_net *net = cell->net; @@ -228,25 +207,41 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, if (!server) goto enomem; + estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL); + if (!estate) + goto enomem_server; + refcount_set(&server->ref, 1); atomic_set(&server->active, 1); server->debug_id = atomic_inc_return(&afs_server_debug_id); - RCU_INIT_POINTER(server->addresses, alist); server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); - INIT_WORK(&server->initcb_work, afs_server_init_callback_work); + INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); INIT_LIST_HEAD(&server->probe_link); spin_lock_init(&server->probe_lock); server->cell = cell; server->rtt = UINT_MAX; + server->service_id = FS_SERVICE; + + server->probe_counter = 1; + server->probed_at = jiffies - LONG_MAX / 2; + refcount_set(&estate->ref, 1); + estate->addresses = alist; + estate->server_id = server->debug_id; + estate->probe_seq = 1; + rcu_assign_pointer(server->endpoint_state, estate); afs_inc_servers_outstanding(net); trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc); + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_alloc_server); _leave(" = %p", server); return server; +enomem_server: + kfree(server); enomem: _leave(" = NULL [nomem]"); return NULL; @@ -301,20 +296,20 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, candidate = afs_alloc_server(cell, uuid, alist); if (!candidate) { - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_server_oom); return ERR_PTR(-ENOMEM); } server = afs_install_server(cell, candidate); if (server != candidate) { - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_server_dup); kfree(candidate); } else { /* Immediately dispatch an asynchronous probe to each interface * on the fileserver. This will make sure the repeat-probing * service is started. */ - afs_fs_probe_fileserver(cell->net, server, key, true); + afs_fs_probe_fileserver(cell->net, server, alist, key); } return server; @@ -447,7 +442,8 @@ static void afs_server_rcu(struct rcu_head *rcu) trace_afs_server(server->debug_id, refcount_read(&server->ref), atomic_read(&server->active), afs_server_trace_free); - afs_put_addrlist(rcu_access_pointer(server->addresses)); + afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), + afs_estate_trace_put_server); kfree(server); } @@ -459,14 +455,10 @@ static void __afs_put_server(struct afs_net *net, struct afs_server *server) static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server) { - struct afs_addr_list *alist = rcu_access_pointer(server->addresses); - struct afs_addr_cursor ac = { - .alist = alist, - .index = alist->preferred, - .error = 0, - }; + struct afs_endpoint_state *estate = rcu_access_pointer(server->endpoint_state); + struct afs_addr_list *alist = estate->addresses; - afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + afs_fs_give_up_all_callbacks(net, server, &alist->addrs[alist->preferred], NULL); } /* @@ -477,7 +469,6 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) afs_give_up_callbacks(net, server); - flush_work(&server->initcb_work); afs_put_server(net, server, afs_server_trace_destroy); } @@ -636,9 +627,12 @@ void afs_purge_servers(struct afs_net *net) * Get an update for a server's address list. */ static noinline bool afs_update_server_record(struct afs_operation *op, - struct afs_server *server) + struct afs_server *server, + struct key *key) { - struct afs_addr_list *alist, *discard; + struct afs_endpoint_state *estate; + struct afs_addr_list *alist; + bool has_addrs; _enter(""); @@ -648,29 +642,27 @@ static noinline bool afs_update_server_record(struct afs_operation *op, alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid); if (IS_ERR(alist)) { + rcu_read_lock(); + estate = rcu_dereference(server->endpoint_state); + has_addrs = estate->addresses; + rcu_read_unlock(); + if ((PTR_ERR(alist) == -ERESTARTSYS || PTR_ERR(alist) == -EINTR) && (op->flags & AFS_OPERATION_UNINTR) && - server->addresses) { + has_addrs) { _leave(" = t [intr]"); return true; } - op->error = PTR_ERR(alist); - _leave(" = f [%d]", op->error); + afs_op_set_error(op, PTR_ERR(alist)); + _leave(" = f [%d]", afs_op_error(op)); return false; } - discard = alist; - if (server->addr_version != alist->version) { - write_lock(&server->fs_lock); - discard = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - rcu_assign_pointer(server->addresses, alist); - server->addr_version = alist->version; - write_unlock(&server->fs_lock); - } + if (server->addr_version != alist->version) + afs_fs_probe_fileserver(op->net, server, alist, key); - afs_put_addrlist(discard); + afs_put_addrlist(alist, afs_alist_trace_put_server_update); _leave(" = t"); return true; } @@ -678,7 +670,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op, /* * See if a server's address list needs updating. */ -bool afs_check_server_record(struct afs_operation *op, struct afs_server *server) +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, + struct key *key) { bool success; int ret, retries = 0; @@ -698,7 +691,7 @@ retry: update: if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) { clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); - success = afs_update_server_record(op, server); + success = afs_update_server_record(op, server, key); clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags); wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING); _leave(" = %d", success); @@ -710,7 +703,7 @@ wait: (op->flags & AFS_OPERATION_UNINTR) ? TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE); if (ret == -ERESTARTSYS) { - op->error = ret; + afs_op_set_error(op, ret); _leave(" = f [intr]"); return false; } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index b59896b1de0a..7e7e567a7f8a 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -24,35 +24,62 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) /* * Build a server list from a VLDB record. */ -struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, +struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, struct key *key, - struct afs_vldb_entry *vldb, - u8 type_mask) + struct afs_vldb_entry *vldb) { struct afs_server_list *slist; struct afs_server *server; - int ret = -ENOMEM, nr_servers = 0, i, j; + unsigned int type_mask = 1 << volume->type; + bool use_newrepsites = false; + int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0; - for (i = 0; i < vldb->nr_servers; i++) - if (vldb->fs_mask[i] & type_mask) - nr_servers++; + /* Work out if we're going to restrict to NEWREPSITE-marked servers or + * not. If at least one site is marked as NEWREPSITE, then it's likely + * that "vos release" is busy updating RO sites. We cut over from one + * to the other when >=50% of the sites have been updated. Sites that + * are in the process of being updated are marked DONTUSE. + */ + for (i = 0; i < vldb->nr_servers; i++) { + if (!(vldb->fs_mask[i] & type_mask)) + continue; + nr_servers++; + if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE) + continue; + usable++; + if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE) + newrep++; + } slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL); if (!slist) goto error; + if (newrep) { + if (newrep < usable / 2) { + slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD; + } else { + slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW; + use_newrepsites = true; + } + } + refcount_set(&slist->usage, 1); rwlock_init(&slist->lock); - for (i = 0; i < AFS_MAXTYPES; i++) - slist->vids[i] = vldb->vid[i]; - /* Make sure a records exists for each server in the list. */ for (i = 0; i < vldb->nr_servers; i++) { + unsigned long se_flags = 0; + bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE; + if (!(vldb->fs_mask[i] & type_mask)) continue; + if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE) + __set_bit(AFS_SE_EXCLUDED, &se_flags); + if (newrep && (newrepsite ^ use_newrepsites)) + __set_bit(AFS_SE_EXCLUDED, &se_flags); - server = afs_lookup_server(cell, key, &vldb->fs_server[i], + server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i], vldb->addr_version[i]); if (IS_ERR(server)) { ret = PTR_ERR(server); @@ -70,7 +97,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { - afs_put_server(cell->net, server, + afs_put_server(volume->cell->net, server, afs_server_trace_put_slist_isort); continue; } @@ -81,6 +108,9 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, } slist->servers[j].server = server; + slist->servers[j].volume = volume; + slist->servers[j].flags = se_flags; + slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE; slist->nr_servers++; } @@ -92,7 +122,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, return slist; error_2: - afs_put_serverlist(cell->net, slist); + afs_put_serverlist(volume->cell->net, slist); error: return ERR_PTR(ret); } @@ -103,27 +133,117 @@ error: bool afs_annotate_server_list(struct afs_server_list *new, struct afs_server_list *old) { - struct afs_server *cur; - int i, j; + unsigned long mask = 1UL << AFS_SE_EXCLUDED; + int i; - if (old->nr_servers != new->nr_servers) + if (old->nr_servers != new->nr_servers || + old->ro_replicating != new->ro_replicating) goto changed; - for (i = 0; i < old->nr_servers; i++) + for (i = 0; i < old->nr_servers; i++) { if (old->servers[i].server != new->servers[i].server) goto changed; - + if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask)) + goto changed; + } return false; - changed: - /* Maintain the same preferred server as before if possible. */ - cur = old->servers[old->preferred].server; - for (j = 0; j < new->nr_servers; j++) { - if (new->servers[j].server == cur) { - new->preferred = j; - break; + return true; +} + +/* + * Attach a volume to the servers it is going to use. + */ +void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist) +{ + struct afs_server_entry *se, *pe; + struct afs_server *server; + struct list_head *p; + unsigned int i; + + down_write(&volume->cell->vs_lock); + + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + server = se->server; + + list_for_each(p, &server->volumes) { + pe = list_entry(p, struct afs_server_entry, slink); + if (volume->vid <= pe->volume->vid) + break; + } + list_add_tail(&se->slink, p); + } + + slist->attached = true; + up_write(&volume->cell->vs_lock); +} + +/* + * Reattach a volume to the servers it is going to use when server list is + * replaced. We try to switch the attachment points to avoid rewalking the + * lists. + */ +void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *new, + struct afs_server_list *old) +{ + unsigned int n = 0, o = 0; + + down_write(&volume->cell->vs_lock); + + while (n < new->nr_servers || o < old->nr_servers) { + struct afs_server_entry *pn = n < new->nr_servers ? &new->servers[n] : NULL; + struct afs_server_entry *po = o < old->nr_servers ? &old->servers[o] : NULL; + struct afs_server_entry *s; + struct list_head *p; + int diff; + + if (pn && po && pn->server == po->server) { + pn->cb_expires_at = po->cb_expires_at; + list_replace(&po->slink, &pn->slink); + n++; + o++; + continue; + } + + if (pn && po) + diff = memcmp(&pn->server->uuid, &po->server->uuid, + sizeof(pn->server->uuid)); + else + diff = pn ? -1 : 1; + + if (diff < 0) { + list_for_each(p, &pn->server->volumes) { + s = list_entry(p, struct afs_server_entry, slink); + if (volume->vid <= s->volume->vid) + break; + } + list_add_tail(&pn->slink, p); + n++; + } else { + list_del(&po->slink); + o++; } } - return true; + up_write(&volume->cell->vs_lock); +} + +/* + * Detach a volume from the servers it has been using. + */ +void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist) +{ + unsigned int i; + + if (!slist->attached) + return; + + down_write(&volume->cell->vs_lock); + + for (i = 0; i < slist->nr_servers; i++) + list_del(&slist->servers[i].slink); + + slist->attached = false; + up_write(&volume->cell->vs_lock); } diff --git a/fs/afs/super.c b/fs/afs/super.c index a01a0fb2cdbb..ae2d66a52add 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -381,8 +381,7 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = key; if (ctx->volume) { - afs_put_volume(ctx->net, ctx->volume, - afs_volume_trace_put_validate_fc); + afs_put_volume(ctx->volume, afs_volume_trace_put_validate_fc); ctx->volume = NULL; } @@ -529,7 +528,7 @@ static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { struct afs_net *net = afs_net(as->net_ns); - afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi); + afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi); afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi); put_net(as->net_ns); kfree(as); @@ -615,7 +614,7 @@ static void afs_free_fc(struct fs_context *fc) struct afs_fs_context *ctx = fc->fs_private; afs_destroy_sbi(fc->s_fs_info); - afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc); + afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc); afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); key_put(ctx->key); kfree(ctx); diff --git a/fs/afs/validation.c b/fs/afs/validation.c new file mode 100644 index 000000000000..46b37f2cce7d --- /dev/null +++ b/fs/afs/validation.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* vnode and volume validity verification. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include "internal.h" + +/* + * Data validation is managed through a number of mechanisms from the server: + * + * (1) On first contact with a server (such as if it has just been rebooted), + * the server sends us a CB.InitCallBackState* request. + * + * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC + * calls, the server maintains a time-limited per-vnode promise that it + * will send us a CB.CallBack request if a third party alters the vnodes + * accessed. + * + * Note that a vnode-level callbacks may also be sent for other reasons, + * such as filelock release. + * + * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC + * calls, each server maintains a time-limited per-volume promise that it + * will send us a CB.CallBack request if the RO volume is updated to a + * snapshot of the RW volume ("vos release"). This is an atomic event + * that cuts over all instances of the RO volume across multiple servers + * simultaneously. + * + * Note that a volume-level callbacks may also be sent for other reasons, + * such as the volumeserver taking over control of the volume from the + * fileserver. + * + * Note also that each server maintains an independent time limit on an + * independent callback. + * + * (4) Certain RPC calls include a volume information record "VolSync" in + * their reply. This contains a creation date for the volume that should + * remain unchanged for a RW volume (but will be changed if the volume is + * restored from backup) or will be bumped to the time of snapshotting + * when a RO volume is released. + * + * In order to track this events, the following are provided: + * + * ->cb_v_break. A counter of events that might mean that the contents of + * a volume have been altered since we last checked a vnode. + * + * ->cb_v_check. A counter of the number of events that we've sent a + * query to the server for. Everything's up to date if this equals + * cb_v_break. + * + * ->cb_scrub. A counter of the number of regression events for which we + * have to completely wipe the cache. + * + * ->cb_ro_snapshot. A counter of the number of times that we've + * recognised that a RO volume has been updated. + * + * ->cb_break. A counter of events that might mean that the contents of a + * vnode have been altered. + * + * ->cb_expires_at. The time at which the callback promise expires or + * AFS_NO_CB_PROMISE if we have no promise. + * + * The way we manage things is: + * + * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on + * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the + * volume and volume's server record. + * + * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level + * callback break on all the volumes that have been using that volume + * (ie. increment ->cb_v_break and reset ->cb_expires_at). + * + * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the + * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also + * dispatch a work item to unmap all PTEs to the vnode's pagecache to + * force reentry to the filesystem for revalidation. + * + * (4) When entering the filesystem, we call afs_validate() to check the + * validity of a vnode. This first checks to see if ->cb_v_check and + * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock + * exclusively and perform an FS.FetchStatus on the vnode. + * + * After checking the volume, we check the vnode. If there's a mismatch + * between the volume counters and the vnode's mirrors of those counters, + * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode. + * + * (5) When the reply from FS.FetchStatus arrives, the VolSync record is + * parsed: + * + * (A) If the Creation timestamp has changed on a RW volume or regressed + * on a RO volume, we try to increment ->cb_scrub; if it advances on a + * RO volume, we assume "vos release" happened and try to increment + * ->cb_ro_snapshot. + * + * (B) If the Update timestamp has regressed, we try to increment + * ->cb_scrub. + * + * Note that in both of these cases, we only do the increment if we can + * cmpxchg the value of the timestamp from the value we noted before the + * op. This tries to prevent parallel ops from fighting one another. + * + * volume->cb_v_check is then set to ->cb_v_break. + * + * (6) The AFSCallBack record included in the FS.FetchStatus reply is also + * parsed and used to set the promise in ->cb_expires_at for the vnode, + * the volume and the volume's server record. + * + * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for + * the vnode. + */ + +/* + * Check the validity of a vnode/inode and its parent volume. + */ +bool afs_check_validity(const struct afs_vnode *vnode) +{ + const struct afs_volume *volume = vnode->volume; + time64_t deadline = ktime_get_real_seconds() + 10; + + if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || + atomic64_read(&vnode->cb_expires_at) <= deadline || + volume->cb_expires_at <= deadline || + vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) || + vnode->cb_scrub != atomic_read(&volume->cb_scrub) || + test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { + _debug("inval"); + return false; + } + + return true; +} + +/* + * See if the server we've just talked to is currently excluded. + */ +static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + const struct afs_server_entry *se; + const struct afs_server_list *slist; + bool is_excluded = true; + int i; + + rcu_read_lock(); + + slist = rcu_dereference(volume->servers); + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + if (op->server == se->server) { + is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); + break; + } + } + + rcu_read_unlock(); + return is_excluded; +} + +/* + * Update the volume's server list when the creation time changes and see if + * the server we've just talked to is currently excluded. + */ +static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + int ret; + + if (__afs_is_server_excluded(op, volume)) + return 1; + + set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); + ret = afs_check_volume_status(op->volume, op); + if (ret < 0) + return ret; + + return __afs_is_server_excluded(op, volume); +} + +/* + * Handle a change to the volume creation time in the VolSync record. + */ +static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume) +{ + unsigned int snap; + time64_t cur = volume->creation_time; + time64_t old = op->pre_volsync.creation; + time64_t new = op->volsync.creation; + int ret; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->creation_time = new; + return 0; + } + + if (new == cur) + return 0; + + /* Try to advance the creation timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur != old) + return 0; + + /* If the creation time changes in an unexpected way, we need to scrub + * our caches. For a RW vol, this will only change if the volume is + * restored from a backup; for a RO/Backup vol, this will advance when + * the volume is updated to a new snapshot (eg. "vos release"). + */ + if (volume->type == AFSVL_RWVOL) + goto regressed; + if (volume->type == AFSVL_BACKVOL) { + if (new < old) + goto regressed; + goto advance; + } + + /* We have an RO volume, we need to query the VL server and look at the + * server flags to see if RW->RO replication is in progress. + */ + ret = afs_is_server_excluded(op, volume); + if (ret < 0) + return ret; + if (ret > 0) { + snap = atomic_read(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded); + return ret; + } + +advance: + snap = atomic_inc_return(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release); + volume->creation_time = new; + return 0; + +regressed: + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress); + volume->creation_time = new; + return 0; +} + +/* + * Handle a change to the volume update time in the VolSync record. + */ +static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume) +{ + enum afs_cb_break_reason reason = afs_cb_break_no_break; + time64_t cur = volume->update_time; + time64_t old = op->pre_volsync.update; + time64_t new = op->volsync.update; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->update_time = new; + return; + } + + if (new == cur) + return; + + /* If the volume update time changes in an unexpected way, we need to + * scrub our caches. For a RW vol, this will advance on every + * modification op; for a RO/Backup vol, this will advance when the + * volume is updated to a new snapshot (eg. "vos release"). + */ + if (new < old) + reason = afs_cb_break_for_update_regress; + + /* Try to advance the update timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur == old) { + if (reason == afs_cb_break_for_update_regress) { + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, reason); + } + volume->update_time = new; + } +} + +static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume) +{ + int ret = 0; + + if (likely(op->volsync.creation == volume->creation_time && + op->volsync.update == volume->update_time)) + return 0; + + mutex_lock(&volume->volsync_lock); + if (op->volsync.creation != volume->creation_time) { + ret = afs_update_volume_creation_time(op, volume); + if (ret < 0) + goto out; + } + if (op->volsync.update != volume->update_time) + afs_update_volume_update_time(op, volume); +out: + mutex_unlock(&volume->volsync_lock); + return ret; +} + +/* + * Update the state of a volume, including recording the expiration time of the + * callback promise. Returns 1 to redo the operation from the start. + */ +int afs_update_volume_state(struct afs_operation *op) +{ + struct afs_server_list *slist = op->server_list; + struct afs_server_entry *se = &slist->servers[op->server_index]; + struct afs_callback *cb = &op->file[0].scb.callback; + struct afs_volume *volume = op->volume; + unsigned int cb_v_break = atomic_read(&volume->cb_v_break); + unsigned int cb_v_check = atomic_read(&volume->cb_v_check); + int ret; + + _enter("%llx", op->volume->vid); + + if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) { + ret = afs_update_volume_times(op, volume); + if (ret != 0) { + _leave(" = %d", ret); + return ret; + } + } + + if (op->cb_v_break == cb_v_break && + (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) { + time64_t expires_at = cb->expires_at; + + if (!op->file[0].scb.have_cb) + expires_at = op->file[1].scb.callback.expires_at; + + se->cb_expires_at = expires_at; + volume->cb_expires_at = expires_at; + } + if (cb_v_check < op->cb_v_break) + atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break); + return 0; +} + +/* + * mark the data attached to an inode as obsolete due to a write on the server + * - might also want to ditch all the outstanding writes and dirty pages + */ +static void afs_zap_data(struct afs_vnode *vnode) +{ + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); + + afs_invalidate_cache(vnode, 0); + + /* nuke all the non-dirty pages that aren't locked, mapped or being + * written back in a regular file and completely discard the pages in a + * directory or symlink */ + if (S_ISREG(vnode->netfs.inode.i_mode)) + invalidate_remote_inode(&vnode->netfs.inode); + else + invalidate_inode_pages2(vnode->netfs.inode.i_mapping); +} + +/* + * validate a vnode/inode + * - there are several things we need to check + * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, + * symlink) + * - parent dir metadata changed (security changes) + * - dentry data changed (write, truncate) + * - dentry metadata changed (security changes) + */ +int afs_validate(struct afs_vnode *vnode, struct key *key) +{ + struct afs_volume *volume = vnode->volume; + unsigned int cb_ro_snapshot, cb_scrub; + time64_t deadline = ktime_get_real_seconds() + 10; + bool zap = false, locked_vol = false; + int ret; + + _enter("{v={%llx:%llu} fl=%lx},%x", + vnode->fid.vid, vnode->fid.vnode, vnode->flags, + key_serial(key)); + + if (afs_check_validity(vnode)) + return 0; + + ret = down_write_killable(&vnode->validate_lock); + if (ret < 0) + goto error; + + /* Validate a volume after the v_break has changed or the volume + * callback expired. We only want to do this once per volume per + * v_break change. The actual work will be done when parsing the + * status fetch reply. + */ + if (volume->cb_expires_at <= deadline || + atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) { + ret = mutex_lock_interruptible(&volume->cb_check_lock); + if (ret < 0) + goto error_unlock; + locked_vol = true; + } + + cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); + cb_scrub = atomic_read(&volume->cb_scrub); + if (vnode->cb_ro_snapshot != cb_ro_snapshot || + vnode->cb_scrub != cb_scrub) + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); + + if (vnode->cb_ro_snapshot != cb_ro_snapshot || + vnode->cb_scrub != cb_scrub || + volume->cb_expires_at <= deadline || + atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || + atomic64_read(&vnode->cb_expires_at) <= deadline + ) { + ret = afs_fetch_status(vnode, key, false, NULL); + if (ret < 0) { + if (ret == -ENOENT) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + goto error_unlock; + } + + _debug("new promise [fl=%lx]", vnode->flags); + } + + /* We can drop the volume lock now as. */ + if (locked_vol) { + mutex_unlock(&volume->cb_check_lock); + locked_vol = false; + } + + cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); + cb_scrub = atomic_read(&volume->cb_scrub); + _debug("vnode inval %x==%x %x==%x", + vnode->cb_ro_snapshot, cb_ro_snapshot, + vnode->cb_scrub, cb_scrub); + if (vnode->cb_scrub != cb_scrub) + zap = true; + vnode->cb_ro_snapshot = cb_ro_snapshot; + vnode->cb_scrub = cb_scrub; + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + _debug("file already deleted"); + ret = -ESTALE; + goto error_unlock; + } + + /* if the vnode's data version number changed then its contents are + * different */ + zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + if (zap) + afs_zap_data(vnode); + up_write(&vnode->validate_lock); + _leave(" = 0"); + return 0; + +error_unlock: + if (locked_vol) + mutex_unlock(&volume->cb_check_lock); + up_write(&vnode->validate_lock); +error: + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index f04a80e4f5c3..9f36e14f1c2d 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -32,55 +32,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k return volume; } -/* - * Compare two addresses. - */ -static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a, - const struct sockaddr_rxrpc *srx_b) -{ - short port_a, port_b; - int addr_a, addr_b, diff; - - diff = (short)srx_a->transport_type - (short)srx_b->transport_type; - if (diff) - goto out; - - switch (srx_a->transport_type) { - case AF_INET: { - const struct sockaddr_in *a = &srx_a->transport.sin; - const struct sockaddr_in *b = &srx_b->transport.sin; - addr_a = ntohl(a->sin_addr.s_addr); - addr_b = ntohl(b->sin_addr.s_addr); - diff = addr_a - addr_b; - if (diff == 0) { - port_a = ntohs(a->sin_port); - port_b = ntohs(b->sin_port); - diff = port_a - port_b; - } - break; - } - - case AF_INET6: { - const struct sockaddr_in6 *a = &srx_a->transport.sin6; - const struct sockaddr_in6 *b = &srx_b->transport.sin6; - diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16); - if (diff == 0) { - port_a = ntohs(a->sin6_port); - port_b = ntohs(b->sin6_port); - diff = port_a - port_b; - } - break; - } - - default: - WARN_ON(1); - diff = 1; - } - -out: - return diff; -} - /* * Compare the address lists of a pair of fileservers. */ @@ -90,13 +41,13 @@ static int afs_compare_fs_alists(const struct afs_server *server_a, const struct afs_addr_list *la, *lb; int a = 0, b = 0, addr_matches = 0; - la = rcu_dereference(server_a->addresses); - lb = rcu_dereference(server_b->addresses); + la = rcu_dereference(server_a->endpoint_state)->addresses; + lb = rcu_dereference(server_b->endpoint_state)->addresses; while (a < la->nr_addrs && b < lb->nr_addrs) { - const struct sockaddr_rxrpc *srx_a = &la->addrs[a]; - const struct sockaddr_rxrpc *srx_b = &lb->addrs[b]; - int diff = afs_compare_addrs(srx_a, srx_b); + unsigned long pa = (unsigned long)la->addrs[a].peer; + unsigned long pb = (unsigned long)lb->addrs[b].peer; + long diff = pa - pb; if (diff < 0) { a++; @@ -126,7 +77,7 @@ static int afs_compare_volume_slists(const struct afs_volume *vol_a, lb = rcu_dereference(vol_b->servers); for (i = 0; i < AFS_MAXTYPES; i++) - if (la->vids[i] != lb->vids[i]) + if (vol_a->vids[i] != vol_b->vids[i]) return 0; while (a < la->nr_servers && b < lb->nr_servers) { @@ -205,7 +156,7 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, /* And see if it's in the new cell. */ volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len); if (IS_ERR(volume)) { - afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias); + afs_put_volume(pvol, afs_volume_trace_put_query_alias); if (PTR_ERR(volume) != -ENOMEDIUM) return PTR_ERR(volume); /* That volume is not in the new cell, so not an alias */ @@ -223,8 +174,8 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, rcu_read_unlock(); } - afs_put_volume(cell->net, volume, afs_volume_trace_put_query_alias); - afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias); + afs_put_volume(volume, afs_volume_trace_put_query_alias); + afs_put_volume(pvol, afs_volume_trace_put_query_alias); return ret; } @@ -285,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) while (afs_select_vlserver(&vc)) { if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) { - vc.ac.error = -EOPNOTSUPP; + vc.call_error = -EOPNOTSUPP; skipped = true; continue; } diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index acc48216136a..9b1c20daac53 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -13,6 +13,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, unsigned short port) { struct afs_vlserver *vlserver; + static atomic_t debug_ids; vlserver = kzalloc(struct_size(vlserver, name, name_len + 1), GFP_KERNEL); @@ -21,8 +22,10 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, rwlock_init(&vlserver->lock); init_waitqueue_head(&vlserver->probe_wq); spin_lock_init(&vlserver->probe_lock); + vlserver->debug_id = atomic_inc_return(&debug_ids); vlserver->rtt = UINT_MAX; vlserver->name_len = name_len; + vlserver->service_id = VL_SERVICE; vlserver->port = port; memcpy(vlserver->name, name, name_len); } @@ -33,7 +36,8 @@ static void afs_vlserver_rcu(struct rcu_head *rcu) { struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu); - afs_put_addrlist(rcu_access_pointer(vlserver->addresses)); + afs_put_addrlist(rcu_access_pointer(vlserver->addresses), + afs_alist_trace_put_vlserver); kfree_rcu(vlserver, rcu); } @@ -83,14 +87,15 @@ static u16 afs_extract_le16(const u8 **_b) /* * Build a VL server address list from a DNS queried server list. */ -static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, +static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, + const u8 **_b, const u8 *end, u8 nr_addrs, u16 port) { struct afs_addr_list *alist; const u8 *b = *_b; int ret = -EINVAL; - alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port); + alist = afs_alloc_addrlist(nr_addrs); if (!alist) return ERR_PTR(-ENOMEM); if (nr_addrs == 0) @@ -109,7 +114,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, goto error; } memcpy(x, b, 4); - afs_merge_fs_addr4(alist, x[0], port); + ret = afs_merge_fs_addr4(net, alist, x[0], port); + if (ret < 0) + goto error; b += 4; break; @@ -119,7 +126,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, goto error; } memcpy(x, b, 16); - afs_merge_fs_addr6(alist, x, port); + ret = afs_merge_fs_addr6(net, alist, x, port); + if (ret < 0) + goto error; b += 16; break; @@ -140,7 +149,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, error: *_b = b; - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_parse_error); return ERR_PTR(ret); } @@ -247,7 +256,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, /* Extract the addresses - note that we can't skip this as we * have to advance the payload pointer. */ - addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port); + addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port); if (IS_ERR(addrs)) { ret = PTR_ERR(addrs); goto error_2; @@ -255,7 +264,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, if (vllist->nr_servers >= nr_servers) { _debug("skip %u >= %u", vllist->nr_servers, nr_servers); - afs_put_addrlist(addrs); + afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); afs_put_vlserver(cell->net, server); continue; } @@ -264,7 +273,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, addrs->status = bs.status; if (addrs->nr_addrs == 0) { - afs_put_addrlist(addrs); + afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); if (!rcu_access_pointer(server->addresses)) { afs_put_vlserver(cell->net, server); continue; @@ -276,7 +285,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, old = rcu_replace_pointer(server->addresses, old, lockdep_is_held(&server->lock)); write_unlock(&server->lock); - afs_put_addrlist(old); + afs_put_addrlist(old, afs_alist_trace_put_vlserver_old); } diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index 58452b86e672..3d2e0c925460 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -46,11 +46,12 @@ static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up) */ void afs_vlserver_probe_result(struct afs_call *call) { - struct afs_addr_list *alist = call->alist; + struct afs_addr_list *alist = call->vl_probe; struct afs_vlserver *server = call->vlserver; + struct afs_address *addr = &alist->addrs[call->probe_index]; unsigned int server_index = call->server_index; unsigned int rtt_us = 0; - unsigned int index = call->addr_ix; + unsigned int index = call->probe_index; bool have_result = false; int ret = call->error; @@ -89,7 +90,7 @@ void afs_vlserver_probe_result(struct afs_call *call) case -ETIME: default: clear_bit(index, &alist->responded); - set_bit(index, &alist->failed); + set_bit(index, &alist->probe_failed); if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && (server->probe.error == 0 || server->probe.error == -ETIMEDOUT || @@ -101,21 +102,21 @@ void afs_vlserver_probe_result(struct afs_call *call) responded: set_bit(index, &alist->responded); - clear_bit(index, &alist->failed); + clear_bit(index, &alist->probe_failed); if (call->service_id == YFS_VL_SERVICE) { server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } else { server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } } - rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); + rtt_us = rxrpc_kernel_get_srtt(addr->peer); if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; @@ -130,8 +131,10 @@ responded: out: spin_unlock(&server->probe_lock); - _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, rtt_us, ret); + trace_afs_vl_probe(server, false, alist, index, call->error, call->abort_code, rtt_us); + _debug("probe [%u][%u] %pISpc rtt=%d ret=%d", + server_index, index, rxrpc_kernel_remote_addr(addr->peer), + rtt_us, ret); afs_done_one_vl_probe(server, have_result); } @@ -146,35 +149,52 @@ static bool afs_do_probe_vlserver(struct afs_net *net, unsigned int server_index, struct afs_error *_e) { - struct afs_addr_cursor ac = { - .index = 0, - }; + struct afs_addr_list *alist; struct afs_call *call; + unsigned long unprobed; + unsigned int index, i; bool in_progress = false; + int best_prio; _enter("%s", server->name); read_lock(&server->lock); - ac.alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->lock)); + alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->lock)); + afs_get_addrlist(alist, afs_alist_trace_get_vlprobe); read_unlock(&server->lock); - atomic_set(&server->probe_outstanding, ac.alist->nr_addrs); + atomic_set(&server->probe_outstanding, alist->nr_addrs); memset(&server->probe, 0, sizeof(server->probe)); server->probe.rtt = UINT_MAX; - for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { - call = afs_vl_get_capabilities(net, &ac, key, server, + unprobed = (1UL << alist->nr_addrs) - 1; + while (unprobed) { + best_prio = -1; + index = 0; + for (i = 0; i < alist->nr_addrs; i++) { + if (test_bit(i, &unprobed) && + alist->addrs[i].prio > best_prio) { + index = i; + best_prio = alist->addrs[i].prio; + } + } + __clear_bit(index, &unprobed); + + trace_afs_vl_probe(server, true, alist, index, 0, 0, 0); + call = afs_vl_get_capabilities(net, alist, index, key, server, server_index); if (!IS_ERR(call)) { + afs_prioritise_error(_e, call->error, call->abort_code); afs_put_call(call); in_progress = true; } else { - afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code); + afs_prioritise_error(_e, PTR_ERR(call), 0); afs_done_one_vl_probe(server, false); } } + afs_put_addrlist(alist, afs_alist_trace_put_vlprobe); return in_progress; } @@ -185,12 +205,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key, struct afs_vlserver_list *vllist) { struct afs_vlserver *server; - struct afs_error e; + struct afs_error e = {}; bool in_progress = false; int i; - e.error = 0; - e.responded = false; for (i = 0; i < vllist->nr_servers; i++) { server = vllist->servers[i].server; if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index eb415ce56360..d8f79f6ada3d 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -17,18 +17,21 @@ bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell, struct key *key) { + static atomic_t debug_ids; + memset(vc, 0, sizeof(*vc)); vc->cell = cell; vc->key = key; - vc->error = -EDESTADDRREQ; - vc->ac.error = SHRT_MAX; + vc->cumul_error.error = -EDESTADDRREQ; + vc->nr_iterations = -1; if (signal_pending(current)) { - vc->error = -EINTR; + vc->cumul_error.error = -EINTR; vc->flags |= AFS_VL_CURSOR_STOP; return false; } + vc->debug_id = atomic_inc_return(&debug_ids); return true; } @@ -52,7 +55,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) &cell->dns_lookup_count, smp_load_acquire(&cell->dns_lookup_count) != dns_lookup_count) < 0) { - vc->error = -ERESTARTSYS; + vc->cumul_error.error = -ERESTARTSYS; return false; } } @@ -60,12 +63,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) /* Status load is ordered after lookup counter load */ if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) { pr_warn("No record of cell %s\n", cell->name); - vc->error = -ENOENT; + vc->cumul_error.error = -ENOENT; return false; } if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { - vc->error = -EDESTADDRREQ; + vc->cumul_error.error = -EDESTADDRREQ; return false; } } @@ -78,8 +81,8 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) if (!vc->server_list->nr_servers) return false; - vc->untried = (1UL << vc->server_list->nr_servers) - 1; - vc->index = -1; + vc->untried_servers = (1UL << vc->server_list->nr_servers) - 1; + vc->server_index = -1; return true; } @@ -89,54 +92,57 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) */ bool afs_select_vlserver(struct afs_vl_cursor *vc) { - struct afs_addr_list *alist; + struct afs_addr_list *alist = vc->alist; struct afs_vlserver *vlserver; - struct afs_error e; - u32 rtt; - int error = vc->ac.error, i; + unsigned long set, failed; + unsigned int rtt; + s32 abort_code = vc->call_abort_code; + int error = vc->call_error, i; - _enter("%lx[%d],%lx[%d],%d,%d", - vc->untried, vc->index, - vc->ac.tried, vc->ac.index, - error, vc->ac.abort_code); + vc->nr_iterations++; + + _enter("VC=%x+%x,%d{%lx},%d{%lx},%d,%d", + vc->debug_id, vc->nr_iterations, vc->server_index, vc->untried_servers, + vc->addr_index, vc->addr_tried, + error, abort_code); if (vc->flags & AFS_VL_CURSOR_STOP) { _leave(" = f [stopped]"); return false; } - vc->nr_iterations++; + if (vc->nr_iterations == 0) + goto start; + + WRITE_ONCE(alist->addrs[vc->addr_index].last_error, error); /* Evaluate the result of the previous operation, if there was one. */ switch (error) { - case SHRT_MAX: - goto start; - default: case 0: /* Success or local failure. Stop. */ - vc->error = error; + vc->cumul_error.error = error; vc->flags |= AFS_VL_CURSOR_STOP; - _leave(" = f [okay/local %d]", vc->ac.error); + _leave(" = f [okay/local %d]", vc->cumul_error.error); return false; case -ECONNABORTED: /* The far side rejected the operation on some grounds. This * might involve the server being busy or the volume having been moved. */ - switch (vc->ac.abort_code) { + switch (abort_code) { case AFSVL_IO: case AFSVL_BADVOLOPER: case AFSVL_NOMEM: /* The server went weird. */ - vc->error = -EREMOTEIO; + afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code); //write_lock(&vc->cell->vl_servers_lock); - //vc->server_list->weird_mask |= 1 << vc->index; + //vc->server_list->weird_mask |= 1 << vc->server_index; //write_unlock(&vc->cell->vl_servers_lock); goto next_server; default: - vc->error = afs_abort_to_error(vc->ac.abort_code); + afs_prioritise_error(&vc->cumul_error, error, abort_code); goto failed; } @@ -149,12 +155,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) case -ETIMEDOUT: case -ETIME: _debug("no conn %d", error); - vc->error = error; + afs_prioritise_error(&vc->cumul_error, error, 0); goto iterate_address; case -ECONNRESET: _debug("call reset"); - vc->error = error; + afs_prioritise_error(&vc->cumul_error, error, 0); vc->flags |= AFS_VL_CURSOR_RETRY; goto next_server; @@ -165,7 +171,13 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) restart_from_beginning: _debug("restart"); - afs_end_cursor(&vc->ac); + if (vc->call_responded && + vc->addr_index != vc->alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_restart); + alist = vc->alist = NULL; + afs_put_vlserverlist(vc->cell->net, vc->server_list); vc->server_list = NULL; if (vc->flags & AFS_VL_CURSOR_RETRIED) @@ -173,53 +185,58 @@ restart_from_beginning: vc->flags |= AFS_VL_CURSOR_RETRIED; start: _debug("start"); + ASSERTCMP(alist, ==, NULL); if (!afs_start_vl_iteration(vc)) goto failed; error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_prioritise_error(&vc->cumul_error, error, 0); + goto failed; + } pick_server: - _debug("pick [%lx]", vc->untried); + _debug("pick [%lx]", vc->untried_servers); + ASSERTCMP(alist, ==, NULL); - error = afs_wait_for_vl_probes(vc->server_list, vc->untried); - if (error < 0) - goto failed_set_error; + error = afs_wait_for_vl_probes(vc->server_list, vc->untried_servers); + if (error < 0) { + afs_prioritise_error(&vc->cumul_error, error, 0); + goto failed; + } /* Pick the untried server with the lowest RTT. */ - vc->index = vc->server_list->preferred; - if (test_bit(vc->index, &vc->untried)) + vc->server_index = vc->server_list->preferred; + if (test_bit(vc->server_index, &vc->untried_servers)) goto selected_server; - vc->index = -1; - rtt = U32_MAX; + vc->server_index = -1; + rtt = UINT_MAX; for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; - if (!test_bit(i, &vc->untried) || + if (!test_bit(i, &vc->untried_servers) || !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) continue; - if (s->probe.rtt < rtt) { - vc->index = i; + if (s->probe.rtt <= rtt) { + vc->server_index = i; rtt = s->probe.rtt; } } - if (vc->index == -1) + if (vc->server_index == -1) goto no_more_servers; selected_server: - _debug("use %d", vc->index); - __clear_bit(vc->index, &vc->untried); + _debug("use %d", vc->server_index); + __clear_bit(vc->server_index, &vc->untried_servers); /* We're starting on a different vlserver from the list. We need to * check it, find its address list and probe its capabilities before we * use it. */ - ASSERTCMP(vc->ac.alist, ==, NULL); - vlserver = vc->server_list->servers[vc->index].server; + vlserver = vc->server_list->servers[vc->server_index].server; vc->server = vlserver; _debug("USING VLSERVER: %s", vlserver->name); @@ -227,34 +244,48 @@ selected_server: read_lock(&vlserver->lock); alist = rcu_dereference_protected(vlserver->addresses, lockdep_is_held(&vlserver->lock)); - afs_get_addrlist(alist); + vc->alist = afs_get_addrlist(alist, afs_alist_trace_get_vlrotate_set); read_unlock(&vlserver->lock); - memset(&vc->ac, 0, sizeof(vc->ac)); - - if (!vc->ac.alist) - vc->ac.alist = alist; - else - afs_put_addrlist(alist); - - vc->ac.index = -1; + vc->addr_tried = 0; + vc->addr_index = -1; iterate_address: - ASSERT(vc->ac.alist); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ - if (!afs_iterate_addresses(&vc->ac)) + set = READ_ONCE(alist->responded); + failed = READ_ONCE(alist->probe_failed); + vc->addr_index = READ_ONCE(alist->preferred); + + _debug("%lx-%lx-%lx,%d", set, failed, vc->addr_tried, vc->addr_index); + + set &= ~(failed | vc->addr_tried); + + if (!set) goto next_server; - _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs); + if (!test_bit(vc->addr_index, &set)) + vc->addr_index = __ffs(set); - _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport); + set_bit(vc->addr_index, &vc->addr_tried); + vc->alist = alist; + + _debug("VL address %d/%d", vc->addr_index, alist->nr_addrs); + + vc->call_responded = false; + _leave(" = t %pISpc", rxrpc_kernel_remote_addr(alist->addrs[vc->addr_index].peer)); return true; next_server: _debug("next"); - afs_end_cursor(&vc->ac); + ASSERT(alist); + if (vc->call_responded && + vc->addr_index != alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_next); + alist = vc->alist = NULL; goto pick_server; no_more_servers: @@ -264,25 +295,26 @@ no_more_servers: if (vc->flags & AFS_VL_CURSOR_RETRY) goto restart_from_beginning; - e.error = -EDESTADDRREQ; - e.responded = false; for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) - e.responded = true; - afs_prioritise_error(&e, READ_ONCE(s->probe.error), + vc->cumul_error.responded = true; + afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error), s->probe.abort_code); } - error = e.error; - -failed_set_error: - vc->error = error; failed: + if (alist) { + if (vc->call_responded && + vc->addr_index != alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_fail); + alist = vc->alist = NULL; + } vc->flags |= AFS_VL_CURSOR_STOP; - afs_end_cursor(&vc->ac); - _leave(" = f [failed %d]", vc->error); + _leave(" = f [failed %d]", vc->cumul_error.error); return false; } @@ -305,7 +337,10 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) pr_notice("DNS: src=%u st=%u lc=%x\n", cell->dns_source, cell->dns_status, cell->dns_lookup_count); pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", - vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error); + vc->untried_servers, vc->server_index, vc->nr_iterations, + vc->flags, vc->cumul_error.error); + pr_notice("VC: call er=%d ac=%d r=%u\n", + vc->call_error, vc->call_abort_code, vc->call_responded); if (vc->server_list) { const struct afs_vlserver_list *sl = vc->server_list; @@ -322,16 +357,14 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); pr_notice("VC: - R=%lx F=%lx\n", - a->responded, a->failed); - if (a == vc->ac.alist) + a->responded, a->probe_failed); + if (a == vc->alist) pr_notice("VC: - current\n"); } } } - pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", - vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error, - vc->ac.responded, vc->ac.nr_iterations); + pr_notice("AC: t=%lx ax=%u\n", vc->addr_tried, vc->addr_index); rcu_read_unlock(); } @@ -342,17 +375,25 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc) { struct afs_net *net = vc->cell->net; - if (vc->error == -EDESTADDRREQ || - vc->error == -EADDRNOTAVAIL || - vc->error == -ENETUNREACH || - vc->error == -EHOSTUNREACH) + _enter("VC=%x+%x", vc->debug_id, vc->nr_iterations); + + switch (vc->cumul_error.error) { + case -EDESTADDRREQ: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: afs_vl_dump_edestaddrreq(vc); + break; + } - afs_end_cursor(&vc->ac); + if (vc->alist) { + if (vc->call_responded && + vc->addr_index != vc->alist->preferred && + test_bit(vc->alist->preferred, &vc->addr_tried)) + WRITE_ONCE(vc->alist->preferred, vc->addr_index); + afs_put_addrlist(vc->alist, afs_alist_trace_put_vlrotate_end); + vc->alist = NULL; + } afs_put_vlserverlist(net, vc->server_list); - - if (vc->error == -ECONNABORTED) - vc->error = afs_abort_to_error(vc->ac.abort_code); - - return vc->error; + return vc->cumul_error.error; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 00fca3c66ba6..cac75f89b64a 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -18,8 +18,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) { struct afs_uvldbentry__xdr *uvldb; struct afs_vldb_entry *entry; - bool new_only = false; - u32 tmp, nr_servers, vlflags; + u32 nr_servers, vlflags; int i, ret; _enter(""); @@ -41,27 +40,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) entry->name[i] = 0; entry->name_len = strlen(entry->name); - /* If there is a new replication site that we can use, ignore all the - * sites that aren't marked as new. - */ - for (i = 0; i < nr_servers; i++) { - tmp = ntohl(uvldb->serverFlags[i]); - if (!(tmp & AFS_VLSF_DONTUSE) && - (tmp & AFS_VLSF_NEWREPSITE)) - new_only = true; - } - vlflags = ntohl(uvldb->flags); for (i = 0; i < nr_servers; i++) { struct afs_uuid__xdr *xdr; struct afs_uuid *uuid; + u32 tmp = ntohl(uvldb->serverFlags[i]); int j; int n = entry->nr_servers; - tmp = ntohl(uvldb->serverFlags[i]); - if (tmp & AFS_VLSF_DONTUSE || - (new_only && !(tmp & AFS_VLSF_NEWREPSITE))) - continue; if (tmp & AFS_VLSF_RWVOL) { entry->fs_mask[n] |= AFS_VOL_VTM_RW; if (vlflags & AFS_VLF_BACKEXISTS) @@ -82,6 +68,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) for (j = 0; j < 6; j++) uuid->node[j] = (u8)ntohl(xdr->node[j]); + entry->vlsf_flags[n] = tmp; entry->addr_version[n] = ntohl(uvldb->serverUnique[i]); entry->nr_servers++; } @@ -106,12 +93,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) return 0; } -static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call) -{ - kfree(call->ret_vldb); - afs_flat_call_destructor(call); -} - /* * VL.GetEntryByNameU operation type. */ @@ -119,7 +100,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = { .name = "VL.GetEntryByNameU", .op = afs_VL_GetEntryByNameU, .deliver = afs_deliver_vl_get_entry_by_name_u, - .destructor = afs_destroy_vl_get_entry_by_name_u, + .destructor = afs_flat_call_destructor, }; /* @@ -155,6 +136,8 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_vldb = entry; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -165,8 +148,17 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, memset((void *)bp + volnamesz, 0, padsz); trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + afs_put_call(call); + if (vc->call_error) { + kfree(entry); + return ERR_PTR(vc->call_error); + } + return entry; } /* @@ -208,7 +200,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) count = ntohl(*bp); nentries = min(nentries, count); - alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT); + alist = afs_alloc_addrlist(nentries); if (!alist) return -ENOMEM; alist->version = uniquifier; @@ -230,9 +222,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) alist = call->ret_alist; bp = call->buffer; count = min(call->count, 4U); - for (i = 0; i < count; i++) - if (alist->nr_addrs < call->count2) - afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT); + for (i = 0; i < count; i++) { + if (alist->nr_addrs < call->count2) { + ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT); + if (ret < 0) + return ret; + } + } call->count -= count; if (call->count > 0) @@ -245,12 +241,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) return 0; } -static void afs_vl_get_addrs_u_destructor(struct afs_call *call) -{ - afs_put_addrlist(call->ret_alist); - return afs_flat_call_destructor(call); -} - /* * VL.GetAddrsU operation type. */ @@ -258,7 +248,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = { .name = "VL.GetAddrsU", .op = afs_VL_GetAddrsU, .deliver = afs_deliver_vl_get_addrs_u, - .destructor = afs_vl_get_addrs_u_destructor, + .destructor = afs_flat_call_destructor, }; /* @@ -269,6 +259,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, const uuid_t *uuid) { struct afs_ListAddrByAttributes__xdr *r; + struct afs_addr_list *alist; const struct afs_uuid *u = (const struct afs_uuid *)uuid; struct afs_call *call; struct afs_net *net = vc->cell->net; @@ -286,6 +277,8 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_alist = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -304,8 +297,18 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, r->uuid.node[i] = htonl(u->node[i]); trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + alist = call->ret_alist; + afs_put_call(call); + if (vc->call_error) { + afs_put_addrlist(alist, afs_alist_trace_put_getaddru); + return ERR_PTR(vc->call_error); + } + return alist; } /* @@ -355,6 +358,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) static void afs_destroy_vl_get_capabilities(struct afs_call *call) { + afs_put_addrlist(call->vl_probe, afs_alist_trace_put_vlgetcaps); afs_put_vlserver(call->net, call->vlserver); afs_flat_call_destructor(call); } @@ -378,7 +382,8 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { * other end supports. */ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, - struct afs_addr_cursor *ac, + struct afs_addr_list *alist, + unsigned int addr_index, struct key *key, struct afs_vlserver *server, unsigned int server_index) @@ -395,6 +400,10 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, call->key = key; call->vlserver = afs_get_vlserver(server); call->server_index = server_index; + call->peer = rxrpc_kernel_get_peer(alist->addrs[addr_index].peer); + call->vl_probe = afs_get_addrlist(alist, afs_alist_trace_get_vlgetcaps); + call->probe_index = addr_index; + call->service_id = server->service_id; call->upgrade = true; call->async = true; call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; @@ -405,7 +414,7 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, /* Can't take a ref on server */ trace_afs_make_vl_call(call); - afs_make_call(ac, call, GFP_KERNEL); + afs_make_call(call, GFP_KERNEL); return call; } @@ -450,7 +459,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (call->count > YFS_MAXENDPOINTS) return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num); - alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); + alist = afs_alloc_addrlist(call->count); if (!alist) return -ENOMEM; alist->version = uniquifier; @@ -488,14 +497,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (ntohl(bp[0]) != sizeof(__be32) * 2) return afs_protocol_error( call, afs_eproto_yvl_fsendpt4_len); - afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); + ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2])); + if (ret < 0) + return ret; bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) return afs_protocol_error( call, afs_eproto_yvl_fsendpt6_len); - afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); + ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5])); + if (ret < 0) + return ret; bp += 6; break; default: @@ -610,7 +623,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { .name = "YFSVL.GetEndpoints", .op = afs_YFSVL_GetEndpoints, .deliver = afs_deliver_yfsvl_get_endpoints, - .destructor = afs_vl_get_addrs_u_destructor, + .destructor = afs_flat_call_destructor, }; /* @@ -620,6 +633,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, const uuid_t *uuid) { + struct afs_addr_list *alist; struct afs_call *call; struct afs_net *net = vc->cell->net; __be32 *bp; @@ -635,6 +649,8 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_alist = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -643,8 +659,18 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */ trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + alist = call->ret_alist; + afs_put_call(call); + if (vc->call_error) { + afs_put_addrlist(alist, afs_alist_trace_put_getaddru); + return ERR_PTR(vc->call_error); + } + return alist; } /* @@ -709,12 +735,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) return 0; } -static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call) -{ - kfree(call->ret_str); - afs_flat_call_destructor(call); -} - /* * VL.GetCapabilities operation type */ @@ -722,7 +742,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = { .name = "YFSVL.GetCellName", .op = afs_YFSVL_GetCellName, .deliver = afs_deliver_yfsvl_get_cell_name, - .destructor = afs_destroy_yfsvl_get_cell_name, + .destructor = afs_flat_call_destructor, }; /* @@ -737,6 +757,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) struct afs_call *call; struct afs_net *net = vc->cell->net; __be32 *bp; + char *cellname; _enter(""); @@ -747,6 +768,8 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) call->key = vc->key; call->ret_str = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* marshall the parameters */ bp = call->request; @@ -754,6 +777,16 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) /* Can't take a ref on server */ trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (char *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + cellname = call->ret_str; + afs_put_call(call); + if (vc->call_error) { + kfree(cellname); + return ERR_PTR(vc->call_error); + } + return cellname; } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 115c081a8e2c..020ecd45e476 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -11,6 +11,8 @@ static unsigned __read_mostly afs_volume_record_life = 60 * 60; +static void afs_destroy_volume(struct work_struct *work); + /* * Insert a volume into a cell. If there's an existing volume record, that is * returned instead with a ref held. @@ -72,11 +74,11 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume) */ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, struct afs_vldb_entry *vldb, - unsigned long type_mask) + struct afs_server_list **_slist) { struct afs_server_list *slist; struct afs_volume *volume; - int ret = -ENOMEM; + int ret = -ENOMEM, i; volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); if (!volume) @@ -88,20 +90,30 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, volume->type = params->type; volume->type_force = params->force; volume->name_len = vldb->name_len; + volume->creation_time = TIME64_MIN; + volume->update_time = TIME64_MIN; refcount_set(&volume->ref, 1); INIT_HLIST_NODE(&volume->proc_link); + INIT_WORK(&volume->destructor, afs_destroy_volume); rwlock_init(&volume->servers_lock); + mutex_init(&volume->volsync_lock); + mutex_init(&volume->cb_check_lock); rwlock_init(&volume->cb_v_break_lock); + INIT_LIST_HEAD(&volume->open_mmaps); + init_rwsem(&volume->open_mmaps_lock); memcpy(volume->name, vldb->name, vldb->name_len + 1); - slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask); + for (i = 0; i < AFS_MAXTYPES; i++) + volume->vids[i] = vldb->vid[i]; + + slist = afs_alloc_server_list(volume, params->key, vldb); if (IS_ERR(slist)) { ret = PTR_ERR(slist); goto error_1; } - refcount_set(&slist->usage, 1); + *_slist = slist; rcu_assign_pointer(volume->servers, slist); trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc); return volume; @@ -117,18 +129,20 @@ error_0: * Look up or allocate a volume record. */ static struct afs_volume *afs_lookup_volume(struct afs_fs_context *params, - struct afs_vldb_entry *vldb, - unsigned long type_mask) + struct afs_vldb_entry *vldb) { + struct afs_server_list *slist; struct afs_volume *candidate, *volume; - candidate = afs_alloc_volume(params, vldb, type_mask); + candidate = afs_alloc_volume(params, vldb, &slist); if (IS_ERR(candidate)) return candidate; volume = afs_insert_volume_into_cell(params->cell, candidate); - if (volume != candidate) - afs_put_volume(params->net, candidate, afs_volume_trace_put_cell_dup); + if (volume == candidate) + afs_attach_volume_to_servers(volume, slist); + else + afs_put_volume(candidate, afs_volume_trace_put_cell_dup); return volume; } @@ -208,8 +222,7 @@ struct afs_volume *afs_create_volume(struct afs_fs_context *params) goto error; } - type_mask = 1UL << params->type; - volume = afs_lookup_volume(params, vldb, type_mask); + volume = afs_lookup_volume(params, vldb); error: kfree(vldb); @@ -219,16 +232,20 @@ error: /* * Destroy a volume record */ -static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) +static void afs_destroy_volume(struct work_struct *work) { + struct afs_volume *volume = container_of(work, struct afs_volume, destructor); + struct afs_server_list *slist = rcu_access_pointer(volume->servers); + _enter("%p", volume); #ifdef CONFIG_AFS_FSCACHE ASSERTCMP(volume->cache, ==, NULL); #endif + afs_detach_volume_from_servers(volume, slist); afs_remove_volume_from_cell(volume); - afs_put_serverlist(net, rcu_access_pointer(volume->servers)); + afs_put_serverlist(volume->cell->net, slist); afs_put_cell(volume->cell, afs_cell_trace_put_vol); trace_afs_volume(volume->vid, refcount_read(&volume->ref), afs_volume_trace_free); @@ -270,8 +287,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, /* * Drop a reference on a volume record. */ -void afs_put_volume(struct afs_net *net, struct afs_volume *volume, - enum afs_volume_trace reason) +void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason) { if (volume) { afs_volid_t vid = volume->vid; @@ -281,7 +297,7 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume, zero = __refcount_dec_and_test(&volume->ref, &r); trace_afs_volume(vid, r - 1, reason); if (zero) - afs_destroy_volume(net, volume); + schedule_work(&volume->destructor); } } @@ -362,8 +378,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) } /* See if the volume's server list got updated. */ - new = afs_alloc_server_list(volume->cell, key, - vldb, (1 << volume->type)); + new = afs_alloc_server_list(volume, key, vldb); if (IS_ERR(new)) { ret = PTR_ERR(new); goto error_vldb; @@ -382,11 +397,17 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) discard = old; } - volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; + /* Check more often if replication is ongoing. */ + if (new->ro_replicating) + volume->update_at = ktime_get_real_seconds() + 10 * 60; + else + volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; write_unlock(&volume->servers_lock); - ret = 0; + if (discard == old) + afs_reattach_volume_to_servers(volume, new, old); afs_put_serverlist(volume->cell->net, discard); + ret = 0; error_vldb: kfree(vldb); error: diff --git a/fs/afs/write.c b/fs/afs/write.c index e87b52b1f34c..61d34ad2ca7d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -366,7 +366,7 @@ static void afs_store_data_success(struct afs_operation *op) op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); - if (op->error == 0) { + if (!afs_op_error(op)) { if (!op->store.laundering) afs_pages_written_back(vnode, op->store.pos, op->store.size); afs_stat_v(vnode, n_stores); @@ -428,7 +428,7 @@ try_next_key: afs_wait_for_operation(op); - switch (op->error) { + switch (afs_op_error(op)) { case -EACCES: case -EPERM: case -ENOKEY: @@ -447,7 +447,7 @@ try_next_key: } afs_put_wb_key(wbk); - _leave(" = %d", op->error); + _leave(" = %d", afs_op_error(op)); return afs_put_operation(op); } diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 11571cca86c1..f521e66d3bf6 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -245,12 +245,15 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp, struct afs_volsync *volsync) { struct yfs_xdr_YFSVolSync *x = (void *)*_bp; - u64 creation; + u64 creation, update; if (volsync) { creation = xdr_to_u64(x->vol_creation_date); do_div(creation, 10 * 1000 * 1000); volsync->creation = creation; + update = xdr_to_u64(x->vol_update_date); + do_div(update, 10 * 1000 * 1000); + volsync->update = update; } *_bp += xdr_size(x); @@ -490,6 +493,7 @@ void yfs_fs_fetch_data(struct afs_operation *op) bp = xdr_encode_u64(bp, req->len); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -572,6 +576,7 @@ void yfs_fs_create_file(struct afs_operation *op) bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -620,6 +625,7 @@ void yfs_fs_make_dir(struct afs_operation *op) bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -704,6 +710,7 @@ void yfs_fs_remove_file2(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -773,6 +780,7 @@ void yfs_fs_remove_file(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -814,6 +822,7 @@ void yfs_fs_remove_dir(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -887,6 +896,7 @@ void yfs_fs_link(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call1(call, &vp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -968,6 +978,7 @@ void yfs_fs_symlink(struct afs_operation *op) bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1047,6 +1058,7 @@ void yfs_fs_rename(struct afs_operation *op) bp = xdr_encode_name(bp, new_name); yfs_check_req(call, bp); + call->fid = orig_dvp->fid; trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1102,6 +1114,7 @@ void yfs_fs_store_data(struct afs_operation *op) bp = xdr_encode_u64(bp, op->store.i_size); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1158,6 +1171,7 @@ static void yfs_fs_setattr_size(struct afs_operation *op) bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1196,6 +1210,7 @@ void yfs_fs_setattr(struct afs_operation *op) bp = xdr_encode_YFS_StoreStatus(bp, attr); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1366,6 +1381,7 @@ void yfs_fs_get_volume_status(struct afs_operation *op) bp = xdr_encode_u64(bp, vp->fid.vid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1430,6 +1446,7 @@ void yfs_fs_set_lock(struct afs_operation *op) bp = xdr_encode_u32(bp, op->lock.type); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); afs_make_op_call(op, call, GFP_NOFS); } @@ -1460,6 +1477,7 @@ void yfs_fs_extend_lock(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1490,6 +1508,7 @@ void yfs_fs_release_lock(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1556,6 +1575,7 @@ void yfs_fs_fetch_status(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1736,6 +1756,7 @@ void yfs_fs_inline_bulk_status(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &op->more_files[i].fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1898,6 +1919,7 @@ void yfs_fs_fetch_opaque_acl(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } @@ -1948,6 +1970,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op) bp += size / sizeof(__be32); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 5531dd08061e..0754c463224a 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -15,6 +15,7 @@ struct key; struct sock; struct socket; struct rxrpc_call; +struct rxrpc_peer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -41,13 +42,14 @@ void rxrpc_kernel_new_call_notification(struct socket *, rxrpc_notify_new_call_t, rxrpc_discard_new_call_t); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, - struct sockaddr_rxrpc *srx, + struct rxrpc_peer *peer, struct key *key, unsigned long user_call_ID, s64 tx_total_len, u32 hard_timeout, gfp_t gfp, rxrpc_notify_rx_t notify_rx, + u16 service_id, bool upgrade, enum rxrpc_interruptibility interruptibility, unsigned int debug_id); @@ -60,9 +62,14 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, enum rxrpc_abort_reason); void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call); void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call); -void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, - struct sockaddr_rxrpc *); -bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *); +struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, + struct sockaddr_rxrpc *srx, gfp_t gfp); +void rxrpc_kernel_put_peer(struct rxrpc_peer *peer); +struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer); +struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call); +const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer); +const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); +unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, unsigned int); diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index e9d412d19dbb..5194b7e6dc8d 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -18,97 +18,6 @@ #ifndef __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY -enum afs_call_trace { - afs_call_trace_alloc, - afs_call_trace_free, - afs_call_trace_get, - afs_call_trace_put, - afs_call_trace_wake, - afs_call_trace_work, -}; - -enum afs_server_trace { - afs_server_trace_alloc, - afs_server_trace_callback, - afs_server_trace_destroy, - afs_server_trace_free, - afs_server_trace_gc, - afs_server_trace_get_by_addr, - afs_server_trace_get_by_uuid, - afs_server_trace_get_caps, - afs_server_trace_get_install, - afs_server_trace_get_new_cbi, - afs_server_trace_get_probe, - afs_server_trace_give_up_cb, - afs_server_trace_purging, - afs_server_trace_put_call, - afs_server_trace_put_cbi, - afs_server_trace_put_find_rsq, - afs_server_trace_put_probe, - afs_server_trace_put_slist, - afs_server_trace_put_slist_isort, - afs_server_trace_put_uuid_rsq, - afs_server_trace_update, -}; - - -enum afs_volume_trace { - afs_volume_trace_alloc, - afs_volume_trace_free, - afs_volume_trace_get_alloc_sbi, - afs_volume_trace_get_cell_insert, - afs_volume_trace_get_new_op, - afs_volume_trace_get_query_alias, - afs_volume_trace_put_cell_dup, - afs_volume_trace_put_cell_root, - afs_volume_trace_put_destroy_sbi, - afs_volume_trace_put_free_fc, - afs_volume_trace_put_put_op, - afs_volume_trace_put_query_alias, - afs_volume_trace_put_validate_fc, - afs_volume_trace_remove, -}; - -enum afs_cell_trace { - afs_cell_trace_alloc, - afs_cell_trace_free, - afs_cell_trace_get_queue_dns, - afs_cell_trace_get_queue_manage, - afs_cell_trace_get_queue_new, - afs_cell_trace_get_vol, - afs_cell_trace_insert, - afs_cell_trace_manage, - afs_cell_trace_put_candidate, - afs_cell_trace_put_destroy, - afs_cell_trace_put_queue_fail, - afs_cell_trace_put_queue_work, - afs_cell_trace_put_vol, - afs_cell_trace_see_source, - afs_cell_trace_see_ws, - afs_cell_trace_unuse_alias, - afs_cell_trace_unuse_check_alias, - afs_cell_trace_unuse_delete, - afs_cell_trace_unuse_fc, - afs_cell_trace_unuse_lookup, - afs_cell_trace_unuse_mntpt, - afs_cell_trace_unuse_no_pin, - afs_cell_trace_unuse_parse, - afs_cell_trace_unuse_pin, - afs_cell_trace_unuse_probe, - afs_cell_trace_unuse_sbi, - afs_cell_trace_unuse_ws, - afs_cell_trace_use_alias, - afs_cell_trace_use_check_alias, - afs_cell_trace_use_fc, - afs_cell_trace_use_fc_alias, - afs_cell_trace_use_lookup, - afs_cell_trace_use_mntpt, - afs_cell_trace_use_pin, - afs_cell_trace_use_probe, - afs_cell_trace_use_sbi, - afs_cell_trace_wait, -}; - enum afs_fs_operation { afs_FS_FetchData = 130, /* AFS Fetch file data */ afs_FS_FetchACL = 131, /* AFS Fetch file ACL */ @@ -202,121 +111,6 @@ enum yfs_cm_operation { yfs_CB_CallBack = 64204, }; -enum afs_edit_dir_op { - afs_edit_dir_create, - afs_edit_dir_create_error, - afs_edit_dir_create_inval, - afs_edit_dir_create_nospc, - afs_edit_dir_delete, - afs_edit_dir_delete_error, - afs_edit_dir_delete_inval, - afs_edit_dir_delete_noent, -}; - -enum afs_edit_dir_reason { - afs_edit_dir_for_create, - afs_edit_dir_for_link, - afs_edit_dir_for_mkdir, - afs_edit_dir_for_rename_0, - afs_edit_dir_for_rename_1, - afs_edit_dir_for_rename_2, - afs_edit_dir_for_rmdir, - afs_edit_dir_for_silly_0, - afs_edit_dir_for_silly_1, - afs_edit_dir_for_symlink, - afs_edit_dir_for_unlink, -}; - -enum afs_eproto_cause { - afs_eproto_bad_status, - afs_eproto_cb_count, - afs_eproto_cb_fid_count, - afs_eproto_cellname_len, - afs_eproto_file_type, - afs_eproto_ibulkst_cb_count, - afs_eproto_ibulkst_count, - afs_eproto_motd_len, - afs_eproto_offline_msg_len, - afs_eproto_volname_len, - afs_eproto_yvl_fsendpt4_len, - afs_eproto_yvl_fsendpt6_len, - afs_eproto_yvl_fsendpt_num, - afs_eproto_yvl_fsendpt_type, - afs_eproto_yvl_vlendpt4_len, - afs_eproto_yvl_vlendpt6_len, - afs_eproto_yvl_vlendpt_type, -}; - -enum afs_io_error { - afs_io_error_cm_reply, - afs_io_error_extract, - afs_io_error_fs_probe_fail, - afs_io_error_vl_lookup_fail, - afs_io_error_vl_probe_fail, -}; - -enum afs_file_error { - afs_file_error_dir_bad_magic, - afs_file_error_dir_big, - afs_file_error_dir_missing_page, - afs_file_error_dir_name_too_long, - afs_file_error_dir_over_end, - afs_file_error_dir_small, - afs_file_error_dir_unmarked_ext, - afs_file_error_mntpt, - afs_file_error_writeback_fail, -}; - -enum afs_flock_event { - afs_flock_acquired, - afs_flock_callback_break, - afs_flock_defer_unlock, - afs_flock_extend_fail, - afs_flock_fail_other, - afs_flock_fail_perm, - afs_flock_no_lockers, - afs_flock_release_fail, - afs_flock_silly_delete, - afs_flock_timestamp, - afs_flock_try_to_lock, - afs_flock_vfs_lock, - afs_flock_vfs_locking, - afs_flock_waited, - afs_flock_waiting, - afs_flock_work_extending, - afs_flock_work_retry, - afs_flock_work_unlocking, - afs_flock_would_block, -}; - -enum afs_flock_operation { - afs_flock_op_copy_lock, - afs_flock_op_flock, - afs_flock_op_grant, - afs_flock_op_lock, - afs_flock_op_release_lock, - afs_flock_op_return_ok, - afs_flock_op_return_eagain, - afs_flock_op_return_edeadlk, - afs_flock_op_return_error, - afs_flock_op_set_lock, - afs_flock_op_unlock, - afs_flock_op_wake, -}; - -enum afs_cb_break_reason { - afs_cb_break_no_break, - afs_cb_break_no_promise, - afs_cb_break_for_callback, - afs_cb_break_for_deleted, - afs_cb_break_for_lapsed, - afs_cb_break_for_s_reinit, - afs_cb_break_for_unlink, - afs_cb_break_for_v_break, - afs_cb_break_for_volume_callback, - afs_cb_break_for_zap, -}; - #endif /* end __AFS_DECLARE_TRACE_ENUMS_ONCE_ONLY */ /* @@ -357,9 +151,11 @@ enum afs_cb_break_reason { EM(afs_volume_trace_alloc, "ALLOC ") \ EM(afs_volume_trace_free, "FREE ") \ EM(afs_volume_trace_get_alloc_sbi, "GET sbi-alloc ") \ + EM(afs_volume_trace_get_callback, "GET callback ") \ EM(afs_volume_trace_get_cell_insert, "GET cell-insrt") \ EM(afs_volume_trace_get_new_op, "GET op-new ") \ EM(afs_volume_trace_get_query_alias, "GET cell-alias") \ + EM(afs_volume_trace_put_callback, "PUT callback ") \ EM(afs_volume_trace_put_cell_dup, "PUT cell-dup ") \ EM(afs_volume_trace_put_cell_root, "PUT cell-root ") \ EM(afs_volume_trace_put_destroy_sbi, "PUT sbi-destry") \ @@ -391,6 +187,7 @@ enum afs_cb_break_reason { EM(afs_cell_trace_unuse_fc, "UNU fc ") \ EM(afs_cell_trace_unuse_lookup, "UNU lookup") \ EM(afs_cell_trace_unuse_mntpt, "UNU mntpt ") \ + EM(afs_cell_trace_unuse_no_pin, "UNU no-pin") \ EM(afs_cell_trace_unuse_parse, "UNU parse ") \ EM(afs_cell_trace_unuse_pin, "UNU pin ") \ EM(afs_cell_trace_unuse_probe, "UNU probe ") \ @@ -407,6 +204,40 @@ enum afs_cb_break_reason { EM(afs_cell_trace_use_sbi, "USE sbi ") \ E_(afs_cell_trace_wait, "WAIT ") +#define afs_alist_traces \ + EM(afs_alist_trace_alloc, "ALLOC ") \ + EM(afs_alist_trace_get_estate, "GET estate") \ + EM(afs_alist_trace_get_vlgetcaps, "GET vgtcap") \ + EM(afs_alist_trace_get_vlprobe, "GET vprobe") \ + EM(afs_alist_trace_get_vlrotate_set, "GET vl-rot") \ + EM(afs_alist_trace_put_estate, "PUT estate") \ + EM(afs_alist_trace_put_getaddru, "PUT GtAdrU") \ + EM(afs_alist_trace_put_parse_empty, "PUT p-empt") \ + EM(afs_alist_trace_put_parse_error, "PUT p-err ") \ + EM(afs_alist_trace_put_server_dup, "PUT sv-dup") \ + EM(afs_alist_trace_put_server_oom, "PUT sv-oom") \ + EM(afs_alist_trace_put_server_update, "PUT sv-upd") \ + EM(afs_alist_trace_put_vlgetcaps, "PUT vgtcap") \ + EM(afs_alist_trace_put_vlprobe, "PUT vprobe") \ + EM(afs_alist_trace_put_vlrotate_end, "PUT vr-end") \ + EM(afs_alist_trace_put_vlrotate_fail, "PUT vr-fai") \ + EM(afs_alist_trace_put_vlrotate_next, "PUT vr-nxt") \ + EM(afs_alist_trace_put_vlrotate_restart,"PUT vr-rst") \ + EM(afs_alist_trace_put_vlserver, "PUT vlsrvr") \ + EM(afs_alist_trace_put_vlserver_old, "PUT vs-old") \ + E_(afs_alist_trace_free, "FREE ") + +#define afs_estate_traces \ + EM(afs_estate_trace_alloc_probe, "ALLOC prob") \ + EM(afs_estate_trace_alloc_server, "ALLOC srvr") \ + EM(afs_estate_trace_get_server_state, "GET srv-st") \ + EM(afs_estate_trace_get_getcaps, "GET getcap") \ + EM(afs_estate_trace_put_getcaps, "PUT getcap") \ + EM(afs_estate_trace_put_probe, "PUT probe ") \ + EM(afs_estate_trace_put_server, "PUT server") \ + EM(afs_estate_trace_put_server_state, "PUT srv-st") \ + E_(afs_estate_trace_free, "FREE ") + #define afs_fs_operations \ EM(afs_FS_FetchData, "FS.FetchData") \ EM(afs_FS_FetchStatus, "FS.FetchStatus") \ @@ -604,15 +435,67 @@ enum afs_cb_break_reason { #define afs_cb_break_reasons \ EM(afs_cb_break_no_break, "no-break") \ - EM(afs_cb_break_no_promise, "no-promise") \ EM(afs_cb_break_for_callback, "break-cb") \ + EM(afs_cb_break_for_creation_regress, "creation-regress") \ EM(afs_cb_break_for_deleted, "break-del") \ - EM(afs_cb_break_for_lapsed, "break-lapsed") \ EM(afs_cb_break_for_s_reinit, "s-reinit") \ EM(afs_cb_break_for_unlink, "break-unlink") \ - EM(afs_cb_break_for_v_break, "break-v") \ + EM(afs_cb_break_for_update_regress, "update-regress") \ EM(afs_cb_break_for_volume_callback, "break-v-cb") \ - E_(afs_cb_break_for_zap, "break-zap") + EM(afs_cb_break_for_vos_release, "break-vos-release") \ + E_(afs_cb_break_volume_excluded, "vol-excluded") + +#define afs_rotate_traces \ + EM(afs_rotate_trace_aborted, "Abortd") \ + EM(afs_rotate_trace_busy_sleep, "BsySlp") \ + EM(afs_rotate_trace_check_vol_status, "VolStt") \ + EM(afs_rotate_trace_failed, "Failed") \ + EM(afs_rotate_trace_iter, "Iter ") \ + EM(afs_rotate_trace_iterate_addr, "ItAddr") \ + EM(afs_rotate_trace_next_server, "NextSv") \ + EM(afs_rotate_trace_no_more_servers, "NoMore") \ + EM(afs_rotate_trace_nomem, "Nomem ") \ + EM(afs_rotate_trace_probe_error, "PrbErr") \ + EM(afs_rotate_trace_probe_fileserver, "PrbFsv") \ + EM(afs_rotate_trace_probe_none, "PrbNon") \ + EM(afs_rotate_trace_probe_response, "PrbRsp") \ + EM(afs_rotate_trace_probe_superseded, "PrbSup") \ + EM(afs_rotate_trace_restart, "Rstart") \ + EM(afs_rotate_trace_retry_server, "RtrySv") \ + EM(afs_rotate_trace_selected_server, "SlctSv") \ + EM(afs_rotate_trace_stale_lock, "StlLck") \ + EM(afs_rotate_trace_start, "Start ") \ + EM(afs_rotate_trace_stop, "Stop ") \ + E_(afs_rotate_trace_stopped, "Stoppd") + +/* + * Generate enums for tracing information. + */ +#ifndef __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY +#define __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY + +#undef EM +#undef E_ +#define EM(a, b) a, +#define E_(a, b) a + +enum afs_alist_trace { afs_alist_traces } __mode(byte); +enum afs_call_trace { afs_call_traces } __mode(byte); +enum afs_cb_break_reason { afs_cb_break_reasons } __mode(byte); +enum afs_cell_trace { afs_cell_traces } __mode(byte); +enum afs_edit_dir_op { afs_edit_dir_ops } __mode(byte); +enum afs_edit_dir_reason { afs_edit_dir_reasons } __mode(byte); +enum afs_eproto_cause { afs_eproto_causes } __mode(byte); +enum afs_estate_trace { afs_estate_traces } __mode(byte); +enum afs_file_error { afs_file_errors } __mode(byte); +enum afs_flock_event { afs_flock_events } __mode(byte); +enum afs_flock_operation { afs_flock_operations } __mode(byte); +enum afs_io_error { afs_io_errors } __mode(byte); +enum afs_rotate_trace { afs_rotate_traces } __mode(byte); +enum afs_server_trace { afs_server_traces } __mode(byte); +enum afs_volume_trace { afs_volume_traces } __mode(byte); + +#endif /* end __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY */ /* * Export enum symbols via userspace. @@ -622,21 +505,24 @@ enum afs_cb_break_reason { #define EM(a, b) TRACE_DEFINE_ENUM(a); #define E_(a, b) TRACE_DEFINE_ENUM(a); +afs_alist_traces; afs_call_traces; -afs_server_traces; +afs_cb_break_reasons; afs_cell_traces; -afs_fs_operations; -afs_vl_operations; afs_cm_operations; -yfs_cm_operations; afs_edit_dir_ops; afs_edit_dir_reasons; afs_eproto_causes; -afs_io_errors; +afs_estate_traces; afs_file_errors; -afs_flock_types; afs_flock_operations; -afs_cb_break_reasons; +afs_flock_types; +afs_fs_operations; +afs_io_errors; +afs_rotate_traces; +afs_server_traces; +afs_vl_operations; +yfs_cm_operations; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -654,12 +540,12 @@ TRACE_EVENT(afs_receive_data, TP_ARGS(call, iter, want_more, ret), TP_STRUCT__entry( - __field(loff_t, remain ) - __field(unsigned int, call ) - __field(enum afs_call_state, state ) - __field(unsigned short, unmarshall ) - __field(bool, want_more ) - __field(int, ret ) + __field(loff_t, remain) + __field(unsigned int, call) + __field(enum afs_call_state, state) + __field(unsigned short, unmarshall) + __field(bool, want_more) + __field(int, ret) ), TP_fast_assign( @@ -686,9 +572,9 @@ TRACE_EVENT(afs_notify_call, TP_ARGS(rxcall, call), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(enum afs_call_state, state ) - __field(unsigned short, unmarshall ) + __field(unsigned int, call) + __field(enum afs_call_state, state) + __field(unsigned short, unmarshall) ), TP_fast_assign( @@ -708,9 +594,9 @@ TRACE_EVENT(afs_cb_call, TP_ARGS(call), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(u32, op ) - __field(u16, service_id ) + __field(unsigned int, call) + __field(u32, op) + __field(u16, service_id) ), TP_fast_assign( @@ -733,11 +619,11 @@ TRACE_EVENT(afs_call, TP_ARGS(call_debug_id, op, ref, outstanding, where), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(int, op ) - __field(int, ref ) - __field(int, outstanding ) - __field(const void *, where ) + __field(unsigned int, call) + __field(int, op) + __field(int, ref) + __field(int, outstanding) + __field(const void *, where) ), TP_fast_assign( @@ -762,9 +648,9 @@ TRACE_EVENT(afs_make_fs_call, TP_ARGS(call, fid), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(enum afs_fs_operation, op ) - __field_struct(struct afs_fid, fid ) + __field(unsigned int, call) + __field(enum afs_fs_operation, op) + __field_struct(struct afs_fid, fid) ), TP_fast_assign( @@ -794,10 +680,10 @@ TRACE_EVENT(afs_make_fs_calli, TP_ARGS(call, fid, i), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(unsigned int, i ) - __field(enum afs_fs_operation, op ) - __field_struct(struct afs_fid, fid ) + __field(unsigned int, call) + __field(unsigned int, i) + __field(enum afs_fs_operation, op) + __field_struct(struct afs_fid, fid) ), TP_fast_assign( @@ -829,10 +715,10 @@ TRACE_EVENT(afs_make_fs_call1, TP_ARGS(call, fid, name), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(enum afs_fs_operation, op ) - __field_struct(struct afs_fid, fid ) - __array(char, name, 24 ) + __field(unsigned int, call) + __field(enum afs_fs_operation, op) + __field_struct(struct afs_fid, fid) + __array(char, name, 24) ), TP_fast_assign( @@ -866,11 +752,11 @@ TRACE_EVENT(afs_make_fs_call2, TP_ARGS(call, fid, name, name2), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(enum afs_fs_operation, op ) - __field_struct(struct afs_fid, fid ) - __array(char, name, 24 ) - __array(char, name2, 24 ) + __field(unsigned int, call) + __field(enum afs_fs_operation, op) + __field_struct(struct afs_fid, fid) + __array(char, name, 24) + __array(char, name2, 24) ), TP_fast_assign( @@ -907,8 +793,8 @@ TRACE_EVENT(afs_make_vl_call, TP_ARGS(call), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(enum afs_vl_operation, op ) + __field(unsigned int, call) + __field(enum afs_vl_operation, op) ), TP_fast_assign( @@ -927,10 +813,10 @@ TRACE_EVENT(afs_call_done, TP_ARGS(call), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(struct rxrpc_call *, rx_call ) - __field(int, ret ) - __field(u32, abort_code ) + __field(unsigned int, call) + __field(struct rxrpc_call *, rx_call) + __field(int, ret) + __field(u32, abort_code) ), TP_fast_assign( @@ -953,10 +839,10 @@ TRACE_EVENT(afs_send_data, TP_ARGS(call, msg), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(unsigned int, flags ) - __field(loff_t, offset ) - __field(loff_t, count ) + __field(unsigned int, call) + __field(unsigned int, flags) + __field(loff_t, offset) + __field(loff_t, count) ), TP_fast_assign( @@ -977,10 +863,10 @@ TRACE_EVENT(afs_sent_data, TP_ARGS(call, msg, ret), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(int, ret ) - __field(loff_t, offset ) - __field(loff_t, count ) + __field(unsigned int, call) + __field(int, ret) + __field(loff_t, offset) + __field(loff_t, count) ), TP_fast_assign( @@ -1001,9 +887,9 @@ TRACE_EVENT(afs_dir_check_failed, TP_ARGS(vnode, off, i_size), TP_STRUCT__entry( - __field(struct afs_vnode *, vnode ) - __field(loff_t, off ) - __field(loff_t, i_size ) + __field(struct afs_vnode *, vnode) + __field(loff_t, off) + __field(loff_t, i_size) ), TP_fast_assign( @@ -1022,11 +908,11 @@ TRACE_EVENT(afs_folio_dirty, TP_ARGS(vnode, where, folio), TP_STRUCT__entry( - __field(struct afs_vnode *, vnode ) - __field(const char *, where ) - __field(pgoff_t, index ) - __field(unsigned long, from ) - __field(unsigned long, to ) + __field(struct afs_vnode *, vnode) + __field(const char *, where) + __field(pgoff_t, index) + __field(unsigned long, from) + __field(unsigned long, to) ), TP_fast_assign( @@ -1056,11 +942,11 @@ TRACE_EVENT(afs_call_state, TP_ARGS(call, from, to, ret, remote_abort), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(enum afs_call_state, from ) - __field(enum afs_call_state, to ) - __field(int, ret ) - __field(u32, abort ) + __field(unsigned int, call) + __field(enum afs_call_state, from) + __field(enum afs_call_state, to) + __field(int, ret) + __field(u32, abort) ), TP_fast_assign( @@ -1084,9 +970,9 @@ TRACE_EVENT(afs_lookup, TP_ARGS(dvnode, name, fid), TP_STRUCT__entry( - __field_struct(struct afs_fid, dfid ) - __field_struct(struct afs_fid, fid ) - __array(char, name, 24 ) + __field_struct(struct afs_fid, dfid) + __field_struct(struct afs_fid, fid) + __array(char, name, 24) ), TP_fast_assign( @@ -1116,15 +1002,15 @@ TRACE_EVENT(afs_edit_dir, TP_ARGS(dvnode, why, op, block, slot, f_vnode, f_unique, name), TP_STRUCT__entry( - __field(unsigned int, vnode ) - __field(unsigned int, unique ) - __field(enum afs_edit_dir_reason, why ) - __field(enum afs_edit_dir_op, op ) - __field(unsigned int, block ) - __field(unsigned short, slot ) - __field(unsigned int, f_vnode ) - __field(unsigned int, f_unique ) - __array(char, name, 24 ) + __field(unsigned int, vnode) + __field(unsigned int, unique) + __field(enum afs_edit_dir_reason, why) + __field(enum afs_edit_dir_op, op) + __field(unsigned int, block) + __field(unsigned short, slot) + __field(unsigned int, f_vnode) + __field(unsigned int, f_unique) + __array(char, name, 24) ), TP_fast_assign( @@ -1157,8 +1043,8 @@ TRACE_EVENT(afs_protocol_error, TP_ARGS(call, cause), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(enum afs_eproto_cause, cause ) + __field(unsigned int, call) + __field(enum afs_eproto_cause, cause) ), TP_fast_assign( @@ -1177,9 +1063,9 @@ TRACE_EVENT(afs_io_error, TP_ARGS(call, error, where), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(int, error ) - __field(enum afs_io_error, where ) + __field(unsigned int, call) + __field(int, error) + __field(enum afs_io_error, where) ), TP_fast_assign( @@ -1199,9 +1085,9 @@ TRACE_EVENT(afs_file_error, TP_ARGS(vnode, error, where), TP_STRUCT__entry( - __field_struct(struct afs_fid, fid ) - __field(int, error ) - __field(enum afs_file_error, where ) + __field_struct(struct afs_fid, fid) + __field(int, error) + __field(enum afs_file_error, where) ), TP_fast_assign( @@ -1222,9 +1108,9 @@ TRACE_EVENT(afs_cm_no_server, TP_ARGS(call, srx), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(unsigned int, op_id ) - __field_struct(struct sockaddr_rxrpc, srx ) + __field(unsigned int, call) + __field(unsigned int, op_id) + __field_struct(struct sockaddr_rxrpc, srx) ), TP_fast_assign( @@ -1243,9 +1129,9 @@ TRACE_EVENT(afs_cm_no_server_u, TP_ARGS(call, uuid), TP_STRUCT__entry( - __field(unsigned int, call ) - __field(unsigned int, op_id ) - __field_struct(uuid_t, uuid ) + __field(unsigned int, call) + __field(unsigned int, op_id) + __field_struct(uuid_t, uuid) ), TP_fast_assign( @@ -1265,11 +1151,11 @@ TRACE_EVENT(afs_flock_ev, TP_ARGS(vnode, fl, event, error), TP_STRUCT__entry( - __field_struct(struct afs_fid, fid ) - __field(enum afs_flock_event, event ) - __field(enum afs_lock_state, state ) - __field(int, error ) - __field(unsigned int, debug_id ) + __field_struct(struct afs_fid, fid) + __field(enum afs_flock_event, event) + __field(enum afs_lock_state, state) + __field(int, error) + __field(unsigned int, debug_id) ), TP_fast_assign( @@ -1295,13 +1181,13 @@ TRACE_EVENT(afs_flock_op, TP_ARGS(vnode, fl, op), TP_STRUCT__entry( - __field_struct(struct afs_fid, fid ) - __field(loff_t, from ) - __field(loff_t, len ) - __field(enum afs_flock_operation, op ) - __field(unsigned char, type ) - __field(unsigned int, flags ) - __field(unsigned int, debug_id ) + __field_struct(struct afs_fid, fid) + __field(loff_t, from) + __field(loff_t, len) + __field(enum afs_flock_operation, op) + __field(unsigned char, type) + __field(unsigned int, flags) + __field(unsigned int, debug_id) ), TP_fast_assign( @@ -1328,7 +1214,7 @@ TRACE_EVENT(afs_reload_dir, TP_ARGS(vnode), TP_STRUCT__entry( - __field_struct(struct afs_fid, fid ) + __field_struct(struct afs_fid, fid) ), TP_fast_assign( @@ -1345,8 +1231,8 @@ TRACE_EVENT(afs_silly_rename, TP_ARGS(vnode, done), TP_STRUCT__entry( - __field_struct(struct afs_fid, fid ) - __field(bool, done ) + __field_struct(struct afs_fid, fid) + __field(bool, done) ), TP_fast_assign( @@ -1365,9 +1251,9 @@ TRACE_EVENT(afs_get_tree, TP_ARGS(cell, volume), TP_STRUCT__entry( - __field(u64, vid ) - __array(char, cell, 24 ) - __array(char, volume, 24 ) + __field(u64, vid) + __array(char, cell, 24) + __array(char, volume, 24) ), TP_fast_assign( @@ -1385,6 +1271,30 @@ TRACE_EVENT(afs_get_tree, __entry->cell, __entry->volume, __entry->vid) ); +TRACE_EVENT(afs_cb_v_break, + TP_PROTO(afs_volid_t vid, unsigned int cb_v_break, + enum afs_cb_break_reason reason), + + TP_ARGS(vid, cb_v_break, reason), + + TP_STRUCT__entry( + __field(afs_volid_t, vid) + __field(unsigned int, cb_v_break) + __field(enum afs_cb_break_reason, reason) + ), + + TP_fast_assign( + __entry->vid = vid; + __entry->cb_v_break = cb_v_break; + __entry->reason = reason; + ), + + TP_printk("%llx vb=%x %s", + __entry->vid, + __entry->cb_v_break, + __print_symbolic(__entry->reason, afs_cb_break_reasons)) + ); + TRACE_EVENT(afs_cb_break, TP_PROTO(struct afs_fid *fid, unsigned int cb_break, enum afs_cb_break_reason reason, bool skipped), @@ -1392,10 +1302,10 @@ TRACE_EVENT(afs_cb_break, TP_ARGS(fid, cb_break, reason, skipped), TP_STRUCT__entry( - __field_struct(struct afs_fid, fid ) - __field(unsigned int, cb_break ) - __field(enum afs_cb_break_reason, reason ) - __field(bool, skipped ) + __field_struct(struct afs_fid, fid) + __field(unsigned int, cb_break) + __field(enum afs_cb_break_reason, reason) + __field(bool, skipped) ), TP_fast_assign( @@ -1418,8 +1328,8 @@ TRACE_EVENT(afs_cb_miss, TP_ARGS(fid, reason), TP_STRUCT__entry( - __field_struct(struct afs_fid, fid ) - __field(enum afs_cb_break_reason, reason ) + __field_struct(struct afs_fid, fid) + __field(enum afs_cb_break_reason, reason) ), TP_fast_assign( @@ -1439,10 +1349,10 @@ TRACE_EVENT(afs_server, TP_ARGS(server_debug_id, ref, active, reason), TP_STRUCT__entry( - __field(unsigned int, server ) - __field(int, ref ) - __field(int, active ) - __field(int, reason ) + __field(unsigned int, server) + __field(int, ref) + __field(int, active) + __field(int, reason) ), TP_fast_assign( @@ -1465,9 +1375,9 @@ TRACE_EVENT(afs_volume, TP_ARGS(vid, ref, reason), TP_STRUCT__entry( - __field(afs_volid_t, vid ) - __field(int, ref ) - __field(enum afs_volume_trace, reason ) + __field(afs_volid_t, vid) + __field(int, ref) + __field(enum afs_volume_trace, reason) ), TP_fast_assign( @@ -1489,10 +1399,10 @@ TRACE_EVENT(afs_cell, TP_ARGS(cell_debug_id, ref, active, reason), TP_STRUCT__entry( - __field(unsigned int, cell ) - __field(int, ref ) - __field(int, active ) - __field(int, reason ) + __field(unsigned int, cell) + __field(int, ref) + __field(int, active) + __field(int, reason) ), TP_fast_assign( @@ -1509,6 +1419,199 @@ TRACE_EVENT(afs_cell, __entry->active) ); +TRACE_EVENT(afs_alist, + TP_PROTO(unsigned int alist_debug_id, int ref, enum afs_alist_trace reason), + + TP_ARGS(alist_debug_id, ref, reason), + + TP_STRUCT__entry( + __field(unsigned int, alist) + __field(int, ref) + __field(int, active) + __field(int, reason) + ), + + TP_fast_assign( + __entry->alist = alist_debug_id; + __entry->ref = ref; + __entry->reason = reason; + ), + + TP_printk("AL=%08x %s r=%d", + __entry->alist, + __print_symbolic(__entry->reason, afs_alist_traces), + __entry->ref) + ); + +TRACE_EVENT(afs_estate, + TP_PROTO(unsigned int server_debug_id, unsigned int estate_debug_id, + int ref, enum afs_estate_trace reason), + + TP_ARGS(server_debug_id, estate_debug_id, ref, reason), + + TP_STRUCT__entry( + __field(unsigned int, server) + __field(unsigned int, estate) + __field(int, ref) + __field(int, active) + __field(int, reason) + ), + + TP_fast_assign( + __entry->server = server_debug_id; + __entry->estate = estate_debug_id; + __entry->ref = ref; + __entry->reason = reason; + ), + + TP_printk("ES=%08x[%x] %s r=%d", + __entry->server, + __entry->estate, + __print_symbolic(__entry->reason, afs_estate_traces), + __entry->ref) + ); + +TRACE_EVENT(afs_fs_probe, + TP_PROTO(struct afs_server *server, bool tx, struct afs_endpoint_state *estate, + unsigned int addr_index, int error, s32 abort_code, unsigned int rtt_us), + + TP_ARGS(server, tx, estate, addr_index, error, abort_code, rtt_us), + + TP_STRUCT__entry( + __field(unsigned int, server) + __field(unsigned int, estate) + __field(bool, tx) + __field(u16, addr_index) + __field(short, error) + __field(s32, abort_code) + __field(unsigned int, rtt_us) + __field_struct(struct sockaddr_rxrpc, srx) + ), + + TP_fast_assign( + struct afs_addr_list *alist = estate->addresses; + __entry->server = server->debug_id; + __entry->estate = estate->probe_seq; + __entry->tx = tx; + __entry->addr_index = addr_index; + __entry->error = error; + __entry->abort_code = abort_code; + __entry->rtt_us = rtt_us; + memcpy(&__entry->srx, rxrpc_kernel_remote_srx(alist->addrs[addr_index].peer), + sizeof(__entry->srx)); + ), + + TP_printk("s=%08x %s pq=%x ax=%u e=%d ac=%d rtt=%d %pISpc", + __entry->server, __entry->tx ? "tx" : "rx", __entry->estate, + __entry->addr_index, __entry->error, __entry->abort_code, __entry->rtt_us, + &__entry->srx.transport) + ); + +TRACE_EVENT(afs_vl_probe, + TP_PROTO(struct afs_vlserver *server, bool tx, struct afs_addr_list *alist, + unsigned int addr_index, int error, s32 abort_code, unsigned int rtt_us), + + TP_ARGS(server, tx, alist, addr_index, error, abort_code, rtt_us), + + TP_STRUCT__entry( + __field(unsigned int, server) + __field(bool, tx) + __field(unsigned short, flags) + __field(u16, addr_index) + __field(short, error) + __field(s32, abort_code) + __field(unsigned int, rtt_us) + __field_struct(struct sockaddr_rxrpc, srx) + ), + + TP_fast_assign( + __entry->server = server->debug_id; + __entry->tx = tx; + __entry->addr_index = addr_index; + __entry->error = error; + __entry->abort_code = abort_code; + __entry->rtt_us = rtt_us; + memcpy(&__entry->srx, rxrpc_kernel_remote_srx(alist->addrs[addr_index].peer), + sizeof(__entry->srx)); + ), + + TP_printk("vl=%08x %s ax=%u e=%d ac=%d rtt=%d %pISpc", + __entry->server, __entry->tx ? "tx" : "rx", __entry->addr_index, + __entry->error, __entry->abort_code, __entry->rtt_us, + &__entry->srx.transport) + ); + +TRACE_EVENT(afs_rotate, + TP_PROTO(struct afs_operation *op, enum afs_rotate_trace reason, unsigned int extra), + + TP_ARGS(op, reason, extra), + + TP_STRUCT__entry( + __field(unsigned int, op) + __field(unsigned int, flags) + __field(unsigned int, extra) + __field(unsigned short, iteration) + __field(short, server_index) + __field(short, addr_index) + __field(enum afs_rotate_trace, reason) + ), + + TP_fast_assign( + __entry->op = op->debug_id; + __entry->flags = op->flags; + __entry->iteration = op->nr_iterations; + __entry->server_index = op->server_index; + __entry->addr_index = op->addr_index; + __entry->reason = reason; + __entry->extra = extra; + ), + + TP_printk("OP=%08x it=%02x %s fl=%x sx=%d ax=%d ext=%d", + __entry->op, + __entry->iteration, + __print_symbolic(__entry->reason, afs_rotate_traces), + __entry->flags, + __entry->server_index, + __entry->addr_index, + __entry->extra) + ); + +TRACE_EVENT(afs_make_call, + TP_PROTO(struct afs_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(bool, is_vl) + __field(enum afs_fs_operation, op) + __field_struct(struct afs_fid, fid) + __field_struct(struct sockaddr_rxrpc, srx) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->op = call->operation_ID; + __entry->fid = call->fid; + memcpy(&__entry->srx, rxrpc_kernel_remote_srx(call->peer), + sizeof(__entry->srx)); + __entry->srx.srx_service = call->service_id; + __entry->is_vl = (__entry->srx.srx_service == VL_SERVICE || + __entry->srx.srx_service == YFS_VL_SERVICE); + ), + + TP_printk("c=%08x %pISpc+%u %s %llx:%llx:%x", + __entry->call, + &__entry->srx.transport, + __entry->srx.srx_service, + __entry->is_vl ? + __print_symbolic(__entry->op, afs_vl_operations) : + __print_symbolic(__entry->op, afs_fs_operations), + __entry->fid.vid, + __entry->fid.vnode, + __entry->fid.unique) + ); + #endif /* _TRACE_AFS_H */ /* This part must be outside protection */ diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index f7e537f64db4..4c1ef7b3705c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -178,7 +178,9 @@ #define rxrpc_peer_traces \ EM(rxrpc_peer_free, "FREE ") \ EM(rxrpc_peer_get_accept, "GET accept ") \ + EM(rxrpc_peer_get_application, "GET app ") \ EM(rxrpc_peer_get_bundle, "GET bundle ") \ + EM(rxrpc_peer_get_call, "GET call ") \ EM(rxrpc_peer_get_client_conn, "GET cln-conn") \ EM(rxrpc_peer_get_input, "GET input ") \ EM(rxrpc_peer_get_input_error, "GET inpt-err") \ @@ -187,6 +189,7 @@ EM(rxrpc_peer_get_service_conn, "GET srv-conn") \ EM(rxrpc_peer_new_client, "NEW client ") \ EM(rxrpc_peer_new_prealloc, "NEW prealloc") \ + EM(rxrpc_peer_put_application, "PUT app ") \ EM(rxrpc_peer_put_bundle, "PUT bundle ") \ EM(rxrpc_peer_put_call, "PUT call ") \ EM(rxrpc_peer_put_conn, "PUT conn ") \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index fa8aec78f63d..465bfe5eb061 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -258,16 +258,62 @@ static int rxrpc_listen(struct socket *sock, int backlog) return ret; } +/** + * rxrpc_kernel_lookup_peer - Obtain remote transport endpoint for an address + * @sock: The socket through which it will be accessed + * @srx: The network address + * @gfp: Allocation flags + * + * Lookup or create a remote transport endpoint record for the specified + * address and return it with a ref held. + */ +struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, + struct sockaddr_rxrpc *srx, gfp_t gfp) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + int ret; + + ret = rxrpc_validate_address(rx, srx, sizeof(*srx)); + if (ret < 0) + return ERR_PTR(ret); + + return rxrpc_lookup_peer(rx->local, srx, gfp); +} +EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); + +/** + * rxrpc_kernel_get_peer - Get a reference on a peer + * @peer: The peer to get a reference on. + * + * Get a record for the remote peer in a call. + */ +struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) +{ + return peer ? rxrpc_get_peer(peer, rxrpc_peer_get_application) : NULL; +} +EXPORT_SYMBOL(rxrpc_kernel_get_peer); + +/** + * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference + * @peer: The peer to drop a ref on + */ +void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) +{ + rxrpc_put_peer(peer, rxrpc_peer_put_application); +} +EXPORT_SYMBOL(rxrpc_kernel_put_peer); + /** * rxrpc_kernel_begin_call - Allow a kernel service to begin a call * @sock: The socket on which to make the call - * @srx: The address of the peer to contact + * @peer: The peer to contact * @key: The security context to use (defaults to socket setting) * @user_call_ID: The ID to use * @tx_total_len: Total length of data to transmit during the call (or -1) * @hard_timeout: The maximum lifespan of the call in sec * @gfp: The allocation constraints * @notify_rx: Where to send notifications instead of socket queue + * @service_id: The ID of the service to contact * @upgrade: Request service upgrade for call * @interruptibility: The call is interruptible, or can be canceled. * @debug_id: The debug ID for tracing to be assigned to the call @@ -280,13 +326,14 @@ static int rxrpc_listen(struct socket *sock, int backlog) * supplying @srx and @key. */ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, - struct sockaddr_rxrpc *srx, + struct rxrpc_peer *peer, struct key *key, unsigned long user_call_ID, s64 tx_total_len, u32 hard_timeout, gfp_t gfp, rxrpc_notify_rx_t notify_rx, + u16 service_id, bool upgrade, enum rxrpc_interruptibility interruptibility, unsigned int debug_id) @@ -295,13 +342,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_call_params p; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - int ret; _enter(",,%x,%lx", key_serial(key), user_call_ID); - ret = rxrpc_validate_address(rx, srx, sizeof(*srx)); - if (ret < 0) - return ERR_PTR(ret); + if (WARN_ON_ONCE(peer->local != rx->local)) + return ERR_PTR(-EIO); lock_sock(&rx->sk); @@ -319,12 +364,13 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, memset(&cp, 0, sizeof(cp)); cp.local = rx->local; + cp.peer = peer; cp.key = key; cp.security_level = rx->min_sec_level; cp.exclusive = false; cp.upgrade = upgrade; - cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp, debug_id); + cp.service_id = service_id; + call = rxrpc_new_client_call(rx, &cp, &p, gfp, debug_id); /* The socket has been unlocked. */ if (!IS_ERR(call)) { call->notify_rx = notify_rx; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e8e14c6f904d..2f8b39a614c3 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -68,6 +68,7 @@ struct rxrpc_net { atomic_t nr_calls; /* Count of allocated calls */ atomic_t nr_conns; + struct list_head bundle_proc_list; /* List of bundles for proc */ struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ @@ -364,6 +365,7 @@ struct rxrpc_conn_proto { struct rxrpc_conn_parameters { struct rxrpc_local *local; /* Representation of local endpoint */ + struct rxrpc_peer *peer; /* Representation of remote endpoint */ struct key *key; /* Security details */ bool exclusive; /* T if conn is exclusive */ bool upgrade; /* T if service ID can be upgraded */ @@ -431,6 +433,7 @@ struct rxrpc_bundle { struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* Security details */ + struct list_head proc_link; /* Link in net->bundle_proc_list */ const struct rxrpc_security *security; /* applied security module */ refcount_t ref; atomic_t active; /* Number of active users */ @@ -444,6 +447,7 @@ struct rxrpc_bundle { struct rb_node local_node; /* Node in local->client_conns */ struct list_head waiting_calls; /* Calls waiting for channels */ unsigned long avail_chans; /* Mask of available channels */ + unsigned int conn_ids[4]; /* Connection IDs. */ struct rxrpc_connection *conns[4]; /* The connections in the bundle (max 4) */ }; @@ -867,7 +871,6 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, - struct sockaddr_rxrpc *, struct rxrpc_call_params *, gfp_t, unsigned int); void rxrpc_start_call_timer(struct rxrpc_call *call); @@ -1167,6 +1170,7 @@ void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); */ extern const struct seq_operations rxrpc_call_seq_ops; extern const struct seq_operations rxrpc_connection_seq_ops; +extern const struct seq_operations rxrpc_bundle_seq_ops; extern const struct seq_operations rxrpc_peer_seq_ops; extern const struct seq_operations rxrpc_local_seq_ops; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 773eecd1e979..beea25ac88f5 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -193,7 +193,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, * Allocate a new client call. */ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, - struct sockaddr_rxrpc *srx, struct rxrpc_conn_parameters *cp, struct rxrpc_call_params *p, gfp_t gfp, @@ -211,10 +210,12 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, now = ktime_get_real(); call->acks_latest_ts = now; call->cong_tstamp = now; - call->dest_srx = *srx; + call->dest_srx = cp->peer->srx; + call->dest_srx.srx_service = cp->service_id; call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; call->key = key_get(cp->key); + call->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_call); call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); call->security_level = cp->security_level; if (p->kernel) @@ -306,10 +307,6 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) _enter("{%d,%lx},", call->debug_id, call->user_call_ID); - call->peer = rxrpc_lookup_peer(local, &call->dest_srx, gfp); - if (!call->peer) - goto error; - ret = rxrpc_look_up_bundle(call, gfp); if (ret < 0) goto error; @@ -334,7 +331,6 @@ error: */ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) @@ -349,13 +345,18 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _enter("%p,%lx", rx, p->user_call_ID); + if (WARN_ON_ONCE(!cp->peer)) { + release_sock(&rx->sk); + return ERR_PTR(-EIO); + } + limiter = rxrpc_get_call_slot(p, gfp); if (!limiter) { release_sock(&rx->sk); return ERR_PTR(-ERESTARTSYS); } - call = rxrpc_alloc_client_call(rx, srx, cp, p, gfp, debug_id); + call = rxrpc_alloc_client_call(rx, cp, p, gfp, debug_id); if (IS_ERR(call)) { release_sock(&rx->sk); up(limiter); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 1d95f8bc769f..3b9b267a4431 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -91,6 +91,10 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, atomic_set(&bundle->active, 1); INIT_LIST_HEAD(&bundle->waiting_calls); trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_new); + + write_lock(&bundle->local->rxnet->conn_lock); + list_add_tail(&bundle->proc_link, &bundle->local->rxnet->bundle_proc_list); + write_unlock(&bundle->local->rxnet->conn_lock); } return bundle; } @@ -109,6 +113,9 @@ static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { trace_rxrpc_bundle(bundle->debug_id, refcount_read(&bundle->ref), rxrpc_bundle_free); + write_lock(&bundle->local->rxnet->conn_lock); + list_del(&bundle->proc_link); + write_unlock(&bundle->local->rxnet->conn_lock); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); key_put(bundle->key); kfree(bundle); @@ -338,6 +345,7 @@ static bool rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, old = bundle->conns[slot]; if (old) { bundle->conns[slot] = NULL; + bundle->conn_ids[slot] = 0; trace_rxrpc_client(old, -1, rxrpc_client_replace); rxrpc_put_connection(old, rxrpc_conn_put_noreuse); } @@ -351,6 +359,7 @@ static bool rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, rxrpc_activate_bundle(bundle); conn->bundle_shift = shift; bundle->conns[slot] = conn; + bundle->conn_ids[slot] = conn->debug_id; for (i = 0; i < RXRPC_MAXCALLS; i++) set_bit(shift + i, &bundle->avail_chans); return true; @@ -671,6 +680,7 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (bundle->conns[bindex] == conn) { _debug("clear slot %u", bindex); bundle->conns[bindex] = NULL; + bundle->conn_ids[bindex] = 0; for (i = 0; i < RXRPC_MAXCALLS; i++) clear_bit(conn->bundle_shift + i, &bundle->avail_chans); rxrpc_put_client_connection_id(bundle->local, conn); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 89ac05a711a4..39c908a3ca6e 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -25,7 +25,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *peer, struct rxrpc_conn_proto k; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rb_node *p; - unsigned int seq = 0; + unsigned int seq = 1; k.epoch = sp->hdr.epoch; k.cid = sp->hdr.cid & RXRPC_CIDMASK; @@ -35,6 +35,7 @@ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *peer, * under just the RCU read lock, so we have to check for * changes. */ + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&peer->service_conn_lock, &seq); p = rcu_dereference_raw(peer->service_conns.rb_node); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index a0319c040c25..a4c135d0fbcc 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -45,6 +45,7 @@ static __net_init int rxrpc_init_net(struct net *net) atomic_set(&rxnet->nr_calls, 1); atomic_set(&rxnet->nr_conns, 1); + INIT_LIST_HEAD(&rxnet->bundle_proc_list); INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); rwlock_init(&rxnet->conn_lock); @@ -78,6 +79,9 @@ static __net_init int rxrpc_init_net(struct net *net) proc_create_net("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_ops, sizeof(struct seq_net_private)); + proc_create_net("bundles", 0444, rxnet->proc_net, + &rxrpc_bundle_seq_ops, + sizeof(struct seq_net_private)); proc_create_net("peers", 0444, rxnet->proc_net, &rxrpc_peer_seq_ops, sizeof(struct seq_net_private)); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 8d7a715a0bb1..49dcda67a0d5 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -22,6 +22,8 @@ #include #include "ar-internal.h" +static const struct sockaddr_rxrpc rxrpc_null_addr; + /* * Hash a peer key. */ @@ -457,39 +459,53 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) } /** - * rxrpc_kernel_get_peer - Get the peer address of a call + * rxrpc_kernel_get_call_peer - Get the peer address of a call * @sock: The socket on which the call is in progress. * @call: The call to query - * @_srx: Where to place the result * - * Get the address of the remote peer in a call. + * Get a record for the remote peer in a call. */ -void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, - struct sockaddr_rxrpc *_srx) +struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) { - *_srx = call->peer->srx; + return call->peer; } -EXPORT_SYMBOL(rxrpc_kernel_get_peer); +EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); /** * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT - * @sock: The socket on which the call is in progress. - * @call: The call to query - * @_srtt: Where to store the SRTT value. + * @peer: The peer to query * - * Get the call's peer smoothed RTT in uS. + * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples. */ -bool rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call, - u32 *_srtt) +unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { - struct rxrpc_peer *peer = call->peer; - - if (peer->rtt_count == 0) { - *_srtt = 1000000; /* 1S */ - return false; - } - - *_srtt = call->peer->srtt_us >> 3; - return true; + return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX; } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); + +/** + * rxrpc_kernel_remote_srx - Get the address of a peer + * @peer: The peer to query + * + * Get a pointer to the address from a peer record. The caller is responsible + * for making sure that the address is not deallocated. + */ +const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer) +{ + return peer ? &peer->srx : &rxrpc_null_addr; +} +EXPORT_SYMBOL(rxrpc_kernel_remote_srx); + +/** + * rxrpc_kernel_remote_addr - Get the peer transport address of a call + * @peer: The peer to query + * + * Get a pointer to the transport address from a peer record. The caller is + * responsible for making sure that the address is not deallocated. + */ +const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) +{ + return (const struct sockaddr *) + (peer ? &peer->srx.transport : &rxrpc_null_addr.transport); +} +EXPORT_SYMBOL(rxrpc_kernel_remote_addr); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 682636d3b060..6c86cbb98d1d 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -198,6 +198,82 @@ const struct seq_operations rxrpc_connection_seq_ops = { .show = rxrpc_connection_seq_show, }; +/* + * generate a list of extant virtual bundles in /proc/net/rxrpc/bundles + */ +static void *rxrpc_bundle_seq_start(struct seq_file *seq, loff_t *_pos) + __acquires(rxnet->conn_lock) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + read_lock(&rxnet->conn_lock); + return seq_list_start_head(&rxnet->bundle_proc_list, *_pos); +} + +static void *rxrpc_bundle_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + return seq_list_next(v, &rxnet->bundle_proc_list, pos); +} + +static void rxrpc_bundle_seq_stop(struct seq_file *seq, void *v) + __releases(rxnet->conn_lock) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + + read_unlock(&rxnet->conn_lock); +} + +static int rxrpc_bundle_seq_show(struct seq_file *seq, void *v) +{ + struct rxrpc_bundle *bundle; + struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); + char lbuff[50], rbuff[50]; + + if (v == &rxnet->bundle_proc_list) { + seq_puts(seq, + "Proto Local " + " Remote " + " SvID Ref Act Flg Key |" + " Bundle Conn_0 Conn_1 Conn_2 Conn_3\n" + ); + return 0; + } + + bundle = list_entry(v, struct rxrpc_bundle, proc_link); + + sprintf(lbuff, "%pISpc", &bundle->local->srx.transport); + sprintf(rbuff, "%pISpc", &bundle->peer->srx.transport); + seq_printf(seq, + "UDP %-47.47s %-47.47s %4x %3u %3d" + " %c%c%c %08x | %08x %08x %08x %08x %08x\n", + lbuff, + rbuff, + bundle->service_id, + refcount_read(&bundle->ref), + atomic_read(&bundle->active), + bundle->try_upgrade ? 'U' : '-', + bundle->exclusive ? 'e' : '-', + bundle->upgrade ? 'u' : '-', + key_serial(bundle->key), + bundle->debug_id, + bundle->conn_ids[0], + bundle->conn_ids[1], + bundle->conn_ids[2], + bundle->conn_ids[3]); + + return 0; +} + +const struct seq_operations rxrpc_bundle_seq_ops = { + .start = rxrpc_bundle_seq_start, + .next = rxrpc_bundle_seq_next, + .stop = rxrpc_bundle_seq_stop, + .show = rxrpc_bundle_seq_show, +}; + /* * generate a list of extant virtual peers in /proc/net/rxrpc/peers */ diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 8e0b94714e84..5677d5690a02 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -572,6 +572,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; + struct rxrpc_peer *peer; struct rxrpc_call *call; struct key *key; @@ -584,21 +585,29 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, return ERR_PTR(-EDESTADDRREQ); } + peer = rxrpc_lookup_peer(rx->local, srx, GFP_KERNEL); + if (!peer) { + release_sock(&rx->sk); + return ERR_PTR(-ENOMEM); + } + key = rx->key; if (key && !rx->key->payload.data[0]) key = NULL; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; + cp.peer = peer; cp.key = rx->key; cp.security_level = rx->min_sec_level; cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL, + call = rxrpc_new_client_call(rx, &cp, &p->call, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ + rxrpc_put_peer(peer, rxrpc_peer_put_application); _leave(" = %p\n", call); return call; }