From a519fc7a70d1a918574bb826cc6905b87b482eb9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Sep 2012 16:49:15 -0400 Subject: [PATCH 001/113] SUNRPC: Ensure that the TCP socket is closed when in CLOSE_WAIT Instead of doing a shutdown() call, we need to do an actual close(). Ditto if/when the server is sending us junk RPC headers. Signed-off-by: Trond Myklebust Tested-by: Simon Kirby Cc: stable@vger.kernel.org --- net/sunrpc/xprtsock.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a35b8e52e551..d1988cf8bf33 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1025,6 +1025,16 @@ static void xs_udp_data_ready(struct sock *sk, int len) read_unlock_bh(&sk->sk_callback_lock); } +/* + * Helper function to force a TCP close if the server is sending + * junk and/or it has put us in CLOSE_WAIT + */ +static void xs_tcp_force_close(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + xprt_force_disconnect(xprt); +} + static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1051,7 +1061,7 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_rea /* Sanity check of the record length */ if (unlikely(transport->tcp_reclen < 8)) { dprintk("RPC: invalid TCP record fragment length\n"); - xprt_force_disconnect(xprt); + xs_tcp_force_close(xprt); return; } dprintk("RPC: reading TCP record fragment of length %d\n", @@ -1132,7 +1142,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport, break; default: dprintk("RPC: invalid request message type\n"); - xprt_force_disconnect(&transport->xprt); + xs_tcp_force_close(&transport->xprt); } xs_tcp_check_fraghdr(transport); } @@ -1455,6 +1465,8 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) static void xs_sock_mark_closed(struct rpc_xprt *xprt) { smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); smp_mb__after_clear_bit(); @@ -1512,8 +1524,8 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ - xprt_force_disconnect(xprt); xprt->connect_cookie++; + xs_tcp_force_close(xprt); case TCP_CLOSING: /* * If the server closed down the connection, make sure that @@ -2199,8 +2211,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) /* We're probably in TIME_WAIT. Get rid of existing socket, * and retry */ - set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); - xprt_force_disconnect(xprt); + xs_tcp_force_close(xprt); break; case -ECONNREFUSED: case -ECONNRESET: From 84e28a307e376f271505af65a7b7e212dd6f61f4 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Mon, 24 Sep 2012 13:39:01 -0400 Subject: [PATCH 002/113] SUNRPC: Set alloc_slot for backchannel tcp ops f39c1bfb5a03e2d255451bff05be0d7255298fa4 (SUNRPC: Fix a UDP transport regression) introduced the "alloc_slot" function for xprt operations, but never created one for the backchannel operations. This patch fixes a null pointer dereference when mounting NFS over v4.1. Call Trace: [] ? xprt_reserve+0x47/0x50 [sunrpc] [] call_reserve+0x34/0x60 [sunrpc] [] __rpc_execute+0x90/0x400 [sunrpc] [] rpc_async_schedule+0x2a/0x40 [sunrpc] [] process_one_work+0x139/0x500 [] ? alloc_worker+0x70/0x70 [] ? __rpc_execute+0x400/0x400 [sunrpc] [] worker_thread+0x15e/0x460 [] ? preempt_schedule+0x49/0x70 [] ? rescuer_thread+0x230/0x230 [] kthread+0x93/0xa0 [] kernel_thread_helper+0x4/0x10 [] ? kthread_freezable_should_stop+0x70/0x70 [] ? gs_change+0x13/0x13 Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d1988cf8bf33..97f8918169ed 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2539,6 +2539,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { static struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, + .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .buf_alloc = bc_malloc, .buf_free = bc_free, From e8d920c58ddb45126e1b306854f6e34b88446baf Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 21 Sep 2012 12:27:41 +0800 Subject: [PATCH 003/113] NFS: fix the return value check by using IS_ERR In case of error, the function rpcauth_create() returns ERR_PTR() and never returns NULL pointer. The NULL test in the return value check should be replaced with IS_ERR(). dpatch engine is used to auto generated this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 2 +- fs/nfs/nfs4proc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 017b4b01a69c..398d5fd74157 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -205,7 +205,7 @@ struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *ino return clone; auth = rpcauth_create(flavor, clone); - if (!auth) { + if (IS_ERR(auth)) { rpc_shutdown_client(clone); clone = ERR_PTR(-EIO); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1e50326d00dd..ddfebb128017 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2399,7 +2399,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl int ret; auth = rpcauth_create(flavor, server->client); - if (!auth) { + if (IS_ERR(auth)) { ret = -EIO; goto out; } From 62d98c935456ee121b03d6a68aa3091a04085b7e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 17 Sep 2012 16:46:34 +1000 Subject: [PATCH 004/113] NFS4: avoid underflow when converting error to pointer. In nfs4_create_sec_client, 'flavor' can hold a negative error code (returned from nfs4_negotiate_security), even though it is an 'enum' and hence unsigned. The code is careful to cast it to an (int) before testing if it is negative, however it doesn't cast to an (int) before calling ERR_PTR. On a machine where "void*" is larger than "int", this results in the unsigned equivalent of -1 (e.g. 0xffffffff) being converted to a pointer. Subsequent code determines that this is not negative, and so dereferences it with predictable results. So: cast 'flavor' to a (signed) int before passing to ERR_PTR. cc: Benny Halevy Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 398d5fd74157..4fdeb1b7042e 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -198,7 +198,7 @@ struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *ino flavor = nfs4_negotiate_security(inode, name); if ((int)flavor < 0) - return ERR_PTR(flavor); + return ERR_PTR((int)flavor); clone = rpc_clone_client(clnt); if (IS_ERR(clone)) From 8a9a8b8332b92b13316cf49685b5dc5257cfe115 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Aug 2012 14:32:13 -0400 Subject: [PATCH 005/113] SUNRPC: Fix the return value of xdr_align_pages() The callers of xdr_align_pages() expect it to return the number of bytes of actual XDR data remaining in the pages. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 0afba1b4b656..fbbd1c475b43 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -742,6 +742,8 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) /* Truncate page data and move it into the tail */ if (buf->page_len > len) xdr_shrink_pagelen(buf, buf->page_len - len); + else + len = buf->page_len; xdr->nwords = XDR_QUADLEN(buf->len - cur); return len; } From 13fe4ba1b64c099843c75b4f0633ad30a4526637 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Aug 2012 14:21:12 -0400 Subject: [PATCH 006/113] NFSv4.1: decode_getdeviceinfo should check xdr_read_pages() return value Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8dba6bd48557..a756349b0fa4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5642,7 +5642,8 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, * and places the remaining xdr data in xdr_buf->tail */ pdev->mincount = be32_to_cpup(p); - xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ + if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) + goto out_overflow; /* Parse notification bitmap, verifying that it is zero. */ p = xdr_inline_decode(xdr, 4); From 0e24d849c4ea777c59955b241fd3af14a1b84af5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Sep 2012 12:03:09 -0400 Subject: [PATCH 007/113] NFSv4: Remove BUG_ON() and ACCESS_ONCE() calls in the idmapper The use of ACCESS_ONCE() is wrong, since the various routines that set/clear idmap->idmap_key_cons should be strictly ordered w.r.t. each other, and the idmap->idmap_mutex ensures that only one thread at a time may be in an upcall situation. Also replace the BUG_ON()s with WARN_ON_ONCE() where appropriate. Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index a850079467d8..79f6424aa081 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -465,8 +465,6 @@ nfs_idmap_new(struct nfs_client *clp) struct rpc_pipe *pipe; int error; - BUG_ON(clp->cl_idmap != NULL); - idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); if (idmap == NULL) return -ENOMEM; @@ -510,7 +508,6 @@ static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event, switch (event) { case RPC_PIPEFS_MOUNT: - BUG_ON(clp->cl_rpcclient->cl_dentry == NULL); err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, clp->cl_idmap, clp->cl_idmap->idmap_pipe); @@ -689,7 +686,11 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, if (ret < 0) goto out2; - BUG_ON(idmap->idmap_key_cons != NULL); + if (idmap->idmap_key_cons != NULL) { + WARN_ON_ONCE(1); + ret = -EAGAIN; + goto out2; + } idmap->idmap_key_cons = cons; ret = rpc_queue_upcall(idmap->idmap_pipe, msg); @@ -746,7 +747,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ - cons = ACCESS_ONCE(idmap->idmap_key_cons); + cons = idmap->idmap_key_cons; idmap->idmap_key_cons = NULL; if (mlen != sizeof(im)) { @@ -790,7 +791,7 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct idmap *idmap = data->idmap; struct key_construction *cons; if (msg->errno) { - cons = ACCESS_ONCE(idmap->idmap_key_cons); + cons = idmap->idmap_key_cons; idmap->idmap_key_cons = NULL; complete_request_key(cons, msg->errno); } From e9ab41b620e4b679ed069ab05cb85e67870b7c87 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Sep 2012 15:44:19 -0400 Subject: [PATCH 008/113] NFSv4: Clean up the legacy idmapper upcall Replace the BUG_ON(idmap->idmap_key_cons != NULL) with a WARN_ON_ONCE(). Then get rid of the ACCESS_ONCE(idmap->idmap_key_cons). Then add helper functions for starting, finishing and aborting the legacy upcall. Signed-off-by: Trond Myklebust Cc: Bryan Schumaker --- fs/nfs/idmap.c | 65 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 79f6424aa081..8222ad861456 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -330,7 +330,6 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, name, namelen, type, data, data_size, idmap); - idmap->idmap_key_cons = NULL; mutex_unlock(&idmap->idmap_mutex); } return ret; @@ -662,6 +661,34 @@ out: return ret; } +static bool +nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, + struct key_construction *cons) +{ + if (idmap->idmap_key_cons != NULL) { + WARN_ON_ONCE(1); + return false; + } + idmap->idmap_key_cons = cons; + return true; +} + +static void +nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) +{ + struct key_construction *cons = idmap->idmap_key_cons; + + idmap->idmap_key_cons = NULL; + complete_request_key(cons, ret); +} + +static void +nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) +{ + if (idmap->idmap_key_cons != NULL) + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +} + static int nfs_idmap_legacy_upcall(struct key_construction *cons, const char *op, void *aux) @@ -686,21 +713,17 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, if (ret < 0) goto out2; - if (idmap->idmap_key_cons != NULL) { - WARN_ON_ONCE(1); - ret = -EAGAIN; + ret = -EAGAIN; + if (!nfs_idmap_prepare_pipe_upcall(idmap, cons)) goto out2; - } - idmap->idmap_key_cons = cons; ret = rpc_queue_upcall(idmap->idmap_pipe, msg); - if (ret < 0) - goto out3; + if (ret < 0) { + nfs_idmap_abort_pipe_upcall(idmap, ret); + kfree(data); + } return ret; - -out3: - idmap->idmap_key_cons = NULL; out2: kfree(data); out1: @@ -741,14 +764,15 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) struct key_construction *cons; struct idmap_msg im; size_t namelen_in; - int ret; + int ret = -ENOKEY; /* If instantiation is successful, anyone waiting for key construction * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ cons = idmap->idmap_key_cons; - idmap->idmap_key_cons = NULL; + if (cons == NULL) + goto out_noupcall; if (mlen != sizeof(im)) { ret = -ENOSPC; @@ -778,7 +802,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) } out: - complete_request_key(cons, ret); + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +out_noupcall: return ret; } @@ -789,12 +814,9 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct idmap_legacy_upcalldata, pipe_msg); struct idmap *idmap = data->idmap; - struct key_construction *cons; - if (msg->errno) { - cons = idmap->idmap_key_cons; - idmap->idmap_key_cons = NULL; - complete_request_key(cons, msg->errno); - } + + if (msg->errno) + nfs_idmap_abort_pipe_upcall(idmap, msg->errno); /* Free memory allocated in nfs_idmap_legacy_upcall() */ kfree(data); } @@ -804,7 +826,8 @@ idmap_release_pipe(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); struct idmap *idmap = (struct idmap *)rpci->private; - idmap->idmap_key_cons = NULL; + + nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); } int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) From 0cac120233305b614cfe3ad419f3655876066017 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Sep 2012 16:15:00 -0400 Subject: [PATCH 009/113] NFSv4: Ensure that idmap_pipe_downcall sanity-checks the downcall data Use the idmapper upcall data to verify that the legacy idmapper daemon is indeed responding to an upcall that we sent. Signed-off-by: Trond Myklebust Cc: Bryan Schumaker --- fs/nfs/idmap.c | 62 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 8222ad861456..7ac93e0dd4cf 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -55,18 +55,19 @@ static const struct cred *id_resolver_cache; static struct key_type key_type_id_resolver_legacy; -struct idmap { - struct rpc_pipe *idmap_pipe; - struct key_construction *idmap_key_cons; - struct mutex idmap_mutex; -}; - struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; + struct key_construction *key_cons; struct idmap *idmap; }; +struct idmap { + struct rpc_pipe *idmap_pipe; + struct idmap_legacy_upcalldata *idmap_upcall_data; + struct mutex idmap_mutex; +}; + /** * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields * @fattr: fully initialised struct nfs_fattr @@ -663,29 +664,30 @@ out: static bool nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, - struct key_construction *cons) + struct idmap_legacy_upcalldata *data) { - if (idmap->idmap_key_cons != NULL) { + if (idmap->idmap_upcall_data != NULL) { WARN_ON_ONCE(1); return false; } - idmap->idmap_key_cons = cons; + idmap->idmap_upcall_data = data; return true; } static void nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) { - struct key_construction *cons = idmap->idmap_key_cons; + struct key_construction *cons = idmap->idmap_upcall_data->key_cons; - idmap->idmap_key_cons = NULL; + kfree(idmap->idmap_upcall_data); + idmap->idmap_upcall_data = NULL; complete_request_key(cons, ret); } static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) { - if (idmap->idmap_key_cons != NULL) + if (idmap->idmap_upcall_data != NULL) nfs_idmap_complete_pipe_upcall_locked(idmap, ret); } @@ -714,14 +716,12 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, goto out2; ret = -EAGAIN; - if (!nfs_idmap_prepare_pipe_upcall(idmap, cons)) + if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) goto out2; ret = rpc_queue_upcall(idmap->idmap_pipe, msg); - if (ret < 0) { + if (ret < 0) nfs_idmap_abort_pipe_upcall(idmap, ret); - kfree(data); - } return ret; out2: @@ -738,21 +738,32 @@ static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *dat authkey); } -static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey) +static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, + struct idmap_msg *upcall, + struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; - int ret = -EINVAL; + int ret = -ENOKEY; + /* ret = -ENOKEY */ + if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) + goto out; switch (im->im_conv) { case IDMAP_CONV_NAMETOID: + if (strcmp(upcall->im_name, im->im_name) != 0) + break; sprintf(id_str, "%d", im->im_id); ret = nfs_idmap_instantiate(key, authkey, id_str); break; case IDMAP_CONV_IDTONAME: + if (upcall->im_id != im->im_id) + break; ret = nfs_idmap_instantiate(key, authkey, im->im_name); break; + default: + ret = -EINVAL; } - +out: return ret; } @@ -770,10 +781,11 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ - cons = idmap->idmap_key_cons; - if (cons == NULL) + if (idmap->idmap_upcall_data == NULL) goto out_noupcall; + cons = idmap->idmap_upcall_data->key_cons; + if (mlen != sizeof(im)) { ret = -ENOSPC; goto out; @@ -793,9 +805,11 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { ret = -EINVAL; goto out; - } +} - ret = nfs_idmap_read_message(&im, cons->key, cons->authkey); + ret = nfs_idmap_read_and_verify_message(&im, + &idmap->idmap_upcall_data->idmap_msg, + cons->key, cons->authkey); if (ret >= 0) { key_set_timeout(cons->key, nfs_idmap_cache_timeout); ret = mlen; @@ -817,8 +831,6 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) if (msg->errno) nfs_idmap_abort_pipe_upcall(idmap, msg->errno); - /* Free memory allocated in nfs_idmap_legacy_upcall() */ - kfree(data); } static void From a11a2bf4de5679fa0b63474c7d39bea2dac7d061 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 2 Aug 2012 13:21:43 -0400 Subject: [PATCH 010/113] SUNRPC: Optimise away unnecessary data moves in xdr_align_pages We only have to call xdr_shrink_pagelen() if the remaining RPC message does not fit in the page buffer length that we supplied to xdr_align_pages(). Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index fbbd1c475b43..08f50afd5f2a 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -730,21 +730,24 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) if (xdr->nwords == 0) return 0; + /* Realign pages to current pointer position */ + iov = buf->head; + if (iov->iov_len > cur) { + xdr_shrink_bufhead(buf, iov->iov_len - cur); + xdr->nwords = XDR_QUADLEN(buf->len - cur); + } + if (nwords > xdr->nwords) { nwords = xdr->nwords; len = nwords << 2; } - /* Realign pages to current pointer position */ - iov = buf->head; - if (iov->iov_len > cur) - xdr_shrink_bufhead(buf, iov->iov_len - cur); - - /* Truncate page data and move it into the tail */ - if (buf->page_len > len) - xdr_shrink_pagelen(buf, buf->page_len - len); - else + if (buf->page_len <= len) len = buf->page_len; - xdr->nwords = XDR_QUADLEN(buf->len - cur); + else if (nwords < xdr->nwords) { + /* Truncate page data and move it into the tail */ + xdr_shrink_pagelen(buf, buf->page_len - len); + xdr->nwords = XDR_QUADLEN(buf->len - cur); + } return len; } From b3c54de6f82d01637796bcc1f667a45f3b32e814 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Aug 2012 17:15:50 -0400 Subject: [PATCH 011/113] NFS: Convert nfs_get_lock_context to return an ERR_PTR on failure We want to be able to distinguish between allocation failures, and the case where the lock context is not needed (because there are no locks). Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 16 ++++++++++++---- fs/nfs/inode.c | 2 +- fs/nfs/pagelist.c | 8 +++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1ba385b7c90d..22130df16218 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -450,6 +450,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; dreq = nfs_direct_req_alloc(); if (dreq == NULL) @@ -457,9 +458,12 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); - dreq->l_ctx = nfs_get_lock_context(dreq->ctx); - if (dreq->l_ctx == NULL) + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); goto out_release; + } + dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; @@ -849,6 +853,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; dreq = nfs_direct_req_alloc(); if (!dreq) @@ -856,9 +861,12 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); - dreq->l_ctx = nfs_get_lock_context(dreq->ctx); - if (dreq->l_ctx == NULL) + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); goto out_release; + } + dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9b47610338f5..b5e2913dff2d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -578,7 +578,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) spin_unlock(&inode->i_lock); new = kmalloc(sizeof(*new), GFP_KERNEL); if (new == NULL) - return NULL; + return ERR_PTR(-ENOMEM); nfs_init_lock_context(new); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 311a79681e2b..dfd764bd943d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -102,6 +102,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, unsigned int offset, unsigned int count) { struct nfs_page *req; + struct nfs_lock_context *l_ctx; /* try to allocate the request struct */ req = nfs_page_alloc(); @@ -109,11 +110,12 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, return ERR_PTR(-ENOMEM); /* get lock context early so we can deal with alloc failures */ - req->wb_lock_context = nfs_get_lock_context(ctx); - if (req->wb_lock_context == NULL) { + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) { nfs_page_free(req); - return ERR_PTR(-ENOMEM); + return ERR_CAST(l_ctx); } + req->wb_lock_context = l_ctx; /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in From 2a369153c82e0c83621b3e71d8f0c53394705bda Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Aug 2012 18:54:45 -0400 Subject: [PATCH 012/113] NFS: Clean up helper function nfs4_select_rw_stateid() We want to be able to pass on the information that the page was not dirtied under a lock. Instead of adding a flag parameter, do this by passing a pointer to a 'struct nfs_lock_owner' that may be NULL. Also reuse this structure in struct nfs_lock_context to carry the fl_owner_t and pid_t. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 8 ++++---- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 6 +++++- fs/nfs/nfs4state.c | 14 +++++++++++--- fs/nfs/nfs4xdr.c | 6 +++++- fs/nfs/pagelist.c | 4 +++- fs/nfs/write.c | 10 +++++++--- include/linux/nfs_fs.h | 8 ++++++-- 8 files changed, 42 insertions(+), 16 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b5e2913dff2d..126a4cbbb98f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -547,8 +547,8 @@ EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { atomic_set(&l_ctx->count, 1); - l_ctx->lockowner = current->files; - l_ctx->pid = current->tgid; + l_ctx->lockowner.l_owner = current->files; + l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); } @@ -557,9 +557,9 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context struct nfs_lock_context *pos; list_for_each_entry(pos, &ctx->lock_context.list, list) { - if (pos->lockowner != current->files) + if (pos->lockowner.l_owner != current->files) continue; - if (pos->pid != current->tgid) + if (pos->lockowner.l_pid != current->tgid) continue; atomic_inc(&pos->count); return pos; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index da0618aeeadb..d95e25ec3574 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -351,7 +351,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *, extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, - fmode_t, fl_owner_t, pid_t); + fmode_t, const struct nfs_lockowner *); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ddfebb128017..f19ea4f0f0c5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2013,8 +2013,12 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, nfs_fattr_init(fattr); if (state != NULL) { + struct nfs_lockowner lockowner = { + .l_owner = current->files, + .l_pid = current->tgid, + }; nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, - current->files, current->tgid); + &lockowner); } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, FMODE_WRITE)) { /* Use that stateid */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 55148def5540..03a4e7825f3b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -911,14 +911,22 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) } static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, - fl_owner_t fl_owner, pid_t fl_pid) + const struct nfs_lockowner *lockowner) { struct nfs4_lock_state *lsp; + fl_owner_t fl_owner; + pid_t fl_pid; bool ret = false; + + if (lockowner == NULL) + goto out; + if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) goto out; + fl_owner = lockowner->l_owner; + fl_pid = lockowner->l_pid; spin_lock(&state->state_lock); lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) { @@ -946,11 +954,11 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) * requests. */ void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, - fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid) + fmode_t fmode, const struct nfs_lockowner *lockowner) { if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) return; - if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid)) + if (nfs4_copy_lock_stateid(dst, state, lockowner)) return; nfs4_copy_open_stateid(dst, state); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a756349b0fa4..7ab29abb3160 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1509,8 +1509,12 @@ static void encode_open_stateid(struct xdr_stream *xdr, nfs4_stateid stateid; if (ctx->state != NULL) { + const struct nfs_lockowner *lockowner = NULL; + + if (l_ctx != NULL) + lockowner = &l_ctx->lockowner; nfs4_select_rw_stateid(&stateid, ctx->state, - fmode, l_ctx->lockowner, l_ctx->pid); + fmode, lockowner); if (zero_seqid) stateid.seqid = 0; encode_nfs4_stateid(xdr, &stateid); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index dfd764bd943d..e56e846e9d2d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -292,7 +292,9 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, { if (req->wb_context->cred != prev->wb_context->cred) return false; - if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + if (req->wb_lock_context->lockowner.l_owner != prev->wb_lock_context->lockowner.l_owner) + return false; + if (req->wb_lock_context->lockowner.l_pid != prev->wb_lock_context->lockowner.l_pid) return false; if (req->wb_context->state != prev->wb_context->state) return false; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e3b55372726c..e1b5fe4d873a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -846,6 +846,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_lock_context *l_ctx; struct nfs_page *req; int do_flush, status; /* @@ -860,9 +861,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page) req = nfs_page_find_request(page); if (req == NULL) return 0; - do_flush = req->wb_page != page || req->wb_context != ctx || - req->wb_lock_context->lockowner != current->files || - req->wb_lock_context->pid != current->tgid; + l_ctx = req->wb_lock_context; + do_flush = req->wb_page != page || req->wb_context != ctx; + if (l_ctx) { + do_flush |= l_ctx->lockowner.l_owner != current->files + || l_ctx->lockowner.l_pid != current->tgid; + } nfs_release_request(req); if (!do_flush) return 0; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4b03f56e280e..869eac0c2635 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -81,12 +81,16 @@ struct nfs_access_entry { int mask; }; +struct nfs_lockowner { + fl_owner_t l_owner; + pid_t l_pid; +}; + struct nfs_lock_context { atomic_t count; struct list_head list; struct nfs_open_context *open_context; - fl_owner_t lockowner; - pid_t pid; + struct nfs_lockowner lockowner; }; struct nfs4_state; From 795a88c968eef031f370973512b42124bacb2f17 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 10 Sep 2012 13:26:49 -0400 Subject: [PATCH 013/113] NFSv4: Convert the nfs4_lock_state->ls_flags to a bit field Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 4 ++-- fs/nfs/nfs4proc.c | 10 +++++----- fs/nfs/nfs4state.c | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index d95e25ec3574..71d407fd00a5 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -132,8 +132,8 @@ struct nfs4_lock_owner { struct nfs4_lock_state { struct list_head ls_locks; /* Other lock stateids */ struct nfs4_state * ls_state; /* Pointer to open state */ -#define NFS_LOCK_INITIALIZED 1 - int ls_flags; +#define NFS_LOCK_INITIALIZED 0 + unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; atomic_t ls_count; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f19ea4f0f0c5..cf2fd5d0c1b3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4395,7 +4395,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) return; - if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { + if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ task->tk_action = NULL; return; @@ -4589,7 +4589,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) } if (data->rpc_status == 0) { nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); - data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags); renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); } out: @@ -4636,7 +4636,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ case -NFS4ERR_BAD_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || - (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) nfs4_schedule_stateid_recovery(server, lsp->ls_state); break; case -NFS4ERR_STALE_STATEID: @@ -4760,7 +4760,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) struct nfs_server *server = NFS_SERVER(state->inode); list_for_each_entry(lsp, &state->lock_states, ls_locks) { - if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { status = nfs41_test_stateid(server, &lsp->ls_stateid); if (status != NFS_OK) { /* Free the stateid unless the server @@ -4768,7 +4768,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) if (status != -NFS4ERR_BAD_STATEID) nfs41_free_stateid(server, &lsp->ls_stateid); - lsp->ls_flags &= ~NFS_LOCK_INITIALIZED; + clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); ret = status; } } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 03a4e7825f3b..fc6cfe68ad1d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -865,7 +865,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); - if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { if (nfs4_release_lockowner(lsp) == 0) return; } @@ -929,7 +929,7 @@ static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_pid = lockowner->l_pid; spin_lock(&state->state_lock); lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); - if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) { + if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { nfs4_stateid_copy(dst, &lsp->ls_stateid); ret = true; } @@ -1297,7 +1297,7 @@ restart: if (status >= 0) { spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) pr_warn_ratelimited("NFS: " "%s: Lock reclaim " "failed!\n", __func__); @@ -1369,7 +1369,7 @@ static void nfs4_clear_open_state(struct nfs4_state *state) spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { lock->ls_seqid.flags = 0; - lock->ls_flags &= ~NFS_LOCK_INITIALIZED; + clear_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags); } spin_unlock(&state->state_lock); } From 05990d1bf2708b9e84d67074551f964d3738eedc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Sep 2012 16:01:22 -0400 Subject: [PATCH 014/113] NFS: Fix fdatasync/fsync() when confronted with a server reboot If the server reboots before it can commit the unstable writes to disk, then nfs_commit_release_pages() will detect this when it compares the verifier returned by COMMIT to the one returned by WRITE. When this happens, the client needs to resend those writes in order to guarantee that they make it to stable storage. This patch adds a signalling mechanism to notify fsync() that it needs to retry all writes before it can exit. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 34 ++++++++++++++++++++++------------ fs/nfs/nfs4file.c | 22 ++++++++++++---------- fs/nfs/write.c | 1 + include/linux/nfs_fs.h | 1 + 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6a7fcab7ecb3..cc9b56691bef 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -259,7 +259,7 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; - int have_error, status; + int have_error, do_resend, status; int ret = 0; dprintk("NFS: fsync file(%s/%s) datasync %d\n", @@ -267,15 +267,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); + do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); - if (status >= 0 && ret < 0) - status = ret; have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) + if (have_error) { ret = xchg(&ctx->error, 0); - if (!ret && status < 0) + if (ret) + goto out; + } + if (status < 0) { ret = status; + goto out; + } + do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); + if (do_resend) + ret = -EAGAIN; +out: return ret; } EXPORT_SYMBOL_GPL(nfs_file_fsync_commit); @@ -286,13 +294,15 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file->f_path.dentry->d_inode; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret != 0) - goto out; - mutex_lock(&inode->i_mutex); - ret = nfs_file_fsync_commit(file, start, end, datasync); - mutex_unlock(&inode->i_mutex); -out: + do { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret != 0) + break; + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + mutex_unlock(&inode->i_mutex); + } while (ret == -EAGAIN); + return ret; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index eb5eb8eef4d3..eef1b38a1b08 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -95,16 +95,18 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file->f_path.dentry->d_inode; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret != 0) - goto out; - mutex_lock(&inode->i_mutex); - ret = nfs_file_fsync_commit(file, start, end, datasync); - if (!ret && !datasync) - /* application has asked for meta-data sync */ - ret = pnfs_layoutcommit_inode(inode, true); - mutex_unlock(&inode->i_mutex); -out: + do { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret != 0) + break; + mutex_lock(&inode->i_mutex); + ret = nfs_file_fsync_commit(file, start, end, datasync); + if (!ret && !datasync) + /* application has asked for meta-data sync */ + ret = pnfs_layoutcommit_inode(inode, true); + mutex_unlock(&inode->i_mutex); + } while (ret == -EAGAIN); + return ret; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1b5fe4d873a..9347ab7c9574 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1580,6 +1580,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a mismatch. Write the page again */ dprintk(" mismatch\n"); nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); next: nfs_unlock_and_release_request(req); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 869eac0c2635..383f3313f053 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -103,6 +103,7 @@ struct nfs_open_context { unsigned long flags; #define NFS_CONTEXT_ERROR_WRITE (0) +#define NFS_CONTEXT_RESEND_WRITES (1) int error; struct list_head list; From dcfc4f25461813e8a2dd43b052aa1e0be155742f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Sep 2012 16:19:38 -0400 Subject: [PATCH 015/113] NFS: Write the entire file if a server reboot occurs during fsync() This is to ensure that we don't clear the NFS_CONTEXT_RESEND_WRITES flag while there are still writes that haven't been resent. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 7 +++++++ fs/nfs/nfs4file.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cc9b56691bef..c814666bbe7f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -301,6 +301,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); mutex_unlock(&inode->i_mutex); + /* + * If nfs_file_fsync_commit detected a server reboot, then + * resend all dirty pages that might have been covered by + * the NFS_CONTEXT_RESEND_WRITES flag + */ + start = 0; + end = LLONG_MAX; } while (ret == -EAGAIN); return ret; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index eef1b38a1b08..afddd6639afb 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -105,6 +105,13 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) /* application has asked for meta-data sync */ ret = pnfs_layoutcommit_inode(inode, true); mutex_unlock(&inode->i_mutex); + /* + * If nfs_file_fsync_commit detected a server reboot, then + * resend all dirty pages that might have been covered by + * the NFS_CONTEXT_RESEND_WRITES flag + */ + start = 0; + end = LLONG_MAX; } while (ret == -EAGAIN); return ret; From d19751e7b9bd8a01d00372325439589886674f79 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Sep 2012 17:21:25 -0400 Subject: [PATCH 016/113] SUNRPC: Get rid of the redundant xprt->shutdown bit field It is only set after everyone has dereferenced the transport, and serves no useful purpose: setting it is racy, so all the socket code, etc still needs to be able to cope with the cases where they miss reading it. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +-- net/sunrpc/xprt.c | 8 ++------ net/sunrpc/xprtrdma/transport.c | 20 +++++++------------- net/sunrpc/xprtsock.c | 18 ------------------ 4 files changed, 10 insertions(+), 39 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index bf8c49ff7530..951cb9b7d02b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -173,8 +173,7 @@ struct rpc_xprt { unsigned int min_reqs; /* min number of slots */ atomic_t num_reqs; /* total slots */ unsigned long state; /* transport state */ - unsigned char shutdown : 1, /* being shut down */ - resvport : 1; /* use a reserved port */ + unsigned char resvport : 1; /* use a reserved port */ unsigned int swapper; /* we're swapping over this transport */ unsigned int bind_index; /* bind function index */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 5d7f61d7559c..bd462a532acf 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -231,7 +231,7 @@ EXPORT_SYMBOL_GPL(xprt_reserve_xprt); static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state) || xprt->shutdown) { + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { smp_mb__before_clear_bit(); clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); @@ -504,9 +504,6 @@ EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); */ void xprt_write_space(struct rpc_xprt *xprt) { - if (unlikely(xprt->shutdown)) - return; - spin_lock_bh(&xprt->transport_lock); if (xprt->snd_task) { dprintk("RPC: write space: waking waiting task on " @@ -679,7 +676,7 @@ xprt_init_autodisconnect(unsigned long data) struct rpc_xprt *xprt = (struct rpc_xprt *)data; spin_lock(&xprt->transport_lock); - if (!list_empty(&xprt->recv) || xprt->shutdown) + if (!list_empty(&xprt->recv)) goto out_abort; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; @@ -1262,7 +1259,6 @@ out: static void xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); - xprt->shutdown = 1; del_timer_sync(&xprt->timer); rpc_destroy_wait_queue(&xprt->binding); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 5d9202dc7cb1..c9aa7a35f3bf 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -199,21 +199,15 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->xprt; int rc = 0; - if (!xprt->shutdown) { - current->flags |= PF_FSTRANS; - xprt_clear_connected(xprt); + current->flags |= PF_FSTRANS; + xprt_clear_connected(xprt); - dprintk("RPC: %s: %sconnect\n", __func__, - r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - goto out; - } - goto out_clear; + dprintk("RPC: %s: %sconnect\n", __func__, + r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + xprt_wake_pending_tasks(xprt, rc); -out: - xprt_wake_pending_tasks(xprt, rc); -out_clear: dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); current->flags &= ~PF_FSTRANS; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 97f8918169ed..aaaadfbe36e9 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -917,9 +917,6 @@ static void xs_local_data_ready(struct sock *sk, int len) if (skb == NULL) goto out; - if (xprt->shutdown) - goto dropit; - repsize = skb->len - sizeof(rpc_fraghdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d\n", repsize); @@ -981,9 +978,6 @@ static void xs_udp_data_ready(struct sock *sk, int len) if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) goto out; - if (xprt->shutdown) - goto dropit; - repsize = skb->len - sizeof(struct udphdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d!\n", repsize); @@ -1412,9 +1406,6 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes) read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; - if (xprt->shutdown) - goto out; - /* Any data means we had a useful conversation, so * the we don't need to delay the next reconnect */ @@ -1901,9 +1892,6 @@ static void xs_local_setup_socket(struct work_struct *work) struct socket *sock; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); @@ -2020,9 +2008,6 @@ static void xs_udp_setup_socket(struct work_struct *work) struct socket *sock = transport->sock; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; /* Start by resetting any existing state */ @@ -2168,9 +2153,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; if (!sock) { From a0b0a6e39bd1bb4a0922086feee73627cbd53ba4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 17 Sep 2012 17:12:15 -0400 Subject: [PATCH 017/113] NFS: Clean up the pNFS layoutget interface Ensure that we do return errors from nfs4_proc_layoutget() and that we don't mark the layout as having failed if the error was due to a signal or resource problem on the client side. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 14 +++++++++----- fs/nfs/pnfs.c | 25 ++++++++++++++++--------- fs/nfs/pnfs.h | 4 ++-- include/linux/nfs_xdr.h | 1 - 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cf2fd5d0c1b3..1c8656f8745c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6286,7 +6286,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { .rpc_release = nfs4_layoutget_release, }; -void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) +struct pnfs_layout_segment * +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) { struct nfs_server *server = NFS_SERVER(lgp->args.inode); size_t max_pages = max_response_pages(server); @@ -6303,6 +6304,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) .callback_data = lgp, .flags = RPC_TASK_ASYNC, }; + struct pnfs_layout_segment *lseg = NULL; int status = 0; dprintk("--> %s\n", __func__); @@ -6310,7 +6312,7 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); if (!lgp->args.layout.pages) { nfs4_layoutget_release(lgp); - return; + return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; @@ -6319,15 +6321,17 @@ void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) - return; + return ERR_CAST(task); status = nfs4_wait_for_completion_rpc_task(task); if (status == 0) status = task->tk_status; if (status == 0) - status = pnfs_layout_process(lgp); + lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); - return; + if (status) + return ERR_PTR(status); + return lseg; } static void diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2e00feacd4be..3a7ac97020df 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -582,7 +582,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct inode *ino = lo->plh_inode; struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; - struct pnfs_layout_segment *lseg = NULL; + struct pnfs_layout_segment *lseg; dprintk("--> %s\n", __func__); @@ -599,16 +599,22 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); - lgp->lsegpp = &lseg; lgp->gfp_flags = gfp_flags; /* Synchronously retrieve layout information from server and * store in lseg. */ - nfs4_proc_layoutget(lgp, gfp_flags); - if (!lseg) { - /* remember that LAYOUTGET failed and suspend trying */ - set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); + lseg = nfs4_proc_layoutget(lgp, gfp_flags); + if (IS_ERR(lseg)) { + switch (PTR_ERR(lseg)) { + case -ENOMEM: + case -ERESTARTSYS: + break; + default: + /* remember that LAYOUTGET failed and suspend trying */ + set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); + } + return NULL; } return lseg; @@ -1096,7 +1102,7 @@ out_unlock: } EXPORT_SYMBOL_GPL(pnfs_update_layout); -int +struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; @@ -1129,7 +1135,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } init_lseg(lo, lseg); lseg->pls_range = res->range; - *lgp->lsegpp = get_lseg(lseg); + get_lseg(lseg); pnfs_insert_layout(lo, lseg); if (res->return_on_close) { @@ -1140,8 +1146,9 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) /* Done processing layoutget. Set the layout stateid */ pnfs_set_layout_stateid(lo, &res->stateid, false); spin_unlock(&ino->i_lock); + return lseg; out: - return status; + return ERR_PTR(status); out_forget_reply: spin_unlock(&ino->i_lock); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 745aa1b39e7c..d51ef888e71b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -172,7 +172,7 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server, struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev); -extern void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ @@ -192,7 +192,7 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); -int pnfs_layout_process(struct nfs4_layoutget *lgp); +struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index be9cf3c7e79e..5da789fdf25b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -251,7 +251,6 @@ struct nfs4_layoutget_res { struct nfs4_layoutget { struct nfs4_layoutget_args args; struct nfs4_layoutget_res res; - struct pnfs_layout_segment **lsegpp; gfp_t gfp_flags; }; From 49a85061b0bc9cb26361096482c81172c666c937 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Sep 2012 20:43:31 -0400 Subject: [PATCH 018/113] NFSv4.1: Cleanup add a "pnfs_" prefix to mark_matching_lsegs_invalid Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 4 ++-- fs/nfs/pnfs.c | 6 +++--- fs/nfs/pnfs.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 1b5d809a105e..57b8bda0f4e9 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -158,7 +158,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, ino = lo->plh_inode; spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - mark_matching_lsegs_invalid(lo, &free_me_list, + pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &args->cbl_range)) rv = NFS4ERR_DELAY; else @@ -211,7 +211,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, ino = lo->plh_inode; spin_lock(&ino->i_lock); set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (mark_matching_lsegs_invalid(lo, &free_me_list, &range)) + if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range)) rv = NFS4ERR_DELAY; list_del_init(&lo->plh_bulk_recall); spin_unlock(&ino->i_lock); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3a7ac97020df..aea2e5256fe4 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -390,7 +390,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, * after call. */ int -mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, +pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *recall_range) { @@ -458,7 +458,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) lo = nfsi->layout; if (lo) { lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); } spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); @@ -651,7 +651,7 @@ _pnfs_return_layout(struct inode *ino) /* Reference matched in nfs4_layoutreturn_release */ get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); - mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d51ef888e71b..6af518934e4c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -203,7 +203,7 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, struct nfs4_state *open_state); -int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, +int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); From 70c3bd2bdf9a3c7c9282c362a4ec9ec88c713e13 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Sep 2012 20:51:13 -0400 Subject: [PATCH 019/113] NFSv4.1: Cleanup; add "pnfs_" prefix to get_layout_hdr() and put_layout_hdr() Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 8 ++++---- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.c | 30 +++++++++++++++--------------- fs/nfs/pnfs.h | 4 ++-- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 57b8bda0f4e9..24252fea2c9c 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -122,7 +122,7 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, ino = igrab(lo->plh_inode); if (!ino) continue; - get_layout_hdr(lo); + pnfs_get_layout_hdr(lo); return lo; } } @@ -166,7 +166,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); - put_layout_hdr(lo); + pnfs_put_layout_hdr(lo); iput(ino); return rv; } @@ -198,7 +198,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, list_for_each_entry(lo, &server->layouts, plh_layouts) { if (!igrab(lo->plh_inode)) continue; - get_layout_hdr(lo); + pnfs_get_layout_hdr(lo); BUG_ON(!list_empty(&lo->plh_bulk_recall)); list_add(&lo->plh_bulk_recall, &recall_list); } @@ -216,7 +216,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, list_del_init(&lo->plh_bulk_recall); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); - put_layout_hdr(lo); + pnfs_put_layout_hdr(lo); iput(ino); } return rv; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1c8656f8745c..bdacb8c21a33 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6375,7 +6375,7 @@ static void nfs4_layoutreturn_release(void *calldata) struct nfs4_layoutreturn *lrp = calldata; dprintk("--> %s\n", __func__); - put_layout_hdr(lrp->args.layout); + pnfs_put_layout_hdr(lrp->args.layout); kfree(calldata); dprintk("<-- %s\n", __func__); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index aea2e5256fe4..512c8632bf36 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -190,7 +190,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); /* Need to hold i_lock if caller does not already hold reference */ void -get_layout_hdr(struct pnfs_layout_hdr *lo) +pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) { atomic_inc(&lo->plh_refcount); } @@ -221,14 +221,14 @@ destroy_layout_hdr(struct pnfs_layout_hdr *lo) } static void -put_layout_hdr_locked(struct pnfs_layout_hdr *lo) +pnfs_put_layout_hdr_locked(struct pnfs_layout_hdr *lo) { if (atomic_dec_and_test(&lo->plh_refcount)) destroy_layout_hdr(lo); } void -put_layout_hdr(struct pnfs_layout_hdr *lo) +pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode = lo->plh_inode; @@ -254,8 +254,8 @@ static void free_lseg(struct pnfs_layout_segment *lseg) struct inode *ino = lseg->pls_layout->plh_inode; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - /* Matched by get_layout_hdr in pnfs_insert_layout */ - put_layout_hdr(NFS_I(ino)->layout); + /* Matched by pnfs_get_layout_hdr in pnfs_insert_layout */ + pnfs_put_layout_hdr(NFS_I(ino)->layout); } static void @@ -268,7 +268,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg) if (list_empty(&lseg->pls_layout->plh_segs)) { set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); /* Matched by initial refcount set in alloc_init_layout_hdr */ - put_layout_hdr_locked(lseg->pls_layout); + pnfs_put_layout_hdr_locked(lseg->pls_layout); } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } @@ -404,7 +404,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, NFS_I(lo->plh_inode)->write_io = 0; NFS_I(lo->plh_inode)->read_io = 0; if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) - put_layout_hdr_locked(lo); + pnfs_put_layout_hdr_locked(lo); return 0; } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) @@ -649,13 +649,13 @@ _pnfs_return_layout(struct inode *ino) } stateid = nfsi->layout->plh_stateid; /* Reference matched in nfs4_layoutreturn_release */ - get_layout_hdr(lo); + pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); - put_layout_hdr(lo); + pnfs_put_layout_hdr(lo); dprintk("NFS: %s no layout segments to return\n", __func__); goto out; } @@ -672,7 +672,7 @@ _pnfs_return_layout(struct inode *ino) set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); pnfs_clear_layout_returned(lo); - put_layout_hdr(lo); + pnfs_put_layout_hdr(lo); goto out; } @@ -709,7 +709,7 @@ bool pnfs_roc(struct inode *ino) if (!found) goto out_nolayout; lo->plh_block_lgets++; - get_layout_hdr(lo); /* matched in pnfs_roc_release */ + pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); return true; @@ -726,7 +726,7 @@ void pnfs_roc_release(struct inode *ino) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; lo->plh_block_lgets--; - put_layout_hdr_locked(lo); + pnfs_put_layout_hdr_locked(lo); spin_unlock(&ino->i_lock); } @@ -819,7 +819,7 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo, __func__, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); out: - get_layout_hdr(lo); + pnfs_get_layout_hdr(lo); dprintk("%s:Return\n", __func__); } @@ -1058,7 +1058,7 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; atomic_inc(&lo->plh_outstanding); - get_layout_hdr(lo); + pnfs_get_layout_hdr(lo); if (list_empty(&lo->plh_segs)) first = true; @@ -1091,7 +1091,7 @@ pnfs_update_layout(struct inode *ino, spin_unlock(&clp->cl_lock); } atomic_dec(&lo->plh_outstanding); - put_layout_hdr(lo); + pnfs_put_layout_hdr(lo); out: dprintk("%s end, state 0x%lx lseg %p\n", __func__, nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 6af518934e4c..2af681f0a491 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -176,7 +176,7 @@ extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lg extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ -void get_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void put_lseg(struct pnfs_layout_segment *lseg); void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, @@ -196,7 +196,7 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); -void put_layout_hdr(struct pnfs_layout_hdr *lo); +void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier); From 9369a431bce1e985597eda32992960c969b27c5b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Sep 2012 20:57:08 -0400 Subject: [PATCH 020/113] NFSv4.1: Cleanup; add "pnfs_" prefix to put_lseg() and get_lseg() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 16 ++++++++-------- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.c | 36 ++++++++++++++++++------------------ fs/nfs/pnfs.h | 8 ++++---- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 53f94d915bd1..77cd1151ef0d 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -453,7 +453,7 @@ static void filelayout_commit_release(void *calldata) struct nfs_commit_data *data = calldata; data->completion_ops->completion(data); - put_lseg(data->lseg); + pnfs_put_lseg(data->lseg); nfs_put_client(data->ds_clp); nfs_commitdata_release(data); } @@ -931,7 +931,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); if (status < 0) { - put_lseg(pgio->pg_lseg); + pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; goto out_mds; } @@ -985,7 +985,7 @@ filelayout_clear_request_commit(struct nfs_page *req, out: nfs_request_remove_commit_list(req, cinfo); spin_unlock(cinfo->lock); - put_lseg(freeme); + pnfs_put_lseg(freeme); } static struct list_head * @@ -1018,7 +1018,7 @@ filelayout_choose_commit_list(struct nfs_page *req, * off due to a rewrite, in which case it will be done in * filelayout_clear_request_commit */ - buckets[i].wlseg = get_lseg(lseg); + buckets[i].wlseg = pnfs_get_lseg(lseg); } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; @@ -1128,7 +1128,7 @@ filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, if (list_empty(src)) bucket->wlseg = NULL; else - get_lseg(bucket->clseg); + pnfs_get_lseg(bucket->clseg); } return ret; } @@ -1159,12 +1159,12 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, /* NOTE cinfo->lock is NOT held, relying on fact that this is * only called on single thread per dreq. - * Can't take the lock because need to do put_lseg + * Can't take the lock because need to do pnfs_put_lseg */ for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { BUG_ON(!list_empty(&b->written)); - put_lseg(b->wlseg); + pnfs_put_lseg(b->wlseg); b->wlseg = NULL; } } @@ -1200,7 +1200,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) if (list_empty(&bucket->committing)) continue; nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); - put_lseg(bucket->clseg); + pnfs_put_lseg(bucket->clseg); bucket->clseg = NULL; } /* Caller will clean up entries put on list */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bdacb8c21a33..e605d417a006 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6549,7 +6549,7 @@ static void nfs4_layoutcommit_release(void *calldata) list_del_init(&lseg->pls_lc_list); if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) - put_lseg(lseg); + pnfs_put_lseg(lseg); } clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 512c8632bf36..498af8779959 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -259,7 +259,7 @@ static void free_lseg(struct pnfs_layout_segment *lseg) } static void -put_lseg_common(struct pnfs_layout_segment *lseg) +pnfs_put_lseg_common(struct pnfs_layout_segment *lseg) { struct inode *inode = lseg->pls_layout->plh_inode; @@ -274,7 +274,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg) } void -put_lseg(struct pnfs_layout_segment *lseg) +pnfs_put_lseg(struct pnfs_layout_segment *lseg) { struct inode *inode; @@ -288,13 +288,13 @@ put_lseg(struct pnfs_layout_segment *lseg) if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { LIST_HEAD(free_me); - put_lseg_common(lseg); + pnfs_put_lseg_common(lseg); list_add(&lseg->pls_list, &free_me); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&free_me); } } -EXPORT_SYMBOL_GPL(put_lseg); +EXPORT_SYMBOL_GPL(pnfs_put_lseg); static inline u64 end_offset(u64 start, u64 len) @@ -378,7 +378,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); if (atomic_dec_and_test(&lseg->pls_refcount)) { - put_lseg_common(lseg); + pnfs_put_lseg_common(lseg); list_add(&lseg->pls_list, tmp_list); rv = 1; } @@ -914,7 +914,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && is_matching_lseg(&lseg->pls_range, range)) { - ret = get_lseg(lseg); + ret = pnfs_get_lseg(lseg); break; } if (lseg->pls_range.offset > range->offset) @@ -1135,7 +1135,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } init_lseg(lo, lseg); lseg->pls_range = res->range; - get_lseg(lseg); + pnfs_get_lseg(lseg); pnfs_insert_layout(lo, lseg); if (res->return_on_close) { @@ -1369,12 +1369,12 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he if (trypnfs == PNFS_NOT_ATTEMPTED) pnfs_write_through_mds(desc, data); } - put_lseg(lseg); + pnfs_put_lseg(lseg); } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) { - put_lseg(hdr->lseg); + pnfs_put_lseg(hdr->lseg); nfs_writehdr_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_writehdr_free); @@ -1389,17 +1389,17 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) whdr = nfs_writehdr_alloc(); if (!whdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); - put_lseg(desc->pg_lseg); + pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return -ENOMEM; } hdr = &whdr->header; nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); - hdr->lseg = get_lseg(desc->pg_lseg); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); ret = nfs_generic_flush(desc, hdr); if (ret != 0) { - put_lseg(desc->pg_lseg); + pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); @@ -1524,12 +1524,12 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea if (trypnfs == PNFS_NOT_ATTEMPTED) pnfs_read_through_mds(desc, data); } - put_lseg(lseg); + pnfs_put_lseg(lseg); } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) { - put_lseg(hdr->lseg); + pnfs_put_lseg(hdr->lseg); nfs_readhdr_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_readhdr_free); @@ -1545,17 +1545,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) if (!rhdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); ret = -ENOMEM; - put_lseg(desc->pg_lseg); + pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return ret; } hdr = &rhdr->header; nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); - hdr->lseg = get_lseg(desc->pg_lseg); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); ret = nfs_generic_pagein(desc, hdr); if (ret != 0) { - put_lseg(desc->pg_lseg); + pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else pnfs_do_multiple_reads(desc, &hdr->rpc_list); @@ -1608,7 +1608,7 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) } if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - get_lseg(hdr->lseg); + pnfs_get_lseg(hdr->lseg); } if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 2af681f0a491..04958797fad0 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -177,7 +177,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); -void put_lseg(struct pnfs_layout_segment *lseg); +void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, const struct nfs_pgio_completion_ops *); @@ -281,7 +281,7 @@ static inline int lo_fail_bit(u32 iomode) } static inline struct pnfs_layout_segment * -get_lseg(struct pnfs_layout_segment *lseg) +pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { atomic_inc(&lseg->pls_refcount); @@ -406,12 +406,12 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) } static inline struct pnfs_layout_segment * -get_lseg(struct pnfs_layout_segment *lseg) +pnfs_get_lseg(struct pnfs_layout_segment *lseg) { return NULL; } -static inline void put_lseg(struct pnfs_layout_segment *lseg) +static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { } From 78e4e05c643768af170e5a4b21712d9a7a26cce5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Sep 2012 21:02:29 -0400 Subject: [PATCH 021/113] NFSv4.1: Replace get_device_info() with filelayout_get_device_info() Fix the namespace pollution issue. Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 2 +- fs/nfs/nfs4filelayout.c | 2 +- fs/nfs/nfs4filelayout.h | 2 +- fs/nfs/nfs4filelayoutdev.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index dd392ed5f2e2..329bfbfed37d 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -874,7 +874,7 @@ static void free_blk_mountid(struct block_mount_id *mid) } } -/* This is mostly copied from the filelayout's get_device_info function. +/* This is mostly copied from the filelayout_get_device_info function. * It seems much of this should be at the generic pnfs level. */ static struct pnfs_block_dev * diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 77cd1151ef0d..af6ee4ad3f15 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -608,7 +608,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, NFS_SERVER(lo->plh_inode)->nfs_client, id); if (d == NULL) { - dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); + dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags); if (dsaddr == NULL) goto out; } else diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 43fe802dd678..11053c425a61 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -158,7 +158,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * -get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); +filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); void nfs4_ds_disconnect(struct nfs_client *clp); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index f81231f30d94..b85a29df20ae 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -690,7 +690,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl * of available devices, and return it. */ struct nfs4_file_layout_dsaddr * -get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) +filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) { struct pnfs_device *pdev = NULL; u32 max_resp_sz; From f86bbcf85db32596a0484477d1b8042005709049 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 26 Sep 2012 11:21:40 -0400 Subject: [PATCH 022/113] NFSv4.1: Replace dprintk() in pnfs_update_layout with something less buggy Dereferencing nfsi->layout in order to read plh_flags without holding a spin lock is bug prone. Furthermore, the dprintk() tells you nothing about whether or not the call succeeded. Replace it with something that tells you about whether or not a valid layout segment was returned for the inode in question. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 498af8779959..df45acaf91f7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1027,17 +1027,15 @@ pnfs_update_layout(struct inode *ino, bool first = false; if (!pnfs_enabled_sb(NFS_SERVER(ino))) - return NULL; + goto out; if (pnfs_within_mdsthreshold(ctx, ino, iomode)) - return NULL; + goto out; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); - if (lo == NULL) { - dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); + if (lo == NULL) goto out_unlock; - } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { @@ -1093,8 +1091,14 @@ pnfs_update_layout(struct inode *ino, atomic_dec(&lo->plh_outstanding); pnfs_put_layout_hdr(lo); out: - dprintk("%s end, state 0x%lx lseg %p\n", __func__, - nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); + dprintk("%s: inode %s/%llu pNFS layout segment %s for " + "(%s, offset: %llu, length: %llu)\n", + __func__, ino->i_sb->s_id, + (unsigned long long)NFS_FILEID(ino), + lseg == NULL ? "not found" : "found", + iomode==IOMODE_RW ? "read/write" : "read-only", + (unsigned long long)pos, + (unsigned long long)count); return lseg; out_unlock: spin_unlock(&ino->i_lock); From b9e028fd89d6834558aa2a5bb30e5cff5c6c1059 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Sep 2012 16:41:18 -0400 Subject: [PATCH 023/113] NFSv4.1: Add helpers for setting/reading the I/O fail bit ...and make them local to the pnfs.c file. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 38 ++++++++++++++++++++++++++------------ fs/nfs/pnfs.h | 6 ------ 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index df45acaf91f7..f46f9bc4f767 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -238,6 +238,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) } } +static int +pnfs_iomode_to_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; +} + +static void +pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ + set_bit(pnfs_iomode_to_fail_bit(iomode), &lo->plh_flags); + dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, + iomode == IOMODE_RW ? "RW" : "READ"); +} + +static bool +pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ + return test_bit(pnfs_iomode_to_fail_bit(iomode), &lo->plh_flags) != 0; +} + static void init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { @@ -612,7 +633,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, break; default: /* remember that LAYOUTGET failed and suspend trying */ - set_bit(lo_fail_bit(range->iomode), &lo->plh_flags); + pnfs_layout_io_set_failed(lo, range->iomode); } return NULL; } @@ -669,8 +690,8 @@ _pnfs_return_layout(struct inode *ino) lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; - set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); - set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); + pnfs_layout_io_set_failed(lo, IOMODE_RW); + pnfs_layout_io_set_failed(lo, IOMODE_READ); pnfs_clear_layout_returned(lo); pnfs_put_layout_hdr(lo); goto out; @@ -1019,7 +1040,6 @@ pnfs_update_layout(struct inode *ino, .length = count, }; unsigned pg_offset; - struct nfs_inode *nfsi = NFS_I(ino); struct nfs_server *server = NFS_SERVER(ino); struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; @@ -1044,7 +1064,7 @@ pnfs_update_layout(struct inode *ino, } /* if LAYOUTGET already failed once we don't try again */ - if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) + if (pnfs_layout_io_test_failed(lo, iomode)) goto out_unlock; /* Check to see if the layout for the given range already exists */ @@ -1585,13 +1605,7 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) { - if (lseg->pls_range.iomode == IOMODE_RW) { - dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - } else { - dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); - } + pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); } EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 04958797fad0..e3eb7d1b17a8 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -274,12 +274,6 @@ pnfs_test_layout_returned(struct pnfs_layout_hdr *lo) return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); } -static inline int lo_fail_bit(u32 iomode) -{ - return iomode == IOMODE_RW ? - NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; -} - static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { From 25c7533357a4c4a9311d40cc92e9648c8a7e763e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Sep 2012 17:01:12 -0400 Subject: [PATCH 024/113] NFSv4.1: Retry pNFS after a 2 minute timeout If we had to fall back to read/write through MDS, then assume that we should retry pNFS after a suitable timeout period. The following patch sets a timeout of 2 minutes. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 15 ++++++++++++++- fs/nfs/pnfs.h | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f46f9bc4f767..2c59da5511db 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -35,6 +35,7 @@ #include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PNFS +#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) /* Locking: * @@ -248,6 +249,7 @@ pnfs_iomode_to_fail_bit(u32 iomode) static void pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) { + lo->plh_retry_timestamp = jiffies; set_bit(pnfs_iomode_to_fail_bit(iomode), &lo->plh_flags); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, iomode == IOMODE_RW ? "RW" : "READ"); @@ -256,7 +258,18 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) static bool pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) { - return test_bit(pnfs_iomode_to_fail_bit(iomode), &lo->plh_flags) != 0; + unsigned long start, end; + if (test_bit(pnfs_iomode_to_fail_bit(iomode), &lo->plh_flags) == 0) + return false; + end = jiffies; + start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; + if (!time_in_range(lo->plh_retry_timestamp, start, end)) { + /* It is time to retry the failed layoutgets */ + clear_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); + clear_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); + return false; + } + return true; } static void diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e3eb7d1b17a8..bc8e5001203d 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -140,6 +140,7 @@ struct pnfs_layout_hdr { atomic_t plh_outstanding; /* number of RPCs out */ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ u32 plh_barrier; /* ignore lower seqids */ + unsigned long plh_retry_timestamp; unsigned long plh_flags; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ From 1dfed2737d8cfe2f2378fddfb3bed126ff5474e7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Sep 2012 19:51:12 -0400 Subject: [PATCH 025/113] NFSv4.1: pNFS data servers may be temporarily offline In cases where the pNFS data server is just temporarily out of service, we want to mark it as such, and then try again later. Typically that will be in cases of network connection errors etc. This patch allows us to mark the devices as being "unavailable" for such transient errors, and will make them available for retries after a 2 minute timeout period. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 22 +++++++++++++++++++--- fs/nfs/nfs4filelayout.h | 8 ++------ fs/nfs/nfs4filelayoutdev.c | 15 +++++++-------- fs/nfs/pnfs.h | 4 ++++ fs/nfs/pnfs_dev.c | 27 +++++++++++++++++++++++++++ 5 files changed, 59 insertions(+), 17 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index af6ee4ad3f15..dac2162c3ac4 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -205,7 +205,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -EPIPE: dprintk("%s DS connection error %d\n", __func__, task->tk_status); - filelayout_mark_devid_invalid(devid); + nfs4_mark_deviceid_unavailable(devid); clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); _pnfs_return_layout(inode); rpc_wake_up(&tbl->slot_tbl_waitq); @@ -269,6 +269,22 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } +bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node) +{ + return filelayout_test_devid_invalid(node) || + nfs4_test_deviceid_unavailable(node); +} + +static bool +filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg); + + return filelayout_test_layout_invalid(lseg->pls_layout) || + filelayout_test_devid_unavailable(node); +} + /* * Call ops for the async read/write cases * In the case of dense layouts, the offset needs to be reset to its @@ -613,8 +629,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } else dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); - /* Found deviceid is being reaped */ - if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags)) + /* Found deviceid is unavailable */ + if (filelayout_test_devid_unavailable(&dsaddr->id_node)) goto out_put; fl->dsaddr = dsaddr; diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 11053c425a61..10b0f134400b 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -140,12 +140,8 @@ filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) return test_bit(NFS_DEVICEID_INVALID, &node->flags); } -static inline bool -filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) -{ - return filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg)) || - filelayout_test_layout_invalid(lseg->pls_layout); -} +extern bool +filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node); extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index b85a29df20ae..3336d5eaf879 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -804,13 +804,14 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); - if (filelayout_test_devid_invalid(devid)) + if (filelayout_test_devid_unavailable(devid)) return NULL; if (ds == NULL) { printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", __func__, ds_idx); - goto mark_dev_invalid; + filelayout_mark_devid_invalid(devid); + return NULL; } if (!ds->ds_clp) { @@ -818,14 +819,12 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) int err; err = nfs4_ds_connect(s, ds); - if (err) - goto mark_dev_invalid; + if (err) { + nfs4_mark_deviceid_unavailable(devid); + return NULL; + } } return ds; - -mark_dev_invalid: - filelayout_mark_devid_invalid(devid); - return NULL; } module_param(dataserver_retrans, uint, 0644); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index bc8e5001203d..9735031e1e1a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -234,6 +234,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ + NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ }; /* pnfs_dev.c */ @@ -243,6 +244,7 @@ struct nfs4_deviceid_node { const struct pnfs_layoutdriver_type *ld; const struct nfs_client *nfs_client; unsigned long flags; + unsigned long timestamp_unavailable; struct nfs4_deviceid deviceid; atomic_t ref; }; @@ -255,6 +257,8 @@ void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, const struct nfs4_deviceid *); struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); +bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); static inline void diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 73f701f1f4d3..d35b62e83ea6 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -40,6 +40,8 @@ #define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) #define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) + static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; static DEFINE_SPINLOCK(nfs4_deviceid_lock); @@ -218,6 +220,30 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) } EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); +void +nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ + node->timestamp_unavailable = jiffies; + set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); + +bool +nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) +{ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + unsigned long start, end; + + end = jiffies; + start = end - PNFS_DEVICE_RETRY_TIMEOUT; + if (time_in_range(node->timestamp_unavailable, start, end)) + return true; + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + } + return false; +} +EXPORT_SYMBOL_GPL(nfs4_test_deviceid_unavailable); + static void _deviceid_purge_client(const struct nfs_client *clp, long hash) { @@ -276,3 +302,4 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) } rcu_read_unlock(); } + From 830ffb565760234eb984e4343ad82575e96728de Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 21:25:19 -0400 Subject: [PATCH 026/113] NFSv4.1: Fix a reference leak in pnfs_update_layout If we exit after the call to pnfs_find_alloc_layout(), we have to ensure that we put the struct pnfs_layout_hdr. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c59da5511db..d7a8f03e7295 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1067,8 +1067,10 @@ pnfs_update_layout(struct inode *ino, spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); - if (lo == NULL) - goto out_unlock; + if (lo == NULL) { + spin_unlock(&ino->i_lock); + goto out; + } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { @@ -1122,6 +1124,7 @@ pnfs_update_layout(struct inode *ino, spin_unlock(&clp->cl_lock); } atomic_dec(&lo->plh_outstanding); +out_put_layout_hdr: pnfs_put_layout_hdr(lo); out: dprintk("%s: inode %s/%llu pNFS layout segment %s for " @@ -1135,7 +1138,7 @@ out: return lseg; out_unlock: spin_unlock(&ino->i_lock); - goto out; + goto out_put_layout_hdr; } EXPORT_SYMBOL_GPL(pnfs_update_layout); From 3e6212149304eaf9289d5bc56e003068660f3476 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 24 Sep 2012 13:07:16 -0400 Subject: [PATCH 027/113] NFSv4.1: Don't drop the pnfs_layout_hdr after a layoutget failure We want to cache the pnfs_layout_hdr after a layoutget or i/o failure so that pnfs_update_layout() can find it and know when it is time to retry. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d7a8f03e7295..6834fa1be571 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -247,10 +247,28 @@ pnfs_iomode_to_fail_bit(u32 iomode) } static void -pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) +pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { lo->plh_retry_timestamp = jiffies; - set_bit(pnfs_iomode_to_fail_bit(iomode), &lo->plh_flags); + if (test_and_set_bit(fail_bit, &lo->plh_flags)) + atomic_inc(&lo->plh_refcount); +} + +static void +pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) +{ + if (test_and_clear_bit(fail_bit, &lo->plh_flags)) + atomic_dec(&lo->plh_refcount); +} + +static void +pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) +{ + struct inode *inode = lo->plh_inode; + + spin_lock(&inode->i_lock); + pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + spin_unlock(&inode->i_lock); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, iomode == IOMODE_RW ? "RW" : "READ"); } @@ -259,14 +277,15 @@ static bool pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) { unsigned long start, end; - if (test_bit(pnfs_iomode_to_fail_bit(iomode), &lo->plh_flags) == 0) + int fail_bit = pnfs_iomode_to_fail_bit(iomode); + + if (test_bit(fail_bit, &lo->plh_flags) == 0) return false; end = jiffies; start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; if (!time_in_range(lo->plh_retry_timestamp, start, end)) { /* It is time to retry the failed layoutgets */ - clear_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); - clear_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); + pnfs_layout_clear_fail_bit(lo, fail_bit); return false; } return true; @@ -493,9 +512,14 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) if (lo) { lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); - } - spin_unlock(&nfsi->vfs_inode.i_lock); - pnfs_free_lseg_list(&tmp_list); + pnfs_get_layout_hdr(lo); + pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); + pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); + spin_unlock(&nfsi->vfs_inode.i_lock); + pnfs_free_lseg_list(&tmp_list); + pnfs_put_layout_hdr(lo); + } else + spin_unlock(&nfsi->vfs_inode.i_lock); } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); From 115ce575cb10918514d053ef15f597a4e6ff60e9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 21:19:43 -0400 Subject: [PATCH 028/113] NFSv4.1: pnfs_layout_io_set_failed must clear invalid lsegs If pnfs_layout_io_test_failed() authorises a retry of the failed layoutgets, we should clear the existing layout segments so that we start afresh. Do this in pnfs_layout_io_set_failed(). Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6834fa1be571..9ee3bd705b94 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -265,10 +265,18 @@ static void pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) { struct inode *inode = lo->plh_inode; + struct pnfs_layout_range range = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(head); spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + pnfs_mark_matching_lsegs_invalid(lo, &head, &range); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, iomode == IOMODE_RW ? "RW" : "READ"); } From 7fdab069b7172f2348cf3d87e19c6c24340292bf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 20:15:57 -0400 Subject: [PATCH 029/113] NFSv4.1: Fix a race in the pNFS return-on-close code If we sleep after dropping the inode->i_lock, then we are no longer atomic with respect to the rpc_wake_up() call in pnfs_layout_remove_lseg(). Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 +++----- fs/nfs/pnfs.c | 22 ++++++++++++---------- fs/nfs/pnfs.h | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e605d417a006..6d5750cabd8d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2137,6 +2137,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; + struct inode *inode = calldata->inode; int call_close = 0; dprintk("%s: begin!\n", __func__); @@ -2170,16 +2171,13 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc && - pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) { - rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq, - task, NULL); + pnfs_roc_drain(inode, &calldata->roc_barrier, task)) goto out; - } } nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; - if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), + if (nfs4_setup_sequence(NFS_SERVER(inode), &calldata->arg.seq_args, &calldata->res.seq_res, task)) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 9ee3bd705b94..8b32f8745337 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -807,27 +807,29 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_unlock(&ino->i_lock); } -bool pnfs_roc_drain(struct inode *ino, u32 *barrier) +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) { struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg; + u32 current_seqid; bool found = false; spin_lock(&ino->i_lock); list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); found = true; - break; + goto out; } - if (!found) { - struct pnfs_layout_hdr *lo = nfsi->layout; - u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid); + lo = nfsi->layout; + current_seqid = be32_to_cpu(lo->plh_stateid.seqid); - /* Since close does not return a layout stateid for use as - * a barrier, we choose the worst-case barrier. - */ - *barrier = current_seqid + atomic_read(&lo->plh_outstanding); - } + /* Since close does not return a layout stateid for use as + * a barrier, we choose the worst-case barrier. + */ + *barrier = current_seqid + atomic_read(&lo->plh_outstanding); +out: spin_unlock(&ino->i_lock); return found; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 9735031e1e1a..aa9fa1b1ff4a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -210,7 +210,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); -bool pnfs_roc_drain(struct inode *ino, u32 *barrier); +bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); @@ -442,7 +442,7 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier) } static inline bool -pnfs_roc_drain(struct inode *ino, u32 *barrier) +pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) { return false; } From 1f7977c1368afc483908281daaffd31bca5a8d1e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 20:31:51 -0400 Subject: [PATCH 030/113] NFSv4.1: Simplify the pNFS return-on-close code Confine it to the nfs4_do_close() code. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 6 ++---- fs/nfs/nfs4state.c | 7 ++----- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 71d407fd00a5..9cacc131a8a4 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -223,7 +223,7 @@ extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); extern int nfs4_destroy_clientid(struct nfs_client *clp); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); -extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); +extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6d5750cabd8d..8de0435caed9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2204,7 +2204,7 @@ static const struct rpc_call_ops nfs4_close_ops = { * * NOTE: Caller must be holding the sp->so_owner semaphore! */ -int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) +int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_closedata *calldata; @@ -2240,7 +2240,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; - calldata->roc = roc; + calldata->roc = pnfs_roc(state->inode); nfs_sb_active(calldata->inode->i_sb); msg.rpc_argp = &calldata->arg; @@ -2257,8 +2257,6 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) out_free_calldata: kfree(calldata); out: - if (roc) - pnfs_roc_release(state->inode); nfs4_put_open_state(state); nfs4_put_state_owner(sp); return status; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index fc6cfe68ad1d..a5331ec094a6 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -729,11 +729,8 @@ static void __nfs4_close(struct nfs4_state *state, if (!call_close) { nfs4_put_open_state(state); nfs4_put_state_owner(owner); - } else { - bool roc = pnfs_roc(state->inode); - - nfs4_do_close(state, gfp_mask, wait, roc); - } + } else + nfs4_do_close(state, gfp_mask, wait); } void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) From 965938b83b19aeffdc1d16ce9947c8c127e8f59b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 15:07:45 -0400 Subject: [PATCH 031/113] NFSv4.1: Get rid of pNFS layout state "NFS_LAYOUT_INVALID" In all cases where we set NFS_LAYOUT_INVALID, we also set NFS_LAYOUT_DESTROYED. Furthermore, in all cases where we test for NFS_LAYOUT_INVALID, we should also be testing for NFS_LAYOUT_DESTROYED, since the latter means that we hold no valid layout segments. Ergo the two are redundant. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 4 +--- fs/nfs/nfs4filelayout.h | 6 ------ fs/nfs/pnfs.h | 7 ++++++- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index dac2162c3ac4..6cce57e7fe55 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -190,8 +190,6 @@ static int filelayout_async_handle_error(struct rpc_task *task, * i/o and all i/o waiting on the slot table to the MDS until * layout is destroyed and a new valid layout is obtained. */ - set_bit(NFS_LAYOUT_INVALID, - &NFS_I(inode)->layout->plh_flags); pnfs_destroy_layout(NFS_I(inode)); rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; @@ -281,7 +279,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) { struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg); - return filelayout_test_layout_invalid(lseg->pls_layout) || + return pnfs_test_layout_destroyed(lseg->pls_layout) || filelayout_test_devid_unavailable(node); } diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 10b0f134400b..dca47d786710 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -128,12 +128,6 @@ filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node) set_bit(NFS_DEVICEID_INVALID, &node->flags); } -static inline bool -filelayout_test_layout_invalid(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_INVALID, &lo->plh_flags); -} - static inline bool filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index aa9fa1b1ff4a..aacda7fbb536 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -63,7 +63,6 @@ enum { NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ - NFS_LAYOUT_INVALID, /* layout is being destroyed */ NFS_LAYOUT_RETURNED, /* layout has already been returned */ }; @@ -279,6 +278,12 @@ pnfs_test_layout_returned(struct pnfs_layout_hdr *lo) return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); } +static inline bool +pnfs_test_layout_destroyed(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags); +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { From bb346f63976823c2959b0c5917928f12cbf96e4a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 15:52:13 -0400 Subject: [PATCH 032/113] NFSv4.1: reset the inode MDS threshold counters on layout destruction Instead of resetting the inode MDS threshold counters when we mark the layout for destruction, do it as part of freeing the layout. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8b32f8745337..ac94fb86fd19 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -215,9 +215,13 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) static void destroy_layout_hdr(struct pnfs_layout_hdr *lo) { + struct nfs_inode *nfsi = NFS_I(lo->plh_inode); dprintk("%s: freeing layout cache %p\n", __func__, lo); BUG_ON(!list_empty(&lo->plh_layouts)); - NFS_I(lo->plh_inode)->layout = NULL; + nfsi->layout = NULL; + /* Reset MDS Threshold I/O counters */ + nfsi->write_io = 0; + nfsi->read_io = 0; pnfs_free_layout_hdr(lo); } @@ -461,9 +465,6 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, dprintk("%s:Begin lo %p\n", __func__, lo); if (list_empty(&lo->plh_segs)) { - /* Reset MDS Threshold I/O counters */ - NFS_I(lo->plh_inode)->write_io = 0; - NFS_I(lo->plh_inode)->read_io = 0; if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) pnfs_put_layout_hdr_locked(lo); return 0; From 57036a377600ec0900b13f29814aa19072ad3e52 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 16:33:30 -0400 Subject: [PATCH 033/113] NFSv4.1: Rename the pnfs_put_lseg_common to pnfs_layout_remove_lseg The latter name is more descriptive of the actual function. Also rename pnfs_insert_layout to pnfs_layout_insert_lseg. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ac94fb86fd19..33273b3a330f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -319,21 +319,22 @@ static void free_lseg(struct pnfs_layout_segment *lseg) struct inode *ino = lseg->pls_layout->plh_inode; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - /* Matched by pnfs_get_layout_hdr in pnfs_insert_layout */ + /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ pnfs_put_layout_hdr(NFS_I(ino)->layout); } static void -pnfs_put_lseg_common(struct pnfs_layout_segment *lseg) +pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) { - struct inode *inode = lseg->pls_layout->plh_inode; + struct inode *inode = lo->plh_inode; WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); - if (list_empty(&lseg->pls_layout->plh_segs)) { - set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); + if (list_empty(&lo->plh_segs)) { + set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags); /* Matched by initial refcount set in alloc_init_layout_hdr */ - pnfs_put_layout_hdr_locked(lseg->pls_layout); + pnfs_put_layout_hdr_locked(lo); } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } @@ -341,6 +342,7 @@ pnfs_put_lseg_common(struct pnfs_layout_segment *lseg) void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { + struct pnfs_layout_hdr *lo; struct inode *inode; if (!lseg) @@ -349,13 +351,14 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - inode = lseg->pls_layout->plh_inode; + lo = lseg->pls_layout; + inode = lo->plh_inode; if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { LIST_HEAD(free_me); - pnfs_put_lseg_common(lseg); - list_add(&lseg->pls_list, &free_me); + pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); + list_add(&lseg->pls_list, &free_me); pnfs_free_lseg_list(&free_me); } } @@ -443,7 +446,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); if (atomic_dec_and_test(&lseg->pls_refcount)) { - pnfs_put_lseg_common(lseg); + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); list_add(&lseg->pls_list, tmp_list); rv = 1; } @@ -861,7 +864,7 @@ cmp_layout(struct pnfs_layout_range *l1, } static void -pnfs_insert_layout(struct pnfs_layout_hdr *lo, +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { struct pnfs_layout_segment *lp; @@ -1211,7 +1214,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) init_lseg(lo, lseg); lseg->pls_range = res->range; pnfs_get_lseg(lseg); - pnfs_insert_layout(lo, lseg); + pnfs_layout_insert_lseg(lo, lseg); if (res->return_on_close) { set_bit(NFS_LSEG_ROC, &lseg->pls_flags); From 01d39ce82b565961abaf1930f54ccf7b32c96b15 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 17:02:32 -0400 Subject: [PATCH 034/113] NFSv4.1: Remove redundant reference to the pnfs_layout_hdr Each layout segment already holds a reference to the pnfs_layout_hdr, so there is no need to hold an extra reference that is released once the last layout segment is freed. Ensure that pnfs_find_alloc_layout() always returns a reference to the pnfs_layout_hdr, which will be matched by the final call to pnfs_put_layout_hdr() in pnfs_update_layout(). Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 33273b3a330f..7ac5be36f132 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -331,11 +331,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); - if (list_empty(&lo->plh_segs)) { + if (list_empty(&lo->plh_segs)) set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags); - /* Matched by initial refcount set in alloc_init_layout_hdr */ - pnfs_put_layout_hdr_locked(lo); - } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } @@ -468,8 +465,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, dprintk("%s:Begin lo %p\n", __func__, lo); if (list_empty(&lo->plh_segs)) { - if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) - pnfs_put_layout_hdr_locked(lo); + set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags); return 0; } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) @@ -929,8 +925,8 @@ pnfs_find_alloc_layout(struct inode *ino, if (nfsi->layout) { if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) return NULL; - else - return nfsi->layout; + pnfs_get_layout_hdr(nfsi->layout); + return nfsi->layout; } spin_unlock(&ino->i_lock); new = alloc_init_layout_hdr(ino, ctx, gfp_flags); @@ -1129,7 +1125,6 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; atomic_inc(&lo->plh_outstanding); - pnfs_get_layout_hdr(lo); if (list_empty(&lo->plh_segs)) first = true; From 6622c3ea059b2fed49924b74db41d1e0f065fbd3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 17:23:11 -0400 Subject: [PATCH 035/113] NFSv4.1: Free the pnfs_layout_hdr outside the inode->i_lock None of the existing pNFS layout drivers seem to require the inode to be locked while they free the layout header. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7ac5be36f132..08663146f5f5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -213,7 +213,7 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) } static void -destroy_layout_hdr(struct pnfs_layout_hdr *lo) +pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) { struct nfs_inode *nfsi = NFS_I(lo->plh_inode); dprintk("%s: freeing layout cache %p\n", __func__, lo); @@ -222,14 +222,6 @@ destroy_layout_hdr(struct pnfs_layout_hdr *lo) /* Reset MDS Threshold I/O counters */ nfsi->write_io = 0; nfsi->read_io = 0; - pnfs_free_layout_hdr(lo); -} - -static void -pnfs_put_layout_hdr_locked(struct pnfs_layout_hdr *lo) -{ - if (atomic_dec_and_test(&lo->plh_refcount)) - destroy_layout_hdr(lo); } void @@ -238,8 +230,9 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) struct inode *inode = lo->plh_inode; if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { - destroy_layout_hdr(lo); + pnfs_detach_layout_hdr(lo); spin_unlock(&inode->i_lock); + pnfs_free_layout_hdr(lo); } } @@ -792,8 +785,12 @@ void pnfs_roc_release(struct inode *ino) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; lo->plh_block_lgets--; - pnfs_put_layout_hdr_locked(lo); - spin_unlock(&ino->i_lock); + if (atomic_dec_and_test(&lo->plh_refcount)) { + pnfs_detach_layout_hdr(lo); + spin_unlock(&ino->i_lock); + pnfs_free_layout_hdr(lo); + } else + spin_unlock(&ino->i_lock); } void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) From 9c6263819f25254f2ed48f076b50096dd5893dfb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 17:31:43 -0400 Subject: [PATCH 036/113] NFSv4.1: Clean up the removal of pnfs_layout_hdr from the server list Move the code into pnfs_free_layout_hdr(), and add checks to get_layout_by_fh_locked to ensure that they don't reference a layout that is being freed. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 19 ++++++++++++++++++- fs/nfs/pnfs.c | 29 ++++++++++------------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 24252fea2c9c..76b4a7a3e559 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -122,7 +122,15 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, ino = igrab(lo->plh_inode); if (!ino) continue; + spin_lock(&ino->i_lock); + /* Is this layout in the process of being freed? */ + if (NFS_I(ino)->layout != lo) { + spin_unlock(&ino->i_lock); + iput(ino); + continue; + } pnfs_get_layout_hdr(lo); + spin_unlock(&ino->i_lock); return lo; } } @@ -196,9 +204,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, continue; list_for_each_entry(lo, &server->layouts, plh_layouts) { - if (!igrab(lo->plh_inode)) + ino = igrab(lo->plh_inode); + if (ino) continue; + spin_lock(&ino->i_lock); + /* Is this layout in the process of being freed? */ + if (NFS_I(ino)->layout != lo) { + spin_unlock(&ino->i_lock); + iput(ino); + continue; + } pnfs_get_layout_hdr(lo); + spin_unlock(&ino->i_lock); BUG_ON(!list_empty(&lo->plh_bulk_recall)); list_add(&lo->plh_bulk_recall, &recall_list); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 08663146f5f5..11cc0ad6b409 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -207,7 +207,16 @@ pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) static void pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) { - struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + + if (!list_empty(&lo->plh_layouts)) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } put_rpccred(lo->plh_lc_cred); return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); } @@ -217,7 +226,6 @@ pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) { struct nfs_inode *nfsi = NFS_I(lo->plh_inode); dprintk("%s: freeing layout cache %p\n", __func__, lo); - BUG_ON(!list_empty(&lo->plh_layouts)); nfsi->layout = NULL; /* Reset MDS Threshold I/O counters */ nfsi->write_io = 0; @@ -480,22 +488,10 @@ void pnfs_free_lseg_list(struct list_head *free_me) { struct pnfs_layout_segment *lseg, *tmp; - struct pnfs_layout_hdr *lo; if (list_empty(free_me)) return; - lo = list_first_entry(free_me, struct pnfs_layout_segment, - pls_list)->pls_layout; - - if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { - struct nfs_client *clp; - - clp = NFS_SERVER(lo->plh_inode)->nfs_client; - spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); - spin_unlock(&clp->cl_lock); - } list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { list_del(&lseg->pls_list); free_lseg(lseg); @@ -1148,11 +1144,6 @@ pnfs_update_layout(struct inode *ino, arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - if (!lseg && first) { - spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); - spin_unlock(&clp->cl_lock); - } atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: pnfs_put_layout_hdr(lo); From 905ca191cfe1ab18822d86e3ddef1b1b38832fdc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 20:46:49 -0400 Subject: [PATCH 037/113] NFSv4.1: Clean up pnfs_put_lseg() There is no longer a need to use pnfs_free_lseg_list(). Just call pnfs_free_lseg() directly. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 11cc0ad6b409..c34ba9a0a46f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -315,7 +315,7 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) lseg->pls_layout = lo; } -static void free_lseg(struct pnfs_layout_segment *lseg) +static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) { struct inode *ino = lseg->pls_layout->plh_inode; @@ -352,12 +352,9 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) lo = lseg->pls_layout; inode = lo->plh_inode; if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { - LIST_HEAD(free_me); - pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); - list_add(&lseg->pls_list, &free_me); - pnfs_free_lseg_list(&free_me); + pnfs_free_lseg(lseg); } } EXPORT_SYMBOL_GPL(pnfs_put_lseg); @@ -494,7 +491,7 @@ pnfs_free_lseg_list(struct list_head *free_me) list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { list_del(&lseg->pls_list); - free_lseg(lseg); + pnfs_free_lseg(lseg); } } From 8f0d27dc5d77b084b2e2fe6d883c4d5776287842 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 20:57:11 -0400 Subject: [PATCH 038/113] NFSv4.1: Balance pnfs_layout_hdr refcount in pnfs_layout_(insert|remove)_lseg Ensure that the reference count for pnfs_layout_hdr reverts to the original value after a call to pnfs_layout_remove_lseg(). Note that the caller is expected to hold a reference to the struct pnfs_layout_hdr. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c34ba9a0a46f..bdd93b969050 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -320,8 +320,6 @@ static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) struct inode *ino = lseg->pls_layout->plh_inode; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ - pnfs_put_layout_hdr(NFS_I(ino)->layout); } static void @@ -332,6 +330,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); + /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ + atomic_dec(&lo->plh_refcount); if (list_empty(&lo->plh_segs)) set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags); rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); @@ -352,9 +352,11 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) lo = lseg->pls_layout; inode = lo->plh_inode; if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); pnfs_free_lseg(lseg); + pnfs_put_layout_hdr(lo); } } EXPORT_SYMBOL_GPL(pnfs_put_lseg); From a9136d4914f61110ca9897ec65ab620075c50298 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 20:01:56 -0400 Subject: [PATCH 039/113] NFSv4.1: Get rid of pNFS spin lock debugging asserts... These are all in static declared functions that are called only once. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bdd93b969050..edc8288fd3d4 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -859,7 +859,6 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, dprintk("%s:Begin\n", __func__); - assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry(lp, &lo->plh_segs, pls_list) { if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) continue; @@ -913,7 +912,6 @@ pnfs_find_alloc_layout(struct inode *ino, dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); - assert_spin_locked(&ino->i_lock); if (nfsi->layout) { if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) return NULL; @@ -970,7 +968,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, dprintk("%s:Begin\n", __func__); - assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && is_matching_lseg(&lseg->pls_range, range)) { From 579342785f7069d32e9637ef30d59c4256dcec3d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Sep 2012 20:37:23 -0400 Subject: [PATCH 040/113] NFSv4.1: Remove unused 'default allocation' for pnfs_alloc_layout_hdr() ...and ditto for pnfs_free_layout_hdr() Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index edc8288fd3d4..fcc72ecf2fd7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -200,8 +200,7 @@ static struct pnfs_layout_hdr * pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; - return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) : - kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); + return ld->alloc_layout_hdr(ino, gfp_flags); } static void @@ -218,7 +217,7 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) spin_unlock(&clp->cl_lock); } put_rpccred(lo->plh_lc_cred); - return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); + return ld->free_layout_hdr(lo); } static void From 8006bfba36d42b6976ed92979f51e5f9bef2625c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 21 Sep 2012 14:48:04 -0400 Subject: [PATCH 041/113] NFSv4.1: Get rid of the NFS_LAYOUT_DESTROYED state We already have a mechanism for blocking LAYOUTGET by means of the plh_block_lgets counter. The only "service" that NFS_LAYOUT_DESTROYED provides at this point is to block layoutget once the layout segment list is empty, which basically means that you have to wait until the pnfs_layout_hdr is destroyed before you can do pNFS on that file again. This patch enables the reuse of the pnfs_layout_hdr if the layout segment list is empty. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 3 +-- fs/nfs/pnfs.c | 9 +-------- fs/nfs/pnfs.h | 7 ------- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 6cce57e7fe55..52d847212066 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -279,8 +279,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) { struct nfs4_deviceid_node *node = FILELAYOUT_DEVID_NODE(lseg); - return pnfs_test_layout_destroyed(lseg->pls_layout) || - filelayout_test_devid_unavailable(node); + return filelayout_test_devid_unavailable(node); } /* diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index fcc72ecf2fd7..bda88a275071 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -331,8 +331,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); - if (list_empty(&lo->plh_segs)) - set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags); rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } @@ -463,10 +461,8 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, dprintk("%s:Begin lo %p\n", __func__, lo); - if (list_empty(&lo->plh_segs)) { - set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags); + if (list_empty(&lo->plh_segs)) return 0; - } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (!recall_range || should_free_lseg(&lseg->pls_range, recall_range)) { @@ -590,7 +586,6 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0) return true; return lo->plh_block_lgets || - test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || (list_empty(&lo->plh_segs) && (atomic_read(&lo->plh_outstanding) > lget)); @@ -912,8 +907,6 @@ pnfs_find_alloc_layout(struct inode *ino, dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); if (nfsi->layout) { - if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags)) - return NULL; pnfs_get_layout_hdr(nfsi->layout); return nfsi->layout; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index aacda7fbb536..92f6ce6532ba 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -62,7 +62,6 @@ enum { NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ - NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ NFS_LAYOUT_RETURNED, /* layout has already been returned */ }; @@ -278,12 +277,6 @@ pnfs_test_layout_returned(struct pnfs_layout_hdr *lo) return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); } -static inline bool -pnfs_test_layout_destroyed(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags); -} - static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { From 173f77e9c5cbddb02eebe17dd9c48d39e5eb86b9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 21 Sep 2012 15:49:42 -0400 Subject: [PATCH 042/113] NFSv4.1: Clear NFS_LAYOUT_BULK_RECALL when the layout segments are freed Once all the affected layout segments have been freed up, clear the NFS_LAYOUT_BULK_RECALL flag so that we can reuse the pnfs_layout_hdr Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bda88a275071..174c51a5001c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -331,6 +331,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); + if (list_empty(&lo->plh_segs)) + clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } From e5929f3cff05e84f20c68df235f4768920e2e89e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 21 Sep 2012 16:37:02 -0400 Subject: [PATCH 043/113] NFSv4.1: Remove the NFS_LAYOUT_RETURNED state It serves no purpose that the test for whether or not we have valid layout segments doesn't already serve. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 7 +------ fs/nfs/pnfs.h | 19 ------------------- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 174c51a5001c..20a1b6222ff6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -692,7 +692,7 @@ _pnfs_return_layout(struct inode *ino) spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || pnfs_test_layout_returned(lo)) { + if (!lo) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout to return\n", __func__); goto out; @@ -710,7 +710,6 @@ _pnfs_return_layout(struct inode *ino) goto out; } lo->plh_block_lgets++; - pnfs_mark_layout_returned(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); @@ -721,7 +720,6 @@ _pnfs_return_layout(struct inode *ino) status = -ENOMEM; pnfs_layout_io_set_failed(lo, IOMODE_RW); pnfs_layout_io_set_failed(lo, IOMODE_READ); - pnfs_clear_layout_returned(lo); pnfs_put_layout_hdr(lo); goto out; } @@ -1111,9 +1109,6 @@ pnfs_update_layout(struct inode *ino, if (list_empty(&lo->plh_segs)) first = true; - /* Enable LAYOUTRETURNs */ - pnfs_clear_layout_returned(lo); - spin_unlock(&ino->i_lock); if (first) { /* The lo must be on the clp list if there is any diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 92f6ce6532ba..6cede2c6c961 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -62,7 +62,6 @@ enum { NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ - NFS_LAYOUT_RETURNED, /* layout has already been returned */ }; enum layoutdriver_policy_flags { @@ -259,24 +258,6 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); -static inline void -pnfs_mark_layout_returned(struct pnfs_layout_hdr *lo) -{ - set_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); -} - -static inline void -pnfs_clear_layout_returned(struct pnfs_layout_hdr *lo) -{ - clear_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); -} - -static inline bool -pnfs_test_layout_returned(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_RETURNED, &lo->plh_flags); -} - static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { From 65857d5768f7716da539933c2075d384b117812d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 24 Sep 2012 13:49:27 -0400 Subject: [PATCH 044/113] NFSv4.1: _pnfs_return_layout() shouldn't invalidate the layout on failure Failure of the layoutreturn allocation fails is not a good reason to mark the pnfs_layout_hdr as having failed a layoutget or i/o. Just exit cleanly. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 20a1b6222ff6..d737557747b9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -718,8 +718,9 @@ _pnfs_return_layout(struct inode *ino) lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; - pnfs_layout_io_set_failed(lo, IOMODE_RW); - pnfs_layout_io_set_failed(lo, IOMODE_READ); + spin_lock(&ino->i_lock); + lo->plh_block_lgets--; + spin_unlock(&ino->i_lock); pnfs_put_layout_hdr(lo); goto out; } From 849b286fd026a6924cc6a4315e446ed88ab983d2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 24 Sep 2012 14:18:39 -0400 Subject: [PATCH 045/113] NFSv4.1: nfs4_proc_layoutreturn must always drop the plh_block_lgets count Currently it does not do so if the RPC call failed to start. Fix is to move the decrement of plh_block_lgets into nfs4_layoutreturn_release. Also remove a redundant test of task->tk_status in nfs4_layoutreturn_done: if lrp->res.lrs_present is set, then obviously the RPC call succeeded. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8de0435caed9..ce1ebff49fd7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6346,7 +6346,6 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; - struct pnfs_layout_hdr *lo = lrp->args.layout; dprintk("--> %s\n", __func__); @@ -6358,19 +6357,20 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) rpc_restart_call_prepare(task); return; } - spin_lock(&lo->plh_inode->i_lock); - if (task->tk_status == 0 && lrp->res.lrs_present) - pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - lo->plh_block_lgets--; - spin_unlock(&lo->plh_inode->i_lock); dprintk("<-- %s\n", __func__); } static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; + struct pnfs_layout_hdr *lo = lrp->args.layout; dprintk("--> %s\n", __func__); + spin_lock(&lo->plh_inode->i_lock); + if (lrp->res.lrs_present) + pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + lo->plh_block_lgets--; + spin_unlock(&lo->plh_inode->i_lock); pnfs_put_layout_hdr(lrp->args.layout); kfree(calldata); dprintk("<-- %s\n", __func__); From 9b96ce71974127af0304514d310abe596426c112 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Sep 2012 20:24:16 -0400 Subject: [PATCH 046/113] SUNRPC: Limit the rpciod workqueue concurrency We shouldn't need more than 1 worker thread per cpu, since rpciod is designed to run without sleeping in most cases. Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 128494ec9a64..6357fcb00c7e 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1022,7 +1022,7 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 1); rpciod_workqueue = wq; return rpciod_workqueue != NULL; } From fcb6d9c6b719b633e9b98d26d8a7937209e8bf21 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 26 Sep 2012 15:25:53 -0400 Subject: [PATCH 047/113] NFS: Always use the open stateid when checking for expired opens If we are reading through a delegation, and the delegation is OK then state->stateid will still point to a delegation stateid and not an open stateid. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ce1ebff49fd7..755ee162ee7e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1790,7 +1790,7 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) static int nfs41_check_open_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); - nfs4_stateid *stateid = &state->stateid; + nfs4_stateid *stateid = &state->open_stateid; int status; /* If a state reset has been done, test_stateid is unneeded */ From 6938867edba929a65a167a97581231e76aeb10b4 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 26 Sep 2012 15:25:52 -0400 Subject: [PATCH 048/113] NFS: Remove bad delegations during open recovery I put the client into an open recovery loop by: Client: Open file read half Server: Expire client (echo 0 > /sys/kernel/debug/nfsd/forget_clients) Client: Drop vm cache (echo 3 > /proc/sys/vm/drop_caches) finish reading file This causes a loop because the client never updates the nfs4_state after discovering that the delegation is invalid. This means it will keep trying to read using the bad delegation rather than attempting to re-open the file. Signed-off-by: Bryan Schumaker CC: stable@vger.kernel.org [3.4+] Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 755ee162ee7e..471a75f11ea2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1774,7 +1774,11 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) nfs41_free_stateid(server, stateid); + nfs_remove_bad_delegation(state->inode); + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); + write_sequnlock(&state->seqlock); clear_bit(NFS_DELEGATED_STATE, &state->flags); } } From 57a51048da742c764b6ce5d028c7f334ae04d363 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 9 Aug 2012 14:05:51 -0400 Subject: [PATCH 049/113] NFS: Use kzalloc() instead of kmalloc() in the idmapper This will allocate memory that has already been zeroed, allowing us to remove the memset later on. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index a850079467d8..9985a0aea5ff 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -632,9 +632,6 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, substring_t substr; int token, ret; - memset(im, 0, sizeof(*im)); - memset(msg, 0, sizeof(*msg)); - im->im_type = IDMAP_TYPE_GROUP; token = match_token(desc, nfs_idmap_tokens, &substr); @@ -677,7 +674,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, int ret = -ENOMEM; /* msg and im are freed in idmap_pipe_destroy_msg */ - data = kmalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out1; From 6168f62cbde8dcf4f58255794efbcdb8df603959 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Mon, 10 Sep 2012 14:00:46 -0400 Subject: [PATCH 050/113] NFSv4: Add ACCESS operation to OPEN compound The OPEN operation has no way to differentiate an open for read and an open for execution - both look like read to the server. This allowed users to read files that didn't have READ access but did have EXEC access, which is obviously wrong. This patch adds an ACCESS call to the OPEN compound to handle the difference between OPENs for reading and execution. Since we're going through the trouble of calling ACCESS, we check all possible access bits and cache the results hopefully avoiding an ACCESS call in the future. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 16 ++++++++++++- fs/nfs/nfs4proc.c | 52 ++++++++++++++++++++++++++++++++++------- fs/nfs/nfs4xdr.c | 16 +++++++++---- include/linux/nfs_fs.h | 2 ++ include/linux/nfs_xdr.h | 3 +++ 5 files changed, 76 insertions(+), 13 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 627f108ede23..ce8cb926526b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2072,7 +2072,7 @@ found: nfs_access_free_entry(entry); } -static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) @@ -2098,6 +2098,20 @@ static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *s spin_unlock(&nfs_access_lru_lock); } } +EXPORT_SYMBOL_GPL(nfs_access_add_cache); + +void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) +{ + entry->mask = 0; + if (access_result & NFS4_ACCESS_READ) + entry->mask |= MAY_READ; + if (access_result & + (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) + entry->mask |= MAY_WRITE; + if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) + entry->mask |= MAY_EXEC; +} +EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 471a75f11ea2..5b3207f557d9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -104,6 +104,8 @@ static int nfs4_map_errors(int err) return -EACCES; case -NFS4ERR_MINOR_VERS_MISMATCH: return -EPROTONOSUPPORT; + case -NFS4ERR_ACCESS: + return -EACCES; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -860,6 +862,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.fh = NFS_FH(dir); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); + /* ask server to check for all possible rights as results are cached */ + p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | + NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); p->o_arg.id.uniquifier = sp->so_seqid.owner_id; @@ -1643,6 +1648,39 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) return status; } +static int nfs4_opendata_access(struct rpc_cred *cred, + struct nfs4_opendata *opendata, + struct nfs4_state *state, fmode_t fmode) +{ + struct nfs_access_entry cache; + u32 mask; + + /* access call failed or for some reason the server doesn't + * support any access modes -- defer access call until later */ + if (opendata->o_res.access_supported == 0) + return 0; + + mask = 0; + if (fmode & FMODE_READ) + mask |= MAY_READ; + if (fmode & FMODE_WRITE) + mask |= MAY_WRITE; + if (fmode & FMODE_EXEC) + mask |= MAY_EXEC; + + cache.cred = cred; + cache.jiffies = jiffies; + nfs_access_set_mask(&cache, opendata->o_res.access_result); + nfs_access_add_cache(state->inode, &cache); + + if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) + return 0; + + /* even though OPEN succeeded, access is denied. Close the file */ + nfs4_close_state(state, fmode); + return -NFS4ERR_ACCESS; +} + /* * Note: On error, nfs4_proc_open will free the struct nfs4_opendata */ @@ -1900,6 +1938,10 @@ static int _nfs4_do_open(struct inode *dir, if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + status = nfs4_opendata_access(cred, opendata, state, fmode); + if (status != 0) + goto err_opendata_put; + if (opendata->o_arg.open_flags & O_EXCL) { nfs4_exclusive_attrset(opendata, sattr); @@ -1945,7 +1987,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct nfs4_state *res; int status; - fmode &= FMODE_READ|FMODE_WRITE; + fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC; do { status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res, ctx_th); @@ -2771,13 +2813,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { - entry->mask = 0; - if (res.access & NFS4_ACCESS_READ) - entry->mask |= MAY_READ; - if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) - entry->mask |= MAY_WRITE; - if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) - entry->mask |= MAY_EXEC; + nfs_access_set_mask(entry, res.access); nfs_refresh_inode(inode, res.fattr); } nfs_free_fattr(res.fattr); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7ab29abb3160..657483c34e28 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -447,12 +447,14 @@ static int nfs4_stat_to_errno(int); encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_open_maxsz + \ + encode_access_maxsz + \ encode_getfh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ + decode_access_maxsz + \ decode_getfh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_open_confirm_sz \ @@ -467,11 +469,13 @@ static int nfs4_stat_to_errno(int); encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_open_maxsz + \ + encode_access_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ + decode_access_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_open_downgrade_sz \ (compound_encode_hdr_maxsz + \ @@ -2220,6 +2224,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_open(xdr, args, &hdr); encode_getfh(xdr, &hdr); + encode_access(xdr, args->access, &hdr); encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); encode_nops(&hdr); } @@ -2256,6 +2261,7 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_open(xdr, args, &hdr); + encode_access(xdr, args->access, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -4099,7 +4105,7 @@ out_overflow: return -EIO; } -static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) +static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) { __be32 *p; uint32_t supp, acc; @@ -4113,8 +4119,8 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) goto out_overflow; supp = be32_to_cpup(p++); acc = be32_to_cpup(p); - access->supported = supp; - access->access = acc; + *supported = supp; + *access = acc; return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -5892,7 +5898,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status != 0) goto out; - status = decode_access(xdr, res); + status = decode_access(xdr, &res->supported, &res->access); if (status != 0) goto out; decode_getfattr(xdr, res->fattr, res->server); @@ -6233,6 +6239,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, &res->fh); if (status) goto out; + decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr(xdr, res->f_attr, res->server); out: return status; @@ -6281,6 +6288,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, status = decode_open(xdr, res); if (status) goto out; + decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr(xdr, res->f_attr, res->server); out: return status; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 383f3313f053..334a2f5f6bf1 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -360,6 +360,8 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); +extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 5da789fdf25b..655490dae953 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -334,6 +334,7 @@ struct nfs_openargs { struct nfs_seqid * seqid; int open_flags; fmode_t fmode; + u32 access; __u64 clientid; struct stateowner_id id; union { @@ -368,6 +369,8 @@ struct nfs_openres { struct nfs4_string *owner; struct nfs4_string *group_owner; struct nfs4_sequence_res seq_res; + __u32 access_supported; + __u32 access_result; }; /* From c8ceb4124b53a439edfe3fe89a646be1e067ef17 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:06 +0400 Subject: [PATCH 051/113] NFS: pass net to nfs_callback_down() Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 4 ++-- fs/nfs/callback.h | 2 +- fs/nfs/nfs4client.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 4c8459e5bdee..51297b2d0532 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -301,7 +301,7 @@ out_err: /* * Kill the callback thread if it's no longer being used. */ -void nfs_callback_down(int minorversion) +void nfs_callback_down(int minorversion, struct net *net) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; @@ -309,7 +309,7 @@ void nfs_callback_down(int minorversion) cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); - svc_shutdown_net(cb_info->serv, &init_net); + svc_shutdown_net(cb_info->serv, net); svc_exit_thread(cb_info->rqst); cb_info->serv = NULL; cb_info->rqst = NULL; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index b44d7b128b71..309404453e95 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -194,7 +194,7 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, struct cb_process_state *cps); #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); -extern void nfs_callback_down(int minorversion); +extern void nfs_callback_down(int minorversion, struct net *net); extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid); extern int nfs4_set_callback_sessionid(struct nfs_client *clp); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 24eb663f8ed5..088a7d2e2ecb 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -84,7 +84,7 @@ error: static void nfs4_destroy_callback(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_mvops->minor_version); + nfs_callback_down(clp->cl_mvops->minor_version, &init_net); } static void nfs4_shutdown_client(struct nfs_client *clp) From dd018428dce087b72d9e6a0b32e93cb8088b3aaa Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:11 +0400 Subject: [PATCH 052/113] NFS: callback service creation function introduced This function creates service if it's not exist, or increase usage counter of the existent, and returns pointer to it. Usage counter will be droppepd by svc_destroy() later in nfs_callback_up(). Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 63 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 51297b2d0532..18efeb5f005d 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -217,12 +217,50 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, } #endif /* CONFIG_NFS_V4_1 */ +static struct svc_serv *nfs_callback_create_svc(int minorversion) +{ + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + struct svc_serv *serv; + + /* + * Check whether we're already up and running. + */ + if (cb_info->task) { + /* + * Note: increase service usage, because later in case of error + * svc_destroy() will be called. + */ + svc_get(cb_info->serv); + return cb_info->serv; + } + + /* + * Sanity check: if there's no task, + * we should be the first user ... + */ + if (cb_info->users) + printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", + cb_info->users); + + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + if (!serv) { + printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); + return ERR_PTR(-ENOMEM); + } + /* As there is only one thread we need to over-ride the + * default maximum of 80 connections + */ + serv->sv_maxconn = 1024; + dprintk("nfs_callback_create_svc: service created\n"); + return serv; +} + /* * Bring up the callback thread if it is not already up. */ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) { - struct svc_serv *serv = NULL; + struct svc_serv *serv; struct svc_rqst *rqstp; int (*callback_svc)(void *vrqstp); struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; @@ -232,19 +270,17 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) struct net *net = &init_net; mutex_lock(&nfs_callback_mutex); + + serv = nfs_callback_create_svc(minorversion); + if (IS_ERR(serv)) { + ret = PTR_ERR(serv); + goto err_create; + } + if (cb_info->users++ || cb_info->task != NULL) { nfs_callback_bc_serv(minorversion, xprt, cb_info); goto out; } - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); - if (!serv) { - ret = -ENOMEM; - goto out_err; - } - /* As there is only one thread we need to over-ride the - * default maximum of 80 connections - */ - serv->sv_maxconn = 1024; ret = svc_bind(serv, net); if (ret < 0) { @@ -285,16 +321,15 @@ out: * on both success and failure so that the refcount is 1 when the * thread exits. */ - if (serv) - svc_destroy(serv); + svc_destroy(serv); +err_create: mutex_unlock(&nfs_callback_mutex); return ret; out_err: dprintk("NFS: Couldn't create callback socket or server thread; " "err = %d\n", ret); cb_info->users--; - if (serv) - svc_shutdown_net(serv, net); + svc_shutdown_net(serv, net); goto out; } From c946556b8749beb357e2d2860e7dac757972dd3d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:16 +0400 Subject: [PATCH 053/113] NFS: move per-net callback thread initialization to nfs_callback_up_net() v4: 1) Callback transport creation routine selection by version simlified. This new function in now called before nfs_minorversion_callback_svc_setup()). Also few small changes: 1) current network namespace in nfs_callback_up() was replaced by transport net. 2) svc_shutdown_net() was moved prior to callback usage counter decrement (because in case of per-net data allocation faulure svc_shutdown_net() have to be skipped). Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 133 ++++++++++++++++++++++++++++---------------- fs/nfs/nfs4client.c | 2 +- 2 files changed, 87 insertions(+), 48 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 18efeb5f005d..a53b4e53d5dd 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -39,6 +39,32 @@ static struct svc_program nfs4_callback_program; unsigned short nfs_callback_tcpport6; +static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) +{ + int ret; + + ret = svc_create_xprt(serv, "tcp", net, PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret <= 0) + goto out_err; + nfs_callback_tcpport = ret; + dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", + nfs_callback_tcpport, PF_INET, net); + + ret = svc_create_xprt(serv, "tcp", net, PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + if (ret > 0) { + nfs_callback_tcpport6 = ret; + dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", + nfs_callback_tcpport6, PF_INET6, net); + } else if (ret != -EAFNOSUPPORT) + goto out_err; + return 0; + +out_err: + return (ret) ? ret : -ENOMEM; +} + /* * This is the NFSv4 callback kernel thread. */ @@ -80,36 +106,21 @@ nfs4_callback_svc(void *vrqstp) static struct svc_rqst * nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) { - int ret; - - ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); - if (ret <= 0) - goto out_err; - nfs_callback_tcpport = ret; - dprintk("NFS: Callback listener port = %u (af %u)\n", - nfs_callback_tcpport, PF_INET); - - ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); - if (ret > 0) { - nfs_callback_tcpport6 = ret; - dprintk("NFS: Callback listener port = %u (af %u)\n", - nfs_callback_tcpport6, PF_INET6); - } else if (ret == -EAFNOSUPPORT) - ret = 0; - else - goto out_err; - return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - -out_err: - if (ret == 0) - ret = -ENOMEM; - return ERR_PTR(ret); } #if defined(CONFIG_NFS_V4_1) +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) +{ + /* + * Create an svc_sock for the back channel service that shares the + * fore channel connection. + * Returns the input port (0) and sets the svc_serv bc_xprt on success + */ + return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); +} + /* * The callback service for NFSv4.1 callbacks */ @@ -152,19 +163,6 @@ static struct svc_rqst * nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) { struct svc_rqst *rqstp; - int ret; - - /* - * Create an svc_sock for the back channel service that shares the - * fore channel connection. - * Returns the input port (0) and sets the svc_serv bc_xprt on success - */ - ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, - SVC_SOCK_ANONYMOUS); - if (ret < 0) { - rqstp = ERR_PTR(ret); - goto out; - } /* * Save the svc_serv in the transport so that it can @@ -180,7 +178,6 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) svc_xprt_put(serv->sv_bc_xprt); serv->sv_bc_xprt = NULL; } -out: dprintk("--> %s return %ld\n", __func__, IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0); return rqstp; @@ -204,6 +201,11 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, xprt->bc_serv = cb_info->serv; } #else +static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) +{ + return 0; +} + static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, struct svc_serv *serv, struct rpc_xprt *xprt, struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) @@ -217,6 +219,44 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, } #endif /* CONFIG_NFS_V4_1 */ +static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net) +{ + int ret; + + dprintk("NFS: create per-net callback data; net=%p\n", net); + + ret = svc_bind(serv, net); + if (ret < 0) { + printk(KERN_WARNING "NFS: bind callback service failed\n"); + goto err_bind; + } + + switch (minorversion) { + case 0: + ret = nfs4_callback_up_net(serv, net); + break; + case 1: + ret = nfs41_callback_up_net(serv, net); + break; + default: + printk(KERN_ERR "NFS: unknown callback version: %d\n", + minorversion); + ret = -EINVAL; + break; + } + + if (ret < 0) { + printk(KERN_ERR "NFS: callback service start failed\n"); + goto err_socks; + } + return 0; + +err_socks: + svc_rpcb_cleanup(serv, net); +err_bind: + return ret; +} + static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; @@ -267,7 +307,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) char svc_name[12]; int ret = 0; int minorversion_setup; - struct net *net = &init_net; + struct net *net = xprt->xprt_net; mutex_lock(&nfs_callback_mutex); @@ -282,11 +322,9 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) goto out; } - ret = svc_bind(serv, net); - if (ret < 0) { - printk(KERN_WARNING "NFS: bind callback service failed\n"); - goto out_err; - } + ret = nfs_callback_up_net(minorversion, serv, net); + if (ret < 0) + goto err_net; minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, serv, xprt, &rqstp, &callback_svc); @@ -326,10 +364,11 @@ err_create: mutex_unlock(&nfs_callback_mutex); return ret; out_err: + svc_shutdown_net(serv, net); +err_net: dprintk("NFS: Couldn't create callback socket or server thread; " "err = %d\n", ret); cb_info->users--; - svc_shutdown_net(serv, net); goto out; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 088a7d2e2ecb..612f5ebaabac 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -84,7 +84,7 @@ error: static void nfs4_destroy_callback(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_mvops->minor_version, &init_net); + nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net); } static void nfs4_shutdown_client(struct nfs_client *clp) From 691c457ae635a063e0e4c8551ba4566eab9a17e3 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:21 +0400 Subject: [PATCH 054/113] NFS: callback up - transport backchannel cleanup No need to assign transports backchannel server explicitly in nfs41_callback_up() - there is nfs_callback_bc_serv() function for this. By using it, nfs4_callback_up() and nfs41_callback_up() can be called without transport argument. Note: service have to be passed to nfs_callback_bc_serv() instead of callback, since callback link can be uninitialized. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a53b4e53d5dd..a528cb75121e 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -104,7 +104,7 @@ nfs4_callback_svc(void *vrqstp) * Prepare to bring up the NFSv4 callback service */ static struct svc_rqst * -nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) +nfs4_callback_up(struct svc_serv *serv) { return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); } @@ -160,16 +160,10 @@ nfs41_callback_svc(void *vrqstp) * Bring up the NFSv4.1 callback service */ static struct svc_rqst * -nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) +nfs41_callback_up(struct svc_serv *serv) { struct svc_rqst *rqstp; - /* - * Save the svc_serv in the transport so that it can - * be referenced when the session backchannel is initialized - */ - xprt->bc_serv = serv; - INIT_LIST_HEAD(&serv->sv_cb_list); spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); @@ -184,21 +178,25 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) } static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, - struct svc_serv *serv, struct rpc_xprt *xprt, + struct svc_serv *serv, struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) { if (minorversion) { - *rqstpp = nfs41_callback_up(serv, xprt); + *rqstpp = nfs41_callback_up(serv); *callback_svc = nfs41_callback_svc; } return minorversion; } static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, - struct nfs_callback_data *cb_info) + struct svc_serv *serv) { if (minorversion) - xprt->bc_serv = cb_info->serv; + /* + * Save the svc_serv in the transport so that it can + * be referenced when the session backchannel is initialized + */ + xprt->bc_serv = serv; } #else static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) @@ -207,14 +205,14 @@ static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) } static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, - struct svc_serv *serv, struct rpc_xprt *xprt, + struct svc_serv *serv, struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) { return 0; } static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, - struct nfs_callback_data *cb_info) + struct svc_serv *serv) { } #endif /* CONFIG_NFS_V4_1 */ @@ -318,7 +316,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) } if (cb_info->users++ || cb_info->task != NULL) { - nfs_callback_bc_serv(minorversion, xprt, cb_info); + nfs_callback_bc_serv(minorversion, xprt, serv); goto out; } @@ -326,11 +324,13 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) if (ret < 0) goto err_net; + nfs_callback_bc_serv(minorversion, xprt, serv); + minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, - serv, xprt, &rqstp, &callback_svc); + serv, &rqstp, &callback_svc); if (!minorversion_setup) { /* v4.0 callback setup */ - rqstp = nfs4_callback_up(serv, xprt); + rqstp = nfs4_callback_up(serv); callback_svc = nfs4_callback_svc; } From 8e2461444319b8f3fe47b94ea9b5d2e1dd8adadb Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:26 +0400 Subject: [PATCH 055/113] NFS: callback service start function introduced This is just a code move, which from my POW makes code looks better. I.e. now on start we have 3 different stages: 1) Service creation. 2) Service per-net data allocation. 3) Service start. Patch also renames goto label "out_err:" into "err_start:" to reflect new changes. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 77 +++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a528cb75121e..5d5f9d10cfd0 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -217,6 +217,46 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, } #endif /* CONFIG_NFS_V4_1 */ +static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, + struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + int (*callback_svc)(void *vrqstp); + struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + char svc_name[12]; + int ret; + int minorversion_setup; + + nfs_callback_bc_serv(minorversion, xprt, serv); + + minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, + serv, &rqstp, &callback_svc); + if (!minorversion_setup) { + /* v4.0 callback setup */ + rqstp = nfs4_callback_up(serv); + callback_svc = nfs4_callback_svc; + } + + if (IS_ERR(rqstp)) + return PTR_ERR(rqstp); + + svc_sock_update_bufs(serv); + + sprintf(svc_name, "nfsv4.%u-svc", minorversion); + cb_info->serv = serv; + cb_info->rqst = rqstp; + cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); + if (IS_ERR(cb_info->task)) { + ret = PTR_ERR(cb_info->task); + svc_exit_thread(cb_info->rqst); + cb_info->rqst = NULL; + cb_info->task = NULL; + return PTR_ERR(cb_info->task); + } + dprintk("nfs_callback_up: service started\n"); + return 0; +} + static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net) { int ret; @@ -299,12 +339,8 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) { struct svc_serv *serv; - struct svc_rqst *rqstp; - int (*callback_svc)(void *vrqstp); struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; - char svc_name[12]; int ret = 0; - int minorversion_setup; struct net *net = xprt->xprt_net; mutex_lock(&nfs_callback_mutex); @@ -324,34 +360,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) if (ret < 0) goto err_net; - nfs_callback_bc_serv(minorversion, xprt, serv); + ret = nfs_callback_start_svc(minorversion, xprt, serv); + if (ret < 0) + goto err_start; - minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, - serv, &rqstp, &callback_svc); - if (!minorversion_setup) { - /* v4.0 callback setup */ - rqstp = nfs4_callback_up(serv); - callback_svc = nfs4_callback_svc; - } - - if (IS_ERR(rqstp)) { - ret = PTR_ERR(rqstp); - goto out_err; - } - - svc_sock_update_bufs(serv); - - sprintf(svc_name, "nfsv4.%u-svc", minorversion); - cb_info->serv = serv; - cb_info->rqst = rqstp; - cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); - if (IS_ERR(cb_info->task)) { - ret = PTR_ERR(cb_info->task); - svc_exit_thread(cb_info->rqst); - cb_info->rqst = NULL; - cb_info->task = NULL; - goto out_err; - } out: /* * svc_create creates the svc_serv with sv_nrthreads == 1, and then @@ -363,7 +375,8 @@ out: err_create: mutex_unlock(&nfs_callback_mutex); return ret; -out_err: + +err_start: svc_shutdown_net(serv, net); err_net: dprintk("NFS: Couldn't create callback socket or server thread; " From 23c20ecd44750dd42e5fd53285a17ca8d8a9b0a3 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:31 +0400 Subject: [PATCH 056/113] NFS: callback up - users counting cleanup Usage coutner now increased only is the service was started sccessfully. Even if service is running already, then goto is not required anymore, because service creation and start will be skipped. With this patch code looks clearer. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 5d5f9d10cfd0..64e87ec045aa 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -229,6 +229,9 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, nfs_callback_bc_serv(minorversion, xprt, serv); + if (cb_info->task) + return 0; + minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, serv, &rqstp, &callback_svc); if (!minorversion_setup) { @@ -292,6 +295,8 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n err_socks: svc_rpcb_cleanup(serv, net); err_bind: + dprintk("NFS: Couldn't create callback socket: err = %d; " + "net = %p\n", ret, net); return ret; } @@ -340,7 +345,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) { struct svc_serv *serv; struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; - int ret = 0; + int ret; struct net *net = xprt->xprt_net; mutex_lock(&nfs_callback_mutex); @@ -351,11 +356,6 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) goto err_create; } - if (cb_info->users++ || cb_info->task != NULL) { - nfs_callback_bc_serv(minorversion, xprt, serv); - goto out; - } - ret = nfs_callback_up_net(minorversion, serv, net); if (ret < 0) goto err_net; @@ -364,13 +364,14 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) if (ret < 0) goto err_start; -out: + cb_info->users++; /* * svc_create creates the svc_serv with sv_nrthreads == 1, and then * svc_prepare_thread increments that. So we need to call svc_destroy * on both success and failure so that the refcount is 1 when the * thread exits. */ +err_net: svc_destroy(serv); err_create: mutex_unlock(&nfs_callback_mutex); @@ -378,11 +379,8 @@ err_create: err_start: svc_shutdown_net(serv, net); -err_net: - dprintk("NFS: Couldn't create callback socket or server thread; " - "err = %d\n", ret); - cb_info->users--; - goto out; + dprintk("NFS: Couldn't create server thread; err = %d\n", ret); + goto err_net; } /* From bbe0a3aa4e227c8aae02a484ce1c0b655cd19055 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:36 +0400 Subject: [PATCH 057/113] NFS: make nfs_callback_tcpport per network context Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 6 ++++-- fs/nfs/callback.h | 1 - fs/nfs/netns.h | 1 + fs/nfs/nfs4state.c | 4 +++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 64e87ec045aa..94aa9d8f3086 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -23,6 +23,7 @@ #include "nfs4_fs.h" #include "callback.h" #include "internal.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -42,14 +43,15 @@ unsigned short nfs_callback_tcpport6; static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) { int ret; + struct nfs_net *nn = net_generic(net, nfs_net_id); ret = svc_create_xprt(serv, "tcp", net, PF_INET, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_err; - nfs_callback_tcpport = ret; + nn->nfs_callback_tcpport = ret; dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", - nfs_callback_tcpport, PF_INET, net); + nn->nfs_callback_tcpport, PF_INET, net); ret = svc_create_xprt(serv, "tcp", net, PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 309404453e95..1c167d163683 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -208,7 +208,6 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp); #define NFS41_BC_MAX_CALLBACKS 1 extern unsigned int nfs_callback_set_tcpport; -extern unsigned short nfs_callback_tcpport; extern unsigned short nfs_callback_tcpport6; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 0539de1b8d1f..1538d3a83cde 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -22,6 +22,7 @@ struct nfs_net { struct list_head nfs_volume_list; #if IS_ENABLED(CONFIG_NFS_V4) struct idr cb_ident_idr; /* Protected by nfs_client_lock */ + unsigned short nfs_callback_tcpport; #endif spinlock_t nfs_client_lock; struct timespec boot_time; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a5331ec094a6..716cdc20475f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -56,6 +56,7 @@ #include "delegation.h" #include "internal.h" #include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_STATE @@ -73,10 +74,11 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) }; unsigned short port; int status; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) goto do_confirm; - port = nfs_callback_tcpport; + port = nn->nfs_callback_tcpport; if (clp->cl_addr.ss_family == AF_INET6) port = nfs_callback_tcpport6; From 29dcc16a8e29371e11fb58fc1292e01f30ff13c5 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:41 +0400 Subject: [PATCH 058/113] NFS: make nfs_callback_tcpport6 per network context Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 6 ++---- fs/nfs/callback.h | 1 - fs/nfs/netns.h | 1 + fs/nfs/nfs4state.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 94aa9d8f3086..baafa0f1e555 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -38,8 +38,6 @@ static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; static DEFINE_MUTEX(nfs_callback_mutex); static struct svc_program nfs4_callback_program; -unsigned short nfs_callback_tcpport6; - static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) { int ret; @@ -56,9 +54,9 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) ret = svc_create_xprt(serv, "tcp", net, PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { - nfs_callback_tcpport6 = ret; + nn->nfs_callback_tcpport6 = ret; dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", - nfs_callback_tcpport6, PF_INET6, net); + nn->nfs_callback_tcpport6, PF_INET6, net); } else if (ret != -EAFNOSUPPORT) goto out_err; return 0; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 1c167d163683..c07a8d460d36 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -208,6 +208,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp); #define NFS41_BC_MAX_CALLBACKS 1 extern unsigned int nfs_callback_set_tcpport; -extern unsigned short nfs_callback_tcpport6; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 1538d3a83cde..137238b012fb 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -23,6 +23,7 @@ struct nfs_net { #if IS_ENABLED(CONFIG_NFS_V4) struct idr cb_ident_idr; /* Protected by nfs_client_lock */ unsigned short nfs_callback_tcpport; + unsigned short nfs_callback_tcpport6; #endif spinlock_t nfs_client_lock; struct timespec boot_time; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 716cdc20475f..bd8ed01cb0ea 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -80,7 +80,7 @@ int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) goto do_confirm; port = nn->nfs_callback_tcpport; if (clp->cl_addr.ss_family == AF_INET6) - port = nfs_callback_tcpport6; + port = nn->nfs_callback_tcpport6; status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); if (status != 0) From b3d19c51723be69fddb64723bebb5a30fb57a483 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:46 +0400 Subject: [PATCH 059/113] NFS: callback per-net usage counting introduced This patch also introduces refcount-aware nfs_callback_down_net() wrapper for svc_shutdown_net(). Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 19 +++++++++++++++++-- fs/nfs/netns.h | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index baafa0f1e555..6dfdc8311f27 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -260,10 +260,25 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, return 0; } +static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + if (--nn->cb_users[minorversion]) + return; + + dprintk("NFS: destroy per-net callback data; net=%p\n", net); + svc_shutdown_net(serv, net); +} + static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net) { + struct nfs_net *nn = net_generic(net, nfs_net_id); int ret; + if (nn->cb_users[minorversion]++) + return 0; + dprintk("NFS: create per-net callback data; net=%p\n", net); ret = svc_bind(serv, net); @@ -378,7 +393,7 @@ err_create: return ret; err_start: - svc_shutdown_net(serv, net); + nfs_callback_down_net(minorversion, serv, net); dprintk("NFS: Couldn't create server thread; err = %d\n", ret); goto err_net; } @@ -391,10 +406,10 @@ void nfs_callback_down(int minorversion, struct net *net) struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; mutex_lock(&nfs_callback_mutex); + nfs_callback_down_net(minorversion, cb_info->serv, net); cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); - svc_shutdown_net(cb_info->serv, net); svc_exit_thread(cb_info->rqst); cb_info->serv = NULL; cb_info->rqst = NULL; diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 137238b012fb..b9c7f9b1f91b 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -24,6 +24,7 @@ struct nfs_net { struct idr cb_ident_idr; /* Protected by nfs_client_lock */ unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; + int cb_users[NFS4_MAX_MINOR_VERSION + 1]; #endif spinlock_t nfs_client_lock; struct timespec boot_time; From 1dc42e04b75779d321f1d17dca3873004066f667 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 20 Aug 2012 18:00:51 +0400 Subject: [PATCH 060/113] NFS: add debug messages to callback down function Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 6dfdc8311f27..8ed0bc8cffb6 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -410,7 +410,9 @@ void nfs_callback_down(int minorversion, struct net *net) cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); + dprintk("nfs_callback_down: service stopped\n"); svc_exit_thread(cb_info->rqst); + dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; cb_info->rqst = NULL; cb_info->task = NULL; From e9406db20fecbfcab646bad157b4cfdc7cadddfb Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 18 Sep 2012 13:37:12 +0400 Subject: [PATCH 061/113] lockd: per-net NSM client creation and destruction helpers introduced NSM RPC client can be required on NFSv3 umount, when child reaper is dying (and destroying it's mount namespace). It means, that current nsproxy is set to NULL already, but creation of RPC client requires UTS namespace for gaining hostname string. This patch introduces reference counted NFS RPC clients creation and destruction helpers (similar to RPCBIND RPC clients). Signed-off-by: Stanislav Kinsbursky Cc: Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++-- fs/lockd/netns.h | 4 ++++ fs/lockd/svc.c | 1 + 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 7ef14b3c5bee..38f240e104ce 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -19,6 +19,8 @@ #include +#include "netns.h" + #define NLMDBG_FACILITY NLMDBG_MONITOR #define NSM_PROGRAM 100024 #define NSM_VERSION 1 @@ -70,7 +72,7 @@ static struct rpc_clnt *nsm_create(struct net *net) }; struct rpc_create_args args = { .net = net, - .protocol = XPRT_TRANSPORT_UDP, + .protocol = XPRT_TRANSPORT_TCP, .address = (struct sockaddr *)&sin, .addrsize = sizeof(sin), .servername = "rpc.statd", @@ -83,6 +85,51 @@ static struct rpc_clnt *nsm_create(struct net *net) return rpc_create(&args); } +__maybe_unused static struct rpc_clnt *nsm_client_get(struct net *net) +{ + static DEFINE_MUTEX(nsm_create_mutex); + struct rpc_clnt *clnt; + struct lockd_net *ln = net_generic(net, lockd_net_id); + + spin_lock(&ln->nsm_clnt_lock); + if (ln->nsm_users) { + ln->nsm_users++; + clnt = ln->nsm_clnt; + spin_unlock(&ln->nsm_clnt_lock); + goto out; + } + spin_unlock(&ln->nsm_clnt_lock); + + mutex_lock(&nsm_create_mutex); + clnt = nsm_create(net); + if (!IS_ERR(clnt)) { + ln->nsm_clnt = clnt; + smp_wmb(); + ln->nsm_users = 1; + } + mutex_unlock(&nsm_create_mutex); +out: + return clnt; +} + +__maybe_unused static void nsm_client_put(struct net *net) +{ + struct lockd_net *ln = net_generic(net, lockd_net_id); + struct rpc_clnt *clnt = ln->nsm_clnt; + int shutdown = 0; + + spin_lock(&ln->nsm_clnt_lock); + if (ln->nsm_users) { + if (--ln->nsm_users) + ln->nsm_clnt = NULL; + shutdown = !ln->nsm_users; + } + spin_unlock(&ln->nsm_clnt_lock); + + if (shutdown) + rpc_shutdown_client(clnt); +} + static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, struct net *net) { @@ -111,7 +158,7 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, memset(res, 0, sizeof(*res)); msg.rpc_proc = &clnt->cl_procinfo[proc]; - status = rpc_call_sync(clnt, &msg, 0); + status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); if (status < 0) dprintk("lockd: NSM upcall RPC failed, status=%d\n", status); diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 4eee248ba96e..5010b55628b4 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -12,6 +12,10 @@ struct lockd_net { struct delayed_work grace_period_end; struct lock_manager lockd_manager; struct list_head grace_list; + + spinlock_t nsm_clnt_lock; + unsigned int nsm_users; + struct rpc_clnt *nsm_clnt; }; extern int lockd_net_id; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 31a63f87b806..7e355870d519 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -596,6 +596,7 @@ static int lockd_init_net(struct net *net) INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->grace_list); + spin_lock_init(&ln->nsm_clnt_lock); return 0; } From 303a7ce92064c285a04c870f2dc0192fdb2968cb Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 18 Sep 2012 13:37:18 +0400 Subject: [PATCH 062/113] lockd: use rpc client's cl_nodename for id encoding Taking hostname from uts namespace if not safe, because this cuold be performind during umount operation on child reaper death. And in this case current->nsproxy is NULL already. Signed-off-by: Stanislav Kinsbursky Cc: Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 38f240e104ce..e0bc36e74ceb 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -42,6 +42,7 @@ struct nsm_args { u32 proc; char *mon_name; + char *nodename; }; struct nsm_res { @@ -141,6 +142,7 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, .vers = 3, .proc = NLMPROC_NSM_NOTIFY, .mon_name = nsm->sm_mon_name, + .nodename = utsname()->nodename, }; struct rpc_message msg = { .rpc_argp = &args, @@ -477,7 +479,7 @@ static void encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) { __be32 *p; - encode_nsm_string(xdr, utsname()->nodename); + encode_nsm_string(xdr, argp->nodename); p = xdr_reserve_space(xdr, 4 + 4 + 4); *p++ = cpu_to_be32(argp->prog); *p++ = cpu_to_be32(argp->vers); From cb7323fffa85df37161f4d3be45e1f787808309c Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 18 Sep 2012 13:37:23 +0400 Subject: [PATCH 063/113] lockd: create and use per-net NSM RPC clients on MON/UNMON requests NSM RPC client can be required on NFSv3 umount, when child reaper is dying (and destroying it's mount namespace). It means, that current nsproxy is set to NULL already, but creation of RPC client requires UTS namespace for gaining hostname string. This patch creates reference-counted per-net NSM client on first monitor request and destroys it after last unmonitor request. Signed-off-by: Stanislav Kinsbursky Cc: Signed-off-by: Trond Myklebust --- fs/lockd/mon.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e0bc36e74ceb..e4fb3ba5a58a 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -7,7 +7,6 @@ */ #include -#include #include #include #include @@ -86,7 +85,7 @@ static struct rpc_clnt *nsm_create(struct net *net) return rpc_create(&args); } -__maybe_unused static struct rpc_clnt *nsm_client_get(struct net *net) +static struct rpc_clnt *nsm_client_get(struct net *net) { static DEFINE_MUTEX(nsm_create_mutex); struct rpc_clnt *clnt; @@ -113,7 +112,7 @@ out: return clnt; } -__maybe_unused static void nsm_client_put(struct net *net) +static void nsm_client_put(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); struct rpc_clnt *clnt = ln->nsm_clnt; @@ -132,9 +131,8 @@ __maybe_unused static void nsm_client_put(struct net *net) } static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, - struct net *net) + struct rpc_clnt *clnt) { - struct rpc_clnt *clnt; int status; struct nsm_args args = { .priv = &nsm->sm_priv, @@ -142,20 +140,14 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, .vers = 3, .proc = NLMPROC_NSM_NOTIFY, .mon_name = nsm->sm_mon_name, - .nodename = utsname()->nodename, + .nodename = clnt->cl_nodename, }; struct rpc_message msg = { .rpc_argp = &args, .rpc_resp = res, }; - clnt = nsm_create(net); - if (IS_ERR(clnt)) { - status = PTR_ERR(clnt); - dprintk("lockd: failed to create NSM upcall transport, " - "status=%d\n", status); - goto out; - } + BUG_ON(clnt == NULL); memset(res, 0, sizeof(*res)); @@ -166,8 +158,6 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, status); else status = 0; - rpc_shutdown_client(clnt); - out: return status; } @@ -187,6 +177,7 @@ int nsm_monitor(const struct nlm_host *host) struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; int status; + struct rpc_clnt *clnt; dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); @@ -199,7 +190,15 @@ int nsm_monitor(const struct nlm_host *host) */ nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; - status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net); + clnt = nsm_client_get(host->net); + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); + dprintk("lockd: failed to create NSM upcall transport, " + "status=%d, net=%p\n", status, host->net); + return status; + } + + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt); if (unlikely(res.status != 0)) status = -EIO; if (unlikely(status < 0)) { @@ -231,9 +230,11 @@ void nsm_unmonitor(const struct nlm_host *host) if (atomic_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { + struct lockd_net *ln = net_generic(host->net, lockd_net_id); + dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); - status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net); + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt); if (res.status != 0) status = -EIO; if (status < 0) @@ -241,6 +242,8 @@ void nsm_unmonitor(const struct nlm_host *host) nsm->sm_name); else nsm->sm_monitored = 0; + + nsm_client_put(host->net); } } From 8cb7f74eeeb5441811d93f94b6138d4a5a9d8b20 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:14 -0400 Subject: [PATCH 064/113] NFS: nfs_parsed_mount_options can use unsigned int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs/nfs/super.c: In function ‘nfs_compare_remount_data’: fs/nfs/super.c:2042:18: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] fs/nfs/super.c:2043:18: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] fs/nfs/super.c:2044:20: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] fs/nfs/super.c:2046:21: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] fs/nfs/super.c:2047:21: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] fs/nfs/super.c:2048:21: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] fs/nfs/super.c:2049:21: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] fs/nfs/super.c:2050:18: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] Seen with gcc (GCC) 4.6.3 20120306 (Red Hat 4.6.3-2). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 31fdb03225cd..89560be07e4a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -101,11 +101,11 @@ struct nfs_client_initdata { */ struct nfs_parsed_mount_data { int flags; - int rsize, wsize; - int timeo, retrans; - int acregmin, acregmax, + unsigned int rsize, wsize; + unsigned int timeo, retrans; + unsigned int acregmin, acregmax, acdirmin, acdirmax; - int namlen; + unsigned int namlen; unsigned int options; unsigned int bsize; unsigned int auth_flavor_len; From ffe5a83005b0d23575ab109755b4cb5518a5d91f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:23 -0400 Subject: [PATCH 065/113] NFS: Slow down state manager after an unhandled error If the state manager thread is not actually able to fully recover from some situation, it wakes up waiters, who kick off a new state manager thread. Quite often the fresh invocation of the state manager is just as successful. This results in a livelock as the client dumps thousands of NFS requests a second on the network in a vain attempt to recover. Not very friendly. To mitigate this situation, add a delay in the state manager after an unhandled error, so that the client sends just a few requests every second in this case. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index bd8ed01cb0ea..38eeefd95375 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2015,6 +2015,7 @@ out_error: pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); + ssleep(1); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } From d8af9bc16c9e9506c10581111f897be04486218f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:34 -0400 Subject: [PATCH 066/113] SUNRPC: Clean up dprintk messages in rpc_pipe.c Clean up: The blank space in front of the message must be spaces. Tabs show up on the console as a graphical character. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 21fde99e5c56..80f5dd23417d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1119,8 +1119,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; - dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", net, - NET_NAME(net)); + dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", + net, NET_NAME(net)); sn->pipefs_sb = sb; err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_MOUNT, @@ -1155,8 +1155,8 @@ static void rpc_kill_sb(struct super_block *sb) sn->pipefs_sb = NULL; mutex_unlock(&sn->pipefs_sb_lock); put_net(net); - dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", net, - NET_NAME(net)); + dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", + net, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); From 632f0d0503accb8ab749a1165af99d344579c37b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:43 -0400 Subject: [PATCH 067/113] SUNRPC: Use __func__ in dprintk() in auth_gss.c Clean up: Some function names have changed, but debugging messages were never updated. Automate the construction of the function name in debugging messages. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 58 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 34c522021004..909dc0c31aab 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -239,7 +239,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct } return q; err: - dprintk("RPC: gss_fill_context returning %ld\n", -PTR_ERR(p)); + dprintk("RPC: %s returning %ld\n", __func__, -PTR_ERR(p)); return p; } @@ -301,10 +301,10 @@ __gss_find_upcall(struct rpc_pipe *pipe, uid_t uid) if (pos->uid != uid) continue; atomic_inc(&pos->count); - dprintk("RPC: gss_find_upcall found msg %p\n", pos); + dprintk("RPC: %s found msg %p\n", __func__, pos); return pos; } - dprintk("RPC: gss_find_upcall found nothing\n"); + dprintk("RPC: %s found nothing\n", __func__); return NULL; } @@ -507,8 +507,8 @@ gss_refresh_upcall(struct rpc_task *task) struct rpc_pipe *pipe; int err = 0; - dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid, - cred->cr_uid); + dprintk("RPC: %5u %s for uid %u\n", + task->tk_pid, __func__, cred->cr_uid); gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we @@ -539,8 +539,8 @@ gss_refresh_upcall(struct rpc_task *task) spin_unlock(&pipe->lock); gss_release_msg(gss_msg); out: - dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n", - task->tk_pid, cred->cr_uid, err); + dprintk("RPC: %5u %s for uid %u result %d\n", + task->tk_pid, __func__, cred->cr_uid, err); return err; } @@ -553,7 +553,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) DEFINE_WAIT(wait); int err = 0; - dprintk("RPC: gss_upcall for uid %u\n", cred->cr_uid); + dprintk("RPC: %s for uid %u\n", __func__, cred->cr_uid); retry: gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { @@ -594,8 +594,8 @@ out_intr: finish_wait(&gss_msg->waitqueue, &wait); gss_release_msg(gss_msg); out: - dprintk("RPC: gss_create_upcall for uid %u result %d\n", - cred->cr_uid, err); + dprintk("RPC: %s for uid %u result %d\n", + __func__, cred->cr_uid, err); return err; } @@ -681,7 +681,7 @@ err_put_ctx: err: kfree(buf); out: - dprintk("RPC: gss_pipe_downcall returning %Zd\n", err); + dprintk("RPC: %s returning %Zd\n", __func__, err); return err; } @@ -747,8 +747,8 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->errno < 0) { - dprintk("RPC: gss_pipe_destroy_msg releasing msg %p\n", - gss_msg); + dprintk("RPC: %s releasing msg %p\n", + __func__, gss_msg); atomic_inc(&gss_msg->count); gss_unhash_msg(gss_msg); if (msg->errno == -ETIMEDOUT) @@ -976,7 +976,7 @@ gss_destroying_context(struct rpc_cred *cred) static void gss_do_free_ctx(struct gss_cl_ctx *ctx) { - dprintk("RPC: gss_free_ctx\n"); + dprintk("RPC: %s\n", __func__); gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); @@ -999,7 +999,7 @@ gss_free_ctx(struct gss_cl_ctx *ctx) static void gss_free_cred(struct gss_cred *gss_cred) { - dprintk("RPC: gss_free_cred %p\n", gss_cred); + dprintk("RPC: %s cred=%p\n", __func__, gss_cred); kfree(gss_cred); } @@ -1049,8 +1049,8 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) struct gss_cred *cred = NULL; int err = -ENOMEM; - dprintk("RPC: gss_create_cred for uid %d, flavor %d\n", - acred->uid, auth->au_flavor); + dprintk("RPC: %s for uid %d, flavor %d\n", + __func__, acred->uid, auth->au_flavor); if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) goto out_err; @@ -1069,7 +1069,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) return &cred->gc_base; out_err: - dprintk("RPC: gss_create_cred failed with error %d\n", err); + dprintk("RPC: %s failed with error %d\n", __func__, err); return ERR_PTR(err); } @@ -1127,7 +1127,7 @@ gss_marshal(struct rpc_task *task, __be32 *p) struct kvec iov; struct xdr_buf verf_buf; - dprintk("RPC: %5u gss_marshal\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); *p++ = htonl(RPC_AUTH_GSS); cred_len = p++; @@ -1253,7 +1253,7 @@ gss_validate(struct rpc_task *task, __be32 *p) u32 flav,len; u32 maj_stat; - dprintk("RPC: %5u gss_validate\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); flav = ntohl(*p++); if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE) @@ -1271,20 +1271,20 @@ gss_validate(struct rpc_task *task, __be32 *p) if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) { - dprintk("RPC: %5u gss_validate: gss_verify_mic returned " - "error 0x%08x\n", task->tk_pid, maj_stat); + dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n", + task->tk_pid, __func__, maj_stat); goto out_bad; } /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; gss_put_ctx(ctx); - dprintk("RPC: %5u gss_validate: gss_verify_mic succeeded.\n", - task->tk_pid); + dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n", + task->tk_pid, __func__); return p + XDR_QUADLEN(len); out_bad: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_validate failed.\n", task->tk_pid); + dprintk("RPC: %5u %s failed.\n", task->tk_pid, __func__); return NULL; } @@ -1466,7 +1466,7 @@ gss_wrap_req(struct rpc_task *task, struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status = -EIO; - dprintk("RPC: %5u gss_wrap_req\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); if (ctx->gc_proc != RPC_GSS_PROC_DATA) { /* The spec seems a little ambiguous here, but I think that not * wrapping context destruction requests makes the most sense. @@ -1489,7 +1489,7 @@ gss_wrap_req(struct rpc_task *task, } out: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_wrap_req returning %d\n", task->tk_pid, status); + dprintk("RPC: %5u %s returning %d\n", task->tk_pid, __func__, status); return status; } @@ -1604,8 +1604,8 @@ out_decode: status = gss_unwrap_req_decode(decode, rqstp, p, obj); out: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid, - status); + dprintk("RPC: %5u %s returning %d\n", + task->tk_pid, __func__, status); return status; } From 1b63a75180c6c65c71655c250a4e6b578ba7d1c0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:52 -0400 Subject: [PATCH 068/113] SUNRPC: Refactor rpc_clone_client() rpc_clone_client() does most of the same tasks as rpc_new_client(), so there is an opportunity for code re-use. Create a generic helper that makes it easy to clone an RPC client while replacing any of the clnt's parameters. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 83 ++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index fa48c60aef23..afbeefab6600 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -490,59 +490,62 @@ EXPORT_SYMBOL_GPL(rpc_create); * same transport while varying parameters such as the authentication * flavour. */ -struct rpc_clnt * -rpc_clone_client(struct rpc_clnt *clnt) +static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, + struct rpc_clnt *clnt) { - struct rpc_clnt *new; struct rpc_xprt *xprt; - int err = -ENOMEM; + struct rpc_clnt *new; + int err; - new = kmemdup(clnt, sizeof(*new), GFP_KERNEL); - if (!new) - goto out_no_clnt; - new->cl_parent = clnt; - /* Turn off autobind on clones */ - new->cl_autobind = 0; - INIT_LIST_HEAD(&new->cl_tasks); - spin_lock_init(&new->cl_lock); - rpc_init_rtt(&new->cl_rtt_default, clnt->cl_timeout->to_initval); - new->cl_metrics = rpc_alloc_iostats(clnt); - if (new->cl_metrics == NULL) - goto out_no_stats; - if (clnt->cl_principal) { - new->cl_principal = kstrdup(clnt->cl_principal, GFP_KERNEL); - if (new->cl_principal == NULL) - goto out_no_principal; - } + err = -ENOMEM; rcu_read_lock(); xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); rcu_read_unlock(); if (xprt == NULL) - goto out_no_transport; - rcu_assign_pointer(new->cl_xprt, xprt); - atomic_set(&new->cl_count, 1); - err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name); - if (err != 0) - goto out_no_path; - rpc_clnt_set_nodename(new, utsname()->nodename); - if (new->cl_auth) - atomic_inc(&new->cl_auth->au_count); + goto out_err; + args->servername = xprt->servername; + + new = rpc_new_client(args, xprt); + if (IS_ERR(new)) { + err = PTR_ERR(new); + goto out_put; + } + atomic_inc(&clnt->cl_count); - rpc_register_client(new); - rpciod_up(); + new->cl_parent = clnt; + + /* Turn off autobind on clones */ + new->cl_autobind = 0; + new->cl_softrtry = clnt->cl_softrtry; + new->cl_discrtry = clnt->cl_discrtry; + new->cl_chatty = clnt->cl_chatty; return new; -out_no_path: + +out_put: xprt_put(xprt); -out_no_transport: - kfree(new->cl_principal); -out_no_principal: - rpc_free_iostats(new->cl_metrics); -out_no_stats: - kfree(new); -out_no_clnt: +out_err: dprintk("RPC: %s: returned error %d\n", __func__, err); return ERR_PTR(err); } + +/** + * rpc_clone_client - Clone an RPC client structure + * + * @clnt: RPC client whose parameters are copied + * + * Returns a fresh RPC client or an ERR_PTR. + */ +struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) +{ + struct rpc_create_args args = { + .program = clnt->cl_program, + .prognumber = clnt->cl_prog, + .version = clnt->cl_vers, + .authflavor = clnt->cl_auth->au_flavor, + .client_name = clnt->cl_principal, + }; + return __rpc_clone_client(&args, clnt); +} EXPORT_SYMBOL_GPL(rpc_clone_client); /* From ba9b584c1dc37851d9c6ca6d0d2ccba55d9aad04 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:24:02 -0400 Subject: [PATCH 069/113] SUNRPC: Introduce rpc_clone_client_set_auth() An ULP is supposed to be able to replace a GSS rpc_auth object with another GSS rpc_auth object using rpcauth_create(). However, rpcauth_create() in 3.5 reliably fails with -EEXIST in this case. This is because when gss_create() attempts to create the upcall pipes, sometimes they are already there. For example if a pipe FS mount event occurs, or a previous GSS flavor was in use for this rpc_clnt. It turns out that's not the only problem here. While working on a fix for the above problem, we noticed that replacing an rpc_clnt's rpc_auth is not safe, since dereferencing the cl_auth field is not protected in any way. So we're deprecating the ability of rpcauth_create() to switch an rpc_clnt's security flavor during normal operation. Instead, let's add a fresh API that clones an rpc_clnt and gives the clone a new flavor before it's used. This makes immediate use of the new __rpc_clone_client() helper. This can be used in a similar fashion to rpcauth_create() when a client is hunting for the correct security flavor. Instead of replacing an rpc_clnt's security flavor in a loop, the ULP replaces the whole rpc_clnt. To fix the -EEXIST problem, any ULP logic that relies on replacing an rpc_clnt's rpc_auth with rpcauth_create() must be changed to use this API instead. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 13 ++----------- fs/nfs/nfs4namespace.c | 14 +------------- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 22 ++++++++++++++++++++++ 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 99694442b93f..143149db3440 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -668,7 +668,8 @@ int nfs_init_server_rpcclient(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; - server->client = rpc_clone_client(clp->cl_rpcclient); + server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, + pseudoflavour); if (IS_ERR(server->client)) { dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); @@ -678,16 +679,6 @@ int nfs_init_server_rpcclient(struct nfs_server *server, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; - - if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { - struct rpc_auth *auth; - - auth = rpcauth_create(pseudoflavour, server->client); - if (IS_ERR(auth)) { - dprintk("%s: couldn't create credcache!\n", __func__); - return PTR_ERR(auth); - } - } server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 4fdeb1b7042e..79fbb61ce202 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -192,25 +192,13 @@ out: struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, struct qstr *name) { - struct rpc_clnt *clone; - struct rpc_auth *auth; rpc_authflavor_t flavor; flavor = nfs4_negotiate_security(inode, name); if ((int)flavor < 0) return ERR_PTR((int)flavor); - clone = rpc_clone_client(clnt); - if (IS_ERR(clone)) - return clone; - - auth = rpcauth_create(flavor, clone); - if (IS_ERR(auth)) { - rpc_shutdown_client(clone); - clone = ERR_PTR(-EIO); - } - - return clone; + return rpc_clone_client_set_auth(clnt, flavor); } static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 523547ecfee2..34206b84d8da 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -130,6 +130,8 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, const struct rpc_program *, u32); void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt); struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); +struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *, + rpc_authflavor_t); void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); void rpc_task_release_client(struct rpc_task *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index afbeefab6600..cdc7564b4512 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -548,6 +548,28 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_clone_client); +/** + * rpc_clone_client_set_auth - Clone an RPC client structure and set its auth + * + * @clnt: RPC client whose parameters are copied + * @auth: security flavor for new client + * + * Returns a fresh RPC client or an ERR_PTR. + */ +struct rpc_clnt * +rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + struct rpc_create_args args = { + .program = clnt->cl_program, + .prognumber = clnt->cl_prog, + .version = clnt->cl_vers, + .authflavor = flavor, + .client_name = clnt->cl_principal, + }; + return __rpc_clone_client(&args, clnt); +} +EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth); + /* * Kill all tasks for the given client. * XXX: kill their descendants as well? From 896526174ce2b6a773e187ebe5a047b68230e2c4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:24:11 -0400 Subject: [PATCH 070/113] NFS: Introduce "migration" mount option Currently, the Linux client uses a unique nfs_client_id4.id string when identifying itself to distinct NFS servers. To support transparent state migration, the Linux client will have to use the same nfs_client_id4 string for all servers it communicates with (also known as the "uniform client string" approach). Otherwise NFS servers can not recognize that open and lock state need to be merged after a file system transition. Unfortunately, there are some NFSv4.0 servers currently in the field that do not tolerate the uniform client string approach. Thus, by default, our NFSv4.0 mounts will continue to use the current approach, and we introduce a mount option that switches them to use the uniform model. Client administrators must identify which servers can be mounted with this option. Eventually most NFSv4.0 servers will be able to handle the uniform approach, and we can change the default. The first mount of a server controls the behavior for all subsequent mounts for the lifetime of that set of mounts of that server. After the last mount of that server is gone, the client erases the data structure that tracks the lease. A subsequent lease may then honor a different "migration" setting. This patch adds only the infrastructure for parsing the new mount option. Support for uniform client strings is added in a subsequent patch. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 ++ fs/nfs/super.c | 20 ++++++++++++++++++++ include/linux/nfs_fs_sb.h | 2 ++ 3 files changed, 24 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 143149db3440..92aed2e08bd5 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -752,6 +752,8 @@ static int nfs_init_server(struct nfs_server *server, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (server->options & NFS_OPTION_MIGRATION) + set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b8eda700584b..056138d45c11 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -88,6 +88,7 @@ enum { Opt_sharecache, Opt_nosharecache, Opt_resvport, Opt_noresvport, Opt_fscache, Opt_nofscache, + Opt_migration, Opt_nomigration, /* Mount options that take integer arguments */ Opt_port, @@ -147,6 +148,8 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_noresvport, "noresvport" }, { Opt_fscache, "fsc" }, { Opt_nofscache, "nofsc" }, + { Opt_migration, "migration" }, + { Opt_nomigration, "nomigration" }, { Opt_port, "port=%s" }, { Opt_rsize, "rsize=%s" }, @@ -676,6 +679,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->options & NFS_OPTION_FSCACHE) seq_printf(m, ",fsc"); + if (nfss->options & NFS_OPTION_MIGRATION) + seq_printf(m, ",migration"); + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) seq_printf(m, ",lookupcache=none"); @@ -1243,6 +1249,12 @@ static int nfs_parse_mount_options(char *raw, kfree(mnt->fscache_uniq); mnt->fscache_uniq = NULL; break; + case Opt_migration: + mnt->options |= NFS_OPTION_MIGRATION; + break; + case Opt_nomigration: + mnt->options &= NFS_OPTION_MIGRATION; + break; /* * options that take numeric values @@ -1535,6 +1547,10 @@ static int nfs_parse_mount_options(char *raw, if (mnt->minorversion && mnt->version != 4) goto out_minorversion_mismatch; + if (mnt->options & NFS_OPTION_MIGRATION && + mnt->version != 4 && mnt->minorversion != 0) + goto out_migration_misuse; + /* * verify that any proto=/mountproto= options match the address * familiies in the addr=/mountaddr= options. @@ -1572,6 +1588,10 @@ out_minorversion_mismatch: printk(KERN_INFO "NFS: mount option vers=%u does not support " "minorversion=%u\n", mnt->version, mnt->minorversion); return 0; +out_migration_misuse: + printk(KERN_INFO + "NFS: 'migration' not supported for this NFS version\n"); + return 0; out_nomem: printk(KERN_INFO "NFS: not enough memory to parse option\n"); return 0; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 310c63c8ab2c..2e22fc7e47cf 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -39,6 +39,7 @@ struct nfs_client { unsigned long cl_flags; /* behavior switches */ #define NFS_CS_NORESVPORT 0 /* - use ephemeral src port */ #define NFS_CS_DISCRTRY 1 /* - disconnect on RPC retry */ +#define NFS_CS_MIGRATION 2 /* - transparent state migr */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ @@ -125,6 +126,7 @@ struct nfs_server { unsigned int namelen; unsigned int options; /* extra options enabled by mount */ #define NFS_OPTION_FSCACHE 0x00000001 /* - local caching enabled */ +#define NFS_OPTION_MIGRATION 0x00000002 /* - NFSv4 migration enabled */ struct nfs_fsid fsid; __u64 maxfilesize; /* maximum file size */ From e984a55a7418f777407c7edbb2bdf5eb9559b5e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:24:21 -0400 Subject: [PATCH 071/113] NFS: Use the same nfs_client_id4 for every server Currently, when identifying itself to NFS servers, the Linux NFS client uses a unique nfs_client_id4.id string for each server IP address it talks with. For example, when client A talks to server X, the client identifies itself using a string like "AX". The requirements for these strings are specified in detail by RFC 3530 (and bis). This form of client identification presents a problem for Transparent State Migration. When client A's state on server X is migrated to server Y, it continues to be associated with string "AX." But, according to the rules of client string construction above, client A will present string "AY" when communicating with server Y. Server Y thus has no way to know that client A should be associated with the state migrated from server X. "AX" is all but abandoned, interfering with establishing fresh state for client A on server Y. To support transparent state migration, then, NFSv4.0 clients must instead use the same nfs_client_id4.id string to identify themselves to every NFS server; something like "A". Now a client identifies itself as "A" to server X. When a file system on server X transitions to server Y, and client A identifies itself as "A" to server Y, Y will know immediately that the state associated with "A," whether it is native or migrated, is owned by the client, and can merge both into a single lease. As a pre-requisite to adding support for NFSv4 migration to the Linux NFS client, this patch changes the way Linux identifies itself to NFS servers via the SETCLIENTID (NFSv4 minor version 0) and EXCHANGE_ID (NFSv4 minor version 1) operations. In addition to removing the server's IP address from nfs_client_id4, the Linux NFS client will also no longer use its own source IP address as part of the nfs_client_id4 string. On multi-homed clients, the value of this address depends on the address family and network routing used to contact the server, thus it can be different for each server. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 51 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5b3207f557d9..461411171966 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4047,6 +4047,32 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } +static unsigned int +nfs4_init_nonuniform_client_string(const struct nfs_client *clp, + char *buf, size_t len) +{ + unsigned int result; + + rcu_read_lock(); + result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_PROTO)); + rcu_read_unlock(); + return result; +} + +static unsigned int +nfs4_init_uniform_client_string(const struct nfs_client *clp, + char *buf, size_t len) +{ + return scnprintf(buf, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + clp->cl_rpcclient->cl_nodename); +} + /** * nfs4_proc_setclientid - Negotiate client ID * @clp: state data structure @@ -4077,15 +4103,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, /* nfs_client_id4 */ nfs4_init_boot_verifier(clp, &sc_verifier); - rcu_read_lock(); - setclientid.sc_name_len = scnprintf(setclientid.sc_name, - sizeof(setclientid.sc_name), "%s/%s %s", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_PROTO)); + if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) + setclientid.sc_name_len = + nfs4_init_uniform_client_string(clp, + setclientid.sc_name, + sizeof(setclientid.sc_name)); + else + setclientid.sc_name_len = + nfs4_init_nonuniform_client_string(clp, + setclientid.sc_name, + sizeof(setclientid.sc_name)); /* cb_client4 */ + rcu_read_lock(); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, sizeof(setclientid.sc_netid), rpc_peeraddr2str(clp->cl_rpcclient, @@ -5307,10 +5336,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) }; nfs4_init_boot_verifier(clp, &verifier); - args.id_len = scnprintf(args.id, sizeof(args.id), - "%s/%s", - clp->cl_ipaddr, - clp->cl_rpcclient->cl_nodename); + args.id_len = nfs4_init_uniform_client_string(clp, args.id, + sizeof(args.id)); dprintk("NFS call exchange_id auth=%s, '%.*s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, args.id_len, args.id); From 05f4c350ee02e9461c6ae3a880ea326a06835e37 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:24:32 -0400 Subject: [PATCH 072/113] NFS: Discover NFSv4 server trunking when mounting "Server trunking" is a fancy named for a multi-homed NFS server. Trunking might occur if a client sends NFS requests for a single workload to multiple network interfaces on the same server. There are some implications for NFSv4 state management that make it useful for a client to know if a single NFSv4 server instance is multi-homed. (Note this is only a consideration for NFSv4, not for legacy versions of NFS, which are stateless). If a client cares about server trunking, no NFSv4 operations can proceed until that client determines who it is talking to. Thus server IP trunking discovery must be done when the client first encounters an unfamiliar server IP address. The nfs_get_client() function walks the nfs_client_list and matches on server IP address. The outcome of that walk tells us immediately if we have an unfamiliar server IP address. It invokes nfs_init_client() in this case. Thus, nfs4_init_client() is a good spot to perform trunking discovery. Discovery requires a client to establish a fresh client ID, so our client will now send SETCLIENTID or EXCHANGE_ID as the first NFS operation after a successful ping, rather than waiting for an application to perform an operation that requires NFSv4 state. The exact process for detecting trunking is different for NFSv4.0 and NFSv4.1, so a minorversion-specific init_client callout method is introduced. CLID_INUSE recovery is important for the trunking discovery process. CLID_INUSE is a sign the server recognizes the client's nfs_client_id4 id string, but the client is using the wrong principal this time for the SETCLIENTID operation. The SETCLIENTID must be retried with a series of different principals until one works, and then the rest of trunking discovery can proceed. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 3 +- fs/nfs/internal.h | 6 + fs/nfs/nfs4_fs.h | 8 ++ fs/nfs/nfs4client.c | 253 ++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4proc.c | 4 + fs/nfs/nfs4state.c | 182 ++++++++++++++++++++++++++- include/linux/nfs_fs_sb.h | 1 + 7 files changed, 455 insertions(+), 2 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 92aed2e08bd5..57d2a5c3d933 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -498,7 +498,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, return nfs_found_client(cl_init, clp); } if (new) { - list_add(&new->cl_share_link, &nn->nfs_client_list); + list_add_tail(&new->cl_share_link, + &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; return rpc_ops->init_client(new, timeparms, ip_addr, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 89560be07e4a..89a795dc3027 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -483,6 +483,12 @@ extern int _nfs4_call_sync_session(struct rpc_clnt *clnt, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply); +extern int nfs40_walk_client_list(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred); +extern int nfs41_walk_client_list(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred); /* * Determine the device name as a string diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 9cacc131a8a4..832503c7a00e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -191,6 +191,8 @@ struct nfs4_state_recovery_ops { int (*establish_clid)(struct nfs_client *, struct rpc_cred *); struct rpc_cred * (*get_clid_cred)(struct nfs_client *); int (*reclaim_complete)(struct nfs_client *); + int (*detect_trunking)(struct nfs_client *, struct nfs_client **, + struct rpc_cred *); }; struct nfs4_state_maintenance_ops { @@ -320,9 +322,15 @@ extern void nfs4_renew_state(struct work_struct *); /* nfs4state.c */ struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); +int nfs4_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **); +int nfs40_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **, struct rpc_cred *); #if defined(CONFIG_NFS_V4_1) struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); +int nfs41_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 612f5ebaabac..14ddd4d30966 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -185,6 +185,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, rpc_authflavor_t authflavour) { char buf[INET6_ADDRSTRLEN + 1]; + struct nfs_client *old; int error; if (clp->cl_cons_state == NFS_CS_READY) { @@ -230,6 +231,17 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (!nfs4_has_session(clp)) nfs_mark_client_ready(clp, NFS_CS_READY); + + error = nfs4_discover_server_trunking(clp, &old); + if (error < 0) + goto error; + if (clp != old) { + clp->cl_preserve_clid = true; + nfs_put_client(clp); + clp = old; + atomic_inc(&clp->cl_count); + } + return clp; error: @@ -239,6 +251,247 @@ error: return ERR_PTR(error); } +/* + * Returns true if the client IDs match + */ +static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +{ + if (a->cl_clientid != b->cl_clientid) { + dprintk("NFS: --> %s client ID %llx does not match %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return false; + } + dprintk("NFS: --> %s client ID %llx matches %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return true; +} + +/* + * SETCLIENTID just did a callback update with the callback ident in + * "drop," but server trunking discovery claims "drop" and "keep" are + * actually the same server. Swap the callback IDs so that "keep" + * will continue to use the callback ident the server now knows about, + * and so that "keep"'s original callback ident is destroyed when + * "drop" is freed. + */ +static void nfs4_swap_callback_idents(struct nfs_client *keep, + struct nfs_client *drop) +{ + struct nfs_net *nn = net_generic(keep->cl_net, nfs_net_id); + unsigned int save = keep->cl_cb_ident; + + if (keep->cl_cb_ident == drop->cl_cb_ident) + return; + + dprintk("%s: keeping callback ident %u and dropping ident %u\n", + __func__, keep->cl_cb_ident, drop->cl_cb_ident); + + spin_lock(&nn->nfs_client_lock); + + idr_replace(&nn->cb_ident_idr, keep, drop->cl_cb_ident); + keep->cl_cb_ident = drop->cl_cb_ident; + + idr_replace(&nn->cb_ident_idr, drop, save); + drop->cl_cb_ident = save; + + spin_unlock(&nn->nfs_client_lock); +} + +/** + * nfs40_walk_client_list - Find server that recognizes a client ID + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs40_walk_client_list() relies on the new nfs_client being + * the last nfs_client on the list. + */ +int nfs40_walk_client_list(struct nfs_client *new, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); + struct nfs_client *pos, *n, *prev = NULL; + struct nfs4_setclientid_res clid = { + .clientid = new->cl_clientid, + .confirm = new->cl_confirm, + }; + int status; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos" */ + if (pos->cl_cons_state < NFS_CS_READY) + continue; + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + + if (pos->cl_clientid != new->cl_clientid) + continue; + + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + + status = nfs4_proc_setclientid_confirm(pos, &clid, cred); + if (status == 0) { + nfs4_swap_callback_idents(pos, new); + + nfs_put_client(pos); + *result = pos; + dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", + __func__, pos, atomic_read(&pos->cl_count)); + return 0; + } + if (status != -NFS4ERR_STALE_CLIENTID) { + nfs_put_client(pos); + dprintk("NFS: <-- %s status = %d, no result\n", + __func__, status); + return status; + } + + spin_lock(&nn->nfs_client_lock); + prev = pos; + } + + /* + * No matching nfs_client found. This should be impossible, + * because the new nfs_client has already been added to + * nfs_client_list by nfs_get_client(). + * + * Don't BUG(), since the caller is holding a mutex. + */ + if (prev) + nfs_put_client(prev); + spin_unlock(&nn->nfs_client_lock); + pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); + return -NFS4ERR_STALE_CLIENTID; +} + +#ifdef CONFIG_NFS_V4_1 +/* + * Returns true if the server owners match + */ +static bool +nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b) +{ + struct nfs41_server_owner *o1 = a->cl_serverowner; + struct nfs41_server_owner *o2 = b->cl_serverowner; + + if (o1->minor_id != o2->minor_id) { + dprintk("NFS: --> %s server owner minor IDs do not match\n", + __func__); + return false; + } + + if (o1->major_id_sz != o2->major_id_sz) + goto out_major_mismatch; + if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) + goto out_major_mismatch; + + dprintk("NFS: --> %s server owners match\n", __func__); + return true; + +out_major_mismatch: + dprintk("NFS: --> %s server owner major IDs do not match\n", + __func__); + return false; +} + +/** + * nfs41_walk_client_list - Find nfs_client that matches a client/server owner + * + * @new: nfs_client with client ID to test + * @result: OUT: found nfs_client, or new + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in "result." + * + * NB: nfs41_walk_client_list() relies on the new nfs_client being + * the last nfs_client on the list. + */ +int nfs41_walk_client_list(struct nfs_client *new, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); + struct nfs_client *pos, *n, *prev = NULL; + int error; + + spin_lock(&nn->nfs_client_lock); + list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos", especially the client + * ID and serverowner fields. Wait for CREATE_SESSION + * to finish. */ + if (pos->cl_cons_state < NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + prev = pos; + + error = nfs_wait_client_init_complete(pos); + if (error < 0) { + nfs_put_client(pos); + continue; + } + + spin_lock(&nn->nfs_client_lock); + } + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + + if (!nfs4_match_clientids(pos, new)) + continue; + + if (!nfs4_match_serverowners(pos, new)) + continue; + + spin_unlock(&nn->nfs_client_lock); + dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", + __func__, pos, atomic_read(&pos->cl_count)); + + *result = pos; + return 0; + } + + /* + * No matching nfs_client found. This should be impossible, + * because the new nfs_client has already been added to + * nfs_client_list by nfs_get_client(). + * + * Don't BUG(), since the caller is holding a mutex. + */ + spin_unlock(&nn->nfs_client_lock); + pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); + return -NFS4ERR_STALE_CLIENTID; +} +#endif /* CONFIG_NFS_V4_1 */ + static void nfs4_destroy_server(struct nfs_server *server) { nfs_server_return_all_delegations(server); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 461411171966..b5834abfcbff 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5458,6 +5458,8 @@ int nfs4_destroy_clientid(struct nfs_client *clp) goto out; if (clp->cl_exchange_flags == 0) goto out; + if (clp->cl_preserve_clid) + goto out; cred = nfs4_get_exchange_id_cred(clp); ret = nfs4_proc_destroy_clientid(clp, cred); if (cred) @@ -6871,6 +6873,7 @@ static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { .recover_lock = nfs4_lock_reclaim, .establish_clid = nfs4_init_clientid, .get_clid_cred = nfs4_get_setclientid_cred, + .detect_trunking = nfs40_discover_server_trunking, }; #if defined(CONFIG_NFS_V4_1) @@ -6882,6 +6885,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, .reclaim_complete = nfs41_proc_reclaim_complete, + .detect_trunking = nfs41_discover_server_trunking, }; #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 38eeefd95375..5c4286643701 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -51,6 +51,8 @@ #include #include +#include + #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" @@ -63,7 +65,7 @@ #define OPENOWNER_POOL_SIZE 8 const nfs4_stateid zero_stateid; - +static DEFINE_MUTEX(nfs_clid_init_mutex); static LIST_HEAD(nfs4_clientid_list); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -98,6 +100,55 @@ out: return status; } +/** + * nfs40_discover_server_trunking - Detect server IP address trunking (mv0) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns zero, a negative errno, or a negative NFS4ERR status. + * If zero is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs40_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred) +{ + struct nfs4_setclientid_res clid = { + .clientid = clp->cl_clientid, + .confirm = clp->cl_confirm, + }; + unsigned short port; + int status; + + port = nfs_callback_tcpport; + if (clp->cl_addr.ss_family == AF_INET6) + port = nfs_callback_tcpport6; + + status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); + if (status != 0) + goto out; + clp->cl_clientid = clid.clientid; + clp->cl_confirm = clid.confirm; + + status = nfs40_walk_client_list(clp, result, cred); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + case 0: + /* Sustain the lease, even if it's empty. If the clientid4 + * goes stale it's of no use for trunking discovery. */ + nfs4_schedule_state_renewal(*result); + break; + } + +out: + return status; +} + struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) { struct rpc_cred *cred = NULL; @@ -277,6 +328,32 @@ out: return status; } +/** + * nfs41_discover_server_trunking - Detect server IP address trunking (mv1) + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * @cred: credential to use for trunking test + * + * Returns NFS4_OK, a negative errno, or a negative NFS4ERR status. + * If NFS4_OK is returned, an nfs_client pointer is planted in + * "result". + * + * Note: The returned client may not yet be marked ready. + */ +int nfs41_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result, + struct rpc_cred *cred) +{ + int status; + + status = nfs4_proc_exchange_id(clp, cred); + if (status != NFS4_OK) + return status; + + return nfs41_walk_client_list(clp, result, cred); +} + struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp) { struct rpc_cred *cred; @@ -1705,6 +1782,109 @@ static int nfs4_purge_lease(struct nfs_client *clp) return 0; } +/** + * nfs4_discover_server_trunking - Detect server IP address trunking + * + * @clp: nfs_client under test + * @result: OUT: found nfs_client, or clp + * + * Returns zero or a negative errno. If zero is returned, + * an nfs_client pointer is planted in "result". + * + * Note: since we are invoked in process context, and + * not from inside the state manager, we cannot use + * nfs4_handle_reclaim_lease_error(). + */ +int nfs4_discover_server_trunking(struct nfs_client *clp, + struct nfs_client **result) +{ + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; + rpc_authflavor_t *flavors, flav, save; + struct rpc_clnt *clnt; + struct rpc_cred *cred; + int i, len, status; + + dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); + + len = NFS_MAX_SECFLAVORS; + flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL); + if (flavors == NULL) { + status = -ENOMEM; + goto out; + } + len = rpcauth_list_flavors(flavors, len); + if (len < 0) { + status = len; + goto out_free; + } + clnt = clp->cl_rpcclient; + save = clnt->cl_auth->au_flavor; + i = 0; + + mutex_lock(&nfs_clid_init_mutex); + status = -ENOENT; +again: + cred = ops->get_clid_cred(clp); + if (cred == NULL) + goto out_unlock; + + status = ops->detect_trunking(clp, result, cred); + put_rpccred(cred); + switch (status) { + case 0: + break; + + case -EACCES: + if (clp->cl_machine_cred == NULL) + break; + /* Handle case where the user hasn't set up machine creds */ + nfs4_clear_machine_cred(clp); + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EAGAIN: + ssleep(1); + dprintk("NFS: %s after status %d, retrying\n", + __func__, status); + goto again; + + case -NFS4ERR_CLID_INUSE: + case -NFS4ERR_WRONGSEC: + status = -EPERM; + if (i >= len) + break; + + flav = flavors[i++]; + if (flav == save) + flav = flavors[i++]; + clnt = rpc_clone_client_set_auth(clnt, flav); + if (IS_ERR(clnt)) { + status = PTR_ERR(clnt); + break; + } + clp->cl_rpcclient = clnt; + goto again; + + case -NFS4ERR_MINOR_VERS_MISMATCH: + status = -EPROTONOSUPPORT; + break; + + case -EKEYEXPIRED: + nfs4_warn_keyexpired(clp->cl_hostname); + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + status = -EKEYEXPIRED; + } + +out_unlock: + mutex_unlock(&nfs_clid_init_mutex); +out_free: + kfree(flavors); +out: + dprintk("NFS: %s: status = %d\n", __func__, status); + return status; +} + #ifdef CONFIG_NFS_V4_1 void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 2e22fc7e47cf..a9e76ee1adca 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -82,6 +82,7 @@ struct nfs_client { /* The flags used for obtaining the clientid during EXCHANGE_ID */ u32 cl_exchange_flags; struct nfs4_session *cl_session; /* shared session */ + bool cl_preserve_clid; struct nfs41_server_owner *cl_serverowner; struct nfs41_server_scope *cl_serverscope; struct nfs41_impl_id *cl_implid; From 6f2ea7f2a3ff3cd342bface43f8b4bf5e431cf36 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:24:41 -0400 Subject: [PATCH 073/113] NFS: Add nfs4_unique_id boot parameter An optional boot parameter is introduced to allow client administrators to specify a string that the Linux NFS client can insert into its nfs_client_id4 id string, to make it both more globally unique, and to ensure that it doesn't change even if the client's nodename changes. If this boot parameter is not specified, the client's nodename is used, as before. Client installation procedures can create a unique string (typically, a UUID) which remains unchanged during the lifetime of that client instance. This works just like creating a UUID for the label of the system's root and boot volumes. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- Documentation/filesystems/nfs/nfs.txt | 44 +++++++++++++++++++++++++-- Documentation/kernel-parameters.txt | 5 +++ fs/nfs/nfs4_fs.h | 3 ++ fs/nfs/nfs4proc.c | 6 +++- fs/nfs/super.c | 5 +++ 5 files changed, 59 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/nfs/nfs.txt b/Documentation/filesystems/nfs/nfs.txt index f50f26ce6cd0..f2571c8bef74 100644 --- a/Documentation/filesystems/nfs/nfs.txt +++ b/Documentation/filesystems/nfs/nfs.txt @@ -12,9 +12,47 @@ and work is in progress on adding support for minor version 1 of the NFSv4 protocol. The purpose of this document is to provide information on some of the -upcall interfaces that are used in order to provide the NFS client with -some of the information that it requires in order to fully comply with -the NFS spec. +special features of the NFS client that can be configured by system +administrators. + + +The nfs4_unique_id parameter +============================ + +NFSv4 requires clients to identify themselves to servers with a unique +string. File open and lock state shared between one client and one server +is associated with this identity. To support robust NFSv4 state recovery +and transparent state migration, this identity string must not change +across client reboots. + +Without any other intervention, the Linux client uses a string that contains +the local system's node name. System administrators, however, often do not +take care to ensure that node names are fully qualified and do not change +over the lifetime of a client system. Node names can have other +administrative requirements that require particular behavior that does not +work well as part of an nfs_client_id4 string. + +The nfs.nfs4_unique_id boot parameter specifies a unique string that can be +used instead of a system's node name when an NFS client identifies itself to +a server. Thus, if the system's node name is not unique, or it changes, its +nfs.nfs4_unique_id stays the same, preventing collision with other clients +or loss of state during NFS reboot recovery or transparent state migration. + +The nfs.nfs4_unique_id string is typically a UUID, though it can contain +anything that is believed to be unique across all NFS clients. An +nfs4_unique_id string should be chosen when a client system is installed, +just as a system's root file system gets a fresh UUID in its label at +install time. + +The string should remain fixed for the lifetime of the client. It can be +changed safely if care is taken that the client shuts down cleanly and all +outstanding NFSv4 state has expired, to prevent loss of NFSv4 state. + +This string can be stored in an NFS client's grub.conf, or it can be provided +via a net boot facility such as PXE. It may also be specified as an nfs.ko +module parameter. Specifying a uniquifier string is not support for NFS +clients running in containers. + The DNS resolver ================ diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ad7e2e5088c1..396a1e66e0ae 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1719,6 +1719,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. will be autodetected by the client, and it will fall back to using the idmapper. To turn off this behaviour, set the value to '0'. + nfs.nfs4_unique_id= + [NFS4] Specify an additional fixed unique ident- + ification string that NFSv4 clients can insert into + their nfs_client_id4 string. This is typically a + UUID that is generated at system install time. nfs.send_implementation_id = [NFSv4.1] Send client implementation identification diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 832503c7a00e..a525fdefccde 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -380,6 +380,9 @@ extern bool nfs4_disable_idmapping; extern unsigned short max_session_slots; extern unsigned short send_implementation_id; +#define NFS4_CLIENT_ID_UNIQ_LEN (64) +extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; + /* nfs4sysctl.c */ #ifdef CONFIG_SYSCTL int nfs4_register_sysctl(void); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b5834abfcbff..9aa97112426f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4068,9 +4068,13 @@ static unsigned int nfs4_init_uniform_client_string(const struct nfs_client *clp, char *buf, size_t len) { + char *nodename = clp->cl_rpcclient->cl_nodename; + + if (nfs4_client_id_uniquifier[0] != '\0') + nodename = nfs4_client_id_uniquifier; return scnprintf(buf, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, - clp->cl_rpcclient->cl_nodename); + nodename); } /** diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 056138d45c11..56f02a9bd6d3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2662,6 +2662,7 @@ unsigned int nfs_idmap_cache_timeout = 600; bool nfs4_disable_idmapping = true; unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; +char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); EXPORT_SYMBOL_GPL(nfs_callback_tcpport); @@ -2669,6 +2670,7 @@ EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); EXPORT_SYMBOL_GPL(max_session_slots); EXPORT_SYMBOL_GPL(send_implementation_id); +EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); #define NFS_CALLBACK_MAXPORTNR (65535U) @@ -2694,6 +2696,8 @@ static struct kernel_param_ops param_ops_portnr = { module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); module_param(nfs_idmap_cache_timeout, int, 0644); module_param(nfs4_disable_idmapping, bool, 0644); +module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, + NFS4_CLIENT_ID_UNIQ_LEN, 0600); MODULE_PARM_DESC(nfs4_disable_idmapping, "Turn off NFSv4 idmapping when using 'sec=sys'"); module_param(max_session_slots, ushort, 0644); @@ -2702,6 +2706,7 @@ MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); +MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); MODULE_ALIAS("nfs4"); #endif /* CONFIG_NFS_V4 */ From 7acdb026818455638543b04b68d4a580c367fba8 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 24 Aug 2012 00:27:48 +0800 Subject: [PATCH 074/113] NFSv41: fix DIO write_io calculation pnfs_within_mdsthreshold() is called inside pg_init. We need to set read_io/write_io before that. Otherwise we fail pnfs_within_mdsthreshold() and IO goes to MDS. A simple test case: dd if=foo of=/mnt/pnfs/bar bs=10M count=1 oflag=direct Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 22130df16218..253d397780b1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -467,10 +467,10 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + NFS_I(inode)->read_io += iov_length(iov, nr_segs); result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); if (!result) result = nfs_direct_wait(dreq); - NFS_I(inode)->read_io += result; out_release: nfs_direct_req_release(dreq); out: @@ -818,6 +818,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, get_dreq(dreq); atomic_inc(&inode->i_dio_count); + NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs); for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); @@ -829,7 +830,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, pos += vec->iov_len; } nfs_pageio_complete(&desc); - NFS_I(dreq->inode)->write_io += desc.pg_bytes_written; /* * If no bytes were started, return the error, and let the From dc182549d439f60c332bf74d7f220a1bccf37da6 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 24 Aug 2012 00:27:49 +0800 Subject: [PATCH 075/113] NFS41: fix error of setting blocklayoutdriver After commit e38eb650 (NFS: set_pnfs_layoutdriver() from nfs4_proc_fsinfo()), set_pnfs_layoutdriver() is called inside nfs4_proc_fsinfo(), but pnfs_blksize is not set. It causes setting blocklayoutdriver failure and pnfsblock mount failure. Cc: stable [since v3.5] Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 1 - fs/nfs/nfs4proc.c | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 57d2a5c3d933..bab3e8af574a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -849,7 +849,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - server->pnfs_blksize = fsinfo->blksize; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9aa97112426f..e10d66f5be0a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3402,8 +3402,11 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s nfs_fattr_init(fsinfo->fattr); error = nfs4_do_fsinfo(server, fhandle, fsinfo); - if (error == 0) + if (error == 0) { + /* block layout checks this! */ + server->pnfs_blksize = fsinfo->blksize; set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + } return error; } From 5d0e3a004f02bffab51f542fa1d5b2e2854d8545 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 24 Aug 2012 00:27:50 +0800 Subject: [PATCH 076/113] Revert "pnfsblock: bail out partial page IO" This reverts commit 159e0561e322dd8008fff59e36efff8d2bdd0b0e, in favor of a more complete fix to the alignment issue. Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 39 +++----------------------------- 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 329bfbfed37d..3c61514599a5 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -228,14 +228,6 @@ bl_end_par_io_read(void *data, int unused) schedule_work(&rdata->task.u.tk_work); } -static bool -bl_check_alignment(u64 offset, u32 len, unsigned long blkmask) -{ - if ((offset & blkmask) || (len & blkmask)) - return false; - return true; -} - static enum pnfs_try_status bl_read_pagelist(struct nfs_read_data *rdata) { @@ -252,9 +244,6 @@ bl_read_pagelist(struct nfs_read_data *rdata) dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); - if (!bl_check_alignment(f_offset, rdata->args.count, PAGE_CACHE_MASK)) - goto use_mds; - par = alloc_parallel(rdata); if (!par) goto use_mds; @@ -563,7 +552,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) struct bio *bio = NULL; struct pnfs_block_extent *be = NULL, *cow_read = NULL; sector_t isect, last_isect = 0, extent_length = 0; - struct parallel_io *par = NULL; + struct parallel_io *par; loff_t offset = wdata->args.offset; size_t count = wdata->args.count; struct page **pages = wdata->args.pages; @@ -574,10 +563,6 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); - /* Check for alignment first */ - if (!bl_check_alignment(offset, count, PAGE_CACHE_MASK)) - goto out_mds; - /* At this point, wdata->pages is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error * to have it redone using nfs. @@ -1011,32 +996,14 @@ bl_clear_layoutdriver(struct nfs_server *server) return 0; } -static void -bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) - nfs_pageio_reset_read_mds(pgio); - else - pnfs_generic_pg_init_read(pgio, req); -} - -static void -bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - if (!bl_check_alignment(req->wb_offset, req->wb_bytes, PAGE_CACHE_MASK)) - nfs_pageio_reset_write_mds(pgio); - else - pnfs_generic_pg_init_write(pgio, req); -} - static const struct nfs_pageio_ops bl_pg_read_ops = { - .pg_init = bl_pg_init_read, + .pg_init = pnfs_generic_pg_init_read, .pg_test = pnfs_generic_pg_test, .pg_doio = pnfs_generic_pg_readpages, }; static const struct nfs_pageio_ops bl_pg_write_ops = { - .pg_init = bl_pg_init_write, + .pg_init = pnfs_generic_pg_init_write, .pg_test = pnfs_generic_pg_test, .pg_doio = pnfs_generic_pg_writepages, }; From fe6e1e8d9fad86873eb74a26e80a8f91f9e870b5 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 24 Aug 2012 00:27:51 +0800 Subject: [PATCH 077/113] pnfsblock: fix partial page buffer wirte If applications use flock to protect its write range, generic NFS will not do read-modify-write cycle at page cache level. Therefore LD should know how to handle non-sector aligned writes. Otherwise there will be data corruption. Cc: stable Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 177 ++++++++++++++++++++++++++++--- fs/nfs/blocklayout/blocklayout.h | 1 + 2 files changed, 166 insertions(+), 12 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 3c61514599a5..a9fe644a12d1 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -162,25 +162,39 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, return bio; } -static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, +static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, struct page *page, struct pnfs_block_extent *be, void (*end_io)(struct bio *, int err), - struct parallel_io *par) + struct parallel_io *par, + unsigned int offset, int len) { + isect = isect + (offset >> SECTOR_SHIFT); + dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, + npg, rw, (unsigned long long)isect, offset, len); retry: if (!bio) { bio = bl_alloc_init_bio(npg, isect, be, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); } - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + if (bio_add_page(bio, page, len, offset) < len) { bio = bl_submit_bio(rw, bio); goto retry; } return bio; } +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, + sector_t isect, struct page *page, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ + return do_add_page_to_bio(bio, npg, rw, isect, page, be, + end_io, par, 0, PAGE_CACHE_SIZE); +} + /* This is basically copied from mpage_end_io_read */ static void bl_end_io_read(struct bio *bio, int err) { @@ -450,6 +464,106 @@ map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) return; } +static void +bl_read_single_end_io(struct bio *bio, int error) +{ + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + + /* Only one page in bvec */ + unlock_page(page); +} + +static int +bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, + unsigned int offset, unsigned int len) +{ + struct bio *bio; + struct page *shadow_page; + sector_t isect; + char *kaddr, *kshadow_addr; + int ret = 0; + + dprintk("%s: offset %u len %u\n", __func__, offset, len); + + shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (shadow_page == NULL) + return -ENOMEM; + + bio = bio_alloc(GFP_NOIO, 1); + if (bio == NULL) + return -ENOMEM; + + isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + + (offset / SECTOR_SIZE); + + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = bl_read_single_end_io; + + lock_page(shadow_page); + if (bio_add_page(bio, shadow_page, + SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { + unlock_page(shadow_page); + bio_put(bio); + return -EIO; + } + + submit_bio(READ, bio); + wait_on_page_locked(shadow_page); + if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { + ret = -EIO; + } else { + kaddr = kmap_atomic(page); + kshadow_addr = kmap_atomic(shadow_page); + memcpy(kaddr + offset, kshadow_addr + offset, len); + kunmap_atomic(kshadow_addr); + kunmap_atomic(kaddr); + } + __free_page(shadow_page); + bio_put(bio); + + return ret; +} + +static int +bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, + unsigned int dirty_offset, unsigned int dirty_len, + bool full_page) +{ + int ret = 0; + unsigned int start, end; + + if (full_page) { + start = 0; + end = PAGE_CACHE_SIZE; + } else { + start = round_down(dirty_offset, SECTOR_SIZE); + end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); + } + + dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); + if (!be) { + zero_user_segments(page, start, dirty_offset, + dirty_offset + dirty_len, end); + if (start == 0 && end == PAGE_CACHE_SIZE && + trylock_page(page)) { + SetPageUptodate(page); + unlock_page(page); + } + return ret; + } + + if (start != dirty_offset) + ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); + + if (!ret && (dirty_offset + dirty_len < end)) + ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, + end - dirty_offset - dirty_len); + + return ret; +} + /* Given an unmapped page, zero it or read in page for COW, page is locked * by caller. */ @@ -483,7 +597,6 @@ init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) SetPageUptodate(page); cleanup: - bl_put_extent(cow_read); if (bh) free_buffer_head(bh); if (ret) { @@ -555,6 +668,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) struct parallel_io *par; loff_t offset = wdata->args.offset; size_t count = wdata->args.count; + unsigned int pg_offset, pg_len, saved_len; struct page **pages = wdata->args.pages; struct page *page; pgoff_t index; @@ -659,10 +773,11 @@ next_page: if (!extent_length) { /* We've used up the previous extent */ bl_put_extent(be); + bl_put_extent(cow_read); bio = bl_submit_bio(WRITE, bio); /* Get the next one */ be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), - isect, NULL); + isect, &cow_read); if (!be || !is_writable(be, isect)) { header->pnfs_error = -EINVAL; goto out; @@ -679,7 +794,26 @@ next_page: extent_length = be->be_length - (isect - be->be_f_offset); } - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + + dprintk("%s offset %lld count %Zu\n", __func__, offset, count); + pg_offset = offset & ~PAGE_CACHE_MASK; + if (pg_offset + count > PAGE_CACHE_SIZE) + pg_len = PAGE_CACHE_SIZE - pg_offset; + else + pg_len = count; + + saved_len = pg_len; + if (be->be_state == PNFS_BLOCK_INVALID_DATA && + !bl_is_sector_init(be->be_inval, isect)) { + ret = bl_read_partial_page_sync(pages[i], cow_read, + pg_offset, pg_len, true); + if (ret) { + dprintk("%s bl_read_partial_page_sync fail %d\n", + __func__, ret); + header->pnfs_error = ret; + goto out; + } + ret = bl_mark_sectors_init(be->be_inval, isect, PAGE_CACHE_SECTORS); if (unlikely(ret)) { @@ -688,15 +822,35 @@ next_page: header->pnfs_error = ret; goto out; } + + /* Expand to full page write */ + pg_offset = 0; + pg_len = PAGE_CACHE_SIZE; + } else if ((pg_offset & (SECTOR_SIZE - 1)) || + (pg_len & (SECTOR_SIZE - 1))){ + /* ahh, nasty case. We have to do sync full sector + * read-modify-write cycles. + */ + unsigned int saved_offset = pg_offset; + ret = bl_read_partial_page_sync(pages[i], be, pg_offset, + pg_len, false); + pg_offset = round_down(pg_offset, SECTOR_SIZE); + pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) + - pg_offset; } - bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, + + + bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, isect, pages[i], be, - bl_end_io_write, par); + bl_end_io_write, par, + pg_offset, pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } + offset += saved_len; + count -= saved_len; isect += PAGE_CACHE_SECTORS; last_isect = isect; extent_length -= PAGE_CACHE_SECTORS; @@ -714,17 +868,16 @@ next_page: } write_done: - wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); - if (count < wdata->res.count) { - wdata->res.count = count; - } + wdata->res.count = wdata->args.count; out: bl_put_extent(be); + bl_put_extent(cow_read); bl_submit_bio(WRITE, bio); put_parallel(par); return PNFS_ATTEMPTED; out_mds: bl_put_extent(be); + bl_put_extent(cow_read); kfree(par); return PNFS_NOT_ATTEMPTED; } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 03350690118e..39bb51a8dd18 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -41,6 +41,7 @@ #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) +#define SECTOR_SIZE (1 << SECTOR_SHIFT) struct block_mount_id { spinlock_t bm_lock; /* protects list */ From f742dc4a32587bff50b13dde9d8894b96851951a Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 24 Aug 2012 00:27:52 +0800 Subject: [PATCH 078/113] pnfsblock: fix non-aligned DIO read For DIO read, if it is not sector aligned, we should reject it and resend via MDS. Otherwise there might be data corruption. Also teach bl_read_pagelist to handle partial page reads for DIO. Cc: stable [since v3.4] Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 64 ++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index a9fe644a12d1..61e04fb7c4b8 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -252,8 +252,11 @@ bl_read_pagelist(struct nfs_read_data *rdata) sector_t isect, extent_length = 0; struct parallel_io *par; loff_t f_offset = rdata->args.offset; + size_t bytes_left = rdata->args.count; + unsigned int pg_offset, pg_len; struct page **pages = rdata->args.pages; int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + const bool is_dio = (header->dreq != NULL); dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); @@ -287,36 +290,53 @@ bl_read_pagelist(struct nfs_read_data *rdata) extent_length = min(extent_length, cow_length); } } + + if (is_dio) { + pg_offset = f_offset & ~PAGE_CACHE_MASK; + if (pg_offset + bytes_left > PAGE_CACHE_SIZE) + pg_len = PAGE_CACHE_SIZE - pg_offset; + else + pg_len = bytes_left; + + f_offset += pg_len; + bytes_left -= pg_len; + isect += (pg_offset >> SECTOR_SHIFT); + } else { + pg_offset = 0; + pg_len = PAGE_CACHE_SIZE; + } + hole = is_hole(be, isect); if (hole && !cow_read) { bio = bl_submit_bio(READ, bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); - zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + zero_user_segment(pages[i], pg_offset, pg_len); print_page(pages[i]); SetPageUptodate(pages[i]); } else { struct pnfs_block_extent *be_read; be_read = (hole && cow_read) ? cow_read : be; - bio = bl_add_page_to_bio(bio, rdata->pages.npages - i, + bio = do_add_page_to_bio(bio, rdata->pages.npages - i, READ, isect, pages[i], be_read, - bl_end_io_read, par); + bl_end_io_read, par, + pg_offset, pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } } - isect += PAGE_CACHE_SECTORS; + isect += (pg_len >> SECTOR_SHIFT); extent_length -= PAGE_CACHE_SECTORS; } if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { rdata->res.eof = 1; - rdata->res.count = header->inode->i_size - f_offset; + rdata->res.count = header->inode->i_size - rdata->args.offset; } else { - rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; + rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset; } out: bl_put_extent(be); @@ -1149,9 +1169,37 @@ bl_clear_layoutdriver(struct nfs_server *server) return 0; } +static bool +is_aligned_req(struct nfs_page *req, unsigned int alignment) +{ + return IS_ALIGNED(req->wb_offset, alignment) && + IS_ALIGNED(req->wb_bytes, alignment); +} + +static void +bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, SECTOR_SIZE)) + nfs_pageio_reset_read_mds(pgio); + else + pnfs_generic_pg_init_read(pgio, req); +} + +static bool +bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, SECTOR_SIZE)) + return false; + + return pnfs_generic_pg_test(pgio, prev, req); +} + static const struct nfs_pageio_ops bl_pg_read_ops = { - .pg_init = pnfs_generic_pg_init_read, - .pg_test = pnfs_generic_pg_test, + .pg_init = bl_pg_init_read, + .pg_test = bl_pg_test_read, .pg_doio = pnfs_generic_pg_readpages, }; From 96c9eae638765c2bf2ca4f5a6325484f9bb69aa7 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 24 Aug 2012 00:27:53 +0800 Subject: [PATCH 079/113] pnfsblock: fix non-aligned DIO write For DIO writes, if it is not blocksize aligned, we need to do internal serialization. It may slow down writers anyway. So we just bail them out and resend to MDS. Cc: stable [since v3.4] Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 34 +++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 61e04fb7c4b8..af3ef0e68491 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -685,7 +685,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) struct bio *bio = NULL; struct pnfs_block_extent *be = NULL, *cow_read = NULL; sector_t isect, last_isect = 0, extent_length = 0; - struct parallel_io *par; + struct parallel_io *par = NULL; loff_t offset = wdata->args.offset; size_t count = wdata->args.count; unsigned int pg_offset, pg_len, saved_len; @@ -697,6 +697,13 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + + if (header->dreq != NULL && + (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) || + !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) { + dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); + goto out_mds; + } /* At this point, wdata->pages is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error * to have it redone using nfs. @@ -1197,6 +1204,27 @@ bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return pnfs_generic_pg_test(pgio, prev, req); } +void +bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, PAGE_CACHE_SIZE)) + nfs_pageio_reset_write_mds(pgio); + else + pnfs_generic_pg_init_write(pgio, req); +} + +static bool +bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_dreq != NULL && + !is_aligned_req(req, PAGE_CACHE_SIZE)) + return false; + + return pnfs_generic_pg_test(pgio, prev, req); +} + static const struct nfs_pageio_ops bl_pg_read_ops = { .pg_init = bl_pg_init_read, .pg_test = bl_pg_test_read, @@ -1204,8 +1232,8 @@ static const struct nfs_pageio_ops bl_pg_read_ops = { }; static const struct nfs_pageio_ops bl_pg_write_ops = { - .pg_init = pnfs_generic_pg_init_write, - .pg_test = pnfs_generic_pg_test, + .pg_init = bl_pg_init_write, + .pg_test = bl_pg_test_write, .pg_doio = pnfs_generic_pg_writepages, }; From 4e266229dbb0782d91b75633322edd632794b86d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 5 Sep 2012 14:38:03 +0800 Subject: [PATCH 080/113] pnfsblock: use list_move_tail instead of list_del/list_add_tail Using list_move_tail() instead of list_del() + list_add_tail(). spatch with a semantic match is used to found this problem. (http://coccinelle.lip6.fr/) Signed-off-by: Wei Yongjun Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/extents.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 1f9a6032796b..9c3e117c3ed1 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -683,8 +683,7 @@ encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); p = xdr_encode_hyper(p, 0LL); *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); - list_del(&lce->bse_node); - list_add_tail(&lce->bse_node, &bl->bl_committing); + list_move_tail(&lce->bse_node, &bl->bl_committing); bl->bl_count--; count++; } From ee34e13620d0678d420ce50101aaef94ab81fc74 Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Mon, 10 Sep 2012 08:40:16 +0800 Subject: [PATCH 081/113] NFS: Remove unnecessary semicolons (fs/nfs/client.c) There are some unnecessary semicolons in function find_nfs_version. Just remove them. Signed-off-by: Yanchuan Nian Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index bab3e8af574a..8b39a42ac35e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -93,10 +93,10 @@ static struct nfs_subversion *find_nfs_version(unsigned int version) spin_unlock(&nfs_version_lock); return nfs; } - }; + } spin_unlock(&nfs_version_lock); - return ERR_PTR(-EPROTONOSUPPORT);; + return ERR_PTR(-EPROTONOSUPPORT); } struct nfs_subversion *get_nfs_version(unsigned int version) From 7297cb682acb506ada2e01fbc9b447b04d69936c Mon Sep 17 00:00:00 2001 From: Daniel Walter Date: Wed, 26 Sep 2012 21:51:46 +0200 Subject: [PATCH 082/113] nfs: replace strict_strto* with kstrto* [nfs] replace strict_str* with kstr* variants * replace string conversions with newer kstr* functions Signed-off-by: Daniel Walter Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 4 ++-- fs/nfs/super.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 9985a0aea5ff..27dde503a6b6 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -158,7 +158,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re return 0; memcpy(buf, name, namelen); buf[namelen] = '\0'; - if (strict_strtoul(buf, 0, &val) != 0) + if (kstrtoul(buf, 0, &val) != 0) return 0; *res = val; return 1; @@ -364,7 +364,7 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *typ if (data_size <= 0) { ret = -EINVAL; } else { - ret = strict_strtol(id_str, 10, &id_long); + ret = kstrtol(id_str, 10, &id_long); *id = (__u32)id_long; } return ret; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 56f02a9bd6d3..a719bc0640b4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1112,7 +1112,7 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option) string = match_strdup(args); if (string == NULL) return -ENOMEM; - rc = strict_strtoul(string, 10, option); + rc = kstrtoul(string, 10, option); kfree(string); return rc; @@ -2681,7 +2681,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) if (!val) return -EINVAL; - ret = strict_strtoul(val, 0, &num); + ret = kstrtoul(val, 0, &num); if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) return -EINVAL; *((unsigned int *)kp->arg) = num; From 9f62387d6e26532bcbfb15606956074192ee526a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2012 16:17:31 -0700 Subject: [PATCH 083/113] NFSv4: Fix up a merge conflict between migration and container changes nfs_callback_tcpport is now per-net_namespace. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5c4286643701..3da8130d4460 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -121,12 +121,13 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, .clientid = clp->cl_clientid, .confirm = clp->cl_confirm, }; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); unsigned short port; int status; - port = nfs_callback_tcpport; + port = nn->nfs_callback_tcpport; if (clp->cl_addr.ss_family == AF_INET6) - port = nfs_callback_tcpport6; + port = nn->nfs_callback_tcpport6; status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred, &clid); if (status != 0) From 758201e2c94b7d26ea0ac64e55cab1d53742780a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2012 16:33:18 -0700 Subject: [PATCH 084/113] NFSv4: Fix the minor version callback channel startup The current spaghetti code confuses some versions of gcc (and just looks ugly as hell)! Clean up... Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 8ed0bc8cffb6..2245bef50f37 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -177,15 +178,11 @@ nfs41_callback_up(struct svc_serv *serv) return rqstp; } -static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, - struct svc_serv *serv, +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) { - if (minorversion) { - *rqstpp = nfs41_callback_up(serv); - *callback_svc = nfs41_callback_svc; - } - return minorversion; + *rqstpp = nfs41_callback_up(serv); + *callback_svc = nfs41_callback_svc; } static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, @@ -204,11 +201,11 @@ static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net) return 0; } -static inline int nfs_minorversion_callback_svc_setup(u32 minorversion, - struct svc_serv *serv, +static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) { - return 0; + *rqstpp = ERR_PTR(-ENOTSUPP); + *callback_svc = ERR_PTR(-ENOTSUPP); } static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, @@ -225,19 +222,21 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; char svc_name[12]; int ret; - int minorversion_setup; nfs_callback_bc_serv(minorversion, xprt, serv); if (cb_info->task) return 0; - minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, - serv, &rqstp, &callback_svc); - if (!minorversion_setup) { + switch (minorversion) { + case 0: /* v4.0 callback setup */ rqstp = nfs4_callback_up(serv); callback_svc = nfs4_callback_svc; + break; + default: + nfs_minorversion_callback_svc_setup(serv, + &rqstp, &callback_svc); } if (IS_ERR(rqstp)) From f9d640f3a4f043f7dff66ad7bd8cb29ec145c41d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2012 16:37:51 -0700 Subject: [PATCH 085/113] NFSv4: nfs4_match_clientids is only used by NFSv4.1 Fix another compiler warning. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 14ddd4d30966..8466e6046ff5 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -251,21 +251,6 @@ error: return ERR_PTR(error); } -/* - * Returns true if the client IDs match - */ -static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) -{ - if (a->cl_clientid != b->cl_clientid) { - dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a->cl_clientid, b->cl_clientid); - return false; - } - dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a->cl_clientid, b->cl_clientid); - return true; -} - /* * SETCLIENTID just did a callback update with the callback ident in * "drop," but server trunking discovery claims "drop" and "keep" are @@ -383,6 +368,21 @@ int nfs40_walk_client_list(struct nfs_client *new, } #ifdef CONFIG_NFS_V4_1 +/* + * Returns true if the client IDs match + */ +static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +{ + if (a->cl_clientid != b->cl_clientid) { + dprintk("NFS: --> %s client ID %llx does not match %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return false; + } + dprintk("NFS: --> %s client ID %llx matches %llx\n", + __func__, a->cl_clientid, b->cl_clientid); + return true; +} + /* * Returns true if the server owners match */ From 47b803c8d2e0545221397d175e3563f9f3695628 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 1 Oct 2012 20:42:32 -0400 Subject: [PATCH 086/113] NFSv4.0 reclaim reboot state when re-establishing clientid We should reclaim reboot state when the clientid is stale. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3da8130d4460..24a3ab492df2 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1680,8 +1680,8 @@ out: return nfs4_recovery_handle_error(clp, status); } -/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors - * on EXCHANGE_ID for v4.1 +/* Set NFS4CLNT_LEASE_EXPIRED and reclaim reboot state for all v4.0 errors + * and for recoverable errors on EXCHANGE_ID for v4.1 */ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) { @@ -1691,8 +1691,12 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) return -ESERVERFAULT; /* Lease confirmation error: retry after purging the lease */ ssleep(1); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + break; case -NFS4ERR_STALE_CLIENTID: clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_state_clear_reclaim_reboot(clp); + nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_CLID_INUSE: pr_err("NFS: Server %s reports our clientid is in use\n", From ca57ccc48f6a9a3ec655f87acebab82bf01088e7 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 2 Oct 2012 14:18:54 +0400 Subject: [PATCH 087/113] nfs: include NFSv4 header in netns.h Build error: fs/nfs/netns.h:27:15: error: 'NFS4_MAX_MINOR_VERSION' undeclared here (not in a function) Reported-by: Fengguang Wu Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/netns.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index b9c7f9b1f91b..8ee1fab83268 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -5,6 +5,7 @@ #ifndef __NFS_NETNS_H__ #define __NFS_NETNS_H__ +#include #include #include From 0b37d20ca225f57fff9237bc75d11d3995c8fed6 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 2 Oct 2012 14:18:59 +0400 Subject: [PATCH 088/113] nfs: declare nfs_callback_tcp_port in header Sparse warning: fs/nfs/super.c:2638:16: sparse: symbol 'nfs_callback_tcpport' was not declared. Should it be static? Reported-by: Fengguang Wu Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/callback.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index c07a8d460d36..4251c2ae06ad 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -208,5 +208,6 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp); #define NFS41_BC_MAX_CALLBACKS 1 extern unsigned int nfs_callback_set_tcpport; +extern unsigned short nfs_callback_tcpport; #endif /* __LINUX_FS_NFS_CALLBACK_H */ From 3dd4f8ef7b26d5d4ada6e2f4b92d99b5e2255f72 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 2 Oct 2012 14:19:04 +0400 Subject: [PATCH 089/113] nfs: declare nfs_xdev_mount as static Sparse warning: fs/nfs/super.c:2517:15: warning: symbol 'nfs_xdev_mount' was not declared. Should it be static? Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a719bc0640b4..b4079bbd30dd 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2514,7 +2514,7 @@ EXPORT_SYMBOL_GPL(nfs_kill_super); /* * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) */ -struct dentry * +static struct dentry * nfs_xdev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { From 22e243096104a741a0872328160c89d121a648c2 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 2 Oct 2012 14:19:09 +0400 Subject: [PATCH 090/113] nfs: include nfs4_fh.h in nfs4sysctl.c Sparse warnings: fs/nfs/nfs4sysctl.c:56:5: warning: symbol 'nfs4_register_sysctl' was not declared. Should it be static? fs/nfs/nfs4sysctl.c:64:6: warning: symbol 'nfs4_unregister_sysctl' was not declared. Should it be static? Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/nfs4sysctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 5729bc8aa75d..2628d921b7e3 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -9,6 +9,7 @@ #include #include +#include "nfs4_fs.h" #include "callback.h" static const int nfs_set_port_min = 0; From 4e437e95ae8fce164a97f4d67866c9a7e7ed9335 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 2 Oct 2012 14:19:14 +0400 Subject: [PATCH 091/113] nfs: include internal.h in getroot.h Sparse warning: fs/nfs/nfs4getroot.c:11:5: warning: symbol 'nfs4_get_rootfh' was not declared. Should it be static? Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfs/getroot.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 4654ced096a6..033803c36644 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -32,6 +32,8 @@ #include +#include "internal.h" + #define NFSDBG_FACILITY NFSDBG_CLIENT /* From 6f018efac14eb267d3ba0aa4294594b556147dba Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Oct 2012 08:29:14 -0700 Subject: [PATCH 092/113] NFSv4.1: bl_pg_init_write should be static Reported-by: Fengguang Wu Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index af3ef0e68491..a34014a7f9a5 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1204,7 +1204,7 @@ bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return pnfs_generic_pg_test(pgio, prev, req); } -void +static void bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { if (pgio->pg_dreq != NULL && From ee314c2a35ee7ea7ffa72e2aca83b041138f3358 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Oct 2012 17:25:48 -0700 Subject: [PATCH 093/113] NFSv4.1: Handle BAD_STATEID and EXPIRED errors in layoutget If the layoutget call returns a stateid error, we want to invalidate the layout stateid, and/or recover the open stateid. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e10d66f5be0a..1e0faf9fa078 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6272,26 +6272,44 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; - struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; + struct nfs4_state *state = NULL; dprintk("--> %s\n", __func__); if (!nfs4_sequence_done(task, &lgp->res.seq_res)) - return; + goto out; switch (task->tk_status) { case 0: - break; + goto out; case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: task->tk_status = -NFS4ERR_DELAY; - /* Fall through */ - default: - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (!lo || list_empty(&lo->plh_segs)) { + spin_unlock(&inode->i_lock); + /* If the open stateid was bad, then recover it. */ + state = lgp->args.ctx->state; + } else { + LIST_HEAD(head); + + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + spin_unlock(&inode->i_lock); + /* Mark the bad layout state as invalid, then + * retry using the open stateid. */ + pnfs_free_lseg_list(&head); } } + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + rpc_restart_call_prepare(task); +out: dprintk("<-- %s\n", __func__); } From c2ccc084eb46ae718a200ad9c2606c258bf79a25 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 2 Oct 2012 09:18:12 -0700 Subject: [PATCH 094/113] NFS: nfs41_walk_client_list(): re-lock before iterating Sparse identified an execution path in nfs41_walk_client_list() where the nfs_client_lock is not re-acquired before taking the next loop iteration. fs/nfs/nfs4client.c:437:9: sparse: context imbalance in 'nfs41_walk_client_list' - different lock contexts for basic block Signed-off-by: Chuck Lever Cc: Fengguang Wu Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8466e6046ff5..6bacfde1319a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -450,6 +450,7 @@ int nfs41_walk_client_list(struct nfs_client *new, error = nfs_wait_client_init_complete(pos); if (error < 0) { nfs_put_client(pos); + spin_lock(&nn->nfs_client_lock); continue; } From fd4835708ffd3387a76df2d1d3021717b0b63761 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 2 Oct 2012 11:33:22 -0700 Subject: [PATCH 095/113] NFSv4.1: don't do two EXCHANGE_IDs on mount Since the addition of NFSv4 server trunking detection the mount context calls nfs4_proc_exchange_id then schedules the state manager, which also calls nfs4_proc_exchange_id. Setting the NFS4CLNT_LEASE_CONFIRM bit makes the state manager skip the unneeded EXCHANGE_ID and continue on with session creation. Reported-by: Jorge Mora Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 24a3ab492df2..c351e6b39838 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -351,6 +351,7 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, status = nfs4_proc_exchange_id(clp, cred); if (status != NFS4_OK) return status; + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); return nfs41_walk_client_list(clp, result, cred); } From ddfc4e171292d63d7e3f8c95ff9c3ef9932870ce Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Tue, 2 Oct 2012 16:01:38 -0400 Subject: [PATCH 096/113] NFS: Set key construction data for the legacy upcall This prevents a null pointer dereference when nfs_idmap_complete_pipe_upcall_locked() calls complete_request_key(). Fixes a regression caused by commit 0cac12023 (NFSv4: Ensure that idmap_pipe_downcall sanity-checks the downcall data). Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 675b389cba5b..9cc4a3fbf4b0 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -707,6 +707,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; + data->key_cons = cons; ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) From bbd3a8eee82a2a6f4aa1cce60ccb014f25e5c560 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 2 Oct 2012 14:49:51 -0700 Subject: [PATCH 097/113] NFSv4: don't check MAY_WRITE access bit in OPEN Don't check MAY_WRITE as a newly created file may not have write mode bits, but POSIX allows the creating process to write regardless. This is ok because NFSv4 OPEN ops handle write permissions correctly - the ACCESS in the OPEN compound is to differentiate READ v EXEC permissions. Fixes a regression due to commit 6168f62c (NFSv4: Add ACCESS operation to OPEN compound) Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1e0faf9fa078..ccada6856f0d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1661,10 +1661,10 @@ static int nfs4_opendata_access(struct rpc_cred *cred, return 0; mask = 0; + /* don't check MAY_WRITE - a newly created file may not have + * write mode bits, but POSIX allows the creating process to write */ if (fmode & FMODE_READ) mask |= MAY_READ; - if (fmode & FMODE_WRITE) - mask |= MAY_WRITE; if (fmode & FMODE_EXEC) mask |= MAY_EXEC; @@ -1673,7 +1673,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, nfs_access_set_mask(&cache, opendata->o_res.access_result); nfs_access_add_cache(state->inode, &cache); - if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) + if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) return 0; /* even though OPEN succeeded, access is denied. Close the file */ From ae2bb03236fc978bdf673c19d39832500793b83c Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 2 Oct 2012 14:49:52 -0700 Subject: [PATCH 098/113] NFSv4: don't put ACCESS in OPEN compound if O_EXCL Don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS will return permission denied for all bits until close. Fixes a regression due to commit 6168f62c (NFSv4: Add ACCESS operation to OPEN compound) Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 +++++++++--- fs/nfs/nfs4xdr.c | 12 ++++++++---- include/linux/nfs_xdr.h | 1 + 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ccada6856f0d..21cfac7c2ff8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -862,9 +862,15 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.fh = NFS_FH(dir); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); - /* ask server to check for all possible rights as results are cached */ - p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | - NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; + /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS + * will return permission denied for all bits until close */ + if (!(flags & O_EXCL)) { + /* ask server to check for all possible rights as results + * are cached */ + p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | + NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; + p->o_res.access_request = p->o_arg.access; + } p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); p->o_arg.id.uniquifier = sp->so_seqid.owner_id; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 657483c34e28..0d6030510fe2 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2224,7 +2224,8 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_open(xdr, args, &hdr); encode_getfh(xdr, &hdr); - encode_access(xdr, args->access, &hdr); + if (args->access) + encode_access(xdr, args->access, &hdr); encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); encode_nops(&hdr); } @@ -2261,7 +2262,8 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_open(xdr, args, &hdr); - encode_access(xdr, args->access, &hdr); + if (args->access) + encode_access(xdr, args->access, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -6239,7 +6241,8 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, &res->fh); if (status) goto out; - decode_access(xdr, &res->access_supported, &res->access_result); + if (res->access_request) + decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr(xdr, res->f_attr, res->server); out: return status; @@ -6288,7 +6291,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, status = decode_open(xdr, res); if (status) goto out; - decode_access(xdr, &res->access_supported, &res->access_result); + if (res->access_request) + decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr(xdr, res->f_attr, res->server); out: return status; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 655490dae953..a73ea89789d1 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -369,6 +369,7 @@ struct nfs_openres { struct nfs4_string *owner; struct nfs4_string *group_owner; struct nfs4_sequence_res seq_res; + __u32 access_request; __u32 access_supported; __u32 access_result; }; From 251ec410c495cb93c7ae2cb4beda29205d9bd35f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Oct 2012 15:41:05 -0700 Subject: [PATCH 099/113] NFSv4.1: Fix another refcount issue in pnfs_find_alloc_layout Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d737557747b9..fe9968a62cad 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -907,18 +907,19 @@ pnfs_find_alloc_layout(struct inode *ino, dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); - if (nfsi->layout) { - pnfs_get_layout_hdr(nfsi->layout); - return nfsi->layout; - } + if (nfsi->layout != NULL) + goto out_existing; spin_unlock(&ino->i_lock); new = alloc_init_layout_hdr(ino, ctx, gfp_flags); spin_lock(&ino->i_lock); - if (likely(nfsi->layout == NULL)) /* Won the race? */ + if (likely(nfsi->layout == NULL)) { /* Won the race? */ nfsi->layout = new; - else - pnfs_free_layout_hdr(new); + return new; + } + pnfs_free_layout_hdr(new); +out_existing: + pnfs_get_layout_hdr(nfsi->layout); return nfsi->layout; } From 038d6493763d900797dfeb555502d3b0d8103fba Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Oct 2012 16:38:41 -0700 Subject: [PATCH 100/113] NFSv4.1: Always set the layout stateid if this is the first layoutget If the list of layout segments is empty, we must unconditionally set the layout stateid. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index fe9968a62cad..c8c1d0cc197a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -558,7 +558,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if ((int)(newseq - oldseq) > 0) { + if (list_empty(&lo->plh_segs) || (int)(newseq - oldseq) > 0) { nfs4_stateid_copy(&lo->plh_stateid, new); if (update_barrier) { u32 new_barrier = be32_to_cpu(new->seqid); @@ -1181,6 +1181,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } + + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid, false); + init_lseg(lo, lseg); lseg->pls_range = res->range; pnfs_get_lseg(lseg); @@ -1191,8 +1195,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); } - /* Done processing layoutget. Set the layout stateid */ - pnfs_set_layout_stateid(lo, &res->stateid, false); spin_unlock(&ino->i_lock); return lseg; out: From 5a65503f3dbdb4aa1cd6cb58c479c015d093292b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Oct 2012 16:47:14 -0700 Subject: [PATCH 101/113] NFSv4.1: Deal with wraparound issues when updating the layout stateid ...and add a helper function. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c8c1d0cc197a..f1387e87513f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -549,6 +549,15 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) } } +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)s1 - (s32)s2 > 0; +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -558,7 +567,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (list_empty(&lo->plh_segs) || (int)(newseq - oldseq) > 0) { + if (list_empty(&lo->plh_segs) || pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); if (update_barrier) { u32 new_barrier = be32_to_cpu(new->seqid); From 25a1a6211dd2fcbf0e45a07030703e2a42d7aa87 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Oct 2012 16:56:49 -0700 Subject: [PATCH 102/113] NFSv4.1: Deal with wraparound when updating the layout "barrier" seqid ...and fix a bug in pnfs_set_layout_stateid. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f1387e87513f..de827251bdac 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -572,7 +572,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, if (update_barrier) { u32 new_barrier = be32_to_cpu(new->seqid); - if ((int)(new_barrier - lo->plh_barrier)) + if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) lo->plh_barrier = new_barrier; } else { /* Because of wraparound, we want to keep the barrier @@ -593,9 +593,12 @@ static bool pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, int lget) { - if ((stateid) && - (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0) - return true; + if (stateid != NULL) { + u32 seqid = be32_to_cpu(stateid->seqid); + + if (!pnfs_seqid_is_newer(seqid, lo->plh_barrier)) + return true; + } return lo->plh_block_lgets || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || (list_empty(&lo->plh_segs) && From 807d66d80221920729a8d4abfa04246546a6d3fa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Oct 2012 17:09:00 -0700 Subject: [PATCH 103/113] NFSv4: nfs4_open_done first must check that GETATTR decoded a file type ...before it can check the validity of that file type. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 21cfac7c2ff8..68438aa4f08e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1537,7 +1537,8 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) return; if (task->tk_status == 0) { - switch (data->o_res.f_attr->mode & S_IFMT) { + if (data->o_res.f_attr->valid & NFS_ATTR_FATTR_TYPE) { + switch (data->o_res.f_attr->mode & S_IFMT) { case S_IFREG: break; case S_IFLNK: @@ -1548,6 +1549,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) break; default: data->rpc_status = -ENOTDIR; + } } renew_lease(data->o_res.server, data->timestamp); if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) From e23008ec81ef37b7b271669ce5d2de2643b2dc75 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 2 Oct 2012 21:07:32 -0400 Subject: [PATCH 104/113] NFSv4 reduce attribute requests for open reclaim We currently make no distinction in attribute requests between normal OPENs and OPEN with CLAIM_PREVIOUS. This offers more possibility of failures in the GETATTR response which foils OPEN reclaim attempts. Reduce the requested attributes to the bare minimum needed to update the reclaim open stateid and split nfs4_opendata_to_nfs4_state processing accordingly. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 114 +++++++++++++++++++++++++++++++++++----------- fs/nfs/nfs4xdr.c | 2 +- 2 files changed, 89 insertions(+), 27 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 68438aa4f08e..2b62e3f79c91 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -152,6 +152,12 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { FATTR4_WORD2_MDSTHRESHOLD }; +static const u32 nfs4_open_noattr_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_FILEID, +}; + const u32 nfs4_statfs_bitmap[2] = { FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE @@ -1126,11 +1132,80 @@ out_return_state: return state; } -static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +static void +nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) +{ + struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client; + struct nfs_delegation *delegation; + int delegation_flags = 0; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation) + delegation_flags = delegation->flags; + rcu_read_unlock(); + if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { + pr_err_ratelimited("NFS: Broken NFSv4 server %s is " + "returning a delegation for " + "OPEN(CLAIM_DELEGATE_CUR)\n", + clp->cl_hostname); + } else if ((delegation_flags & 1UL<inode, + data->owner->so_cred, + &data->o_res); + else + nfs_inode_reclaim_delegation(state->inode, + data->owner->so_cred, + &data->o_res); +} + +/* + * Check the inode attributes against the CLAIM_PREVIOUS returned attributes + * and update the nfs4_state. + */ +static struct nfs4_state * +_nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) +{ + struct inode *inode = data->state->inode; + struct nfs4_state *state = data->state; + int ret; + + if (!data->rpc_done) { + ret = data->rpc_status; + goto err; + } + + ret = -ESTALE; + if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) || + !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) || + !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE)) + goto err; + + ret = -ENOMEM; + state = nfs4_get_open_state(inode, data->owner); + if (state == NULL) + goto err; + + ret = nfs_refresh_inode(inode, &data->f_attr); + if (ret) + goto err; + + if (data->o_res.delegation_type != 0) + nfs4_opendata_check_deleg(data, state); + update_open_stateid(state, &data->o_res.stateid, NULL, + data->o_arg.fmode); + + return state; +err: + return ERR_PTR(ret); + +} + +static struct nfs4_state * +_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) { struct inode *inode; struct nfs4_state *state = NULL; - struct nfs_delegation *delegation; int ret; if (!data->rpc_done) { @@ -1149,30 +1224,8 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data state = nfs4_get_open_state(inode, data->owner); if (state == NULL) goto err_put_inode; - if (data->o_res.delegation_type != 0) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - int delegation_flags = 0; - - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation) - delegation_flags = delegation->flags; - rcu_read_unlock(); - if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { - pr_err_ratelimited("NFS: Broken NFSv4 server %s is " - "returning a delegation for " - "OPEN(CLAIM_DELEGATE_CUR)\n", - clp->cl_hostname); - } else if ((delegation_flags & 1UL<inode, - data->owner->so_cred, - &data->o_res); - else - nfs_inode_reclaim_delegation(state->inode, - data->owner->so_cred, - &data->o_res); - } - + if (data->o_res.delegation_type != 0) + nfs4_opendata_check_deleg(data, state); update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); iput(inode); @@ -1184,6 +1237,14 @@ err: return ERR_PTR(ret); } +static struct nfs4_state * +nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) + return _nfs4_opendata_reclaim_to_nfs4_state(data); + return _nfs4_opendata_to_nfs4_state(data); +} + static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) { struct nfs_inode *nfsi = NFS_I(state->inode); @@ -1505,6 +1566,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0d6030510fe2..40836ee5dc3a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2264,7 +2264,7 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_open(xdr, args, &hdr); if (args->access) encode_access(xdr, args->access, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); encode_nops(&hdr); } From 8544a9dc18a1aa787b85425ecc9233b0a9adaff4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Oct 2012 10:54:50 -0700 Subject: [PATCH 105/113] NFSv4.1: Remove the dependency on CONFIG_EXPERIMENTAL CONFIG_EXPERIMENTAL is deprecated and, regardless of that, this code is being enabled in most newer distributions. Signed-off-by: Trond Myklebust --- fs/nfs/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index db7ad719628a..13ca196385f5 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -95,8 +95,8 @@ config NFS_SWAP This option enables swapon to work on files located on NFS mounts. config NFS_V4_1 - bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" - depends on NFS_V4 && EXPERIMENTAL + bool "NFS client support for NFSv4.1" + depends on NFS_V4 select SUNRPC_BACKCHANNEL help This option enables support for minor version 1 of the NFSv4 protocol From 5f65753033d8c5a53e65810bff3832e8282c68d1 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 3 Oct 2012 02:39:34 -0400 Subject: [PATCH 106/113] NFSv4 set open access operation call flag in nfs4_init_opendata_res nfs4_open_recover_helper zeros the nfs4_opendata result structures, removing the result access_request information which leads to an XDR decode error. Move the setting of the result access_request field to nfs4_init_opendata_res which sets all the other required nfs4_opendata result fields and is shared between the open and recover open paths. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2b62e3f79c91..68b21d81b7ac 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -840,6 +840,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) p->o_res.seqid = p->o_arg.seqid; p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; + p->o_res.access_request = p->o_arg.access; nfs_fattr_init(&p->f_attr); nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); } @@ -875,7 +876,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, * are cached */ p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; - p->o_res.access_request = p->o_arg.access; } p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); From 0f35ad6f688e9b7bcaa918a42130695822906f11 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Oct 2012 16:28:17 -0700 Subject: [PATCH 107/113] NFSv4.1: Deal with seqid wraparound in the pNFS return-on-close code Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index de827251bdac..42613bd19f8e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -801,7 +801,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - if ((int)(barrier - lo->plh_barrier) > 0) + if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); } From 22aaf71495570b31350c37fd0aa736551bbaa3c9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Oct 2012 16:32:22 -0700 Subject: [PATCH 108/113] NFSv4.1: Ensure that the layout sequence id stays 'close' to the current Clamp the layout barrier sequence id to the current sequence id minus the maximum number of outstanding layoutget requests. Also ensure that we correctly initialise lo->plh_barrier if there are no layout segments associated to this layout header. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 42613bd19f8e..861dd97b569e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -563,28 +563,23 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { - u32 oldseq, newseq; + u32 oldseq, newseq, new_barrier; + int empty = list_empty(&lo->plh_segs); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (list_empty(&lo->plh_segs) || pnfs_seqid_is_newer(newseq, oldseq)) { + if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); if (update_barrier) { - u32 new_barrier = be32_to_cpu(new->seqid); - - if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) - lo->plh_barrier = new_barrier; + new_barrier = be32_to_cpu(new->seqid); } else { /* Because of wraparound, we want to keep the barrier - * "close" to the current seqids. It needs to be - * within 2**31 to count as "behind", so if it - * gets too near that limit, give us a litle leeway - * and bring it to within 2**30. - * NOTE - and yes, this is all unsigned arithmetic. + * "close" to the current seqids. */ - if (unlikely((newseq - lo->plh_barrier) > (3 << 29))) - lo->plh_barrier = newseq - (1 << 30); + new_barrier = newseq - atomic_read(&lo->plh_outstanding); } + if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + lo->plh_barrier = new_barrier; } } From 19c54abab79404c027ff61f13468e78a3e2a0065 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Oct 2012 16:56:58 -0700 Subject: [PATCH 109/113] NFSv4.1: Cleanup ugliness in pnfs_layoutgets_blocked() Split it into two functions, one which checks if layoutgets are blocked, and one which checks if the layout stateid has expired. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 861dd97b569e..bd9769296e4e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -583,17 +583,19 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, } } +static bool +pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid) +{ + u32 seqid = be32_to_cpu(stateid->seqid); + + return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); +} + /* lget is set to 1 if called from inside send_layoutget call chain */ static bool -pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, - int lget) +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) { - if (stateid != NULL) { - u32 seqid = be32_to_cpu(stateid->seqid); - - if (!pnfs_seqid_is_newer(seqid, lo->plh_barrier)) - return true; - } return lo->plh_block_lgets || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || (list_empty(&lo->plh_segs) && @@ -608,7 +610,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo, NULL, 1)) { + if (pnfs_layoutgets_blocked(lo, 1)) { status = -EAGAIN; } else if (list_empty(&lo->plh_segs)) { int seq; @@ -1111,7 +1113,7 @@ pnfs_update_layout(struct inode *ino, if (lseg) goto out_unlock; - if (pnfs_layoutgets_blocked(lo, NULL, 0)) + if (pnfs_layoutgets_blocked(lo, 0)) goto out_unlock; atomic_inc(&lo->plh_outstanding); @@ -1184,7 +1186,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget_reply; } - if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { + if (pnfs_layoutgets_blocked(lo, 1) || + pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } From 35754bc00e94e598c432ad02f7a3d3063c4402e3 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 25 Sep 2012 14:55:57 +0800 Subject: [PATCH 110/113] NFS: track direct IO left bytes Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 253d397780b1..4be8673ee18d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -78,6 +78,7 @@ struct nfs_direct_req { atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ ssize_t count, /* bytes actually processed */ + bytes_left, /* bytes left to be sent */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ @@ -390,6 +391,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de user_addr += req_len; pos += req_len; count -= req_len; + dreq->bytes_left -= req_len; } /* The nfs_page now hold references to these pages */ nfs_direct_release_pages(pagevec, npages); @@ -457,6 +459,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, goto out; dreq->inode = inode; + dreq->bytes_left = iov_length(iov, nr_segs); dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { @@ -710,6 +713,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d user_addr += req_len; pos += req_len; count -= req_len; + dreq->bytes_left -= req_len; } /* The nfs_page now hold references to these pages */ nfs_direct_release_pages(pagevec, npages); @@ -860,6 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, goto out; dreq->inode = inode; + dreq->bytes_left = count; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { From 6296556f0b31eaff29f2a3aee2c17b7eae895b98 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 25 Sep 2012 14:55:57 +0800 Subject: [PATCH 111/113] NFS41: send real write size in layoutget For buffer write, block layout client scan inode mapping to find next hole and use offset-to-hole as layoutget length. Object layout client uses offset-to-isize as layoutget length. For direct write, both block layout and object layout use dreq->bytes_left. Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 38 +++++++++++++++++++++++++++++--- fs/nfs/direct.c | 7 ++++++ fs/nfs/internal.h | 1 + fs/nfs/objlayout/objio_osd.c | 9 +++++++- fs/nfs/pnfs.c | 6 +++-- fs/nfs/pnfs.h | 3 ++- 6 files changed, 57 insertions(+), 7 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index a34014a7f9a5..f1027b06a1a9 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -37,6 +37,7 @@ #include /* struct bio */ #include /* various write calls */ #include +#include #include "../pnfs.h" #include "../internal.h" @@ -1204,14 +1205,45 @@ bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return pnfs_generic_pg_test(pgio, prev, req); } +/* + * Return the number of contiguous bytes for a given inode + * starting at page frame idx. + */ +static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t end; + + /* Optimize common case that writes from 0 to end of file */ + end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); + if (end != NFS_I(inode)->npages) { + rcu_read_lock(); + end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); + rcu_read_unlock(); + } + + if (!end) + return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT); + else + return (end - idx) << PAGE_CACHE_SHIFT; +} + static void bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { if (pgio->pg_dreq != NULL && - !is_aligned_req(req, PAGE_CACHE_SIZE)) + !is_aligned_req(req, PAGE_CACHE_SIZE)) { nfs_pageio_reset_write_mds(pgio); - else - pnfs_generic_pg_init_write(pgio, req); + } else { + u64 wb_size; + if (pgio->pg_dreq == NULL) + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, + req->wb_index); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); + } } static bool diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4be8673ee18d..cae26cbd59ee 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -191,6 +192,12 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +{ + return dreq->bytes_left; +} +EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); + /* * Collects and returns the final error value/byte-count. */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 89a795dc3027..59b133c5d652 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -464,6 +464,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode) { inode_dio_wait(inode); } +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_read_data *); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index ea6d111b03e9..be731e6b7b9c 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -41,6 +41,7 @@ #include #include "objlayout.h" +#include "../internal.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -606,8 +607,14 @@ static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { unsigned long stripe_end = 0; + u64 wb_size; - pnfs_generic_pg_init_write(pgio, req); + if (pgio->pg_dreq == NULL) + wb_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); if (unlikely(pgio->pg_lseg == NULL)) return; /* Not pNFS */ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bd9769296e4e..9a2bcce45282 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1240,7 +1240,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); void -pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, u64 wb_size) { BUG_ON(pgio->pg_lseg != NULL); @@ -1248,10 +1249,11 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page * nfs_pageio_reset_write_mds(pgio); return; } + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), - req->wb_bytes, + wb_size, IOMODE_RW, GFP_NOFS); /* If no lseg, fall back to write through mds */ diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 6cede2c6c961..2d722dba1111 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -186,7 +186,8 @@ void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); -void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, u64 wb_size); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); From 1fd937bd7583e618df0528f0268b210f265d6910 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 25 Sep 2012 14:55:57 +0800 Subject: [PATCH 112/113] NFS41: send real read size in layoutget For buffer read, use offst-to-isize. For direct read, use dreq->bytes_left. Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 9a2bcce45282..fe624c91bd00 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1220,16 +1220,24 @@ out_forget_reply: void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { + u64 rd_size = req->wb_bytes; + BUG_ON(pgio->pg_lseg != NULL); if (req->wb_offset != req->wb_pgbase) { nfs_pageio_reset_read_mds(pgio); return; } + + if (pgio->pg_dreq == NULL) + rd_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), - req->wb_bytes, + rd_size, IOMODE_READ, GFP_KERNEL); /* If no lseg, fall back to read through mds */ From af283885b70248268617955a5ea5476647bd556b Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 25 Sep 2012 14:55:57 +0800 Subject: [PATCH 113/113] pnfsblock: cleanup nfs4_blkdev_get It is not needed at all and it is messing with return values... Reported-by: Wei Yongjun Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.h | 1 - fs/nfs/blocklayout/blocklayoutdev.c | 25 +++++-------------------- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 39bb51a8dd18..f4891bde8851 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -173,7 +173,6 @@ struct bl_msg_hdr { /* blocklayoutdev.c */ ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); void bl_pipe_destroy_msg(struct rpc_pipe_msg *); -struct block_device *nfs4_blkdev_get(dev_t dev); int nfs4_blkdev_put(struct block_device *bdev); struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, struct pnfs_device *dev); diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index c96554245ccf..a86c5bdad9e3 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -53,22 +53,6 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) return 0; } -/* Open a block_device by device number. */ -struct block_device *nfs4_blkdev_get(dev_t dev) -{ - struct block_device *bd; - - dprintk("%s enter\n", __func__); - bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); - if (IS_ERR(bd)) - goto fail; - return bd; -fail: - dprintk("%s failed to open device : %ld\n", - __func__, PTR_ERR(bd)); - return NULL; -} - /* * Release the block device */ @@ -172,11 +156,12 @@ nfs4_blk_decode_device(struct nfs_server *server, goto out; } - bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); + bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), + FMODE_READ, NULL); if (IS_ERR(bd)) { - rc = PTR_ERR(bd); - dprintk("%s failed to open device : %d\n", __func__, rc); - rv = ERR_PTR(rc); + dprintk("%s failed to open device : %ld\n", __func__, + PTR_ERR(bd)); + rv = ERR_CAST(bd); goto out; }