From 5a7a613a47a715711b3f2d3322a0eac21d459166 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Feb 2013 12:53:43 -0500 Subject: [PATCH 01/10] NFS: Don't allow NFS silly-renamed files to be deleted, no signal Commit 73ca100 broke the code that prevents the client from deleting a silly renamed dentry. This affected "delete on last close" semantics as after that commit, nothing prevented removal of silly-renamed files. As a result, a process holding a file open could easily get an ESTALE on the file in a directory where some other process issued 'rm -rf some_dir_containing_the_file' twice. Before the commit, any attempt at unlinking silly renamed files would fail inside may_delete() with -EBUSY because of the DCACHE_NFSFS_RENAMED flag. The following testcase demonstrates the problem: tail -f /nfsmnt/dir/file & rm -rf /nfsmnt/dir rm -rf /nfsmnt/dir # second removal does not fail, 'tail' process receives ESTALE The problem with the above commit is that it unhashes the old and new dentries from the lookup path, even in the normal case when a signal is not encountered and it would have been safe to call d_move. Unfortunately the old dentry has the special DCACHE_NFSFS_RENAMED flag set on it. Unhashing has the side-effect that future lookups call d_alloc(), allocating a new dentry without the special flag for any silly-renamed files. As a result, subsequent calls to unlink silly renamed files do not fail but allow the removal to go through. This will result in ESTALE errors for any other process doing operations on the file. To fix this, go back to using d_move on success. For the signal case, it's unclear what we may safely do beyond d_drop. Reported-by: Dave Wysochanski Signed-off-by: Trond Myklebust Acked-by: Jeff Layton Cc: stable@vger.kernel.org --- fs/nfs/unlink.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index d26a32f5b53b..1f1f38f0c5d5 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -335,20 +335,14 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct inode *old_dir = data->old_dir; struct inode *new_dir = data->new_dir; struct dentry *old_dentry = data->old_dentry; - struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { rpc_restart_call_prepare(task); return; } - if (task->tk_status != 0) { + if (task->tk_status != 0) nfs_cancel_async_unlink(old_dentry); - return; - } - - d_drop(old_dentry); - d_drop(new_dentry); } /** @@ -549,6 +543,18 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) error = rpc_wait_for_completion_task(task); if (error == 0) error = task->tk_status; + switch (error) { + case 0: + /* The rename succeeded */ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + d_move(dentry, sdentry); + break; + case -ERESTARTSYS: + /* The result of the rename is unknown. Play it safe by + * forcing a new lookup */ + d_drop(dentry); + d_drop(sdentry); + } rpc_put_task(task); out_dput: dput(sdentry); From a9a6b52ee1baa865283a91eb8d443ee91adfca56 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 22 Feb 2013 14:57:57 -0500 Subject: [PATCH 02/10] SUNRPC: Don't start the retransmission timer when out of socket space If the socket is full, we're better off just waiting until it empties, or until the connection is broken. The reason why we generally don't want to time out is that the call to xprt->ops->release_xprt() will trigger a connection reset, which isn't helpful... Let's make an exception for soft RPC calls, since they have to provide timeout guarantees. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- net/sunrpc/xprt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 846c34fdee9f..b7478d5e7ffd 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -487,13 +487,17 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); * xprt_wait_for_buffer_space - wait for transport output buffer to clear * @task: task to be put to sleep * @action: function pointer to be executed after wait + * + * Note that we only set the timer for the case of RPC_IS_SOFT(), since + * we don't in general want to force a socket disconnection due to + * an incomplete RPC call transmission. */ void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - task->tk_timeout = req->rq_timeout; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; rpc_sleep_on(&xprt->pending, task, action); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); From 78f33277f96430ea001c39e952f6b8200b2ab850 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 24 Feb 2013 09:55:57 -0500 Subject: [PATCH 03/10] pnfs: fix resend_to_mds for directio Pass the directio request on pageio_init to clean up the API. Percolate pg_dreq from original nfs_pageio_descriptor to the pnfs_{read,write}_done_resend_to_mds and use it on respective call to nfs_pageio_init_{read,write} on the newly created nfs_pageio_descriptor. Reproduced by command: mount -o vers=4.1 server:/ /mnt dd bs=128k count=8 if=/dev/zero of=/mnt/dd.out oflag=direct BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: [] atomic_inc+0x4/0x9 [nfs] PGD 34786067 PUD 34794067 PMD 0 Oops: 0002 [#1] SMP Modules linked in: nfs_layout_nfsv41_files nfsv4 nfs nfsd lockd nfs_acl auth_rpcgss exportfs sunrpc btrfs zlib_deflate libcrc32c ipv6 autofs4 CPU 1 Pid: 259, comm: kworker/1:2 Not tainted 3.8.0-rc6 #2 Bochs Bochs RIP: 0010:[] [] atomic_inc+0x4/0x9 [nfs] RSP: 0018:ffff880038f8fa68 EFLAGS: 00010206 RAX: ffffffffa021a6a9 RBX: ffff880038f8fb48 RCX: 00000000000a0000 RDX: ffffffffa021e616 RSI: ffff8800385e9a40 RDI: 0000000000000028 RBP: ffff880038f8fa68 R08: ffffffff81ad6720 R09: ffff8800385e9510 R10: ffffffffa0228450 R11: ffff880038e87418 R12: ffff8800385e9a40 R13: ffff8800385e9a70 R14: ffff880038f8fb38 R15: ffffffffa0148878 FS: 0000000000000000(0000) GS:ffff88003e400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000028 CR3: 0000000034789000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/1:2 (pid: 259, threadinfo ffff880038f8e000, task ffff880038302480) Stack: ffff880038f8fa78 ffffffffa021a6bf ffff880038f8fa88 ffffffffa021bb82 ffff880038f8fae8 ffffffffa021f454 ffff880038f8fae8 ffffffff8109689d ffff880038f8fab8 ffffffff00000006 0000000000000000 ffff880038f8fb48 Call Trace: [] nfs_direct_pgio_init+0x16/0x18 [nfs] [] nfs_pgheader_init+0x6a/0x6c [nfs] [] nfs_generic_pg_writepages+0x51/0xf8 [nfs] [] ? mark_held_locks+0x71/0x99 [] ? rpc_release_resources_task+0x37/0x37 [sunrpc] [] nfs_pageio_doio+0x1a/0x43 [nfs] [] nfs_pageio_complete+0x16/0x2c [nfs] [] pnfs_write_done_resend_to_mds+0x95/0xc5 [nfsv4] [] ? rpc_release_resources_task+0x37/0x37 [sunrpc] [] filelayout_reset_write+0x8c/0x99 [nfs_layout_nfsv41_files] [] filelayout_write_done_cb+0x4d/0xc1 [nfs_layout_nfsv41_files] [] nfs4_write_done+0x36/0x49 [nfsv4] [] nfs_writeback_done+0x53/0x1cc [nfs] [] nfs_writeback_done_common+0xe/0x10 [nfs] [] filelayout_write_call_done+0x28/0x2a [nfs_layout_nfsv41_files] [] rpc_exit_task+0x29/0x87 [sunrpc] [] __rpc_execute+0x11d/0x3cc [sunrpc] [] ? trace_hardirqs_on_caller+0x117/0x173 [] rpc_async_schedule+0x27/0x32 [sunrpc] [] ? __rpc_execute+0x3cc/0x3cc [sunrpc] [] process_one_work+0x226/0x422 [] ? process_one_work+0x159/0x422 [] ? lock_acquired+0x210/0x249 [] ? __rpc_execute+0x3cc/0x3cc [sunrpc] [] worker_thread+0x126/0x1c4 [] ? manage_workers+0x240/0x240 [] kthread+0xb1/0xb9 [] ? __kthread_parkme+0x65/0x65 [] ret_from_fork+0x7c/0xb0 [] ? __kthread_parkme+0x65/0x65 Code: 00 83 38 02 74 12 48 81 4b 50 00 00 01 00 c7 83 60 07 00 00 01 00 00 00 48 89 df e8 55 fe ff ff 5b 41 5c 5d c3 66 90 55 48 89 e5 ff 07 5d c3 55 48 89 e5 f0 ff 0f 0f 94 c0 84 c0 0f 95 c0 0f RIP [] atomic_inc+0x4/0x9 [nfs] RSP CR2: 0000000000000028 Signed-off-by: Benny Halevy Cc: stable@kernel.org [>= 3.6] Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 6 ++++-- fs/nfs/pnfs.c | 14 ++++++++++---- fs/nfs/pnfs.h | 6 ++++-- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 194c48410336..49eeb044c109 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -99,7 +99,8 @@ static void filelayout_reset_write(struct nfs_write_data *data) task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } } @@ -119,7 +120,8 @@ static void filelayout_reset_read(struct nfs_read_data *data) task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6be70f622b62..97767c8683f9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1422,13 +1422,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops) + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); + pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1463,7 +1465,8 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } /* @@ -1578,13 +1581,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops) + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ nfs_pageio_init_read(&pgio, inode, compl_ops); + pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1615,7 +1620,8 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, &hdr->pages, - hdr->completion_ops); + hdr->completion_ops, + hdr->dreq); } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 97cb358bb882..94ba80417748 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -230,9 +230,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops); + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, - const struct nfs_pgio_completion_ops *compl_ops); + const struct nfs_pgio_completion_ops *compl_ops, + struct nfs_direct_req *dreq); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); /* nfs4_deviceid_flags */ From a47970ff7814718fec31b7d966747c6aa1a3545f Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Mon, 25 Feb 2013 21:27:33 -0500 Subject: [PATCH 04/10] NFSv4.1: Hold reference to layout hdr in layoutget This fixes an oops where a LAYOUTGET is in still in the rpciod queue, but the requesting processes has been killed. Without this, killing the process does the final pnfs_put_layout_hdr() and sets NFS_I(inode)->layout to NULL while the LAYOUTGET rpc task still references it. Example oops: BUG: unable to handle kernel NULL pointer dereference at 0000000000000080 IP: [] pnfs_choose_layoutget_stateid+0x37/0xef [nfsv4] PGD 7365b067 PUD 7365d067 PMD 0 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: nfs_layout_nfsv41_files nfsv4 auth_rpcgss nfs lockd sunrpc ipt_MASQUERADE ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle ip6table_filter ip6_tables ppdev e1000 i2c_piix4 i2c_core shpchp parport_pc parport crc32c_intel aesni_intel xts aes_x86_64 lrw gf128mul ablk_helper cryptd mptspi scsi_transport_spi mptscsih mptbase floppy autofs4 CPU 0 Pid: 27, comm: kworker/0:1 Not tainted 3.8.0-dros_cthon2013+ #4 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform RIP: 0010:[] [] pnfs_choose_layoutget_stateid+0x37/0xef [nfsv4] RSP: 0018:ffff88007b0c1c88 EFLAGS: 00010246 RAX: ffff88006ed36678 RBX: 0000000000000000 RCX: 0000000ea877e3bc RDX: ffff88007a729da8 RSI: 0000000000000000 RDI: ffff88007a72b958 RBP: ffff88007b0c1ca8 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff88007a72b958 R13: ffff88007a729da8 R14: 0000000000000000 R15: ffffffffa011077e FS: 0000000000000000(0000) GS:ffff88007f600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000080 CR3: 00000000735f8000 CR4: 00000000001407f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/0:1 (pid: 27, threadinfo ffff88007b0c0000, task ffff88007c2fa0c0) Stack: ffff88006fc05388 ffff88007a72b908 ffff88007b240900 ffff88006fc05388 ffff88007b0c1cd8 ffffffffa01a2170 ffff88007b240900 ffff88007b240900 ffff88007b240970 ffffffffa011077e ffff88007b0c1ce8 ffffffffa0110791 Call Trace: [] nfs4_layoutget_prepare+0x7b/0x92 [nfsv4] [] ? __rpc_atrun+0x15/0x15 [sunrpc] [] rpc_prepare_task+0x13/0x15 [sunrpc] Reported-by: Tigran Mkrtchyan Signed-off-by: Weston Andros Adamson Cc: stable@kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index eae83bf96c6d..655237fc46df 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6129,11 +6129,13 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) static void nfs4_layoutget_release(void *calldata) { struct nfs4_layoutget *lgp = calldata; - struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); nfs4_free_pages(lgp->args.layout.pages, max_pages); + pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); kfree(calldata); dprintk("<-- %s\n", __func__); @@ -6148,7 +6150,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { struct pnfs_layout_segment * nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) { - struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct inode *inode = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(inode); size_t max_pages = max_response_pages(server); struct rpc_task *task; struct rpc_message msg = { @@ -6178,6 +6181,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + + /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ + pnfs_get_layout_hdr(NFS_I(inode)->layout); + task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); From f6488c9ba51d65410e2dbc4345413c0d9120971e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 Feb 2013 20:10:34 -0500 Subject: [PATCH 05/10] nfs: don't allow nfs_find_actor to match inodes of the wrong type Benny Halevy reported the following oops when testing RHEL6: <7>nfs_update_inode: inode 892950 mode changed, 0040755 to 0100644 <1>BUG: unable to handle kernel NULL pointer dereference at (null) <1>IP: [] nfs_closedir+0x15/0x30 [nfs] <4>PGD 81448a067 PUD 831632067 PMD 0 <4>Oops: 0000 [#1] SMP <4>last sysfs file: /sys/kernel/mm/redhat_transparent_hugepage/enabled <4>CPU 6 <4>Modules linked in: fuse bonding 8021q garp ebtable_nat ebtables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi cxgb3 mdio ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi softdog bridge stp llc xt_physdev ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_multiport iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ipv6 dm_round_robin dm_multipath objlayoutdriver2(U) nfs(U) lockd fscache auth_rpcgss nfs_acl sunrpc vhost_net macvtap macvlan tun kvm_intel kvm be2net igb dca ptp pps_core microcode serio_raw sg iTCO_wdt iTCO_vendor_support i7core_edac edac_core shpchp ext4 mbcache jbd2 sd_mod crc_t10dif ahci dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan] <4> <4>Pid: 6332, comm: dd Not tainted 2.6.32-358.el6.x86_64 #1 HP ProLiant DL170e G6 /ProLiant DL170e G6 <4>RIP: 0010:[] [] nfs_closedir+0x15/0x30 [nfs] <4>RSP: 0018:ffff88081458bb98 EFLAGS: 00010292 <4>RAX: ffffffffa02a52b0 RBX: 0000000000000000 RCX: 0000000000000003 <4>RDX: ffffffffa02e45a0 RSI: ffff88081440b300 RDI: ffff88082d5f5760 <4>RBP: ffff88081458bba8 R08: 0000000000000000 R09: 0000000000000000 <4>R10: 0000000000000772 R11: 0000000000400004 R12: 0000000040000008 <4>R13: ffff88082d5f5760 R14: ffff88082d6e8800 R15: ffff88082f12d780 <4>FS: 00007f728f37e700(0000) GS:ffff8800456c0000(0000) knlGS:0000000000000000 <4>CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b <4>CR2: 0000000000000000 CR3: 0000000831279000 CR4: 00000000000007e0 <4>DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 <4>DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 <4>Process dd (pid: 6332, threadinfo ffff88081458a000, task ffff88082fa0e040) <4>Stack: <4> 0000000040000008 ffff88081440b300 ffff88081458bbf8 ffffffff81182745 <4> ffff88082d5f5760 ffff88082d6e8800 ffff88081458bbf8 ffffffffffffffea <4> ffff88082f12d780 ffff88082d6e8800 ffffffffa02a50a0 ffff88082d5f5760 <4>Call Trace: <4> [] __fput+0xf5/0x210 <4> [] ? do_open+0x0/0x20 [nfs] <4> [] fput+0x25/0x30 <4> [] __dentry_open+0x27e/0x360 <4> [] ? inotify_d_instantiate+0x2a/0x60 <4> [] lookup_instantiate_filp+0x69/0x90 <4> [] nfs_intent_set_file+0x59/0x90 [nfs] <4> [] nfs_atomic_lookup+0x1bb/0x310 [nfs] <4> [] __lookup_hash+0x102/0x160 <4> [] ? selinux_inode_permission+0x72/0xb0 <4> [] lookup_hash+0x3a/0x50 <4> [] do_filp_open+0x2eb/0xdd0 <4> [] ? __do_page_fault+0x1ec/0x480 <4> [] ? alloc_fd+0x92/0x160 <4> [] do_sys_open+0x69/0x140 <4> [] ? sys_lseek+0x66/0x80 <4> [] sys_open+0x20/0x30 <4> [] system_call_fastpath+0x16/0x1b <4>Code: 65 48 8b 04 25 c8 cb 00 00 83 a8 44 e0 ff ff 01 5b 41 5c c9 c3 90 55 48 89 e5 53 48 83 ec 08 0f 1f 44 00 00 48 8b 9e a0 00 00 00 <48> 8b 3b e8 13 0c f7 ff 48 89 df e8 ab 3d ec e0 48 83 c4 08 31 <1>RIP [] nfs_closedir+0x15/0x30 [nfs] <4> RSP <4>CR2: 0000000000000000 I think this is ultimately due to a bug on the server. The client had previously found a directory dentry. It then later tried to do an atomic open on a new (regular file) dentry. The attributes it got back had the same filehandle as the previously found directory inode. It then tried to put the filp because it failed the aops tests for O_DIRECT opens, and oopsed here because the ctx was still NULL. Obviously the root cause here is a server issue, but we can take steps to mitigate this on the client. When nfs_fhget is called, we always know what type of inode it is. In the event that there's a broken or malicious server on the other end of the wire, the client can end up crashing because the wrong ops are set on it. Have nfs_find_actor check that the inode type is correct after checking the fileid. The fileid check should rarely ever match, so it should only rarely ever get to this check. In the case where we have a broken server, we may see two different inodes with the same i_ino, but the client should be able to cope with them without crashing. This should fix the oops reported here: https://bugzilla.redhat.com/show_bug.cgi?id=913660 Reported-by: Benny Halevy Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6acc73c80d7f..f52c99f4785b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -237,6 +237,8 @@ nfs_find_actor(struct inode *inode, void *opaque) if (NFS_FILEID(inode) != fattr->fileid) return 0; + if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) + return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; if (is_bad_inode(inode) || NFS_STALE(inode)) From 7aa262b522ee5a7f1bf9b8fe34539b262ba6da0a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 28 Feb 2013 16:19:59 -0800 Subject: [PATCH 06/10] NFSv4: Fix another open/open_recovery deadlock If we don't release the open seqid before we wait for state recovery, then we may end up deadlocking the state recovery thread. This patch addresses a new deadlock that was introduced by commit c21443c2c792cd9b463646d982b0fe48aa6feb0f (NFSv4: Fix a reboot recovery race when opening a file) Reported-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 655237fc46df..37eb38566fbc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1158,6 +1158,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) data->o_arg.fmode); iput(inode); out: + nfs_release_seqid(data->o_arg.seqid); return state; err_put_inode: iput(inode); From eb97cbb459c656b542101091f67ce112b3e56183 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 28 Feb 2013 20:30:08 -0500 Subject: [PATCH 07/10] PNFS: set the default DS timeout to 60 seconds The client should have 60 second default timeouts for DS operations, not 6 seconds. NFS4_DEF_DS_TIMEO is used as "timeout in tenths of a second" in nfs_init_timeout_values (and is not used anywhere else). This matches up with the description of the module param dataserver_timeo. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 8c07241fe52b..b8da95548d3d 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -36,7 +36,7 @@ * Default data server connection timeout and retrans vaules. * Set by module paramters dataserver_timeo and dataserver_retrans. */ -#define NFS4_DEF_DS_TIMEO 60 +#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ #define NFS4_DEF_DS_RETRANS 5 /* From edddbb1eda61753c886a3c5e159293a7b3a9e30a Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 28 Feb 2013 20:30:09 -0500 Subject: [PATCH 08/10] SUNRPC: add call to get configured timeout Returns the configured timeout for the xprt of the rpc client. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 34206b84d8da..21d52d0dc15c 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -160,6 +160,7 @@ void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int); int rpc_protocol(struct rpc_clnt *); struct net * rpc_net_ns(struct rpc_clnt *); size_t rpc_max_payload(struct rpc_clnt *); +unsigned long rpc_get_timeout(struct rpc_clnt *clnt); void rpc_force_rebind(struct rpc_clnt *); size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t); const char *rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a9f7906c1a6a..09dc0c2b56a3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1195,6 +1195,21 @@ size_t rpc_max_payload(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_max_payload); +/** + * rpc_get_timeout - Get timeout for transport in tenths of seconds + * @clnt: RPC client to query + */ +unsigned long rpc_get_timeout(struct rpc_clnt *clnt) +{ + unsigned long ret; + + rcu_read_lock(); + ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval; + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_get_timeout); + /** * rpc_force_rebind - force transport to check that remote port is unchanged * @clnt: client to rebind From 3000512137602b84d1ad5fd89d62984993a19bb6 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 28 Feb 2013 20:30:10 -0500 Subject: [PATCH 09/10] NFSv4.1: LAYOUTGET EDELAY loops timeout to the MDS The client will currently try LAYOUTGETs forever if a server is returning NFS4ERR_LAYOUTTRYLATER or NFS4ERR_RECALLCONFLICT - even if the client no longer needs the layout (ie process killed, unmounted). This patch uses the DS timeout value (module parameter 'dataserver_timeo' via rpc layer) to set an upper limit of how long the client tries LATOUTGETs in this situation. Once the timeout is reached, IO is redirected to the MDS. This also changes how the client checks if a layout is on the clp list to avoid a double list_add. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 ++++++++- fs/nfs/pnfs.c | 7 +++---- include/linux/nfs_xdr.h | 1 + 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 37eb38566fbc..b2671cb0f901 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -93,6 +93,8 @@ static int nfs4_map_errors(int err) return err; switch (err) { case -NFS4ERR_RESOURCE: + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: return -EPERM; @@ -6046,6 +6048,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; struct nfs4_state *state = NULL; + unsigned long timeo, giveup; dprintk("--> %s\n", __func__); @@ -6057,7 +6060,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) goto out; case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: - task->tk_status = -NFS4ERR_DELAY; + timeo = rpc_get_timeout(task->tk_client); + giveup = lgp->args.timestamp + timeo; + if (time_after(giveup, jiffies)) + task->tk_status = -NFS4ERR_DELAY; break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: @@ -6178,6 +6184,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->args.timestamp = jiffies; lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 97767c8683f9..48ac5aad6258 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1181,7 +1181,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; - bool first = false; + bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) goto out; @@ -1215,10 +1215,9 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; atomic_inc(&lo->plh_outstanding); - if (list_empty(&lo->plh_segs)) - first = true; - + first = list_empty(&lo->plh_layouts) ? true : false; spin_unlock(&ino->i_lock); + if (first) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 29adb12c7ecf..2250cab6fc4b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -233,6 +233,7 @@ struct nfs4_layoutget_args { struct inode *inode; struct nfs_open_context *ctx; nfs4_stateid stateid; + unsigned long timestamp; struct nfs4_layoutdriver_data layout; }; From 512e4b291c0e97af24619a91f3e8963697da00d8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 2 Mar 2013 15:54:11 -0800 Subject: [PATCH 10/10] SUNRPC: One line comment fix Reported-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 09dc0c2b56a3..f922fe88b4ff 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1196,7 +1196,7 @@ size_t rpc_max_payload(struct rpc_clnt *clnt) EXPORT_SYMBOL_GPL(rpc_max_payload); /** - * rpc_get_timeout - Get timeout for transport in tenths of seconds + * rpc_get_timeout - Get timeout for transport in units of HZ * @clnt: RPC client to query */ unsigned long rpc_get_timeout(struct rpc_clnt *clnt)