diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst index 33d588a01ace..0e98edd353b5 100644 --- a/Documentation/filesystems/nfs/exporting.rst +++ b/Documentation/filesystems/nfs/exporting.rst @@ -154,6 +154,11 @@ struct which has the following members: to find potential names, and matches inode numbers to find the correct match. + flags + Some filesystems may need to be handled differently than others. The + export_operations struct also includes a flags field that allows the + filesystem to communicate such information to nfsd. See the Export + Operations Flags section below for more explanation. A filehandle fragment consists of an array of 1 or more 4byte words, together with a one byte "type". @@ -163,3 +168,50 @@ generated by encode_fh, in which case it will have been padded with nuls. Rather, the encode_fh routine should choose a "type" which indicates the decode_fh how much of the filehandle is valid, and how it should be interpreted. + +Export Operations Flags +----------------------- +In addition to the operation vector pointers, struct export_operations also +contains a "flags" field that allows the filesystem to communicate to nfsd +that it may want to do things differently when dealing with it. The +following flags are defined: + + EXPORT_OP_NOWCC - disable NFSv3 WCC attributes on this filesystem + RFC 1813 recommends that servers always send weak cache consistency + (WCC) data to the client after each operation. The server should + atomically collect attributes about the inode, do an operation on it, + and then collect the attributes afterward. This allows the client to + skip issuing GETATTRs in some situations but means that the server + is calling vfs_getattr for almost all RPCs. On some filesystems + (particularly those that are clustered or networked) this is expensive + and atomicity is difficult to guarantee. This flag indicates to nfsd + that it should skip providing WCC attributes to the client in NFSv3 + replies when doing operations on this filesystem. Consider enabling + this on filesystems that have an expensive ->getattr inode operation, + or when atomicity between pre and post operation attribute collection + is impossible to guarantee. + + EXPORT_OP_NOSUBTREECHK - disallow subtree checking on this fs + Many NFS operations deal with filehandles, which the server must then + vet to ensure that they live inside of an exported tree. When the + export consists of an entire filesystem, this is trivial. nfsd can just + ensure that the filehandle live on the filesystem. When only part of a + filesystem is exported however, then nfsd must walk the ancestors of the + inode to ensure that it's within an exported subtree. This is an + expensive operation and not all filesystems can support it properly. + This flag exempts the filesystem from subtree checking and causes + exportfs to get back an error if it tries to enable subtree checking + on it. + + EXPORT_OP_CLOSE_BEFORE_UNLINK - always close cached files before unlinking + On some exportable filesystems (such as NFS) unlinking a file that + is still open can cause a fair bit of extra work. For instance, + the NFS client will do a "sillyrename" to ensure that the file + sticks around while it's still open. When reexporting, that open + file is held by nfsd so we usually end up doing a sillyrename, and + then immediately deleting the sillyrenamed file just afterward when + the link count actually goes to zero. Sometimes this delete can race + with other operations (for instance an rmdir of the parent directory). + This flag causes nfsd to close any open files for this inode _before_ + calling into the vfs to do an unlink or a rename that would replace + an existing file. diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 2dd55b172d57..0106eba46d5a 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -417,9 +417,11 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, } EXPORT_SYMBOL_GPL(exportfs_encode_fh); -struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, - int fh_len, int fileid_type, - int (*acceptable)(void *, struct dentry *), void *context) +struct dentry * +exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, + int fileid_type, + int (*acceptable)(void *, struct dentry *), + void *context) { const struct export_operations *nop = mnt->mnt_sb->s_export_op; struct dentry *result, *alias; @@ -432,10 +434,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (!nop || !nop->fh_to_dentry) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); - if (PTR_ERR(result) == -ENOMEM) - return ERR_CAST(result); if (IS_ERR_OR_NULL(result)) - return ERR_PTR(-ESTALE); + return result; /* * If no acceptance criteria was specified by caller, a disconnected @@ -561,10 +561,26 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, err_result: dput(result); - if (err != -ENOMEM) - err = -ESTALE; return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(exportfs_decode_fh_raw); + +struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, + int fh_len, int fileid_type, + int (*acceptable)(void *, struct dentry *), + void *context) +{ + struct dentry *ret; + + ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, + acceptable, context); + if (IS_ERR_OR_NULL(ret)) { + if (ret == ERR_PTR(-ENOMEM)) + return ret; + return ERR_PTR(-ESTALE); + } + return ret; +} EXPORT_SYMBOL_GPL(exportfs_decode_fh); MODULE_LICENSE("GPL"); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 08108b6d2fa1..3be6836074ae 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -697,7 +697,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, xdr_init_decode_pages(&xdr, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&xdr, scratch); status = -EIO; p = xdr_inline_decode(&xdr, 4); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index dec5880ac6de..acb1d22907da 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -510,7 +510,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out; xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&xdr, scratch); p = xdr_inline_decode(&xdr, sizeof(__be32)); if (!p) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4e011adaf967..8a24fe20dccf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -576,7 +576,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en goto out_nopages; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); do { if (entry->label) diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 3430d6891e89..7412bb164fa7 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -171,4 +171,7 @@ const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, + .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| + EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| + EXPORT_OP_NOATOMIC_ATTR, }; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 7f5aa0403e16..d158a500c25c 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -666,7 +666,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, return -ENOMEM; xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), * num_fh (4) */ diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index d913e818858f..86c3f7e69ec4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -82,7 +82,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err; xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); /* Get the stripe count (number of stripe index) */ p = xdr_inline_decode(&stream, 4); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 24bf5797f88a..4252ce633533 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -378,7 +378,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); /* stripe unit and mirror_array_cnt */ rc = -EIO; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 3eda40a320a5..c9b61b818ec1 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -69,7 +69,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, INIT_LIST_HEAD(&dsaddrs); xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); /* multipath count */ p = xdr_inline_decode(&stream, 4); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 8432bd6b95f0..ea7dd8cbfac9 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1539,7 +1539,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE); + xdr_set_scratch_page(xdr, res->scratch); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c6dbfcae7517..2eabe5add344 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6403,10 +6403,8 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct compound_hdr hdr; int status; - if (res->acl_scratch != NULL) { - void *p = page_address(res->acl_scratch); - xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); - } + if (res->acl_scratch != NULL) + xdr_set_scratch_page(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index b73d9dd37f73..26f2a50eceac 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -69,10 +69,14 @@ __state_in_grace(struct net *net, bool open) if (!open) return !list_empty(grace_list); + spin_lock(&grace_lock); list_for_each_entry(lm, grace_list, list) { - if (lm->block_opens) + if (lm->block_opens) { + spin_unlock(&grace_lock); return true; + } } + spin_unlock(&grace_lock); return false; } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 21e404e7cb68..81e7bb12aca6 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -408,6 +408,12 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) return -EINVAL; } + if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && + !(*flags & NFSEXP_NOSUBTREECHECK)) { + dprintk("%s: %s does not support subtree checking!\n", + __func__, inode->i_sb->s_type->name); + return -EINVAL; + } return 0; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 3c6c2f7d1688..d77c624c61f6 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -685,6 +685,7 @@ nfsd_file_cache_init(void) if (IS_ERR(nfsd_file_fsnotify_group)) { pr_err("nfsd: unable to create fsnotify group: %ld\n", PTR_ERR(nfsd_file_fsnotify_group)); + ret = PTR_ERR(nfsd_file_fsnotify_group); nfsd_file_fsnotify_group = NULL; goto out_notifier; } diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 6a900f770dd2..b0f66604532a 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -185,10 +185,6 @@ out: /* * XDR decode functions */ -static int nfsaclsvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) -{ - return 1; -} static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { @@ -255,15 +251,6 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) * XDR encode functions */ -/* - * There must be an encoding function for void results so svc_process - * will work properly. - */ -static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} - /* GETACL */ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) { @@ -378,10 +365,10 @@ struct nfsd3_voidargs { int dummy; }; static const struct svc_procedure nfsd_acl_procedures2[5] = { [ACLPROC2_NULL] = { .pc_func = nfsacld_proc_null, - .pc_decode = nfsaclsvc_decode_voidarg, - .pc_encode = nfsaclsvc_encode_voidres, - .pc_argsize = sizeof(struct nfsd3_voidargs), - .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, }, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 34a394e50e1d..7c30876a31a1 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -245,10 +245,10 @@ struct nfsd3_voidargs { int dummy; }; static const struct svc_procedure nfsd_acl_procedures3[3] = { [ACLPROC3_NULL] = { .pc_func = nfsd3_proc_null, - .pc_decode = nfs3svc_decode_voidarg, - .pc_encode = nfs3svc_encode_voidres, - .pc_argsize = sizeof(struct nfsd3_voidargs), - .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, }, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index a633044b0dc1..76931f4f57c3 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -689,12 +689,9 @@ out: #define nfsd3_mkdirargs nfsd3_createargs #define nfsd3_readdirplusargs nfsd3_readdirargs #define nfsd3_fhandleargs nfsd_fhandle -#define nfsd3_fhandleres nfsd3_attrstat #define nfsd3_attrstatres nfsd3_attrstat #define nfsd3_wccstatres nfsd3_attrstat #define nfsd3_createres nfsd3_diropres -#define nfsd3_voidres nfsd3_voidargs -struct nfsd3_voidargs { int dummy; }; #define ST 1 /* status*/ #define FH 17 /* filehandle with length */ @@ -705,10 +702,10 @@ struct nfsd3_voidargs { int dummy; }; static const struct svc_procedure nfsd_procedures3[22] = { [NFS3PROC_NULL] = { .pc_func = nfsd3_proc_null, - .pc_decode = nfs3svc_decode_voidarg, - .pc_encode = nfs3svc_encode_voidres, - .pc_argsize = sizeof(struct nfsd3_voidargs), - .pc_ressize = sizeof(struct nfsd3_voidres), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, }, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2277f83da250..821db21ba072 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -206,7 +206,7 @@ static __be32 * encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; - if (dentry && d_really_is_positive(dentry)) { + if (!fhp->fh_no_wcc && dentry && d_really_is_positive(dentry)) { __be32 err; struct kstat stat; @@ -259,11 +259,11 @@ void fill_pre_wcc(struct svc_fh *fhp) { struct inode *inode; struct kstat stat; + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); __be32 err; - if (fhp->fh_pre_saved) + if (fhp->fh_no_wcc || fhp->fh_pre_saved) return; - inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); if (err) { @@ -272,11 +272,12 @@ void fill_pre_wcc(struct svc_fh *fhp) stat.ctime = inode->i_ctime; stat.size = inode->i_size; } + if (v4) + fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); fhp->fh_pre_mtime = stat.mtime; fhp->fh_pre_ctime = stat.ctime; fhp->fh_pre_size = stat.size; - fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); fhp->fh_pre_saved = true; } @@ -285,30 +286,30 @@ void fill_pre_wcc(struct svc_fh *fhp) */ void fill_post_wcc(struct svc_fh *fhp) { + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct inode *inode = d_inode(fhp->fh_dentry); __be32 err; + if (fhp->fh_no_wcc) + return; + if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, - d_inode(fhp->fh_dentry)); if (err) { fhp->fh_post_saved = false; - /* Grab the ctime anyway - set_change_info might use it */ - fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; + fhp->fh_post_attr.ctime = inode->i_ctime; } else fhp->fh_post_saved = true; + if (v4) + fhp->fh_post_change = + nfsd4_change_attribute(&fhp->fh_post_attr, inode); } /* * XDR decode functions */ -int -nfs3svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) -{ - return 1; -} int nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) @@ -642,12 +643,6 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) * XDR encode functions */ -int -nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} - /* GETATTR */ int nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) @@ -707,6 +702,7 @@ int nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readlinkres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); @@ -720,6 +716,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); } + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len)) + return 0; return 1; } else return xdr_ressize_check(rqstp, p); @@ -730,6 +728,7 @@ int nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); @@ -746,6 +745,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); } + if (svc_encode_result_payload(rqstp, head->iov_len, + resp->count)) + return 0; return 1; } else return xdr_ressize_check(rqstp, p); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e83b21778816..4727b7f03c5b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -257,8 +257,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * in NFSv4 as in v3 except EXCLUSIVE4_1. */ current->fs->umask = open->op_umask; - status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, - open->op_fname.len, &open->op_iattr, + status = do_nfsd_create(rqstp, current_fh, open->op_fname, + open->op_fnamelen, &open->op_iattr, *resfh, open->op_createmode, (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); @@ -283,7 +283,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * a chance to an acquire a delegation if appropriate. */ status = nfsd_lookup(rqstp, current_fh, - open->op_fname.data, open->op_fname.len, *resfh); + open->op_fname, open->op_fnamelen, *resfh); if (status) goto out; status = nfsd_check_obj_isreg(*resfh); @@ -360,7 +360,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, bool reclaim = false; dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", - (int)open->op_fname.len, open->op_fname.data, + (int)open->op_fnamelen, open->op_fname, open->op_openowner); /* This check required by spec. */ @@ -1023,8 +1023,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_how_written = write->wr_stable_how; - nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist, - &write->wr_head, write->wr_buflen); + nvecs = svc_fill_write_vector(rqstp, write->wr_payload.pages, + write->wr_payload.head, write->wr_buflen); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, @@ -1425,7 +1425,7 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) return status; } -static int dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) +static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) { dst->cp_src_pos = src->cp_src_pos; dst->cp_dst_pos = src->cp_dst_pos; @@ -1444,8 +1444,6 @@ static int dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid)); memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh)); dst->ss_mnt = src->ss_mnt; - - return 0; } static void cleanup_async_copy(struct nfsd4_copy *copy) @@ -1539,9 +1537,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, refcount_set(&async_copy->refcount, 1); memcpy(©->cp_res.cb_stateid, ©->cp_stateid, sizeof(copy->cp_stateid)); - status = dup_copy_fields(copy, async_copy); - if (status) - goto out_err; + dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) @@ -2276,7 +2272,7 @@ static void svcxdr_init_encode(struct svc_rqst *rqstp, xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; /* Tail and page_len should be zero at this point: */ buf->len = buf->head[0].iov_len; - xdr->scratch.iov_len = 0; + xdr_reset_scratch_buffer(xdr); xdr->page_ptr = buf->pages - 1; buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) - rqstp->rq_auth_slack; @@ -3282,7 +3278,7 @@ int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) void warn_on_nonidempotent_op(struct nfsd4_op *op) { if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) { - pr_err("unable to encode reply to nonidempotent op %d (%s)\n", + pr_err("unable to encode reply to nonidempotent op %u (%s)\n", op->opnum, nfsd4_op_name(op->opnum)); WARN_ON_ONCE(1); } @@ -3295,16 +3291,13 @@ static const char *nfsd4_op_name(unsigned opnum) return "unknown_operation"; } -#define nfsd4_voidres nfsd4_voidargs -struct nfsd4_voidargs { int dummy; }; - static const struct svc_procedure nfsd_procedures4[2] = { [NFSPROC4_NULL] = { .pc_func = nfsd4_proc_null, - .pc_decode = nfs4svc_decode_voidarg, - .pc_encode = nfs4svc_encode_voidres, - .pc_argsize = sizeof(struct nfsd4_voidargs), - .pc_ressize = sizeof(struct nfsd4_voidres), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 1, }, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d7f27ed6b794..1d2cd6a88f61 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -769,6 +769,7 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, spin_lock(&nn->s2s_cp_lock); new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT); stid->stid.si_opaque.so_id = new_id; + stid->stid.si_generation = 1; spin_unlock(&nn->s2s_cp_lock); idr_preload_end(); if (new_id < 0) @@ -3066,7 +3067,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " - "ip_addr=%s flags %x, spa_how %d\n", + "ip_addr=%s flags %x, spa_how %u\n", __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 833a2c64dfe8..45ee6b12ce5b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -54,6 +54,8 @@ #include "pnfs.h" #include "filecache.h" +#include "trace.h" + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include #endif @@ -90,6 +92,8 @@ check_filename(char *str, int len) if (len == 0) return nfserr_inval; + if (len > NFS4_MAXNAMLEN) + return nfserr_nametoolong; if (isdotent(str, len)) return nfserr_badname; for (i = 0; i < len; i++) @@ -98,122 +102,6 @@ check_filename(char *str, int len) return 0; } -#define DECODE_HEAD \ - __be32 *p; \ - __be32 status -#define DECODE_TAIL \ - status = 0; \ -out: \ - return status; \ -xdr_error: \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - status = nfserr_bad_xdr; \ - goto out - -#define READMEM(x,nbytes) do { \ - x = (char *)p; \ - p += XDR_QUADLEN(nbytes); \ -} while (0) -#define SAVEMEM(x,nbytes) do { \ - if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ - savemem(argp, p, nbytes) : \ - (char *)p)) { \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - goto xdr_error; \ - } \ - p += XDR_QUADLEN(nbytes); \ -} while (0) -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - -/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */ -#define READ_BUF(nbytes) do { \ - if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \ - p = argp->p; \ - argp->p += XDR_QUADLEN(nbytes); \ - } else if (!(p = read_buf(argp, nbytes))) { \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - goto xdr_error; \ - } \ -} while (0) - -static void next_decode_page(struct nfsd4_compoundargs *argp) -{ - argp->p = page_address(argp->pagelist[0]); - argp->pagelist++; - if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + XDR_QUADLEN(argp->pagelen); - argp->pagelen = 0; - } else { - argp->end = argp->p + (PAGE_SIZE>>2); - argp->pagelen -= PAGE_SIZE; - } -} - -static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) -{ - /* We want more bytes than seem to be available. - * Maybe we need a new page, maybe we have just run out - */ - unsigned int avail = (char *)argp->end - (char *)argp->p; - __be32 *p; - - if (argp->pagelen == 0) { - struct kvec *vec = &argp->rqstp->rq_arg.tail[0]; - - if (!argp->tail) { - argp->tail = true; - avail = vec->iov_len; - argp->p = vec->iov_base; - argp->end = vec->iov_base + avail; - } - - if (avail < nbytes) - return NULL; - - p = argp->p; - argp->p += XDR_QUADLEN(nbytes); - return p; - } - - if (avail + argp->pagelen < nbytes) - return NULL; - if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */ - return NULL; - /* ok, we can do it with the current plus the next page */ - if (nbytes <= sizeof(argp->tmp)) - p = argp->tmp; - else { - kfree(argp->tmpp); - p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL); - if (!p) - return NULL; - - } - /* - * The following memcpy is safe because read_buf is always - * called with nbytes > avail, and the two cases above both - * guarantee p points to at least nbytes bytes. - */ - memcpy(p, argp->p, avail); - next_decode_page(argp); - memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); - argp->p += XDR_QUADLEN(nbytes - avail); - return p; -} - -static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp) -{ - unsigned int this = (char *)argp->end - (char *)argp->p; - - return this + argp->pagelen; -} - static int zero_clientid(clientid_t *clid) { return (clid->cl_boot == 0) && (clid->cl_id == 0); @@ -259,118 +147,243 @@ svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) return p; } -static __be32 -svcxdr_construct_vector(struct nfsd4_compoundargs *argp, struct kvec *head, - struct page ***pagelist, u32 buflen) -{ - int avail; - int len; - int pages; - - /* Sorry .. no magic macros for this.. * - * READ_BUF(write->wr_buflen); - * SAVEMEM(write->wr_buf, write->wr_buflen); - */ - avail = (char *)argp->end - (char *)argp->p; - if (avail + argp->pagelen < buflen) { - dprintk("NFSD: xdr error (%s:%d)\n", - __FILE__, __LINE__); - return nfserr_bad_xdr; - } - head->iov_base = argp->p; - head->iov_len = avail; - *pagelist = argp->pagelist; - - len = XDR_QUADLEN(buflen) << 2; - if (len >= avail) { - len -= avail; - - pages = len >> PAGE_SHIFT; - argp->pagelist += pages; - argp->pagelen -= pages * PAGE_SIZE; - len -= pages * PAGE_SIZE; - - next_decode_page(argp); - } - argp->p += XDR_QUADLEN(len); - - return 0; -} - -/** - * savemem - duplicate a chunk of memory for later processing - * @argp: NFSv4 compound argument structure to be freed with - * @p: pointer to be duplicated - * @nbytes: length to be duplicated - * - * Returns a pointer to a copy of @nbytes bytes of memory at @p - * that are preserved until processing of the NFSv4 compound - * operation described by @argp finishes. +/* + * NFSv4 basic data type decoders */ -static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) -{ - void *ret; - ret = svcxdr_tmpalloc(argp, nbytes); - if (!ret) - return NULL; - memcpy(ret, p, nbytes); - return ret; +/* + * This helper handles variable-length opaques which belong to protocol + * elements that this implementation does not support. + */ +static __be32 +nfsd4_decode_ignored_string(struct nfsd4_compoundargs *argp, u32 maxlen) +{ + u32 len; + + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (maxlen && len > maxlen) + return nfserr_bad_xdr; + if (!xdr_inline_decode(argp->xdr, len)) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 -nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec64 *tv) +nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) { - DECODE_HEAD; + __be32 *p; + u32 len; - READ_BUF(12); + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (len == 0 || len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, len); + if (!p) + return nfserr_bad_xdr; + o->data = svcxdr_tmpalloc(argp, len); + if (!o->data) + return nfserr_jukebox; + o->len = len; + memcpy(o->data, p, len); + + return nfs_ok; +} + +static __be32 +nfsd4_decode_component4(struct nfsd4_compoundargs *argp, char **namp, u32 *lenp) +{ + __be32 *p, status; + + if (xdr_stream_decode_u32(argp->xdr, lenp) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, *lenp); + if (!p) + return nfserr_bad_xdr; + status = check_filename((char *)p, *lenp); + if (status) + return status; + *namp = svcxdr_tmpalloc(argp, *lenp); + if (!*namp) + return nfserr_jukebox; + memcpy(*namp, p, *lenp); + + return nfs_ok; +} + +static __be32 +nfsd4_decode_nfstime4(struct nfsd4_compoundargs *argp, struct timespec64 *tv) +{ + __be32 *p; + + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 3); + if (!p) + return nfserr_bad_xdr; p = xdr_decode_hyper(p, &tv->tv_sec); tv->tv_nsec = be32_to_cpup(p++); if (tv->tv_nsec >= (u32)1000000000) return nfserr_inval; - - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) +nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf) { - u32 bmlen; - DECODE_HEAD; + __be32 *p; - bmval[0] = 0; - bmval[1] = 0; - bmval[2] = 0; + p = xdr_inline_decode(argp->xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_bad_xdr; + memcpy(verf->data, p, sizeof(verf->data)); + return nfs_ok; +} - READ_BUF(4); - bmlen = be32_to_cpup(p++); - if (bmlen > 1000) - goto xdr_error; +/** + * nfsd4_decode_bitmap4 - Decode an NFSv4 bitmap4 + * @argp: NFSv4 compound argument structure + * @bmval: pointer to an array of u32's to decode into + * @bmlen: size of the @bmval array + * + * The server needs to return nfs_ok rather than nfserr_bad_xdr when + * encountering bitmaps containing bits it does not recognize. This + * includes bits in bitmap words past WORDn, where WORDn is the last + * bitmap WORD the implementation currently supports. Thus we are + * careful here to simply ignore bits in bitmap words that this + * implementation has yet to support explicitly. + * + * Return values: + * %nfs_ok: @bmval populated successfully + * %nfserr_bad_xdr: the encoded bitmap was invalid + */ +static __be32 +nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen) +{ + u32 i, count; + __be32 *p; - READ_BUF(bmlen << 2); - if (bmlen > 0) - bmval[0] = be32_to_cpup(p++); - if (bmlen > 1) - bmval[1] = be32_to_cpup(p++); - if (bmlen > 2) - bmval[2] = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + /* request sanity */ + if (count > 1000) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, count << 2); + if (!p) + return nfserr_bad_xdr; + i = 0; + while (i < count) + bmval[i++] = be32_to_cpup(p++); + while (i < bmlen) + bmval[i++] = 0; - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, - struct iattr *iattr, struct nfs4_acl **acl, - struct xdr_netobj *label, int *umask) +nfsd4_decode_nfsace4(struct nfsd4_compoundargs *argp, struct nfs4_ace *ace) { - int expected_len, len = 0; - u32 dummy32; - char *buf; + __be32 *p, status; + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &ace->type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &ace->flag) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &ace->access_mask) < 0) + return nfserr_bad_xdr; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + ace->whotype = nfs4_acl_get_whotype((char *)p, length); + if (ace->whotype != NFS4_ACL_WHO_NAMED) + status = nfs_ok; + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + status = nfsd_map_name_to_gid(argp->rqstp, + (char *)p, length, &ace->who_gid); + else + status = nfsd_map_name_to_uid(argp->rqstp, + (char *)p, length, &ace->who_uid); + + return status; +} + +/* A counted array of nfsace4's */ +static noinline __be32 +nfsd4_decode_acl(struct nfsd4_compoundargs *argp, struct nfs4_acl **acl) +{ + struct nfs4_ace *ace; + __be32 status; + u32 count; + + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + + if (count > xdr_stream_remaining(argp->xdr) / 20) + /* + * Even with 4-byte names there wouldn't be + * space for that many aces; something fishy is + * going on: + */ + return nfserr_fbig; + + *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(count)); + if (*acl == NULL) + return nfserr_jukebox; + + (*acl)->naces = count; + for (ace = (*acl)->aces; ace < (*acl)->aces + count; ace++) { + status = nfsd4_decode_nfsace4(argp, ace); + if (status) + return status; + } + + return nfs_ok; +} + +static noinline __be32 +nfsd4_decode_security_label(struct nfsd4_compoundargs *argp, + struct xdr_netobj *label) +{ + u32 lfs, pi, length; + __be32 *p; + + if (xdr_stream_decode_u32(argp->xdr, &lfs) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &pi) < 0) + return nfserr_bad_xdr; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + if (length > NFS4_MAXLABELLEN) + return nfserr_badlabel; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + label->len = length; + label->data = svcxdr_dupstr(argp, p, length); + if (!label->data) + return nfserr_jukebox; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, + struct iattr *iattr, struct nfs4_acl **acl, + struct xdr_netobj *label, int *umask) +{ + unsigned int starting_pos; + u32 attrlist4_count; + __be32 *p, status; - DECODE_HEAD; iattr->ia_valid = 0; - if ((status = nfsd4_decode_bitmap(argp, bmval))) - return status; + status = nfsd4_decode_bitmap4(argp, bmval, bmlen); + if (status) + return nfserr_bad_xdr; if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 @@ -380,96 +393,69 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_attrnotsupp; } - READ_BUF(4); - expected_len = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &attrlist4_count) < 0) + return nfserr_bad_xdr; + starting_pos = xdr_stream_pos(argp->xdr); if (bmval[0] & FATTR4_WORD0_SIZE) { - READ_BUF(8); - len += 8; - p = xdr_decode_hyper(p, &iattr->ia_size); + u64 size; + + if (xdr_stream_decode_u64(argp->xdr, &size) < 0) + return nfserr_bad_xdr; + iattr->ia_size = size; iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { - u32 nace; - struct nfs4_ace *ace; - - READ_BUF(4); len += 4; - nace = be32_to_cpup(p++); - - if (nace > compoundargs_bytes_left(argp)/20) - /* - * Even with 4-byte names there wouldn't be - * space for that many aces; something fishy is - * going on: - */ - return nfserr_fbig; - - *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); - if (*acl == NULL) - return nfserr_jukebox; - - (*acl)->naces = nace; - for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { - READ_BUF(16); len += 16; - ace->type = be32_to_cpup(p++); - ace->flag = be32_to_cpup(p++); - ace->access_mask = be32_to_cpup(p++); - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += XDR_QUADLEN(dummy32) << 2; - READMEM(buf, dummy32); - ace->whotype = nfs4_acl_get_whotype(buf, dummy32); - status = nfs_ok; - if (ace->whotype != NFS4_ACL_WHO_NAMED) - ; - else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - status = nfsd_map_name_to_gid(argp->rqstp, - buf, dummy32, &ace->who_gid); - else - status = nfsd_map_name_to_uid(argp->rqstp, - buf, dummy32, &ace->who_uid); - if (status) - return status; - } + status = nfsd4_decode_acl(argp, acl); + if (status) + return status; } else *acl = NULL; if (bmval[1] & FATTR4_WORD1_MODE) { - READ_BUF(4); - len += 4; - iattr->ia_mode = be32_to_cpup(p++); + u32 mode; + + if (xdr_stream_decode_u32(argp->xdr, &mode) < 0) + return nfserr_bad_xdr; + iattr->ia_mode = mode; iattr->ia_mode &= (S_IFMT | S_IALLUGO); iattr->ia_valid |= ATTR_MODE; } if (bmval[1] & FATTR4_WORD1_OWNER) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + status = nfsd_map_name_to_uid(argp->rqstp, (char *)p, length, + &iattr->ia_uid); + if (status) return status; iattr->ia_valid |= ATTR_UID; } if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + status = nfsd_map_name_to_gid(argp->rqstp, (char *)p, length, + &iattr->ia_gid); + if (status) return status; iattr->ia_valid |= ATTR_GID; } if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - switch (dummy32) { + u32 set_it; + + if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0) + return nfserr_bad_xdr; + switch (set_it) { case NFS4_SET_TO_CLIENT_TIME: - len += 12; - status = nfsd4_decode_time(argp, &iattr->ia_atime); + status = nfsd4_decode_nfstime4(argp, &iattr->ia_atime); if (status) return status; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); @@ -478,17 +464,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_ATIME; break; default: - goto xdr_error; + return nfserr_bad_xdr; } } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - switch (dummy32) { + u32 set_it; + + if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0) + return nfserr_bad_xdr; + switch (set_it) { case NFS4_SET_TO_CLIENT_TIME: - len += 12; - status = nfsd4_decode_time(argp, &iattr->ia_mtime); + status = nfsd4_decode_nfstime4(argp, &iattr->ia_mtime); if (status) return status; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); @@ -497,222 +483,329 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_MTIME; break; default: - goto xdr_error; + return nfserr_bad_xdr; } } - label->len = 0; if (IS_ENABLED(CONFIG_NFSD_V4_SECURITY_LABEL) && bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */ - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - if (dummy32 > NFS4_MAXLABELLEN) - return nfserr_badlabel; - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - label->len = dummy32; - label->data = svcxdr_dupstr(argp, buf, dummy32); - if (!label->data) - return nfserr_jukebox; + status = nfsd4_decode_security_label(argp, label); + if (status) + return status; } if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + u32 mode, mask; + if (!umask) - goto xdr_error; - READ_BUF(8); - len += 8; - dummy32 = be32_to_cpup(p++); - iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO); - dummy32 = be32_to_cpup(p++); - *umask = dummy32 & S_IRWXUGO; + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &mode) < 0) + return nfserr_bad_xdr; + iattr->ia_mode = mode & (S_IFMT | S_IALLUGO); + if (xdr_stream_decode_u32(argp->xdr, &mask) < 0) + return nfserr_bad_xdr; + *umask = mask & S_IRWXUGO; iattr->ia_valid |= ATTR_MODE; } - if (len != expected_len) - goto xdr_error; - DECODE_TAIL; + /* request sanity: did attrlist4 contain the expected number of words? */ + if (attrlist4_count != xdr_stream_pos(argp->xdr) - starting_pos) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 -nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +nfsd4_decode_stateid4(struct nfsd4_compoundargs *argp, stateid_t *sid) { - DECODE_HEAD; + __be32 *p; - READ_BUF(sizeof(stateid_t)); + p = xdr_inline_decode(argp->xdr, NFS4_STATEID_SIZE); + if (!p) + return nfserr_bad_xdr; sid->si_generation = be32_to_cpup(p++); - COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); - - DECODE_TAIL; + memcpy(&sid->si_opaque, p, sizeof(sid->si_opaque)); + return nfs_ok; } static __be32 -nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) +nfsd4_decode_clientid4(struct nfsd4_compoundargs *argp, clientid_t *clientid) { - DECODE_HEAD; + __be32 *p; - READ_BUF(4); - access->ac_req_access = be32_to_cpup(p++); - - DECODE_TAIL; + p = xdr_inline_decode(argp->xdr, sizeof(__be64)); + if (!p) + return nfserr_bad_xdr; + memcpy(clientid, p, sizeof(*clientid)); + return nfs_ok; } -static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +static __be32 +nfsd4_decode_state_owner4(struct nfsd4_compoundargs *argp, + clientid_t *clientid, struct xdr_netobj *owner) { - DECODE_HEAD; - struct user_namespace *userns = nfsd_user_namespace(argp->rqstp); - u32 dummy, uid, gid; - char *machine_name; - int i; - int nr_secflavs; + __be32 status; + + status = nfsd4_decode_clientid4(argp, clientid); + if (status) + return status; + return nfsd4_decode_opaque(argp, owner); +} + +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_decode_deviceid4(struct nfsd4_compoundargs *argp, + struct nfsd4_deviceid *devid) +{ + __be32 *p; + + p = xdr_inline_decode(argp->xdr, NFS4_DEVICEID4_SIZE); + if (!p) + return nfserr_bad_xdr; + memcpy(devid, p, sizeof(*devid)); + return nfs_ok; +} + +static __be32 +nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutcommit *lcp) +{ + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) + return nfserr_bad_xdr; + if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) + return nfserr_bad_xdr; + if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) + return nfserr_bad_xdr; + + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) + return nfserr_bad_xdr; + if (lcp->lc_up_len > 0) { + lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); + if (!lcp->lc_up_layout) + return nfserr_bad_xdr; + } + + return nfs_ok; +} + +static __be32 +nfsd4_decode_layoutreturn4(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutreturn *lrp) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_return_type) < 0) + return nfserr_bad_xdr; + switch (lrp->lr_return_type) { + case RETURN_FILE: + if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.length) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lrp->lr_sid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lrf_body_len) < 0) + return nfserr_bad_xdr; + if (lrp->lrf_body_len > 0) { + lrp->lrf_body = xdr_inline_decode(argp->xdr, lrp->lrf_body_len); + if (!lrp->lrf_body) + return nfserr_bad_xdr; + } + break; + case RETURN_FSID: + case RETURN_ALL: + lrp->lr_seg.offset = 0; + lrp->lr_seg.length = NFS4_MAX_UINT64; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; +} + +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd4_decode_sessionid4(struct nfsd4_compoundargs *argp, + struct nfs4_sessionid *sessionid) +{ + __be32 *p; + + p = xdr_inline_decode(argp->xdr, NFS4_MAX_SESSIONID_LEN); + if (!p) + return nfserr_bad_xdr; + memcpy(sessionid->data, p, sizeof(sessionid->data)); + return nfs_ok; +} + +/* Defined in Appendix A of RFC 5531 */ +static __be32 +nfsd4_decode_authsys_parms(struct nfsd4_compoundargs *argp, + struct nfsd4_cb_sec *cbs) +{ + u32 stamp, gidcount, uid, gid; + __be32 *p, status; + + if (xdr_stream_decode_u32(argp->xdr, &stamp) < 0) + return nfserr_bad_xdr; + /* machine name */ + status = nfsd4_decode_ignored_string(argp, 255); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &uid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gidcount) < 0) + return nfserr_bad_xdr; + if (gidcount > 16) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, gidcount << 2); + if (!p) + return nfserr_bad_xdr; + if (cbs->flavor == (u32)(-1)) { + struct user_namespace *userns = nfsd_user_namespace(argp->rqstp); + + kuid_t kuid = make_kuid(userns, uid); + kgid_t kgid = make_kgid(userns, gid); + if (uid_valid(kuid) && gid_valid(kgid)) { + cbs->uid = kuid; + cbs->gid = kgid; + cbs->flavor = RPC_AUTH_UNIX; + } else { + dprintk("RPC_AUTH_UNIX with invalid uid or gid, ignoring!\n"); + } + } + + return nfs_ok; +} + +static __be32 +nfsd4_decode_gss_cb_handles4(struct nfsd4_compoundargs *argp, + struct nfsd4_cb_sec *cbs) +{ + __be32 status; + u32 service; + + dprintk("RPC_AUTH_GSS callback secflavor not supported!\n"); + + if (xdr_stream_decode_u32(argp->xdr, &service) < 0) + return nfserr_bad_xdr; + if (service < RPC_GSS_SVC_NONE || service > RPC_GSS_SVC_PRIVACY) + return nfserr_bad_xdr; + /* gcbp_handle_from_server */ + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + /* gcbp_handle_from_client */ + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + + return nfs_ok; +} + +/* a counted array of callback_sec_parms4 items */ +static __be32 +nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +{ + u32 i, secflavor, nr_secflavs; + __be32 status; /* callback_sec_params4 */ - READ_BUF(4); - nr_secflavs = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &nr_secflavs) < 0) + return nfserr_bad_xdr; if (nr_secflavs) cbs->flavor = (u32)(-1); else /* Is this legal? Be generous, take it to mean AUTH_NONE: */ cbs->flavor = 0; + for (i = 0; i < nr_secflavs; ++i) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - switch (dummy) { + if (xdr_stream_decode_u32(argp->xdr, &secflavor) < 0) + return nfserr_bad_xdr; + switch (secflavor) { case RPC_AUTH_NULL: - /* Nothing to read */ + /* void */ if (cbs->flavor == (u32)(-1)) cbs->flavor = RPC_AUTH_NULL; break; case RPC_AUTH_UNIX: - READ_BUF(8); - /* stamp */ - dummy = be32_to_cpup(p++); - - /* machine name */ - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - SAVEMEM(machine_name, dummy); - - /* uid, gid */ - READ_BUF(8); - uid = be32_to_cpup(p++); - gid = be32_to_cpup(p++); - - /* more gids */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - if (cbs->flavor == (u32)(-1)) { - kuid_t kuid = make_kuid(userns, uid); - kgid_t kgid = make_kgid(userns, gid); - if (uid_valid(kuid) && gid_valid(kgid)) { - cbs->uid = kuid; - cbs->gid = kgid; - cbs->flavor = RPC_AUTH_UNIX; - } else { - dprintk("RPC_AUTH_UNIX with invalid" - "uid or gid ignoring!\n"); - } - } + status = nfsd4_decode_authsys_parms(argp, cbs); + if (status) + return status; break; case RPC_AUTH_GSS: - dprintk("RPC_AUTH_GSS callback secflavor " - "not supported!\n"); - READ_BUF(8); - /* gcbp_service */ - dummy = be32_to_cpup(p++); - /* gcbp_handle_from_server */ - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - /* gcbp_handle_from_client */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); + status = nfsd4_decode_gss_cb_handles4(argp, cbs); + if (status) + return status; break; default: - dprintk("Illegal callback secflavor\n"); return nfserr_inval; } } - DECODE_TAIL; + + return nfs_ok; } -static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) + +/* + * NFSv4 operation argument decoders + */ + +static __be32 +nfsd4_decode_access(struct nfsd4_compoundargs *argp, + struct nfsd4_access *access) { - DECODE_HEAD; - - READ_BUF(4); - bc->bc_cb_program = be32_to_cpup(p++); - nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); - - DECODE_TAIL; -} - -static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) -{ - DECODE_HEAD; - - READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); - COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - bcts->dir = be32_to_cpup(p++); - /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker - * could help us figure out we should be using it. */ - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0) + return nfserr_bad_xdr; + return nfs_ok; } static __be32 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) { - DECODE_HEAD; - - READ_BUF(4); - close->cl_seqid = be32_to_cpup(p++); - return nfsd4_decode_stateid(argp, &close->cl_stateid); - - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_stateid4(argp, &close->cl_stateid); } static __be32 nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) { - DECODE_HEAD; - - READ_BUF(12); - p = xdr_decode_hyper(p, &commit->co_offset); - commit->co_count = be32_to_cpup(p++); - - DECODE_TAIL; + if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0) + return nfserr_bad_xdr; + return nfs_ok; } static __be32 nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) { - DECODE_HEAD; + __be32 *p, status; - READ_BUF(4); - create->cr_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0) + return nfserr_bad_xdr; switch (create->cr_type) { case NF4LNK: - READ_BUF(4); - create->cr_datalen = be32_to_cpup(p++); - READ_BUF(create->cr_datalen); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_datalen) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, create->cr_datalen); + if (!p) + return nfserr_bad_xdr; create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen); if (!create->cr_data) return nfserr_jukebox; break; case NF4BLK: case NF4CHR: - READ_BUF(8); - create->cr_specdata1 = be32_to_cpup(p++); - create->cr_specdata2 = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata1) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata2) < 0) + return nfserr_bad_xdr; break; case NF4SOCK: case NF4FIFO: @@ -720,151 +813,210 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create default: break; } - - READ_BUF(4); - create->cr_namelen = be32_to_cpup(p++); - READ_BUF(create->cr_namelen); - SAVEMEM(create->cr_name, create->cr_namelen); - if ((status = check_filename(create->cr_name, create->cr_namelen))) + status = nfsd4_decode_component4(argp, &create->cr_name, + &create->cr_namelen); + if (status) + return status; + status = nfsd4_decode_fattr4(argp, create->cr_bmval, + ARRAY_SIZE(create->cr_bmval), + &create->cr_iattr, &create->cr_acl, + &create->cr_label, &create->cr_umask); + if (status) return status; - status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, - &create->cr_acl, &create->cr_label, - &create->cr_umask); - if (status) - goto out; - - DECODE_TAIL; + return nfs_ok; } static inline __be32 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) { - return nfsd4_decode_stateid(argp, &dr->dr_stateid); + return nfsd4_decode_stateid4(argp, &dr->dr_stateid); } static inline __be32 nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) { - return nfsd4_decode_bitmap(argp, getattr->ga_bmval); + return nfsd4_decode_bitmap4(argp, getattr->ga_bmval, + ARRAY_SIZE(getattr->ga_bmval)); } static __be32 nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) { - DECODE_HEAD; + return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen); +} - READ_BUF(4); - link->li_namelen = be32_to_cpup(p++); - READ_BUF(link->li_namelen); - SAVEMEM(link->li_name, link->li_namelen); - if ((status = check_filename(link->li_name, link->li_namelen))) +static __be32 +nfsd4_decode_open_to_lock_owner4(struct nfsd4_compoundargs *argp, + struct nfsd4_lock *lock) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_open_seqid) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lock->lk_new_open_stateid); + if (status) return status; + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_lock_seqid) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_state_owner4(argp, &lock->lk_new_clientid, + &lock->lk_new_owner); +} - DECODE_TAIL; +static __be32 +nfsd4_decode_exist_lock_owner4(struct nfsd4_compoundargs *argp, + struct nfsd4_lock *lock) +{ + __be32 status; + + status = nfsd4_decode_stateid4(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_old_lock_seqid) < 0) + return nfserr_bad_xdr; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +{ + if (xdr_stream_decode_bool(argp->xdr, &lock->lk_is_new) < 0) + return nfserr_bad_xdr; + if (lock->lk_is_new) + return nfsd4_decode_open_to_lock_owner4(argp, lock); + return nfsd4_decode_exist_lock_owner4(argp, lock); } static __be32 nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) { - DECODE_HEAD; - - /* - * type, reclaim(boolean), offset, length, new_lock_owner(boolean) - */ - READ_BUF(28); - lock->lk_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0) + return nfserr_bad_xdr; if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) - goto xdr_error; - lock->lk_reclaim = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &lock->lk_offset); - p = xdr_decode_hyper(p, &lock->lk_length); - lock->lk_is_new = be32_to_cpup(p++); - - if (lock->lk_is_new) { - READ_BUF(4); - lock->lk_new_open_seqid = be32_to_cpup(p++); - status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); - if (status) - return status; - READ_BUF(8 + sizeof(clientid_t)); - lock->lk_new_lock_seqid = be32_to_cpup(p++); - COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); - lock->lk_new_owner.len = be32_to_cpup(p++); - READ_BUF(lock->lk_new_owner.len); - READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); - } else { - status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); - if (status) - return status; - READ_BUF(4); - lock->lk_old_lock_seqid = be32_to_cpup(p++); - } - - DECODE_TAIL; + return nfserr_bad_xdr; + if (xdr_stream_decode_bool(argp->xdr, &lock->lk_reclaim) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lock->lk_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lock->lk_length) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_locker4(argp, lock); } static __be32 nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) { - DECODE_HEAD; - - READ_BUF(32); - lockt->lt_type = be32_to_cpup(p++); - if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) - goto xdr_error; - p = xdr_decode_hyper(p, &lockt->lt_offset); - p = xdr_decode_hyper(p, &lockt->lt_length); - COPYMEM(&lockt->lt_clientid, 8); - lockt->lt_owner.len = be32_to_cpup(p++); - READ_BUF(lockt->lt_owner.len); - READMEM(lockt->lt_owner.data, lockt->lt_owner.len); - - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0) + return nfserr_bad_xdr; + if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_length) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_state_owner4(argp, &lockt->lt_clientid, + &lockt->lt_owner); } static __be32 nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) { - DECODE_HEAD; + __be32 status; - READ_BUF(8); - locku->lu_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0) + return nfserr_bad_xdr; if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) - goto xdr_error; - locku->lu_seqid = be32_to_cpup(p++); - status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &locku->lu_seqid) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &locku->lu_stateid); if (status) return status; - READ_BUF(16); - p = xdr_decode_hyper(p, &locku->lu_offset); - p = xdr_decode_hyper(p, &locku->lu_length); + if (xdr_stream_decode_u64(argp->xdr, &locku->lu_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &locku->lu_length) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) { - DECODE_HEAD; + return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len); +} - READ_BUF(4); - lookup->lo_len = be32_to_cpup(p++); - READ_BUF(lookup->lo_len); - SAVEMEM(lookup->lo_name, lookup->lo_len); - if ((status = check_filename(lookup->lo_name, lookup->lo_len))) - return status; +static __be32 +nfsd4_decode_createhow4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + __be32 status; - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &open->op_createmode) < 0) + return nfserr_bad_xdr; + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + case NFS4_CREATE_GUARDED: + status = nfsd4_decode_fattr4(argp, open->op_bmval, + ARRAY_SIZE(open->op_bmval), + &open->op_iattr, &open->op_acl, + &open->op_label, &open->op_umask); + if (status) + return status; + break; + case NFS4_CREATE_EXCLUSIVE: + status = nfsd4_decode_verifier4(argp, &open->op_verf); + if (status) + return status; + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (argp->minorversion < 1) + return nfserr_bad_xdr; + status = nfsd4_decode_verifier4(argp, &open->op_verf); + if (status) + return status; + status = nfsd4_decode_fattr4(argp, open->op_bmval, + ARRAY_SIZE(open->op_bmval), + &open->op_iattr, &open->op_acl, + &open->op_label, &open->op_umask); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; +} + +static __be32 +nfsd4_decode_openflag4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &open->op_create) < 0) + return nfserr_bad_xdr; + switch (open->op_create) { + case NFS4_OPEN_NOCREATE: + break; + case NFS4_OPEN_CREATE: + status = nfsd4_decode_createhow4(argp, open); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; } static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when) { - __be32 *p; u32 w; - READ_BUF(4); - w = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &w) < 0) + return nfserr_bad_xdr; *share_access = w & NFS4_SHARE_ACCESS_MASK; *deleg_want = w & NFS4_SHARE_WANT_MASK; if (deleg_when) @@ -907,45 +1059,73 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): return nfs_ok; } -xdr_error: return nfserr_bad_xdr; } static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) { - __be32 *p; - - READ_BUF(4); - *x = be32_to_cpup(p++); - /* Note: unlinke access bits, deny bits may be zero. */ + if (xdr_stream_decode_u32(argp->xdr, x) < 0) + return nfserr_bad_xdr; + /* Note: unlike access bits, deny bits may be zero. */ if (*x & ~NFS4_SHARE_DENY_BOTH) return nfserr_bad_xdr; + return nfs_ok; -xdr_error: - return nfserr_bad_xdr; } -static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +static __be32 +nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp, + struct nfsd4_open *open) { - __be32 *p; + __be32 status; - READ_BUF(4); - o->len = be32_to_cpup(p++); - - if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + if (xdr_stream_decode_u32(argp->xdr, &open->op_claim_type) < 0) return nfserr_bad_xdr; + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + status = nfsd4_decode_component4(argp, &open->op_fname, + &open->op_fnamelen); + if (status) + return status; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + if (xdr_stream_decode_u32(argp->xdr, &open->op_delegate_type) < 0) + return nfserr_bad_xdr; + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid); + if (status) + return status; + status = nfsd4_decode_component4(argp, &open->op_fname, + &open->op_fnamelen); + if (status) + return status; + break; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + if (argp->minorversion < 1) + return nfserr_bad_xdr; + /* void */ + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + if (argp->minorversion < 1) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } - READ_BUF(o->len); - SAVEMEM(o->data, o->len); return nfs_ok; -xdr_error: - return nfserr_bad_xdr; } static __be32 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) { - DECODE_HEAD; + __be32 status; u32 dummy; memset(open->op_bmval, 0, sizeof(open->op_bmval)); @@ -953,160 +1133,79 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) open->op_openowner = NULL; open->op_xdr_error = 0; - /* seqid, share_access, share_deny, clientid, ownerlen */ - READ_BUF(4); - open->op_seqid = be32_to_cpup(p++); - /* decode, yet ignore deleg_when until supported */ + if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0) + return nfserr_bad_xdr; + /* deleg_want is ignored */ status = nfsd4_decode_share_access(argp, &open->op_share_access, &open->op_deleg_want, &dummy); if (status) - goto xdr_error; + return status; status = nfsd4_decode_share_deny(argp, &open->op_share_deny); if (status) - goto xdr_error; - READ_BUF(sizeof(clientid_t)); - COPYMEM(&open->op_clientid, sizeof(clientid_t)); - status = nfsd4_decode_opaque(argp, &open->op_owner); + return status; + status = nfsd4_decode_state_owner4(argp, &open->op_clientid, + &open->op_owner); if (status) - goto xdr_error; - READ_BUF(4); - open->op_create = be32_to_cpup(p++); - switch (open->op_create) { - case NFS4_OPEN_NOCREATE: - break; - case NFS4_OPEN_CREATE: - READ_BUF(4); - open->op_createmode = be32_to_cpup(p++); - switch (open->op_createmode) { - case NFS4_CREATE_UNCHECKED: - case NFS4_CREATE_GUARDED: - status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label, - &open->op_umask); - if (status) - goto out; - break; - case NFS4_CREATE_EXCLUSIVE: - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); - break; - case NFS4_CREATE_EXCLUSIVE4_1: - if (argp->minorversion < 1) - goto xdr_error; - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); - status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label, - &open->op_umask); - if (status) - goto out; - break; - default: - goto xdr_error; - } - break; - default: - goto xdr_error; - } - - /* open_claim */ - READ_BUF(4); - open->op_claim_type = be32_to_cpup(p++); - switch (open->op_claim_type) { - case NFS4_OPEN_CLAIM_NULL: - case NFS4_OPEN_CLAIM_DELEGATE_PREV: - READ_BUF(4); - open->op_fname.len = be32_to_cpup(p++); - READ_BUF(open->op_fname.len); - SAVEMEM(open->op_fname.data, open->op_fname.len); - if ((status = check_filename(open->op_fname.data, open->op_fname.len))) - return status; - break; - case NFS4_OPEN_CLAIM_PREVIOUS: - READ_BUF(4); - open->op_delegate_type = be32_to_cpup(p++); - break; - case NFS4_OPEN_CLAIM_DELEGATE_CUR: - status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); - if (status) - return status; - READ_BUF(4); - open->op_fname.len = be32_to_cpup(p++); - READ_BUF(open->op_fname.len); - SAVEMEM(open->op_fname.data, open->op_fname.len); - if ((status = check_filename(open->op_fname.data, open->op_fname.len))) - return status; - break; - case NFS4_OPEN_CLAIM_FH: - case NFS4_OPEN_CLAIM_DELEG_PREV_FH: - if (argp->minorversion < 1) - goto xdr_error; - /* void */ - break; - case NFS4_OPEN_CLAIM_DELEG_CUR_FH: - if (argp->minorversion < 1) - goto xdr_error; - status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); - if (status) - return status; - break; - default: - goto xdr_error; - } - - DECODE_TAIL; + return status; + status = nfsd4_decode_openflag4(argp, open); + if (status) + return status; + return nfsd4_decode_open_claim4(argp, open); } static __be32 nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) { - DECODE_HEAD; + __be32 status; if (argp->minorversion >= 1) return nfserr_notsupp; - status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + status = nfsd4_decode_stateid4(argp, &open_conf->oc_req_stateid); if (status) return status; - READ_BUF(4); - open_conf->oc_seqid = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) { - DECODE_HEAD; - - status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + __be32 status; + + status = nfsd4_decode_stateid4(argp, &open_down->od_stateid); if (status) return status; - READ_BUF(4); - open_down->od_seqid = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &open_down->od_seqid) < 0) + return nfserr_bad_xdr; + /* deleg_want is ignored */ status = nfsd4_decode_share_access(argp, &open_down->od_share_access, &open_down->od_deleg_want, NULL); if (status) return status; - status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); - if (status) - return status; - DECODE_TAIL; + return nfsd4_decode_share_deny(argp, &open_down->od_share_deny); } static __be32 nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) { - DECODE_HEAD; + __be32 *p; - READ_BUF(4); - putfh->pf_fhlen = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0) + return nfserr_bad_xdr; if (putfh->pf_fhlen > NFS4_FHSIZE) - goto xdr_error; - READ_BUF(putfh->pf_fhlen); - SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen); + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, putfh->pf_fhlen); + if (!p) + return nfserr_bad_xdr; + putfh->pf_fhval = svcxdr_tmpalloc(argp, putfh->pf_fhlen); + if (!putfh->pf_fhval) + return nfserr_jukebox; + memcpy(putfh->pf_fhval, p, putfh->pf_fhlen); - DECODE_TAIL; + return nfs_ok; } static __be32 @@ -1120,109 +1219,68 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) static __be32 nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { - DECODE_HEAD; + __be32 status; - status = nfsd4_decode_stateid(argp, &read->rd_stateid); + status = nfsd4_decode_stateid4(argp, &read->rd_stateid); if (status) return status; - READ_BUF(12); - p = xdr_decode_hyper(p, &read->rd_offset); - read->rd_length = be32_to_cpup(p++); + if (xdr_stream_decode_u64(argp->xdr, &read->rd_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &read->rd_length) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) { - DECODE_HEAD; + __be32 status; - READ_BUF(24); - p = xdr_decode_hyper(p, &readdir->rd_cookie); - COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); - readdir->rd_dircount = be32_to_cpup(p++); - readdir->rd_maxcount = be32_to_cpup(p++); - if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) - goto out; + if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_verifier4(argp, &readdir->rd_verf); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_dircount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_maxcount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_uint32_array(argp->xdr, readdir->rd_bmval, + ARRAY_SIZE(readdir->rd_bmval)) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) { - DECODE_HEAD; - - READ_BUF(4); - remove->rm_namelen = be32_to_cpup(p++); - READ_BUF(remove->rm_namelen); - SAVEMEM(remove->rm_name, remove->rm_namelen); - if ((status = check_filename(remove->rm_name, remove->rm_namelen))) - return status; - - DECODE_TAIL; + return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen); } static __be32 nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) { - DECODE_HEAD; + __be32 status; - READ_BUF(4); - rename->rn_snamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_snamelen); - SAVEMEM(rename->rn_sname, rename->rn_snamelen); - READ_BUF(4); - rename->rn_tnamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_tnamelen); - SAVEMEM(rename->rn_tname, rename->rn_tnamelen); - if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) + status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen); + if (status) return status; - if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen))) - return status; - - DECODE_TAIL; + return nfsd4_decode_component4(argp, &rename->rn_tname, &rename->rn_tnamelen); } static __be32 nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) { - DECODE_HEAD; - - if (argp->minorversion >= 1) - return nfserr_notsupp; - - READ_BUF(sizeof(clientid_t)); - COPYMEM(clientid, sizeof(clientid_t)); - - DECODE_TAIL; + return nfsd4_decode_clientid4(argp, clientid); } static __be32 nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, struct nfsd4_secinfo *secinfo) { - DECODE_HEAD; - - READ_BUF(4); - secinfo->si_namelen = be32_to_cpup(p++); - READ_BUF(secinfo->si_namelen); - SAVEMEM(secinfo->si_name, secinfo->si_namelen); - status = check_filename(secinfo->si_name, secinfo->si_namelen); - if (status) - return status; - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo_no_name *sin) -{ - DECODE_HEAD; - - READ_BUF(4); - sin->sin_style = be32_to_cpup(p++); - DECODE_TAIL; + return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen); } static __be32 @@ -1230,362 +1288,381 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta { __be32 status; - status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid); if (status) return status; - return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, - &setattr->sa_acl, &setattr->sa_label, NULL); + return nfsd4_decode_fattr4(argp, setattr->sa_bmval, + ARRAY_SIZE(setattr->sa_bmval), + &setattr->sa_iattr, &setattr->sa_acl, + &setattr->sa_label, NULL); } static __be32 nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) { - DECODE_HEAD; + __be32 *p, status; if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE); - + status = nfsd4_decode_verifier4(argp, &setclientid->se_verf); + if (status) + return status; status = nfsd4_decode_opaque(argp, &setclientid->se_name); if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_prog) < 0) return nfserr_bad_xdr; - READ_BUF(8); - setclientid->se_callback_prog = be32_to_cpup(p++); - setclientid->se_callback_netid_len = be32_to_cpup(p++); - READ_BUF(setclientid->se_callback_netid_len); - SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); - READ_BUF(4); - setclientid->se_callback_addr_len = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_netid_len) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, setclientid->se_callback_netid_len); + if (!p) + return nfserr_bad_xdr; + setclientid->se_callback_netid_val = svcxdr_tmpalloc(argp, + setclientid->se_callback_netid_len); + if (!setclientid->se_callback_netid_val) + return nfserr_jukebox; + memcpy(setclientid->se_callback_netid_val, p, + setclientid->se_callback_netid_len); - READ_BUF(setclientid->se_callback_addr_len); - SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); - READ_BUF(4); - setclientid->se_callback_ident = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_addr_len) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, setclientid->se_callback_addr_len); + if (!p) + return nfserr_bad_xdr; + setclientid->se_callback_addr_val = svcxdr_tmpalloc(argp, + setclientid->se_callback_addr_len); + if (!setclientid->se_callback_addr_val) + return nfserr_jukebox; + memcpy(setclientid->se_callback_addr_val, p, + setclientid->se_callback_addr_len); + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_ident) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) { - DECODE_HEAD; + __be32 status; if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(8 + NFS4_VERIFIER_SIZE); - COPYMEM(&scd_c->sc_clientid, 8); - COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE); - - DECODE_TAIL; + status = nfsd4_decode_clientid4(argp, &scd_c->sc_clientid); + if (status) + return status; + return nfsd4_decode_verifier4(argp, &scd_c->sc_confirm); } /* Also used for NVERIFY */ static __be32 nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) { - DECODE_HEAD; + __be32 *p, status; - if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) - goto out; + status = nfsd4_decode_bitmap4(argp, verify->ve_bmval, + ARRAY_SIZE(verify->ve_bmval)); + if (status) + return status; /* For convenience's sake, we compare raw xdr'd attributes in * nfsd4_proc_verify */ - READ_BUF(4); - verify->ve_attrlen = be32_to_cpup(p++); - READ_BUF(verify->ve_attrlen); - SAVEMEM(verify->ve_attrval, verify->ve_attrlen); + if (xdr_stream_decode_u32(argp->xdr, &verify->ve_attrlen) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, verify->ve_attrlen); + if (!p) + return nfserr_bad_xdr; + verify->ve_attrval = svcxdr_tmpalloc(argp, verify->ve_attrlen); + if (!verify->ve_attrval) + return nfserr_jukebox; + memcpy(verify->ve_attrval, p, verify->ve_attrlen); - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) { - DECODE_HEAD; + __be32 status; - status = nfsd4_decode_stateid(argp, &write->wr_stateid); + status = nfsd4_decode_stateid4(argp, &write->wr_stateid); if (status) return status; - READ_BUF(16); - p = xdr_decode_hyper(p, &write->wr_offset); - write->wr_stable_how = be32_to_cpup(p++); + if (xdr_stream_decode_u64(argp->xdr, &write->wr_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &write->wr_stable_how) < 0) + return nfserr_bad_xdr; if (write->wr_stable_how > NFS_FILE_SYNC) - goto xdr_error; - write->wr_buflen = be32_to_cpup(p++); + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &write->wr_buflen) < 0) + return nfserr_bad_xdr; + if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen)) + return nfserr_bad_xdr; - status = svcxdr_construct_vector(argp, &write->wr_head, - &write->wr_pagelist, write->wr_buflen); - if (status) - return status; - - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) { - DECODE_HEAD; + __be32 status; if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(12); - COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); - rlockowner->rl_owner.len = be32_to_cpup(p++); - READ_BUF(rlockowner->rl_owner.len); - READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); + status = nfsd4_decode_state_owner4(argp, &rlockowner->rl_clientid, + &rlockowner->rl_owner); + if (status) + return status; if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid)) return nfserr_inval; - DECODE_TAIL; + + return nfs_ok; +} + +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +{ + if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); +} + +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +{ + u32 use_conn_in_rdma_mode; + __be32 status; + + status = nfsd4_decode_sessionid4(argp, &bcts->sessionid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &bcts->dir) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &use_conn_in_rdma_mode) < 0) + return nfserr_bad_xdr; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_state_protect_ops(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + __be32 status; + + status = nfsd4_decode_bitmap4(argp, exid->spo_must_enforce, + ARRAY_SIZE(exid->spo_must_enforce)); + if (status) + return nfserr_bad_xdr; + status = nfsd4_decode_bitmap4(argp, exid->spo_must_allow, + ARRAY_SIZE(exid->spo_must_allow)); + if (status) + return nfserr_bad_xdr; + + return nfs_ok; +} + +/* + * This implementation currently does not support SP4_SSV. + * This decoder simply skips over these arguments. + */ +static noinline __be32 +nfsd4_decode_ssv_sp_parms(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + u32 count, window, num_gss_handles; + __be32 status; + + /* ssp_ops */ + status = nfsd4_decode_state_protect_ops(argp, exid); + if (status) + return status; + + /* ssp_hash_algs<> */ + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + while (count--) { + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + } + + /* ssp_encr_algs<> */ + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + while (count--) { + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + } + + if (xdr_stream_decode_u32(argp->xdr, &window) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &num_gss_handles) < 0) + return nfserr_bad_xdr; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_state_protect4_a(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &exid->spa_how) < 0) + return nfserr_bad_xdr; + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + status = nfsd4_decode_state_protect_ops(argp, exid); + if (status) + return status; + break; + case SP4_SSV: + status = nfsd4_decode_ssv_sp_parms(argp, exid); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; +} + +static __be32 +nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + __be32 status; + u32 count; + + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + switch (count) { + case 0: + break; + case 1: + /* Note that RFC 8881 places no length limit on + * nii_domain, but this implementation permits no + * more than NFS4_OPAQUE_LIMIT bytes */ + status = nfsd4_decode_opaque(argp, &exid->nii_domain); + if (status) + return status; + /* Note that RFC 8881 places no length limit on + * nii_name, but this implementation permits no + * more than NFS4_OPAQUE_LIMIT bytes */ + status = nfsd4_decode_opaque(argp, &exid->nii_name); + if (status) + return status; + status = nfsd4_decode_nfstime4(argp, &exid->nii_time); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; } static __be32 nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, struct nfsd4_exchange_id *exid) { - int dummy, tmp; - DECODE_HEAD; - - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); + __be32 status; + status = nfsd4_decode_verifier4(argp, &exid->verifier); + if (status) + return status; status = nfsd4_decode_opaque(argp, &exid->clname); if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &exid->flags) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_state_protect4_a(argp, exid); + if (status) + return status; + return nfsd4_decode_nfs_impl_id4(argp, exid); +} + +static __be32 +nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp, + struct nfsd4_channel_attrs *ca) +{ + __be32 *p; + + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 7); + if (!p) return nfserr_bad_xdr; - READ_BUF(4); - exid->flags = be32_to_cpup(p++); - - /* Ignore state_protect4_a */ - READ_BUF(4); - exid->spa_how = be32_to_cpup(p++); - switch (exid->spa_how) { - case SP4_NONE: + /* headerpadsz is ignored */ + p++; + ca->maxreq_sz = be32_to_cpup(p++); + ca->maxresp_sz = be32_to_cpup(p++); + ca->maxresp_cached = be32_to_cpup(p++); + ca->maxops = be32_to_cpup(p++); + ca->maxreqs = be32_to_cpup(p++); + ca->nr_rdma_attrs = be32_to_cpup(p); + switch (ca->nr_rdma_attrs) { + case 0: break; - case SP4_MACH_CRED: - /* spo_must_enforce */ - status = nfsd4_decode_bitmap(argp, - exid->spo_must_enforce); - if (status) - goto out; - /* spo_must_allow */ - status = nfsd4_decode_bitmap(argp, exid->spo_must_allow); - if (status) - goto out; - break; - case SP4_SSV: - /* ssp_ops */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - - /* ssp_hash_algs<> */ - READ_BUF(4); - tmp = be32_to_cpup(p++); - while (tmp--) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - } - - /* ssp_encr_algs<> */ - READ_BUF(4); - tmp = be32_to_cpup(p++); - while (tmp--) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - } - - /* ignore ssp_window and ssp_num_gss_handles: */ - READ_BUF(8); + case 1: + if (xdr_stream_decode_u32(argp->xdr, &ca->rdma_attrs) < 0) + return nfserr_bad_xdr; break; default: - goto xdr_error; + return nfserr_bad_xdr; } - READ_BUF(4); /* nfs_impl_id4 array length */ - dummy = be32_to_cpup(p++); - - if (dummy > 1) - goto xdr_error; - - if (dummy == 1) { - status = nfsd4_decode_opaque(argp, &exid->nii_domain); - if (status) - goto xdr_error; - - /* nii_name */ - status = nfsd4_decode_opaque(argp, &exid->nii_name); - if (status) - goto xdr_error; - - /* nii_date */ - status = nfsd4_decode_time(argp, &exid->nii_time); - if (status) - goto xdr_error; - } - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, struct nfsd4_create_session *sess) { - DECODE_HEAD; + __be32 status; - READ_BUF(16); - COPYMEM(&sess->clientid, 8); - sess->seqid = be32_to_cpup(p++); - sess->flags = be32_to_cpup(p++); + status = nfsd4_decode_clientid4(argp, &sess->clientid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &sess->seqid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &sess->flags) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_channel_attrs4(argp, &sess->fore_channel); + if (status) + return status; + status = nfsd4_decode_channel_attrs4(argp, &sess->back_channel); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_cb_sec(argp, &sess->cb_sec); + if (status) + return status; - /* Fore channel attrs */ - READ_BUF(28); - p++; /* headerpadsz is always 0 */ - sess->fore_channel.maxreq_sz = be32_to_cpup(p++); - sess->fore_channel.maxresp_sz = be32_to_cpup(p++); - sess->fore_channel.maxresp_cached = be32_to_cpup(p++); - sess->fore_channel.maxops = be32_to_cpup(p++); - sess->fore_channel.maxreqs = be32_to_cpup(p++); - sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++); - if (sess->fore_channel.nr_rdma_attrs == 1) { - READ_BUF(4); - sess->fore_channel.rdma_attrs = be32_to_cpup(p++); - } else if (sess->fore_channel.nr_rdma_attrs > 1) { - dprintk("Too many fore channel attr bitmaps!\n"); - goto xdr_error; - } - - /* Back channel attrs */ - READ_BUF(28); - p++; /* headerpadsz is always 0 */ - sess->back_channel.maxreq_sz = be32_to_cpup(p++); - sess->back_channel.maxresp_sz = be32_to_cpup(p++); - sess->back_channel.maxresp_cached = be32_to_cpup(p++); - sess->back_channel.maxops = be32_to_cpup(p++); - sess->back_channel.maxreqs = be32_to_cpup(p++); - sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++); - if (sess->back_channel.nr_rdma_attrs == 1) { - READ_BUF(4); - sess->back_channel.rdma_attrs = be32_to_cpup(p++); - } else if (sess->back_channel.nr_rdma_attrs > 1) { - dprintk("Too many back channel attr bitmaps!\n"); - goto xdr_error; - } - - READ_BUF(4); - sess->callback_prog = be32_to_cpup(p++); - nfsd4_decode_cb_sec(argp, &sess->cb_sec); - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_session *destroy_session) { - DECODE_HEAD; - READ_BUF(NFS4_MAX_SESSIONID_LEN); - COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN); - - DECODE_TAIL; + return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid); } static __be32 nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_free_stateid *free_stateid) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - free_stateid->fr_stateid.si_generation = be32_to_cpup(p++); - COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); - - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, - struct nfsd4_sequence *seq) -{ - DECODE_HEAD; - - READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); - COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); - seq->seqid = be32_to_cpup(p++); - seq->slotid = be32_to_cpup(p++); - seq->maxslots = be32_to_cpup(p++); - seq->cachethis = be32_to_cpup(p++); - - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) -{ - int i; - __be32 *p, status; - struct nfsd4_test_stateid_id *stateid; - - READ_BUF(4); - test_stateid->ts_num_ids = ntohl(*p++); - - INIT_LIST_HEAD(&test_stateid->ts_stateid_list); - - for (i = 0; i < test_stateid->ts_num_ids; i++) { - stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); - if (!stateid) { - status = nfserrno(-ENOMEM); - goto out; - } - - INIT_LIST_HEAD(&stateid->ts_id_list); - list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); - - status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid); - if (status) - goto out; - } - - status = 0; -out: - return status; -xdr_error: - dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__); - status = nfserr_bad_xdr; - goto out; -} - -static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) -{ - DECODE_HEAD; - - READ_BUF(8); - COPYMEM(&dc->clientid, 8); - - DECODE_TAIL; -} - -static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) -{ - DECODE_HEAD; - - READ_BUF(4); - rc->rca_one_fs = be32_to_cpup(p++); - - DECODE_TAIL; + return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid); } #ifdef CONFIG_NFSD_PNFS @@ -1593,244 +1670,264 @@ static __be32 nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, struct nfsd4_getdeviceinfo *gdev) { - DECODE_HEAD; - u32 num, i; + __be32 status; - READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4); - COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid)); - gdev->gd_layout_type = be32_to_cpup(p++); - gdev->gd_maxcount = be32_to_cpup(p++); - num = be32_to_cpup(p++); - if (num) { - if (num > 1000) - goto xdr_error; - READ_BUF(4 * num); - gdev->gd_notify_types = be32_to_cpup(p++); - for (i = 1; i < num; i++) { - if (be32_to_cpup(p++)) { - status = nfserr_inval; - goto out; - } - } + status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_maxcount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_uint32_array(argp->xdr, + &gdev->gd_notify_types, 1) < 0) + return nfserr_bad_xdr; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutcommit *lcp) +{ + __be32 *p, status; + + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_bool(argp->xdr, &lcp->lc_reclaim) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lcp->lc_sid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_newoffset) < 0) + return nfserr_bad_xdr; + if (lcp->lc_newoffset) { + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_last_wr) < 0) + return nfserr_bad_xdr; + } else + lcp->lc_last_wr = 0; + p = xdr_inline_decode(argp->xdr, XDR_UNIT); + if (!p) + return nfserr_bad_xdr; + if (xdr_item_is_present(p)) { + status = nfsd4_decode_nfstime4(argp, &lcp->lc_mtime); + if (status) + return status; + } else { + lcp->lc_mtime.tv_nsec = UTIME_NOW; } - DECODE_TAIL; + return nfsd4_decode_layoutupdate4(argp, lcp); } static __be32 nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, struct nfsd4_layoutget *lgp) { - DECODE_HEAD; + __be32 status; - READ_BUF(36); - lgp->lg_signal = be32_to_cpup(p++); - lgp->lg_layout_type = be32_to_cpup(p++); - lgp->lg_seg.iomode = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &lgp->lg_seg.offset); - p = xdr_decode_hyper(p, &lgp->lg_seg.length); - p = xdr_decode_hyper(p, &lgp->lg_minlength); - - status = nfsd4_decode_stateid(argp, &lgp->lg_sid); + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_seg.iomode) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.length) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_minlength) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lgp->lg_sid); if (status) return status; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_maxcount) < 0) + return nfserr_bad_xdr; - READ_BUF(4); - lgp->lg_maxcount = be32_to_cpup(p++); - - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutcommit *lcp) -{ - DECODE_HEAD; - u32 timechange; - - READ_BUF(20); - p = xdr_decode_hyper(p, &lcp->lc_seg.offset); - p = xdr_decode_hyper(p, &lcp->lc_seg.length); - lcp->lc_reclaim = be32_to_cpup(p++); - - status = nfsd4_decode_stateid(argp, &lcp->lc_sid); - if (status) - return status; - - READ_BUF(4); - lcp->lc_newoffset = be32_to_cpup(p++); - if (lcp->lc_newoffset) { - READ_BUF(8); - p = xdr_decode_hyper(p, &lcp->lc_last_wr); - } else - lcp->lc_last_wr = 0; - READ_BUF(4); - timechange = be32_to_cpup(p++); - if (timechange) { - status = nfsd4_decode_time(argp, &lcp->lc_mtime); - if (status) - return status; - } else { - lcp->lc_mtime.tv_nsec = UTIME_NOW; - } - READ_BUF(8); - lcp->lc_layout_type = be32_to_cpup(p++); - - /* - * Save the layout update in XDR format and let the layout driver deal - * with it later. - */ - lcp->lc_up_len = be32_to_cpup(p++); - if (lcp->lc_up_len > 0) { - READ_BUF(lcp->lc_up_len); - READMEM(lcp->lc_up_layout, lcp->lc_up_len); - } - - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, struct nfsd4_layoutreturn *lrp) { - DECODE_HEAD; - - READ_BUF(16); - lrp->lr_reclaim = be32_to_cpup(p++); - lrp->lr_layout_type = be32_to_cpup(p++); - lrp->lr_seg.iomode = be32_to_cpup(p++); - lrp->lr_return_type = be32_to_cpup(p++); - if (lrp->lr_return_type == RETURN_FILE) { - READ_BUF(16); - p = xdr_decode_hyper(p, &lrp->lr_seg.offset); - p = xdr_decode_hyper(p, &lrp->lr_seg.length); - - status = nfsd4_decode_stateid(argp, &lrp->lr_sid); - if (status) - return status; - - READ_BUF(4); - lrp->lrf_body_len = be32_to_cpup(p++); - if (lrp->lrf_body_len > 0) { - READ_BUF(lrp->lrf_body_len); - READMEM(lrp->lrf_body, lrp->lrf_body_len); - } - } else { - lrp->lr_seg.offset = 0; - lrp->lr_seg.length = NFS4_MAX_UINT64; - } - - DECODE_TAIL; + if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_seg.iomode) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_layoutreturn4(argp, lrp); } #endif /* CONFIG_NFSD_PNFS */ +static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo_no_name *sin) +{ + if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0) + return nfserr_bad_xdr; + return nfs_ok; +} + +static __be32 +nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, + struct nfsd4_sequence *seq) +{ + __be32 *p, status; + + status = nfsd4_decode_sessionid4(argp, &seq->sessionid); + if (status) + return status; + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 4); + if (!p) + return nfserr_bad_xdr; + seq->seqid = be32_to_cpup(p++); + seq->slotid = be32_to_cpup(p++); + seq->maxslots = be32_to_cpup(p++); + seq->cachethis = be32_to_cpup(p); + + return nfs_ok; +} + +static __be32 +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +{ + struct nfsd4_test_stateid_id *stateid; + __be32 status; + u32 i; + + if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0) + return nfserr_bad_xdr; + + INIT_LIST_HEAD(&test_stateid->ts_stateid_list); + for (i = 0; i < test_stateid->ts_num_ids; i++) { + stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); + if (!stateid) + return nfserrno(-ENOMEM); /* XXX: not jukebox? */ + INIT_LIST_HEAD(&stateid->ts_id_list); + list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); + status = nfsd4_decode_stateid4(argp, &stateid->ts_id_stateid); + if (status) + return status; + } + + return nfs_ok; +} + +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, + struct nfsd4_destroy_clientid *dc) +{ + return nfsd4_decode_clientid4(argp, &dc->clientid); +} + +static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, + struct nfsd4_reclaim_complete *rc) +{ + if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0) + return nfserr_bad_xdr; + return nfs_ok; +} + static __be32 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, struct nfsd4_fallocate *fallocate) { - DECODE_HEAD; + __be32 status; - status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid); + status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid); if (status) return status; + if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_length) < 0) + return nfserr_bad_xdr; - READ_BUF(16); - p = xdr_decode_hyper(p, &fallocate->falloc_offset); - xdr_decode_hyper(p, &fallocate->falloc_length); - - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) -{ - DECODE_HEAD; - - status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid); - if (status) - return status; - status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid); - if (status) - return status; - - READ_BUF(8 + 8 + 8); - p = xdr_decode_hyper(p, &clone->cl_src_pos); - p = xdr_decode_hyper(p, &clone->cl_dst_pos); - p = xdr_decode_hyper(p, &clone->cl_count); - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp, struct nl4_server *ns) { - DECODE_HEAD; struct nfs42_netaddr *naddr; + __be32 *p; - READ_BUF(4); - ns->nl4_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &ns->nl4_type) < 0) + return nfserr_bad_xdr; /* currently support for 1 inter-server source server */ switch (ns->nl4_type) { case NL4_NETADDR: naddr = &ns->u.nl4_addr; - READ_BUF(4); - naddr->netid_len = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &naddr->netid_len) < 0) + return nfserr_bad_xdr; if (naddr->netid_len > RPCBIND_MAXNETIDLEN) - goto xdr_error; + return nfserr_bad_xdr; - READ_BUF(naddr->netid_len + 4); /* 4 for uaddr len */ - COPYMEM(naddr->netid, naddr->netid_len); + p = xdr_inline_decode(argp->xdr, naddr->netid_len); + if (!p) + return nfserr_bad_xdr; + memcpy(naddr->netid, p, naddr->netid_len); - naddr->addr_len = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &naddr->addr_len) < 0) + return nfserr_bad_xdr; if (naddr->addr_len > RPCBIND_MAXUADDRLEN) - goto xdr_error; + return nfserr_bad_xdr; - READ_BUF(naddr->addr_len); - COPYMEM(naddr->addr, naddr->addr_len); + p = xdr_inline_decode(argp->xdr, naddr->addr_len); + if (!p) + return nfserr_bad_xdr; + memcpy(naddr->addr, p, naddr->addr_len); break; default: - goto xdr_error; + return nfserr_bad_xdr; } - DECODE_TAIL; + + return nfs_ok; } static __be32 nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) { - DECODE_HEAD; struct nl4_server *ns_dummy; - int i, count; + u32 consecutive, i, count; + __be32 status; - status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); + status = nfsd4_decode_stateid4(argp, ©->cp_src_stateid); if (status) return status; - status = nfsd4_decode_stateid(argp, ©->cp_dst_stateid); + status = nfsd4_decode_stateid4(argp, ©->cp_dst_stateid); if (status) return status; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_src_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_dst_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_count) < 0) + return nfserr_bad_xdr; + /* ca_consecutive: we always do consecutive copies */ + if (xdr_stream_decode_u32(argp->xdr, &consecutive) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, ©->cp_synchronous) < 0) + return nfserr_bad_xdr; - READ_BUF(8 + 8 + 8 + 4 + 4 + 4); - p = xdr_decode_hyper(p, ©->cp_src_pos); - p = xdr_decode_hyper(p, ©->cp_dst_pos); - p = xdr_decode_hyper(p, ©->cp_count); - p++; /* ca_consecutive: we always do consecutive copies */ - copy->cp_synchronous = be32_to_cpup(p++); - - count = be32_to_cpup(p++); - + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; copy->cp_intra = false; if (count == 0) { /* intra-server copy */ copy->cp_intra = true; - goto intra; + return nfs_ok; } - /* decode all the supplied server addresses but use first */ + /* decode all the supplied server addresses but use only the first */ status = nfsd4_decode_nl4_server(argp, ©->cp_src); if (status) return status; ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL); if (ns_dummy == NULL) - return nfserrno(-ENOMEM); + return nfserrno(-ENOMEM); /* XXX: jukebox? */ for (i = 0; i < count - 1; i++) { status = nfsd4_decode_nl4_server(argp, ns_dummy); if (status) { @@ -1839,16 +1936,8 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) } } kfree(ns_dummy); -intra: - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, - struct nfsd4_offload_status *os) -{ - return nfsd4_decode_stateid(argp, &os->stateid); + return nfs_ok; } static __be32 @@ -1857,26 +1946,54 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, { __be32 status; - status = nfsd4_decode_stateid(argp, &cn->cpn_src_stateid); + status = nfsd4_decode_stateid4(argp, &cn->cpn_src_stateid); if (status) return status; return nfsd4_decode_nl4_server(argp, &cn->cpn_dst); } +static __be32 +nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, + struct nfsd4_offload_status *os) +{ + return nfsd4_decode_stateid4(argp, &os->stateid); +} + static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { - DECODE_HEAD; + __be32 status; - status = nfsd4_decode_stateid(argp, &seek->seek_stateid); + status = nfsd4_decode_stateid4(argp, &seek->seek_stateid); if (status) return status; + if (xdr_stream_decode_u64(argp->xdr, &seek->seek_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0) + return nfserr_bad_xdr; - READ_BUF(8 + 4); - p = xdr_decode_hyper(p, &seek->seek_offset); - seek->seek_whence = be32_to_cpup(p); + return nfs_ok; +} - DECODE_TAIL; +static __be32 +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +{ + __be32 status; + + status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid4(argp, &clone->cl_dst_stateid); + if (status) + return status; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_src_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_dst_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_count) < 0) + return nfserr_bad_xdr; + + return nfs_ok; } /* @@ -1889,13 +2006,14 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) */ /* - * Decode data into buffer. Uses head and pages constructed by - * svcxdr_construct_vector. + * Decode data into buffer. */ static __be32 -nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head, - struct page **pages, char **bufp, u32 buflen) +nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct xdr_buf *xdr, + char **bufp, u32 buflen) { + struct page **pages = xdr->pages; + struct kvec *head = xdr->head; char *tmp, *dp; u32 len; @@ -1938,25 +2056,22 @@ nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head, static __be32 nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) { - DECODE_HEAD; char *name, *sp, *dp; u32 namelen, cnt; + __be32 *p; - READ_BUF(4); - namelen = be32_to_cpup(p++); - + if (xdr_stream_decode_u32(argp->xdr, &namelen) < 0) + return nfserr_bad_xdr; if (namelen > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) return nfserr_nametoolong; - if (namelen == 0) - goto xdr_error; - - READ_BUF(namelen); - + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, namelen); + if (!p) + return nfserr_bad_xdr; name = svcxdr_tmpalloc(argp, namelen + XATTR_USER_PREFIX_LEN + 1); if (!name) return nfserr_jukebox; - memcpy(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); /* @@ -1969,14 +2084,14 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) while (cnt-- > 0) { if (*sp == '\0') - goto xdr_error; + return nfserr_bad_xdr; *dp++ = *sp++; } *dp = '\0'; *namep = name; - DECODE_TAIL; + return nfs_ok; } /* @@ -2008,13 +2123,11 @@ static __be32 nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, struct nfsd4_setxattr *setxattr) { - DECODE_HEAD; u32 flags, maxcount, size; - struct kvec head; - struct page **pagelist; + __be32 status; - READ_BUF(4); - flags = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &flags) < 0) + return nfserr_bad_xdr; if (flags > SETXATTR4_REPLACE) return nfserr_inval; @@ -2027,33 +2140,32 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, maxcount = svc_max_payload(argp->rqstp); maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); - READ_BUF(4); - size = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &size) < 0) + return nfserr_bad_xdr; if (size > maxcount) return nfserr_xattr2big; setxattr->setxa_len = size; if (size > 0) { - status = svcxdr_construct_vector(argp, &head, &pagelist, size); - if (status) - return status; + struct xdr_buf payload; - status = nfsd4_vbuf_from_vector(argp, &head, pagelist, - &setxattr->setxa_buf, size); + if (!xdr_stream_subsegment(argp->xdr, &payload, size)) + return nfserr_bad_xdr; + status = nfsd4_vbuf_from_vector(argp, &payload, + &setxattr->setxa_buf, size); } - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, struct nfsd4_listxattrs *listxattrs) { - DECODE_HEAD; u32 maxcount; - READ_BUF(12); - p = xdr_decode_hyper(p, &listxattrs->lsxa_cookie); + if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0) + return nfserr_bad_xdr; /* * If the cookie is too large to have even one user.x attribute @@ -2063,7 +2175,8 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, (XATTR_LIST_MAX / (XATTR_USER_PREFIX_LEN + 2))) return nfserr_badcookie; - maxcount = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &maxcount) < 0) + return nfserr_bad_xdr; if (maxcount < 8) /* Always need at least 2 words (length and one character) */ return nfserr_inval; @@ -2071,7 +2184,7 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, maxcount = min(maxcount, svc_max_payload(argp->rqstp)); listxattrs->lsxa_maxcount = maxcount; - DECODE_TAIL; + return nfs_ok; } static __be32 @@ -2198,43 +2311,54 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) return true; } -static __be32 +static int nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { - DECODE_HEAD; struct nfsd4_op *op; bool cachethis = false; int auth_slack= argp->rqstp->rq_auth_slack; int max_reply = auth_slack + 8; /* opcnt, status */ int readcount = 0; int readbytes = 0; + __be32 *p; int i; - READ_BUF(4); - argp->taglen = be32_to_cpup(p++); - READ_BUF(argp->taglen); - SAVEMEM(argp->tag, argp->taglen); - READ_BUF(8); - argp->minorversion = be32_to_cpup(p++); - argp->opcnt = be32_to_cpup(p++); - max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); + if (xdr_stream_decode_u32(argp->xdr, &argp->taglen) < 0) + return 0; + max_reply += XDR_UNIT; + argp->tag = NULL; + if (unlikely(argp->taglen)) { + if (argp->taglen > NFSD4_MAX_TAGLEN) + return 0; + p = xdr_inline_decode(argp->xdr, argp->taglen); + if (!p) + return 0; + argp->tag = svcxdr_tmpalloc(argp, argp->taglen); + if (!argp->tag) + return 0; + memcpy(argp->tag, p, argp->taglen); + max_reply += xdr_align_size(argp->taglen); + } + + if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0) + return 0; + if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0) + return 0; - if (argp->taglen > NFSD4_MAX_TAGLEN) - goto xdr_error; /* * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS * here, so we return success at the xdr level so that * nfsd4_proc can handle this is an NFS-level error. */ if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND) - return 0; + return 1; if (argp->opcnt > ARRAY_SIZE(argp->iops)) { argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); if (!argp->ops) { argp->ops = argp->iops; dprintk("nfsd: couldn't allocate room for COMPOUND\n"); - goto xdr_error; + return 0; } } @@ -2245,12 +2369,16 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) op = &argp->ops[i]; op->replay = NULL; - READ_BUF(4); - op->opnum = be32_to_cpup(p++); - - if (nfsd4_opnum_in_range(argp, op)) + if (xdr_stream_decode_u32(argp->xdr, &op->opnum) < 0) + return 0; + if (nfsd4_opnum_in_range(argp, op)) { op->status = nfsd4_dec_ops[op->opnum](argp, &op->u); - else { + if (op->status != nfs_ok) + trace_nfsd_compound_decode_err(argp->rqstp, + argp->opcnt, i, + op->opnum, + op->status); + } else { op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; } @@ -2289,7 +2417,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); - DECODE_TAIL; + return 1; } static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, @@ -2298,12 +2426,8 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, if (exp->ex_flags & NFSEXP_V4ROOT) { *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); *p++ = 0; - } else if (IS_I_VERSION(inode)) { + } else p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode)); - } else { - *p++ = cpu_to_be32(stat->ctime.tv_sec); - *p++ = cpu_to_be32(stat->ctime.tv_nsec); - } return p; } @@ -2335,15 +2459,8 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode) static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) { *p++ = cpu_to_be32(c->atomic); - if (c->change_supported) { - p = xdr_encode_hyper(p, c->before_change); - p = xdr_encode_hyper(p, c->after_change); - } else { - *p++ = cpu_to_be32(c->before_ctime_sec); - *p++ = cpu_to_be32(c->before_ctime_nsec); - *p++ = cpu_to_be32(c->after_ctime_sec); - *p++ = cpu_to_be32(c->after_ctime_nsec); - } + p = xdr_encode_hyper(p, c->before_change); + p = xdr_encode_hyper(p, c->after_change); return p; } @@ -2558,7 +2675,7 @@ static u32 nfs4_file_type(umode_t mode) case S_IFREG: return NF4REG; case S_IFSOCK: return NF4SOCK; default: return NF4BAD; - }; + } } static inline __be32 @@ -3194,16 +3311,6 @@ out_acl: goto out; } - if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - if (IS_I_VERSION(d_inode(dentry))) - *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR); - else - *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA); - } - #ifdef CONFIG_NFSD_V4_SECURITY_LABEL if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { status = nfsd4_encode_security_label(xdr, rqstp, context, @@ -3756,8 +3863,8 @@ static __be32 nfsd4_encode_splice_read( { struct xdr_stream *xdr = &resp->xdr; struct xdr_buf *buf = xdr->buf; + int status, space_left; u32 eof; - int space_left; __be32 nfserr; __be32 *p = xdr->p - 2; @@ -3768,14 +3875,13 @@ static __be32 nfsd4_encode_splice_read( nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; - if (nfserr) { - /* - * nfsd_splice_actor may have already messed with the - * page length; reset it so as not to confuse - * xdr_truncate_encode: - */ - buf->page_len = 0; - return nfserr; + if (nfserr) + goto out_err; + status = svc_encode_result_payload(read->rd_rqstp, + buf->head[0].iov_len, maxcount); + if (status) { + nfserr = nfserrno(status); + goto out_err; } *(p++) = htonl(eof); @@ -3806,6 +3912,15 @@ static __be32 nfsd4_encode_splice_read( xdr->end = (__be32 *)((void *)xdr->end + space_left); return 0; + +out_err: + /* + * nfsd_splice_actor may have already messed with the + * page length; reset it so as not to confuse + * xdr_truncate_encode in our caller. + */ + buf->page_len = 0; + return nfserr; } static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, @@ -3829,7 +3944,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, read->rd_length = maxcount; if (nfserr) return nfserr; - if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount)) + if (svc_encode_result_payload(resp->rqstp, starting_len + 8, maxcount)) return nfserr_io; xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount)); @@ -3897,6 +4012,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd int zero = 0; struct xdr_stream *xdr = &resp->xdr; int length_offset = xdr->buf->len; + int status; __be32 *p; p = xdr_reserve_space(xdr, 4); @@ -3917,9 +4033,13 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd (char *)p, &maxcount); if (nfserr == nfserr_isdir) nfserr = nfserr_inval; - if (nfserr) { - xdr_truncate_encode(xdr, length_offset); - return nfserr; + if (nfserr) + goto out_err; + status = svc_encode_result_payload(readlink->rl_rqstp, length_offset, + maxcount); + if (status) { + nfserr = nfserrno(status); + goto out_err; } wire_count = htonl(maxcount); @@ -3929,6 +4049,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero, 4 - (maxcount&3)); return 0; + +out_err: + xdr_truncate_encode(xdr, length_offset); + return nfserr; } static __be32 @@ -4575,7 +4699,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, __be32 *p; nfserr = nfsd42_encode_write_res(resp, ©->cp_res, - copy->cp_synchronous); + !!copy->cp_synchronous); if (nfserr) return nfserr; @@ -5182,10 +5306,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) if (op->status && opdesc && !(opdesc->op_flags & OP_NONTRIVIAL_ERROR_ENCODE)) goto status; - BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || + BUG_ON(op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); encoder = nfsd4_enc_ops[op->opnum]; op->status = encoder(resp, op->status, &op->u); + if (op->status) + trace_nfsd_compound_encode_err(rqstp, op->opnum, op->status); if (opdesc && opdesc->op_release) opdesc->op_release(&op->u); xdr_commit_encode(xdr); @@ -5254,12 +5380,6 @@ nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); } -int -nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} - void nfsd4_release_compoundargs(struct svc_rqst *rqstp) { struct nfsd4_compoundargs *args = rqstp->rq_argp; @@ -5268,8 +5388,6 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) kfree(args->ops); args->ops = args->iops; } - kfree(args->tmpp); - args->tmpp = NULL; while (args->to_free) { struct svcxdr_tmpbuf *tb = args->to_free; args->to_free = tb->next; @@ -5277,34 +5395,19 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) } } -int -nfs4svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) -{ - return 1; -} - int nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd4_compoundargs *args = rqstp->rq_argp; - if (rqstp->rq_arg.head[0].iov_len % 4) { - /* client is nuts */ - dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)", - __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid)); - return 0; - } - args->p = p; - args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; - args->pagelist = rqstp->rq_arg.pages; - args->pagelen = rqstp->rq_arg.page_len; - args->tail = false; - args->tmpp = NULL; + /* svcxdr_tmp_alloc */ args->to_free = NULL; + + args->xdr = &rqstp->rq_arg_stream; args->ops = args->iops; args->rqstp = rqstp; - return !nfsd4_decode_compound(args); + return nfsd4_decode_compound(args); } int diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index cb742e17e04a..d63cf8196fed 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -73,6 +73,14 @@ extern unsigned long nfsd_drc_mem_used; extern const struct seq_operations nfs_exports_op; +/* + * Common void argument and result helpers + */ +struct nfsd_voidargs { }; +struct nfsd_voidres { }; +int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p); +int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p); + /* * Function prototypes. */ @@ -387,7 +395,6 @@ void nfsd_lockd_shutdown(void); #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ - FATTR4_WORD2_CHANGE_ATTR_TYPE | \ FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS | \ FATTR4_WORD2_XATTR_SUPPORT) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c81dbbad8792..66f2ef67792a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -268,12 +268,20 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) if (fileid_type == FILEID_ROOT) dentry = dget(exp->ex_path.dentry); else { - dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, - data_left, fileid_type, - nfsd_acceptable, exp); - if (IS_ERR_OR_NULL(dentry)) + dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid, + data_left, fileid_type, + nfsd_acceptable, exp); + if (IS_ERR_OR_NULL(dentry)) { trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp, dentry ? PTR_ERR(dentry) : -ESTALE); + switch (PTR_ERR(dentry)) { + case -ENOMEM: + case -ETIMEDOUT: + break; + default: + dentry = ERR_PTR(-ESTALE); + } + } } if (dentry == NULL) goto out; @@ -291,6 +299,20 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) fhp->fh_dentry = dentry; fhp->fh_export = exp; + + switch (rqstp->rq_vers) { + case 4: + if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR) + fhp->fh_no_atomic_attr = true; + break; + case 3: + if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOWCC) + fhp->fh_no_wcc = true; + break; + case 2: + fhp->fh_no_wcc = true; + } + return 0; out: exp_put(exp); @@ -559,6 +581,9 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, */ set_version_and_fsid_type(fhp, exp, ref_fh); + /* If we have a ref_fh, then copy the fh_no_wcc setting from it. */ + fhp->fh_no_wcc = ref_fh ? ref_fh->fh_no_wcc : false; + if (ref_fh == fhp) fh_put(ref_fh); @@ -662,6 +687,7 @@ fh_put(struct svc_fh *fhp) exp_put(exp); fhp->fh_export = NULL; } + fhp->fh_no_wcc = false; return; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 56cfbc361561..cb20c2cd3469 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -35,6 +35,12 @@ typedef struct svc_fh { bool fh_locked; /* inode locked by us */ bool fh_want_write; /* remount protection taken */ + bool fh_no_wcc; /* no wcc data needed */ + bool fh_no_atomic_attr; + /* + * wcc data is not atomic with + * operation + */ int fh_flags; /* FH flags */ #ifdef CONFIG_NFSD_V3 bool fh_post_saved; /* post-op attrs saved */ @@ -54,7 +60,6 @@ typedef struct svc_fh { struct kstat fh_post_attr; /* full attrs after operation */ u64 fh_post_change; /* nfsv4 change; see above */ #endif /* CONFIG_NFSD_V3 */ - } svc_fh; #define NFSD4_FH_FOREIGN (1<<0) #define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f)) @@ -259,13 +264,16 @@ fh_clear_wcc(struct svc_fh *fhp) static inline u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) { - u64 chattr; + if (IS_I_VERSION(inode)) { + u64 chattr; - chattr = stat->ctime.tv_sec; - chattr <<= 30; - chattr += stat->ctime.tv_nsec; - chattr += inode_query_iversion(inode); - return chattr; + chattr = stat->ctime.tv_sec; + chattr <<= 30; + chattr += stat->ctime.tv_nsec; + chattr += inode_query_iversion(inode); + return chattr; + } else + return time_to_chattr(&stat->ctime); } extern void fill_pre_wcc(struct svc_fh *fhp); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0d71549f9d42..9473d048efec 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -609,7 +609,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp) * NFSv2 Server procedures. * Only the results of non-idempotent operations are cached. */ -struct nfsd_void { int dummy; }; #define ST 1 /* status */ #define FH 8 /* filehandle */ @@ -618,10 +617,10 @@ struct nfsd_void { int dummy; }; static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_NULL] = { .pc_func = nfsd_proc_null, - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, }, @@ -647,10 +646,10 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, [NFSPROC_ROOT] = { .pc_func = nfsd_proc_root, - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, }, @@ -685,10 +684,10 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, [NFSPROC_WRITECACHE] = { .pc_func = nfsd_proc_writecache, - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, }, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 27b1ad136150..00384c332f9b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -29,6 +29,8 @@ #include "netns.h" #include "filecache.h" +#include "trace.h" + #define NFSDDBG_FACILITY NFSDDBG_SVC bool inter_copy_offload_enable; @@ -527,8 +529,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) return; nfsd_shutdown_net(net); - printk(KERN_WARNING "nfsd: last server has exited, flushing export " - "cache\n"); + pr_info("nfsd: last server has exited, flushing export cache\n"); nfsd_export_flush(net); } @@ -1009,17 +1010,16 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) struct kvec *resv = &rqstp->rq_res.head[0]; __be32 *p; - dprintk("nfsd_dispatch: vers %d proc %d\n", - rqstp->rq_vers, rqstp->rq_proc); - if (nfs_request_too_big(rqstp, proc)) - goto out_too_large; + goto out_decode_err; /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ rqstp->rq_cachetype = proc->pc_cachetype; + + svcxdr_init_decode(rqstp); if (!proc->pc_decode(rqstp, argv->iov_base)) goto out_decode_err; @@ -1050,29 +1050,51 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) out_cached_reply: return 1; -out_too_large: - dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); - *statp = rpc_garbage_args; - return 1; - out_decode_err: - dprintk("nfsd: failed to decode arguments!\n"); + trace_nfsd_garbage_args_err(rqstp); *statp = rpc_garbage_args; return 1; out_update_drop: - dprintk("nfsd: Dropping request; may be revisited later\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: - dprintk("nfsd: failed to encode result!\n"); + trace_nfsd_cant_encode_err(rqstp); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } +/** + * nfssvc_decode_voidarg - Decode void arguments + * @rqstp: Server RPC transaction context + * @p: buffer containing arguments to decode + * + * Return values: + * %0: Arguments were not valid + * %1: Decoding was successful + */ +int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + +/** + * nfssvc_encode_voidres - Encode void results + * @rqstp: Server RPC transaction context + * @p: buffer in which to encode results + * + * Return values: + * %0: Local error while encoding + * %1: Encoding was successful + */ +int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) +{ + return xdr_ressize_check(rqstp, p); +} + int nfsd_pool_stats_open(struct inode *inode, struct file *file) { int ret; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 8a288c8fcd57..7aa6e8aca2c1 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -192,11 +192,6 @@ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *f /* * XDR decode functions */ -int -nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_argsize_check(rqstp, p); -} int nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) @@ -423,11 +418,6 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) /* * XDR encode functions */ -int -nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} int nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p) @@ -469,6 +459,7 @@ int nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_readlinkres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; *p++ = resp->status; if (resp->status != nfs_ok) @@ -483,6 +474,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); } + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len)) + return 0; return 1; } @@ -490,6 +483,7 @@ int nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_readres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; *p++ = resp->status; if (resp->status != nfs_ok) @@ -507,6 +501,8 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); } + if (svc_encode_result_payload(rqstp, head->iov_len, resp->count)) + return 0; return 1; } diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c index 90967466a1e5..f008b95ceec2 100644 --- a/fs/nfsd/trace.c +++ b/fs/nfsd/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 99bf07800cd0..92a0973dd671 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -12,6 +12,100 @@ #include "export.h" #include "nfsfh.h" +#define NFSD_TRACE_PROC_ARG_FIELDS \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __array(unsigned char, server, sizeof(struct sockaddr_in6)) \ + __array(unsigned char, client, sizeof(struct sockaddr_in6)) + +#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \ + do { \ + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \ + __entry->xid = be32_to_cpu(rqstp->rq_xid); \ + memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \ + rqstp->rq_xprt->xpt_locallen); \ + memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \ + rqstp->rq_xprt->xpt_remotelen); \ + } while (0); + +#define NFSD_TRACE_PROC_RES_FIELDS \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __field(unsigned long, status) \ + __array(unsigned char, server, sizeof(struct sockaddr_in6)) \ + __array(unsigned char, client, sizeof(struct sockaddr_in6)) + +#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \ + do { \ + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \ + __entry->xid = be32_to_cpu(rqstp->rq_xid); \ + __entry->status = be32_to_cpu(error); \ + memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \ + rqstp->rq_xprt->xpt_locallen); \ + memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \ + rqstp->rq_xprt->xpt_remotelen); \ + } while (0); + +TRACE_EVENT(nfsd_garbage_args_err, + TP_PROTO( + const struct svc_rqst *rqstp + ), + TP_ARGS(rqstp), + TP_STRUCT__entry( + NFSD_TRACE_PROC_ARG_FIELDS + + __field(u32, vers) + __field(u32, proc) + ), + TP_fast_assign( + NFSD_TRACE_PROC_ARG_ASSIGNMENTS + + __entry->vers = rqstp->rq_vers; + __entry->proc = rqstp->rq_proc; + ), + TP_printk("xid=0x%08x vers=%u proc=%u", + __entry->xid, __entry->vers, __entry->proc + ) +); + +TRACE_EVENT(nfsd_cant_encode_err, + TP_PROTO( + const struct svc_rqst *rqstp + ), + TP_ARGS(rqstp), + TP_STRUCT__entry( + NFSD_TRACE_PROC_ARG_FIELDS + + __field(u32, vers) + __field(u32, proc) + ), + TP_fast_assign( + NFSD_TRACE_PROC_ARG_ASSIGNMENTS + + __entry->vers = rqstp->rq_vers; + __entry->proc = rqstp->rq_proc; + ), + TP_printk("xid=0x%08x vers=%u proc=%u", + __entry->xid, __entry->vers, __entry->proc + ) +); + +#define show_nfsd_may_flags(x) \ + __print_flags(x, "|", \ + { NFSD_MAY_EXEC, "EXEC" }, \ + { NFSD_MAY_WRITE, "WRITE" }, \ + { NFSD_MAY_READ, "READ" }, \ + { NFSD_MAY_SATTR, "SATTR" }, \ + { NFSD_MAY_TRUNC, "TRUNC" }, \ + { NFSD_MAY_LOCK, "LOCK" }, \ + { NFSD_MAY_OWNER_OVERRIDE, "OWNER_OVERRIDE" }, \ + { NFSD_MAY_LOCAL_ACCESS, "LOCAL_ACCESS" }, \ + { NFSD_MAY_BYPASS_GSS_ON_ROOT, "BYPASS_GSS_ON_ROOT" }, \ + { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \ + { NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \ + { NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \ + { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }) + TRACE_EVENT(nfsd_compound, TP_PROTO(const struct svc_rqst *rqst, u32 args_opcnt), @@ -51,6 +145,56 @@ TRACE_EVENT(nfsd_compound_status, __get_str(name), __entry->status) ) +TRACE_EVENT(nfsd_compound_decode_err, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 args_opcnt, + u32 resp_opcnt, + u32 opnum, + __be32 status + ), + TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status), + TP_STRUCT__entry( + NFSD_TRACE_PROC_RES_FIELDS + + __field(u32, args_opcnt) + __field(u32, resp_opcnt) + __field(u32, opnum) + ), + TP_fast_assign( + NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + + __entry->args_opcnt = args_opcnt; + __entry->resp_opcnt = resp_opcnt; + __entry->opnum = opnum; + ), + TP_printk("op=%u/%u opnum=%u status=%lu", + __entry->resp_opcnt, __entry->args_opcnt, + __entry->opnum, __entry->status) +); + +TRACE_EVENT(nfsd_compound_encode_err, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 opnum, + __be32 status + ), + TP_ARGS(rqstp, opnum, status), + TP_STRUCT__entry( + NFSD_TRACE_PROC_RES_FIELDS + + __field(u32, opnum) + ), + TP_fast_assign( + NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + + __entry->opnum = opnum; + ), + TP_printk("opnum=%u status=%lu", + __entry->opnum, __entry->status) +); + + DECLARE_EVENT_CLASS(nfsd_fh_err_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -421,6 +565,9 @@ TRACE_EVENT(nfsd_clid_inuse_err, __entry->cl_boot, __entry->cl_id) ) +/* + * from fs/nfsd/filecache.h + */ TRACE_DEFINE_ENUM(NFSD_FILE_HASHED); TRACE_DEFINE_ENUM(NFSD_FILE_PENDING); TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ); @@ -435,13 +582,6 @@ TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED); { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \ { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) -/* FIXME: This should probably be fleshed out in the future. */ -#define show_nf_may(val) \ - __print_flags(val, "|", \ - { NFSD_MAY_READ, "READ" }, \ - { NFSD_MAY_WRITE, "WRITE" }, \ - { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }) - DECLARE_EVENT_CLASS(nfsd_file_class, TP_PROTO(struct nfsd_file *nf), TP_ARGS(nf), @@ -461,12 +601,12 @@ DECLARE_EVENT_CLASS(nfsd_file_class, __entry->nf_may = nf->nf_may; __entry->nf_file = nf->nf_file; ), - TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p", + TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p", __entry->nf_hashval, __entry->nf_inode, __entry->nf_ref, show_nf_flags(__entry->nf_flags), - show_nf_may(__entry->nf_may), + show_nfsd_may_flags(__entry->nf_may), __entry->nf_file) ) @@ -492,10 +632,10 @@ TRACE_EVENT(nfsd_file_acquire, __field(u32, xid) __field(unsigned int, hash) __field(void *, inode) - __field(unsigned int, may_flags) + __field(unsigned long, may_flags) __field(int, nf_ref) __field(unsigned long, nf_flags) - __field(unsigned char, nf_may) + __field(unsigned long, nf_may) __field(struct file *, nf_file) __field(u32, status) ), @@ -512,12 +652,12 @@ TRACE_EVENT(nfsd_file_acquire, __entry->status = be32_to_cpu(status); ), - TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u", + TP_printk("xid=0x%x hash=0x%x inode=%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=%p status=%u", __entry->xid, __entry->hash, __entry->inode, - show_nf_may(__entry->may_flags), __entry->nf_ref, - show_nf_flags(__entry->nf_flags), - show_nf_may(__entry->nf_may), __entry->nf_file, - __entry->status) + show_nfsd_may_flags(__entry->may_flags), + __entry->nf_ref, show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file, __entry->status) ); DECLARE_EVENT_CLASS(nfsd_file_search_class, @@ -533,7 +673,7 @@ DECLARE_EVENT_CLASS(nfsd_file_search_class, __entry->hash = hash; __entry->found = found; ), - TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash, + TP_printk("hash=0x%x inode=%p found=%d", __entry->hash, __entry->inode, __entry->found) ); @@ -561,7 +701,7 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event, __entry->mode = inode->i_mode; __entry->mask = mask; ), - TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, + TP_printk("inode=%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, __entry->nlink, __entry->mode, __entry->mask) ); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 1ecaceebee13..04937e51de56 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -978,18 +978,25 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, __be32 *verf) { struct file *file = nf->nf_file; + struct super_block *sb = file_inode(file)->i_sb; struct svc_export *exp; struct iov_iter iter; __be32 nfserr; int host_err; int use_wgather; loff_t pos = offset; + unsigned long exp_op_flags = 0; unsigned int pflags = current->flags; rwf_t flags = 0; + bool restore_flags = false; trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); - if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + if (sb->s_export_op) + exp_op_flags = sb->s_export_op->flags; + + if (test_bit(RQ_LOCAL, &rqstp->rq_flags) && + !(exp_op_flags & EXPORT_OP_REMOTE_FS)) { /* * We want throttling in balance_dirty_pages() * and shrink_inactive_list() to only consider @@ -998,6 +1005,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, * the client's dirty pages or its congested queue. */ current->flags |= PF_LOCAL_THROTTLE; + restore_flags = true; + } exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); @@ -1049,7 +1058,7 @@ out_nfserr: trace_nfsd_write_err(rqstp, fhp, offset, host_err); nfserr = nfserrno(host_err); } - if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + if (restore_flags) current_restore_flags(pflags, PF_LOCAL_THROTTLE); return nfserr; } @@ -1724,7 +1733,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct inode *fdir, *tdir; __be32 err; int host_err; - bool has_cached = false; + bool close_cached = false; err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) @@ -1783,8 +1792,9 @@ retry: if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - if (nfsd_has_cached_files(ndentry)) { - has_cached = true; + if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && + nfsd_has_cached_files(ndentry)) { + close_cached = true; goto out_dput_old; } else { host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); @@ -1805,7 +1815,7 @@ retry: * as that would do the wrong thing if the two directories * were the same, so again we do it by hand. */ - if (!has_cached) { + if (!close_cached) { fill_post_wcc(ffhp); fill_post_wcc(tfhp); } @@ -1819,8 +1829,8 @@ retry: * shouldn't be done with locks held however, so we delay it until this * point and then reattempt the whole shebang. */ - if (has_cached) { - has_cached = false; + if (close_cached) { + close_cached = false; nfsd_close_cached_files(ndentry); dput(ndentry); goto retry; @@ -1872,7 +1882,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, type = d_inode(rdentry)->i_mode & S_IFMT; if (type != S_IFDIR) { - nfsd_close_cached_files(rdentry); + if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) + nfsd_close_cached_files(rdentry); host_err = vfs_unlink(dirp, rdentry, NULL); } else { host_err = vfs_rmdir(dirp, rdentry); diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 0ff336b0b25f..ad77387734cc 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -144,7 +144,6 @@ union nfsd_xdrstore { #define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) -int nfssvc_decode_void(struct svc_rqst *, __be32 *); int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *); int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *); @@ -156,7 +155,6 @@ int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); -int nfssvc_encode_void(struct svc_rqst *, __be32 *); int nfssvc_encode_stat(struct svc_rqst *, __be32 *); int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *); int nfssvc_encode_diropres(struct svc_rqst *, __be32 *); diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index ae6fa6c9cb46..456fcd7a1038 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -273,7 +273,6 @@ union nfsd3_xdrstore { #define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) -int nfs3svc_decode_voidarg(struct svc_rqst *, __be32 *); int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *); int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *); @@ -290,7 +289,6 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *); -int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *); int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *); int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *); int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 679d40af1bbb..a60ff5ce1a37 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -76,12 +76,7 @@ static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) struct nfsd4_change_info { u32 atomic; - bool change_supported; - u32 before_ctime_sec; - u32 before_ctime_nsec; u64 before_change; - u32 after_ctime_sec; - u32 after_ctime_nsec; u64 after_change; }; @@ -252,7 +247,8 @@ struct nfsd4_listxattrs { struct nfsd4_open { u32 op_claim_type; /* request */ - struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_fnamelen; + char * op_fname; /* request - everything but CLAIM_PREV */ u32 op_delegate_type; /* request - CLAIM_PREV only */ stateid_t op_delegate_stateid; /* request - response */ u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */ @@ -385,13 +381,6 @@ struct nfsd4_setclientid_confirm { nfs4_verifier sc_confirm; }; -struct nfsd4_saved_compoundargs { - __be32 *p; - __be32 *end; - int pagelen; - struct page **pagelist; -}; - struct nfsd4_test_stateid_id { __be32 ts_id_status; stateid_t ts_id_stateid; @@ -419,8 +408,7 @@ struct nfsd4_write { u64 wr_offset; /* request */ u32 wr_stable_how; /* request */ u32 wr_buflen; /* request */ - struct kvec wr_head; - struct page ** wr_pagelist; /* request */ + struct xdr_buf wr_payload; /* request */ u32 wr_bytes_written; /* response */ u32 wr_how_written; /* response */ @@ -433,7 +421,7 @@ struct nfsd4_exchange_id { u32 flags; clientid_t clientid; u32 seqid; - int spa_how; + u32 spa_how; u32 spo_must_enforce[3]; u32 spo_must_allow[3]; struct xdr_netobj nii_domain; @@ -554,7 +542,7 @@ struct nfsd4_copy { bool cp_intra; /* both */ - bool cp_synchronous; + u32 cp_synchronous; /* response */ struct nfsd42_write_res cp_res; @@ -615,7 +603,7 @@ struct nfsd4_copy_notify { }; struct nfsd4_op { - int opnum; + u32 opnum; const struct nfsd4_operation * opdesc; __be32 status; union nfsd4_op_u { @@ -696,15 +684,8 @@ struct svcxdr_tmpbuf { struct nfsd4_compoundargs { /* scratch variables for XDR decode */ - __be32 * p; - __be32 * end; - struct page ** pagelist; - int pagelen; - bool tail; - __be32 tmp[8]; - __be32 * tmpp; + struct xdr_stream *xdr; struct svcxdr_tmpbuf *to_free; - struct svc_rqst *rqstp; u32 taglen; @@ -767,22 +748,14 @@ static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = (u32)fhp->fh_post_saved; - cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); + cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr); cinfo->before_change = fhp->fh_pre_change; cinfo->after_change = fhp->fh_post_change; - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - } bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); -int nfs4svc_decode_voidarg(struct svc_rqst *, __be32 *); -int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *); int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *); int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *); __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 3ceb72b67a7a..9f4d4bcbf251 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -213,12 +213,25 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); +#define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ +#define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ +#define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ +#define EXPORT_OP_REMOTE_FS (0x8) /* Filesystem is remote */ +#define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply + atomic attribute updates + */ + unsigned long flags; }; extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent); extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, int connectable); +extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt, + struct fid *fid, int fh_len, + int fileid_type, + int (*acceptable)(void *, struct dentry *), + void *context); extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *), void *context); diff --git a/include/linux/iversion.h b/include/linux/iversion.h index 2917ef990d43..3bfebde5a1a6 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -328,6 +328,19 @@ inode_query_iversion(struct inode *inode) return cur >> I_VERSION_QUERIED_SHIFT; } +/* + * For filesystems without any sort of change attribute, the best we can + * do is fake one up from the ctime: + */ +static inline u64 time_to_chattr(struct timespec64 *t) +{ + u64 chattr = t->tv_sec; + + chattr <<= 32; + chattr += t->tv_nsec; + return chattr; +} + /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 9dc7eeac924f..5b4c67c91f56 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -385,13 +385,6 @@ enum lock_type4 { NFS4_WRITEW_LT = 4 }; -enum change_attr_type4 { - NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0, - NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1, - NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2, - NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3, - NFS4_CHANGE_TYPE_IS_UNDEFINED = 4 -}; /* Mandatory Attributes */ #define FATTR4_WORD0_SUPPORTED_ATTRS (1UL << 0) @@ -459,7 +452,6 @@ enum change_attr_type4 { #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13) -#define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) #define FATTR4_WORD2_MODE_UMASK (1UL << 17) #define FATTR4_WORD2_XATTR_SUPPORT (1UL << 18) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 386628b36bc7..34c2a69820e9 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -247,6 +247,8 @@ struct svc_rqst { size_t rq_xprt_hlen; /* xprt header len */ struct xdr_buf rq_arg; + struct xdr_stream rq_arg_stream; + struct page *rq_scratch_page; struct xdr_buf rq_res; struct page *rq_pages[RPCSVC_MAXPAGES + 1]; struct page * *rq_respages; /* points into rq_pages */ @@ -519,9 +521,9 @@ void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); -int svc_encode_read_payload(struct svc_rqst *rqstp, - unsigned int offset, - unsigned int length); +int svc_encode_result_payload(struct svc_rqst *rqstp, + unsigned int offset, + unsigned int length); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, struct kvec *first, size_t total); @@ -557,4 +559,18 @@ static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space) svc_reserve(rqstp, space + rqstp->rq_auth_slack); } +/** + * svcxdr_init_decode - Prepare an xdr_stream for svc Call decoding + * @rqstp: controlling server RPC transaction context + * + */ +static inline void svcxdr_init_decode(struct svc_rqst *rqstp) +{ + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct kvec *argv = rqstp->rq_arg.head; + + xdr_init_decode(xdr, &rqstp->rq_arg, argv->iov_base, NULL); + xdr_set_scratch_page(xdr, rqstp->rq_scratch_page); +} + #endif /* SUNRPC_SVC_H */ diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 9dc3a3b88391..294b56e61522 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -47,6 +47,8 @@ #include #include #include +#include + #include #include @@ -142,10 +144,15 @@ struct svc_rdma_recv_ctxt { unsigned int rc_page_count; unsigned int rc_hdr_count; u32 rc_inv_rkey; - __be32 *rc_write_list; - __be32 *rc_reply_chunk; - unsigned int rc_read_payload_offset; - unsigned int rc_read_payload_length; + __be32 rc_msgtype; + + struct svc_rdma_pcl rc_call_pcl; + + struct svc_rdma_pcl rc_read_pcl; + struct svc_rdma_chunk *rc_cur_result_payload; + struct svc_rdma_pcl rc_write_pcl; + struct svc_rdma_pcl rc_reply_pcl; + struct page *rc_pages[RPCSVC_MAXPAGES]; }; @@ -171,6 +178,8 @@ extern void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp, /* svc_rdma_recvfrom.c */ extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma); extern bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma); +extern struct svc_rdma_recv_ctxt * + svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma); extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt); extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma); @@ -179,16 +188,15 @@ extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); -extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, - struct svc_rqst *rqstp, - struct svc_rdma_recv_ctxt *head, __be32 *p); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - __be32 *wr_ch, struct xdr_buf *xdr, - unsigned int offset, - unsigned long length); + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr); extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_recv_ctxt *rctxt, - struct xdr_buf *xdr); + const struct xdr_buf *xdr); +extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head); /* svc_rdma_sendto.c */ extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma); @@ -201,14 +209,14 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma, extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, - struct xdr_buf *xdr); + const struct xdr_buf *xdr); extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, struct svc_rdma_recv_ctxt *rctxt, int status); extern int svc_rdma_sendto(struct svc_rqst *); -extern int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, - unsigned int length); +extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length); /* svc_rdma_transport.c */ extern struct svc_xprt_class svc_rdma_class; diff --git a/include/linux/sunrpc/svc_rdma_pcl.h b/include/linux/sunrpc/svc_rdma_pcl.h new file mode 100644 index 000000000000..7516ad0fae80 --- /dev/null +++ b/include/linux/sunrpc/svc_rdma_pcl.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020, Oracle and/or its affiliates + */ + +#ifndef SVC_RDMA_PCL_H +#define SVC_RDMA_PCL_H + +#include + +struct svc_rdma_segment { + u32 rs_handle; + u32 rs_length; + u64 rs_offset; +}; + +struct svc_rdma_chunk { + struct list_head ch_list; + + u32 ch_position; + u32 ch_length; + u32 ch_payload_length; + + u32 ch_segcount; + struct svc_rdma_segment ch_segments[]; +}; + +struct svc_rdma_pcl { + unsigned int cl_count; + struct list_head cl_chunks; +}; + +/** + * pcl_init - Initialize a parsed chunk list + * @pcl: parsed chunk list to initialize + * + */ +static inline void pcl_init(struct svc_rdma_pcl *pcl) +{ + INIT_LIST_HEAD(&pcl->cl_chunks); +} + +/** + * pcl_is_empty - Return true if parsed chunk list is empty + * @pcl: parsed chunk list + * + */ +static inline bool pcl_is_empty(const struct svc_rdma_pcl *pcl) +{ + return list_empty(&pcl->cl_chunks); +} + +/** + * pcl_first_chunk - Return first chunk in a parsed chunk list + * @pcl: parsed chunk list + * + * Returns the first chunk in the list, or NULL if the list is empty. + */ +static inline struct svc_rdma_chunk * +pcl_first_chunk(const struct svc_rdma_pcl *pcl) +{ + if (pcl_is_empty(pcl)) + return NULL; + return list_first_entry(&pcl->cl_chunks, struct svc_rdma_chunk, + ch_list); +} + +/** + * pcl_next_chunk - Return next chunk in a parsed chunk list + * @pcl: a parsed chunk list + * @chunk: chunk in @pcl + * + * Returns the next chunk in the list, or NULL if @chunk is already last. + */ +static inline struct svc_rdma_chunk * +pcl_next_chunk(const struct svc_rdma_pcl *pcl, struct svc_rdma_chunk *chunk) +{ + if (list_is_last(&chunk->ch_list, &pcl->cl_chunks)) + return NULL; + return list_next_entry(chunk, ch_list); +} + +/** + * pcl_for_each_chunk - Iterate over chunks in a parsed chunk list + * @pos: the loop cursor + * @pcl: a parsed chunk list + */ +#define pcl_for_each_chunk(pos, pcl) \ + for (pos = list_first_entry(&(pcl)->cl_chunks, struct svc_rdma_chunk, ch_list); \ + &pos->ch_list != &(pcl)->cl_chunks; \ + pos = list_next_entry(pos, ch_list)) + +/** + * pcl_for_each_segment - Iterate over segments in a parsed chunk + * @pos: the loop cursor + * @chunk: a parsed chunk + */ +#define pcl_for_each_segment(pos, chunk) \ + for (pos = &(chunk)->ch_segments[0]; \ + pos <= &(chunk)->ch_segments[(chunk)->ch_segcount - 1]; \ + pos++) + +/** + * pcl_chunk_end_offset - Return offset of byte range following @chunk + * @chunk: chunk in @pcl + * + * Returns starting offset of the region just after @chunk + */ +static inline unsigned int +pcl_chunk_end_offset(const struct svc_rdma_chunk *chunk) +{ + return xdr_align_size(chunk->ch_position + chunk->ch_payload_length); +} + +struct svc_rdma_recv_ctxt; + +extern void pcl_free(struct svc_rdma_pcl *pcl); +extern bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p); +extern bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p); +extern bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_pcl *pcl, __be32 *p); +extern int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl, + const struct xdr_buf *xdr, + int (*actor)(const struct xdr_buf *, + void *), + void *data); + +#endif /* SVC_RDMA_PCL_H */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index aca35ab5cff2..92455e0d5244 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -21,8 +21,8 @@ struct svc_xprt_ops { int (*xpo_has_wspace)(struct svc_xprt *); int (*xpo_recvfrom)(struct svc_rqst *); int (*xpo_sendto)(struct svc_rqst *); - int (*xpo_read_payload)(struct svc_rqst *, unsigned int, - unsigned int); + int (*xpo_result_payload)(struct svc_rqst *, unsigned int, + unsigned int); void (*xpo_release_rqst)(struct svc_rqst *); void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 9548d075e06d..9b35ce50cf2b 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -183,7 +183,8 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) */ extern void xdr_shift_buf(struct xdr_buf *, size_t); extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); -extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); +extern int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, + unsigned int base, unsigned int len); extern void xdr_buf_trim(struct xdr_buf *, unsigned int); extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); @@ -247,13 +248,57 @@ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, struct page **pages, unsigned int len); -extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen); extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t); extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t); +extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, + unsigned int len); + +/** + * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to an empty buffer + * @buflen: size of 'buf' + * + * The scratch buffer is used when decoding from an array of pages. + * If an xdr_inline_decode() call spans across page boundaries, then + * we copy the data into the scratch buffer in order to allow linear + * access. + */ +static inline void +xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) +{ + xdr->scratch.iov_base = buf; + xdr->scratch.iov_len = buflen; +} + +/** + * xdr_set_scratch_page - Attach a scratch buffer for decoding data + * @xdr: pointer to xdr_stream struct + * @page: an anonymous page + * + * See xdr_set_scratch_buffer(). + */ +static inline void +xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page) +{ + xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE); +} + +/** + * xdr_reset_scratch_buffer - Clear scratch buffer information + * @xdr: pointer to xdr_stream struct + * + * See xdr_set_scratch_buffer(). + */ +static inline void +xdr_reset_scratch_buffer(struct xdr_stream *xdr) +{ + xdr_set_scratch_buffer(xdr, NULL, 0); +} /** * xdr_stream_remaining - Return the number of bytes remaining in the stream @@ -505,6 +550,27 @@ static inline bool xdr_item_is_present(const __be32 *p) return *p != xdr_zero; } +/** + * xdr_stream_decode_bool - Decode a boolean + * @xdr: pointer to xdr_stream + * @ptr: pointer to a u32 in which to store the result + * + * Return values: + * %0 on success + * %-EBADMSG on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr) +{ + const size_t count = sizeof(*ptr); + __be32 *p = xdr_inline_decode(xdr, count); + + if (unlikely(!p)) + return -EBADMSG; + *ptr = (*p != xdr_zero); + return 0; +} + /** * xdr_stream_decode_u32 - Decode a 32-bit integer * @xdr: pointer to xdr_stream @@ -526,6 +592,27 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr) return 0; } +/** + * xdr_stream_decode_u64 - Decode a 64-bit integer + * @xdr: pointer to xdr_stream + * @ptr: location to store 64-bit integer + * + * Return values: + * %0 on success + * %-EBADMSG on XDR buffer overflow + */ +static inline ssize_t +xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) +{ + const size_t count = sizeof(*ptr); + __be32 *p = xdr_inline_decode(xdr, count); + + if (unlikely(!p)) + return -EBADMSG; + xdr_decode_hyper(p, ptr); + return 0; +} + /** * xdr_stream_decode_opaque_fixed - Decode fixed length opaque xdr data * @xdr: pointer to xdr_stream diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index bf1065772228..896aafc37b09 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1410,101 +1410,112 @@ DEFINE_BADREQ_EVENT(drop); DEFINE_BADREQ_EVENT(badproc); DEFINE_BADREQ_EVENT(parse); -DECLARE_EVENT_CLASS(svcrdma_segment_event, +TRACE_EVENT(svcrdma_encode_wseg, TP_PROTO( + const struct svc_rdma_send_ctxt *ctxt, + u32 segno, u32 handle, u32 length, u64 offset ), - TP_ARGS(handle, length, offset), + TP_ARGS(ctxt, segno, handle, length, offset), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) + __field(u32, segno) __field(u32, handle) __field(u32, length) __field(u64, offset) ), TP_fast_assign( + __entry->cq_id = ctxt->sc_cid.ci_queue_id; + __entry->completion_id = ctxt->sc_cid.ci_completion_id; + __entry->segno = segno; __entry->handle = handle; __entry->length = length; __entry->offset = offset; ), - TP_printk("%u@0x%016llx:0x%08x", - __entry->length, (unsigned long long)__entry->offset, - __entry->handle + TP_printk("cq_id=%u cid=%d segno=%u %u@0x%016llx:0x%08x", + __entry->cq_id, __entry->completion_id, + __entry->segno, __entry->length, + (unsigned long long)__entry->offset, __entry->handle ) ); -#define DEFINE_SEGMENT_EVENT(name) \ - DEFINE_EVENT(svcrdma_segment_event, svcrdma_##name,\ - TP_PROTO( \ - u32 handle, \ - u32 length, \ - u64 offset \ - ), \ - TP_ARGS(handle, length, offset)) - -DEFINE_SEGMENT_EVENT(decode_wseg); -DEFINE_SEGMENT_EVENT(encode_rseg); -DEFINE_SEGMENT_EVENT(send_rseg); -DEFINE_SEGMENT_EVENT(encode_wseg); -DEFINE_SEGMENT_EVENT(send_wseg); - -DECLARE_EVENT_CLASS(svcrdma_chunk_event, +TRACE_EVENT(svcrdma_decode_rseg, TP_PROTO( - u32 length + const struct rpc_rdma_cid *cid, + const struct svc_rdma_chunk *chunk, + const struct svc_rdma_segment *segment ), - TP_ARGS(length), + TP_ARGS(cid, chunk, segment), TP_STRUCT__entry( - __field(u32, length) - ), - - TP_fast_assign( - __entry->length = length; - ), - - TP_printk("length=%u", - __entry->length - ) -); - -#define DEFINE_CHUNK_EVENT(name) \ - DEFINE_EVENT(svcrdma_chunk_event, svcrdma_##name, \ - TP_PROTO( \ - u32 length \ - ), \ - TP_ARGS(length)) - -DEFINE_CHUNK_EVENT(send_pzr); -DEFINE_CHUNK_EVENT(encode_write_chunk); -DEFINE_CHUNK_EVENT(send_write_chunk); -DEFINE_CHUNK_EVENT(encode_read_chunk); -DEFINE_CHUNK_EVENT(send_reply_chunk); - -TRACE_EVENT(svcrdma_send_read_chunk, - TP_PROTO( - u32 length, - u32 position - ), - - TP_ARGS(length, position), - - TP_STRUCT__entry( - __field(u32, length) + __field(u32, cq_id) + __field(int, completion_id) + __field(u32, segno) __field(u32, position) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) ), TP_fast_assign( - __entry->length = length; - __entry->position = position; + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; + __entry->segno = chunk->ch_segcount; + __entry->position = chunk->ch_position; + __entry->handle = segment->rs_handle; + __entry->length = segment->rs_length; + __entry->offset = segment->rs_offset; ), - TP_printk("length=%u position=%u", - __entry->length, __entry->position + TP_printk("cq_id=%u cid=%d segno=%u position=%u %u@0x%016llx:0x%08x", + __entry->cq_id, __entry->completion_id, + __entry->segno, __entry->position, __entry->length, + (unsigned long long)__entry->offset, __entry->handle + ) +); + +TRACE_EVENT(svcrdma_decode_wseg, + TP_PROTO( + const struct rpc_rdma_cid *cid, + const struct svc_rdma_chunk *chunk, + u32 segno + ), + + TP_ARGS(cid, chunk, segno), + + TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) + __field(u32, segno) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + ), + + TP_fast_assign( + const struct svc_rdma_segment *segment = + &chunk->ch_segments[segno]; + + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; + __entry->segno = segno; + __entry->handle = segment->rs_handle; + __entry->length = segment->rs_length; + __entry->offset = segment->rs_offset; + ), + + TP_printk("cq_id=%u cid=%d segno=%u %u@0x%016llx:0x%08x", + __entry->cq_id, __entry->completion_id, + __entry->segno, __entry->length, + (unsigned long long)__entry->offset, __entry->handle ) ); @@ -1581,6 +1592,7 @@ DECLARE_EVENT_CLASS(svcrdma_dma_map_class, TP_ARGS(rdma, dma_addr, length)) DEFINE_SVC_DMA_EVENT(dma_map_page); +DEFINE_SVC_DMA_EVENT(dma_map_err); DEFINE_SVC_DMA_EVENT(dma_unmap_page); TRACE_EVENT(svcrdma_dma_map_rw_err, @@ -1699,20 +1711,30 @@ TRACE_EVENT(svcrdma_small_wrch_err, TRACE_EVENT(svcrdma_send_pullup, TP_PROTO( - unsigned int len + const struct svc_rdma_send_ctxt *ctxt, + unsigned int msglen ), - TP_ARGS(len), + TP_ARGS(ctxt, msglen), TP_STRUCT__entry( - __field(unsigned int, len) + __field(u32, cq_id) + __field(int, completion_id) + __field(unsigned int, hdrlen) + __field(unsigned int, msglen) ), TP_fast_assign( - __entry->len = len; + __entry->cq_id = ctxt->sc_cid.ci_queue_id; + __entry->completion_id = ctxt->sc_cid.ci_completion_id; + __entry->hdrlen = ctxt->sc_hdrbuf.len, + __entry->msglen = msglen; ), - TP_printk("len=%u", __entry->len) + TP_printk("cq_id=%u cid=%d hdr=%u msg=%u (total %u)", + __entry->cq_id, __entry->completion_id, + __entry->hdrlen, __entry->msglen, + __entry->hdrlen + __entry->msglen) ); TRACE_EVENT(svcrdma_send_err, @@ -1819,7 +1841,7 @@ TRACE_EVENT(svcrdma_rq_post_err, ) ); -TRACE_EVENT(svcrdma_post_chunk, +DECLARE_EVENT_CLASS(svcrdma_post_chunk_class, TP_PROTO( const struct rpc_rdma_cid *cid, int sqecount @@ -1845,6 +1867,19 @@ TRACE_EVENT(svcrdma_post_chunk, ) ); +#define DEFINE_POST_CHUNK_EVENT(name) \ + DEFINE_EVENT(svcrdma_post_chunk_class, \ + svcrdma_post_##name##_chunk, \ + TP_PROTO( \ + const struct rpc_rdma_cid *cid, \ + int sqecount \ + ), \ + TP_ARGS(cid, sqecount)) + +DEFINE_POST_CHUNK_EVENT(read); +DEFINE_POST_CHUNK_EVENT(write); +DEFINE_POST_CHUNK_EVENT(reply); + DEFINE_COMPLETION_EVENT(svcrdma_wc_read); DEFINE_COMPLETION_EVENT(svcrdma_wc_write); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 2a03263b5f9d..58994e013022 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1500,30 +1500,6 @@ SVC_RQST_FLAG_LIST #define show_rqstp_flags(flags) \ __print_flags(flags, "|", SVC_RQST_FLAG_LIST) -TRACE_EVENT(svc_recv, - TP_PROTO(struct svc_rqst *rqst, int len), - - TP_ARGS(rqst, len), - - TP_STRUCT__entry( - __field(u32, xid) - __field(int, len) - __field(unsigned long, flags) - __string(addr, rqst->rq_xprt->xpt_remotebuf) - ), - - TP_fast_assign( - __entry->xid = be32_to_cpu(rqst->rq_xid); - __entry->len = len; - __entry->flags = rqst->rq_flags; - __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); - ), - - TP_printk("addr=%s xid=0x%08x len=%d flags=%s", - __get_str(addr), __entry->xid, __entry->len, - show_rqstp_flags(__entry->flags)) -); - TRACE_DEFINE_ENUM(SVC_GARBAGE); TRACE_DEFINE_ENUM(SVC_SYSERR); TRACE_DEFINE_ENUM(SVC_VALID); diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index af9c7f43859c..d1c003a25b0f 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -200,7 +200,7 @@ static int gssp_call(struct net *net, struct rpc_message *msg) static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) { - int i; + unsigned int i; for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); @@ -210,14 +210,19 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { + unsigned int i; + arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); - /* - * XXX: actual pages are allocated by xdr layer in - * xdr_partial_copy_from_skb. - */ if (!arg->pages) return -ENOMEM; + for (i = 0; i < arg->npages; i++) { + arg->pages[i] = alloc_page(GFP_KERNEL); + if (!arg->pages[i]) { + gssp_free_receive_pages(arg); + return -ENOMEM; + } + } return 0; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 2ff7b7083eba..d79f12c2550a 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -771,7 +771,6 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req, xdr_inline_pages(&req->rq_rcv_buf, PAGE_SIZE/2 /* pretty arbitrary */, arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; done: if (err) dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); @@ -789,7 +788,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, scratch = alloc_page(GFP_KERNEL); if (!scratch) return -ENOMEM; - xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 20c93b68505e..1a2c1c44bb00 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -778,7 +778,6 @@ void cache_clean_deferred(void *owner) */ static DEFINE_SPINLOCK(queue_lock); -static DEFINE_MUTEX(queue_io_mutex); struct cache_queue { struct list_head list; @@ -906,44 +905,26 @@ static ssize_t cache_do_downcall(char *kaddr, const char __user *buf, return ret; } -static ssize_t cache_slow_downcall(const char __user *buf, - size_t count, struct cache_detail *cd) -{ - static char write_buf[32768]; /* protected by queue_io_mutex */ - ssize_t ret = -EINVAL; - - if (count >= sizeof(write_buf)) - goto out; - mutex_lock(&queue_io_mutex); - ret = cache_do_downcall(write_buf, buf, count, cd); - mutex_unlock(&queue_io_mutex); -out: - return ret; -} - static ssize_t cache_downcall(struct address_space *mapping, const char __user *buf, size_t count, struct cache_detail *cd) { - struct page *page; - char *kaddr; + char *write_buf; ssize_t ret = -ENOMEM; - if (count >= PAGE_SIZE) - goto out_slow; + if (count >= 32768) { /* 32k is max userland buffer, lets check anyway */ + ret = -EINVAL; + goto out; + } - page = find_or_create_page(mapping, 0, GFP_KERNEL); - if (!page) - goto out_slow; + write_buf = kvmalloc(count + 1, GFP_KERNEL); + if (!write_buf) + goto out; - kaddr = kmap(page); - ret = cache_do_downcall(kaddr, buf, count, cd); - kunmap(page); - unlock_page(page); - put_page(page); + ret = cache_do_downcall(write_buf, buf, count, cd); + kvfree(write_buf); +out: return ret; -out_slow: - return cache_slow_downcall(buf, count, cd); } static ssize_t cache_write(struct file *filp, const char __user *buf, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c211b607239e..4187745887f0 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -614,6 +614,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) rqstp->rq_server = serv; rqstp->rq_pool = pool; + rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); + if (!rqstp->rq_scratch_page) + goto out_enomem; + rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) goto out_enomem; @@ -842,6 +846,7 @@ void svc_rqst_free(struct svc_rqst *rqstp) { svc_release_buffer(rqstp); + put_page(rqstp->rq_scratch_page); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); @@ -1622,7 +1627,7 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svc_max_payload); /** - * svc_encode_read_payload - mark a range of bytes as a READ payload + * svc_encode_result_payload - mark a range of bytes as a result payload * @rqstp: svc_rqst to operate on * @offset: payload's byte offset in rqstp->rq_res * @length: size of payload, in bytes @@ -1630,12 +1635,13 @@ EXPORT_SYMBOL_GPL(svc_max_payload); * Returns zero on success, or a negative errno if a permanent * error occurred. */ -int svc_encode_read_payload(struct svc_rqst *rqstp, unsigned int offset, - unsigned int length) +int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { - return rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, offset, length); + return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset, + length); } -EXPORT_SYMBOL_GPL(svc_encode_read_payload); +EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** * svc_fill_write_vector - Construct data argument for VFS write call diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 43cf8dbde898..5fb9164aa690 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -813,8 +813,6 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) len = svc_deferred_recv(rqstp); else len = xprt->xpt_ops->xpo_recvfrom(rqstp); - if (len > 0) - trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg); rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); @@ -868,7 +866,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; - trace_svc_recv(rqstp, len); + trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg); return len; out_release: rqstp->rq_res.len = 0; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c2752e2b9ce3..b248f2349437 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -181,8 +181,8 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) } } -static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset, - unsigned int length) +static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { return 0; } @@ -635,7 +635,7 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, - .xpo_read_payload = svc_sock_read_payload, + .xpo_result_payload = svc_sock_result_payload, .xpo_release_rqst = svc_udp_release_rqst, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, @@ -1123,7 +1123,7 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, - .xpo_read_payload = svc_sock_read_payload, + .xpo_result_payload = svc_sock_result_payload, .xpo_release_rqst = svc_tcp_release_rqst, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 71e03b930b70..757560a3b06b 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -669,7 +669,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct kvec *iov = buf->head; int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len; - xdr_set_scratch_buffer(xdr, NULL, 0); + xdr_reset_scratch_buffer(xdr); BUG_ON(scratch_len < 0); xdr->buf = buf; xdr->iov = iov; @@ -713,7 +713,7 @@ inline void xdr_commit_encode(struct xdr_stream *xdr) page = page_address(*xdr->page_ptr); memcpy(xdr->scratch.iov_base, page, shift); memmove(page, page + shift, (void *)xdr->p - page); - xdr->scratch.iov_len = 0; + xdr_reset_scratch_buffer(xdr); } EXPORT_SYMBOL_GPL(xdr_commit_encode); @@ -743,8 +743,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, * the "scratch" iov to track any temporarily unused fragment of * space at the end of the previous buffer: */ - xdr->scratch.iov_base = xdr->p; - xdr->scratch.iov_len = frag1bytes; + xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes); p = page_address(*xdr->page_ptr); /* * Note this is where the next encode will start after we've @@ -1052,8 +1051,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst) { xdr->buf = buf; - xdr->scratch.iov_base = NULL; - xdr->scratch.iov_len = 0; + xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); if (buf->head[0].iov_len != 0) xdr_set_iov(xdr, buf->head, buf->len); @@ -1101,24 +1099,6 @@ static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) return p; } -/** - * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. - * @xdr: pointer to xdr_stream struct - * @buf: pointer to an empty buffer - * @buflen: size of 'buf' - * - * The scratch buffer is used when decoding from an array of pages. - * If an xdr_inline_decode() call spans across page boundaries, then - * we copy the data into the scratch buffer in order to allow linear - * access. - */ -void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) -{ - xdr->scratch.iov_base = buf; - xdr->scratch.iov_len = buflen; -} -EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer); - static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; @@ -1379,9 +1359,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov); * * Returns -1 if base of length are out of bounds. */ -int -xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, - unsigned int base, unsigned int len) +int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, + unsigned int base, unsigned int len) { subbuf->buflen = subbuf->len = len; if (base < buf->head[0].iov_len) { @@ -1428,6 +1407,51 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, } EXPORT_SYMBOL_GPL(xdr_buf_subsegment); +/** + * xdr_stream_subsegment - set @subbuf to a portion of @xdr + * @xdr: an xdr_stream set up for decoding + * @subbuf: the result buffer + * @nbytes: length of @xdr to extract, in bytes + * + * Sets up @subbuf to represent a portion of @xdr. The portion + * starts at the current offset in @xdr, and extends for a length + * of @nbytes. If this is successful, @xdr is advanced to the next + * position following that portion. + * + * Return values: + * %true: @subbuf has been initialized, and @xdr has been advanced. + * %false: a bounds error has occurred + */ +bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, + unsigned int nbytes) +{ + unsigned int remaining, offset, len; + + if (xdr_buf_subsegment(xdr->buf, subbuf, xdr_stream_pos(xdr), nbytes)) + return false; + + if (subbuf->head[0].iov_len) + if (!__xdr_inline_decode(xdr, subbuf->head[0].iov_len)) + return false; + + remaining = subbuf->page_len; + offset = subbuf->page_base; + while (remaining) { + len = min_t(unsigned int, remaining, PAGE_SIZE) - offset; + + if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr)) + return false; + if (!__xdr_inline_decode(xdr, len)) + return false; + + remaining -= len; + offset = 0; + } + + return true; +} +EXPORT_SYMBOL_GPL(xdr_stream_subsegment); + /** * xdr_buf_trim - lop at most "len" bytes off the end of "buf" * @buf: buf to be trimmed diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 8ed0377d7a18..55b21bae866d 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ - module.o + svc_rdma_pcl.o module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 5e7c4ba9e147..63f8be974df2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -74,11 +74,17 @@ out_unlock: */ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst, - struct svc_rdma_send_ctxt *ctxt) + struct svc_rdma_send_ctxt *sctxt) { + struct svc_rdma_recv_ctxt *rctxt; int ret; - ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf); + rctxt = svc_rdma_recv_ctxt_get(rdma); + if (!rctxt) + return -EIO; + + ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf); + svc_rdma_recv_ctxt_put(rdma, rctxt); if (ret < 0) return -EIO; @@ -86,8 +92,8 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, * the rq_buffer before all retransmits are complete. */ get_page(virt_to_page(rqst->rq_buffer)); - ctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, ctxt); + sctxt->sc_send_wr.opcode = IB_WR_SEND; + return svc_rdma_send(rdma, sctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_pcl.c b/net/sunrpc/xprtrdma/svc_rdma_pcl.c new file mode 100644 index 000000000000..b63cfeaa2923 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Oracle. All rights reserved. + */ + +#include +#include + +#include "xprt_rdma.h" +#include + +/** + * pcl_free - Release all memory associated with a parsed chunk list + * @pcl: parsed chunk list + * + */ +void pcl_free(struct svc_rdma_pcl *pcl) +{ + while (!list_empty(&pcl->cl_chunks)) { + struct svc_rdma_chunk *chunk; + + chunk = pcl_first_chunk(pcl); + list_del(&chunk->ch_list); + kfree(chunk); + } +} + +static struct svc_rdma_chunk *pcl_alloc_chunk(u32 segcount, u32 position) +{ + struct svc_rdma_chunk *chunk; + + chunk = kmalloc(struct_size(chunk, ch_segments, segcount), GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->ch_position = position; + chunk->ch_length = 0; + chunk->ch_payload_length = 0; + chunk->ch_segcount = 0; + return chunk; +} + +static struct svc_rdma_chunk * +pcl_lookup_position(struct svc_rdma_pcl *pcl, u32 position) +{ + struct svc_rdma_chunk *pos; + + pcl_for_each_chunk(pos, pcl) { + if (pos->ch_position == position) + return pos; + } + return NULL; +} + +static void pcl_insert_position(struct svc_rdma_pcl *pcl, + struct svc_rdma_chunk *chunk) +{ + struct svc_rdma_chunk *pos; + + pcl_for_each_chunk(pos, pcl) { + if (pos->ch_position > chunk->ch_position) + break; + } + __list_add(&chunk->ch_list, pos->ch_list.prev, &pos->ch_list); + pcl->cl_count++; +} + +static void pcl_set_read_segment(const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_chunk *chunk, + u32 handle, u32 length, u64 offset) +{ + struct svc_rdma_segment *segment; + + segment = &chunk->ch_segments[chunk->ch_segcount]; + segment->rs_handle = handle; + segment->rs_length = length; + segment->rs_offset = offset; + + trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment); + + chunk->ch_length += length; + chunk->ch_segcount++; +} + +/** + * pcl_alloc_call - Construct a parsed chunk list for the Call body + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Assumptions: + * - The incoming Read list has already been sanity checked. + * - cl_count is already set to the number of segments in + * the un-decoded list. + * - The list might not be in order by position. + * + * Return values: + * %true: Parsed chunk list was successfully constructed, and + * cl_count is updated to be the number of chunks (ie. + * unique positions) in the Read list. + * %false: Memory allocation failed. + */ +bool pcl_alloc_call(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + struct svc_rdma_pcl *pcl = &rctxt->rc_call_pcl; + unsigned int i, segcount = pcl->cl_count; + + pcl->cl_count = 0; + for (i = 0; i < segcount; i++) { + struct svc_rdma_chunk *chunk; + u32 position, handle, length; + u64 offset; + + p++; /* skip the list discriminator */ + p = xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position != 0) + continue; + + if (pcl_is_empty(pcl)) { + chunk = pcl_alloc_chunk(segcount, position); + if (!chunk) + return false; + pcl_insert_position(pcl, chunk); + } else { + chunk = list_first_entry(&pcl->cl_chunks, + struct svc_rdma_chunk, + ch_list); + } + + pcl_set_read_segment(rctxt, chunk, handle, length, offset); + } + + return true; +} + +/** + * pcl_alloc_read - Construct a parsed chunk list for normal Read chunks + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list + * + * Assumptions: + * - The incoming Read list has already been sanity checked. + * - cl_count is already set to the number of segments in + * the un-decoded list. + * - The list might not be in order by position. + * + * Return values: + * %true: Parsed chunk list was successfully constructed, and + * cl_count is updated to be the number of chunks (ie. + * unique position values) in the Read list. + * %false: Memory allocation failed. + * + * TODO: + * - Check for chunk range overlaps + */ +bool pcl_alloc_read(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + struct svc_rdma_pcl *pcl = &rctxt->rc_read_pcl; + unsigned int i, segcount = pcl->cl_count; + + pcl->cl_count = 0; + for (i = 0; i < segcount; i++) { + struct svc_rdma_chunk *chunk; + u32 position, handle, length; + u64 offset; + + p++; /* skip the list discriminator */ + p = xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position == 0) + continue; + + chunk = pcl_lookup_position(pcl, position); + if (!chunk) { + chunk = pcl_alloc_chunk(segcount, position); + if (!chunk) + return false; + pcl_insert_position(pcl, chunk); + } + + pcl_set_read_segment(rctxt, chunk, handle, length, offset); + } + + return true; +} + +/** + * pcl_alloc_write - Construct a parsed chunk list from a Write list + * @rctxt: Ingress receive context + * @pcl: Parsed chunk list to populate + * @p: Start of an un-decoded Write list + * + * Assumptions: + * - The incoming Write list has already been sanity checked, and + * - cl_count is set to the number of chunks in the un-decoded list. + * + * Return values: + * %true: Parsed chunk list was successfully constructed. + * %false: Memory allocation failed. + */ +bool pcl_alloc_write(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_pcl *pcl, __be32 *p) +{ + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + unsigned int i, j; + u32 segcount; + + for (i = 0; i < pcl->cl_count; i++) { + p++; /* skip the list discriminator */ + segcount = be32_to_cpup(p++); + + chunk = pcl_alloc_chunk(segcount, 0); + if (!chunk) + return false; + list_add_tail(&chunk->ch_list, &pcl->cl_chunks); + + for (j = 0; j < segcount; j++) { + segment = &chunk->ch_segments[j]; + p = xdr_decode_rdma_segment(p, &segment->rs_handle, + &segment->rs_length, + &segment->rs_offset); + trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j); + + chunk->ch_length += segment->rs_length; + chunk->ch_segcount++; + } + } + return true; +} + +static int pcl_process_region(const struct xdr_buf *xdr, + unsigned int offset, unsigned int length, + int (*actor)(const struct xdr_buf *, void *), + void *data) +{ + struct xdr_buf subbuf; + + if (!length) + return 0; + if (xdr_buf_subsegment(xdr, &subbuf, offset, length)) + return -EMSGSIZE; + return actor(&subbuf, data); +} + +/** + * pcl_process_nonpayloads - Process non-payload regions inside @xdr + * @pcl: Chunk list to process + * @xdr: xdr_buf to process + * @actor: Function to invoke on each non-payload region + * @data: Arguments for @actor + * + * This mechanism must ignore not only result payloads that were already + * sent via RDMA Write, but also XDR padding for those payloads that + * the upper layer has added. + * + * Assumptions: + * The xdr->len and ch_position fields are aligned to 4-byte multiples. + * + * Returns: + * On success, zero, + * %-EMSGSIZE on XDR buffer overflow, or + * The return value of @actor + */ +int pcl_process_nonpayloads(const struct svc_rdma_pcl *pcl, + const struct xdr_buf *xdr, + int (*actor)(const struct xdr_buf *, void *), + void *data) +{ + struct svc_rdma_chunk *chunk, *next; + unsigned int start; + int ret; + + chunk = pcl_first_chunk(pcl); + + /* No result payloads were generated */ + if (!chunk || !chunk->ch_payload_length) + return actor(xdr, data); + + /* Process the region before the first result payload */ + ret = pcl_process_region(xdr, 0, chunk->ch_position, actor, data); + if (ret < 0) + return ret; + + /* Process the regions between each middle result payload */ + while ((next = pcl_next_chunk(pcl, chunk))) { + if (!next->ch_payload_length) + break; + + start = pcl_chunk_end_offset(chunk); + ret = pcl_process_region(xdr, start, next->ch_position - start, + actor, data); + if (ret < 0) + return ret; + + chunk = next; + } + + /* Process the region after the last result payload */ + start = pcl_chunk_end_offset(chunk); + ret = pcl_process_region(xdr, start, xdr->len - start, actor, data); + if (ret < 0) + return ret; + + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c6ea2903c21a..cbdb71247755 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -93,6 +93,7 @@ * (see rdma_read_complete() below). */ +#include #include #include #include @@ -143,6 +144,10 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) goto fail2; svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); + pcl_init(&ctxt->rc_call_pcl); + pcl_init(&ctxt->rc_read_pcl); + pcl_init(&ctxt->rc_write_pcl); + pcl_init(&ctxt->rc_reply_pcl); ctxt->rc_recv_wr.next = NULL; ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; @@ -189,8 +194,13 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) } } -static struct svc_rdma_recv_ctxt * -svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) +/** + * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt + * @rdma: controlling svcxprt_rdma + * + * Returns a recv_ctxt or (rarely) NULL if none are available. + */ +struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; struct llist_node *node; @@ -202,7 +212,6 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) out: ctxt->rc_page_count = 0; - ctxt->rc_read_payload_length = 0; return ctxt; out_empty: @@ -226,6 +235,11 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, for (i = 0; i < ctxt->rc_page_count; i++) put_page(ctxt->rc_pages[i]); + pcl_free(&ctxt->rc_call_pcl); + pcl_free(&ctxt->rc_read_pcl); + pcl_free(&ctxt->rc_write_pcl); + pcl_free(&ctxt->rc_reply_pcl); + if (!ctxt->rc_temp) llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); else @@ -385,100 +399,123 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, arg->len = ctxt->rc_byte_len; } -/* This accommodates the largest possible Write chunk. - */ -#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) - -/* This accommodates the largest possible Position-Zero - * Read chunk or Reply chunk. - */ -#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) - -/* Sanity check the Read list. +/** + * xdr_count_read_segments - Count number of Read segments in Read list + * @rctxt: Ingress receive context + * @p: Start of an un-decoded Read list * - * Implementation limits: - * - This implementation supports only one Read chunk. + * Before allocating anything, ensure the ingress Read list is safe + * to use. * - * Sanity checks: - * - Read list does not overflow Receive buffer. - * - Segment size limited by largest NFS data payload. - * - * The segment count is limited to how many segments can - * fit in the transport header without overflowing the - * buffer. That's about 40 Read segments for a 1KB inline - * threshold. + * The segment count is limited to how many segments can fit in the + * transport header without overflowing the buffer. That's about 40 + * Read segments for a 1KB inline threshold. * * Return values: - * %true: Read list is valid. @rctxt's xdr_stream is updated - * to point to the first byte past the Read list. - * %false: Read list is corrupt. @rctxt's xdr_stream is left - * in an unknown state. + * %true: Read list is valid. @rctxt's xdr_stream is updated to point + * to the first byte past the Read list. rc_read_pcl and + * rc_call_pcl cl_count fields are set to the number of + * Read segments in the list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left in an + * unknown state. */ -static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) +static bool xdr_count_read_segments(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) { - u32 position, len; - bool first; - __be32 *p; - - p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); - if (!p) - return false; - - len = 0; - first = true; + rctxt->rc_call_pcl.cl_count = 0; + rctxt->rc_read_pcl.cl_count = 0; while (xdr_item_is_present(p)) { + u32 position, handle, length; + u64 offset; + p = xdr_inline_decode(&rctxt->rc_stream, rpcrdma_readseg_maxsz * sizeof(*p)); if (!p) return false; - if (first) { - position = be32_to_cpup(p); - first = false; - } else if (be32_to_cpup(p) != position) { - return false; + xdr_decode_read_segment(p, &position, &handle, + &length, &offset); + if (position) { + if (position & 3) + return false; + ++rctxt->rc_read_pcl.cl_count; + } else { + ++rctxt->rc_call_pcl.cl_count; } - p += 2; - len += be32_to_cpup(p); p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; } - return len <= MAX_BYTES_SPECIAL_CHUNK; + return true; } -/* The segment count is limited to how many segments can - * fit in the transport header without overflowing the - * buffer. That's about 60 Write segments for a 1KB inline - * threshold. +/* Sanity check the Read list. + * + * Sanity checks: + * - Read list does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Read list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Read list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) +static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 i, segcount, total; __be32 *p; p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - segcount = be32_to_cpup(p); + if (!xdr_count_read_segments(rctxt, p)) + return false; + if (!pcl_alloc_call(rctxt, p)) + return false; + return pcl_alloc_read(rctxt, p); +} - total = 0; - for (i = 0; i < segcount; i++) { - u32 handle, length; - u64 offset; +static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) +{ + u32 segcount; + __be32 *p; - p = xdr_inline_decode(&rctxt->rc_stream, - rpcrdma_segment_maxsz * sizeof(*p)); + if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount)) + return false; + + /* A bogus segcount causes this buffer overflow check to fail. */ + p = xdr_inline_decode(&rctxt->rc_stream, + segcount * rpcrdma_segment_maxsz * sizeof(*p)); + return p != NULL; +} + +/** + * xdr_count_write_chunks - Count number of Write chunks in Write list + * @rctxt: Received header and decoding state + * @p: start of an un-decoded Write list + * + * Before allocating anything, ensure the ingress Write list is + * safe to use. + * + * Return values: + * %true: Write list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Write list, and + * the number of Write chunks is in rc_write_pcl.cl_count. + * %false: Write list is corrupt. @rctxt's xdr_stream is left + * in an indeterminate state. + */ +static bool xdr_count_write_chunks(struct svc_rdma_recv_ctxt *rctxt, __be32 *p) +{ + rctxt->rc_write_pcl.cl_count = 0; + while (xdr_item_is_present(p)) { + if (!xdr_check_write_chunk(rctxt)) + return false; + ++rctxt->rc_write_pcl.cl_count; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - - xdr_decode_rdma_segment(p, &handle, &length, &offset); - trace_svcrdma_decode_wseg(handle, length, offset); - - total += length; } - return total <= maxlen; + return true; } /* Sanity check the Write list. @@ -498,24 +535,18 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) */ static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 chcount = 0; __be32 *p; p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - rctxt->rc_write_list = p; - while (xdr_item_is_present(p)) { - if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK)) - return false; - ++chcount; - p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); - if (!p) - return false; - } - if (!chcount) - rctxt->rc_write_list = NULL; - return chcount < 2; + if (!xdr_count_write_chunks(rctxt, p)) + return false; + if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p)) + return false; + + rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl); + return true; } /* Sanity check the Reply chunk. @@ -537,13 +568,14 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - rctxt->rc_reply_chunk = NULL; - if (xdr_item_is_present(p)) { - if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK)) - return false; - rctxt->rc_reply_chunk = p; - } - return true; + + if (!xdr_item_is_present(p)) + return true; + if (!xdr_check_write_chunk(rctxt)) + return false; + + rctxt->rc_reply_pcl.cl_count = 1; + return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p); } /* RPC-over-RDMA Version One private extension: Remote Invalidation. @@ -552,60 +584,53 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) * * If there is exactly one distinct R_key in the received transport * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero. - * - * Perform this operation while the received transport header is - * still in the CPU cache. */ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt) { - __be32 inv_rkey, *p; - u32 i, segcount; + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + u32 inv_rkey; ctxt->rc_inv_rkey = 0; if (!rdma->sc_snd_w_inv) return; - inv_rkey = xdr_zero; - p = ctxt->rc_recv_buf; - p += rpcrdma_fixed_maxsz; - - /* Read list */ - while (xdr_item_is_present(p++)) { - p++; /* position */ - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) - return; - p += 4; - } - - /* Write list */ - while (xdr_item_is_present(p++)) { - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) + inv_rkey = 0; + pcl_for_each_chunk(chunk, &ctxt->rc_call_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) return; - p += 4; } } - - /* Reply chunk */ - if (xdr_item_is_present(p++)) { - segcount = be32_to_cpup(p++); - for (i = 0; i < segcount; i++) { - if (inv_rkey == xdr_zero) - inv_rkey = *p; - else if (inv_rkey != *p) + pcl_for_each_chunk(chunk, &ctxt->rc_read_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) return; - p += 4; } } - - ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); + pcl_for_each_chunk(chunk, &ctxt->rc_write_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) + return; + } + } + pcl_for_each_chunk(chunk, &ctxt->rc_reply_pcl) { + pcl_for_each_segment(segment, chunk) { + if (inv_rkey == 0) + inv_rkey = segment->rs_handle; + else if (inv_rkey != segment->rs_handle) + return; + } + } + ctxt->rc_inv_rkey = inv_rkey; } /** @@ -641,7 +666,8 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, if (*p != rpcrdma_version) goto out_version; p += 2; - switch (*p) { + rctxt->rc_msgtype = *p; + switch (rctxt->rc_msgtype) { case rdma_msg: break; case rdma_nomsg: @@ -735,30 +761,28 @@ static void svc_rdma_send_error(struct svcxprt_rdma *rdma, * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ -static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, - __be32 *rdma_resp) +static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, + struct svc_rdma_recv_ctxt *rctxt) { - __be32 *p; + __be32 *p = rctxt->rc_recv_buf; if (!xprt->xpt_bc_xprt) return false; - p = rdma_resp + 3; - if (*p++ != rdma_msg) + if (rctxt->rc_msgtype != rdma_msg) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_call_pcl)) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_read_pcl)) return false; - if (*p++ != xdr_zero) + if (!pcl_is_empty(&rctxt->rc_write_pcl)) + return false; + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) return false; - /* XID sanity */ - if (*p++ != *rdma_resp) - return false; - /* call direction */ - if (*p == cpu_to_be32(RPC_CALL)) + /* RPC call direction */ + if (*(p + 8) == cpu_to_be32(RPC_CALL)) return false; return true; @@ -800,7 +824,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svcxprt_rdma *rdma_xprt = container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *ctxt; - __be32 *p; int ret; rqstp->rq_xprt_ctxt = NULL; @@ -833,7 +856,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_next_page = rqstp->rq_respages; - p = (__be32 *)rqstp->rq_arg.head[0].iov_base; ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; @@ -841,14 +863,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_drop; rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, p)) + if (svc_rdma_is_reverse_direction_reply(xprt, ctxt)) goto out_backchannel; svc_rdma_get_inv_rkey(rdma_xprt, ctxt); - p += rpcrdma_fixed_maxsz; - if (*p != xdr_zero) - goto out_readchunk; + if (!pcl_is_empty(&ctxt->rc_read_pcl) || + !pcl_is_empty(&ctxt->rc_call_pcl)) + goto out_readlist; complete: rqstp->rq_xprt_ctxt = ctxt; @@ -856,10 +878,10 @@ complete: svc_xprt_copy_addrs(rqstp, xprt); return rqstp->rq_arg.len; -out_readchunk: - ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); +out_readlist: + ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); if (ret < 0) - goto out_postfail; + goto out_readfail; return 0; out_err: @@ -867,7 +889,7 @@ out_err: svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; -out_postfail: +out_readfail: if (ret == -EINVAL) svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 80a0c0e87590..0b63e1321d74 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -190,14 +190,14 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, * - Stores arguments for the SGL constructor functions */ struct svc_rdma_write_info { + const struct svc_rdma_chunk *wi_chunk; + /* write state of this chunk */ unsigned int wi_seg_off; unsigned int wi_seg_no; - unsigned int wi_nsegs; - __be32 *wi_segs; /* SGL constructor arguments */ - struct xdr_buf *wi_xdr; + const struct xdr_buf *wi_xdr; unsigned char *wi_base; unsigned int wi_next_off; @@ -205,7 +205,8 @@ struct svc_rdma_write_info { }; static struct svc_rdma_write_info * -svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) +svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk) { struct svc_rdma_write_info *info; @@ -213,10 +214,9 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) if (!info) return info; + info->wi_chunk = chunk; info->wi_seg_off = 0; info->wi_seg_no = 0; - info->wi_nsegs = be32_to_cpup(++chunk); - info->wi_segs = ++chunk; svc_rdma_cc_init(rdma, &info->wi_cc); info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; @@ -258,11 +258,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) /* State for pulling a Read chunk. */ struct svc_rdma_read_info { + struct svc_rqst *ri_rqst; struct svc_rdma_recv_ctxt *ri_readctxt; - unsigned int ri_position; unsigned int ri_pageno; unsigned int ri_pageoff; - unsigned int ri_chunklen; + unsigned int ri_totalbytes; struct svc_rdma_chunk_ctxt ri_cc; }; @@ -358,7 +358,6 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) do { if (atomic_sub_return(cc->cc_sqecount, &rdma->sc_sq_avail) > 0) { - trace_svcrdma_post_chunk(&cc->cc_cid, cc->cc_sqecount); ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); if (ret) break; @@ -405,7 +404,7 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, struct svc_rdma_rw_ctxt *ctxt) { unsigned int sge_no, sge_bytes, page_off, page_no; - struct xdr_buf *xdr = info->wi_xdr; + const struct xdr_buf *xdr = info->wi_xdr; struct scatterlist *sg; struct page **page; @@ -443,40 +442,36 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, { struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; struct svcxprt_rdma *rdma = cc->cc_rdma; + const struct svc_rdma_segment *seg; struct svc_rdma_rw_ctxt *ctxt; - __be32 *seg; int ret; - seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; do { unsigned int write_len; - u32 handle, length; u64 offset; - if (info->wi_seg_no >= info->wi_nsegs) + seg = &info->wi_chunk->ch_segments[info->wi_seg_no]; + if (!seg) goto out_overflow; - xdr_decode_rdma_segment(seg, &handle, &length, &offset); - offset += info->wi_seg_off; - - write_len = min(remaining, length - info->wi_seg_off); + write_len = min(remaining, seg->rs_length - info->wi_seg_off); + if (!write_len) + goto out_overflow; ctxt = svc_rdma_get_rw_ctxt(rdma, (write_len >> PAGE_SHIFT) + 2); if (!ctxt) return -ENOMEM; constructor(info, write_len, ctxt); - ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle, + offset = seg->rs_offset + info->wi_seg_off; + ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, DMA_TO_DEVICE); if (ret < 0) return -EIO; - trace_svcrdma_send_wseg(handle, write_len, offset); - list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; - if (write_len == length - info->wi_seg_off) { - seg += 4; + if (write_len == seg->rs_length - info->wi_seg_off) { info->wi_seg_no++; info->wi_seg_off = 0; } else { @@ -489,31 +484,46 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, out_overflow: trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, - info->wi_nsegs); + info->wi_chunk->ch_segcount); return -E2BIG; } -/* Send one of an xdr_buf's kvecs by itself. To send a Reply - * chunk, the whole RPC Reply is written back to the client. - * This function writes either the head or tail of the xdr_buf - * containing the Reply. +/** + * svc_rdma_iov_write - Construct RDMA Writes from an iov + * @info: pointer to write arguments + * @iov: kvec to write + * + * Returns: + * On succes, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred */ -static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, - struct kvec *vec) +static int svc_rdma_iov_write(struct svc_rdma_write_info *info, + const struct kvec *iov) { - info->wi_base = vec->iov_base; + info->wi_base = iov->iov_base; return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, - vec->iov_len); + iov->iov_len); } -/* Send an xdr_buf's page list by itself. A Write chunk is just - * the page list. A Reply chunk is @xdr's head, page list, and - * tail. This function is shared between the two types of chunk. +/** + * svc_rdma_pages_write - Construct RDMA Writes from pages + * @info: pointer to write arguments + * @xdr: xdr_buf with pages to write + * @offset: offset into the content of @xdr + * @length: number of bytes to write + * + * Returns: + * On succes, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred */ -static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, - struct xdr_buf *xdr, - unsigned int offset, - unsigned long length) +static int svc_rdma_pages_write(struct svc_rdma_write_info *info, + const struct xdr_buf *xdr, + unsigned int offset, + unsigned long length) { info->wi_xdr = xdr; info->wi_next_off = offset - xdr->head[0].iov_len; @@ -521,13 +531,49 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, length); } +/** + * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf + * @xdr: xdr_buf to write + * @data: pointer to write arguments + * + * Returns: + * On succes, returns zero + * %-E2BIG if the client-provided Write chunk is too small + * %-ENOMEM if a resource has been exhausted + * %-EIO if an rdma-rw error occurred + */ +static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) +{ + struct svc_rdma_write_info *info = data; + int ret; + + if (xdr->head[0].iov_len) { + ret = svc_rdma_iov_write(info, &xdr->head[0]); + if (ret < 0) + return ret; + } + + if (xdr->page_len) { + ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len, + xdr->page_len); + if (ret < 0) + return ret; + } + + if (xdr->tail[0].iov_len) { + ret = svc_rdma_iov_write(info, &xdr->tail[0]); + if (ret < 0) + return ret; + } + + return xdr->len; +} + /** * svc_rdma_send_write_chunk - Write all segments in a Write chunk * @rdma: controlling RDMA transport - * @wr_ch: Write chunk provided by client + * @chunk: Write chunk provided by the client * @xdr: xdr_buf containing the data payload - * @offset: payload's byte offset in @xdr - * @length: size of payload, in bytes * * Returns a non-negative number of bytes the chunk consumed, or * %-E2BIG if the payload was larger than the Write chunk, @@ -536,30 +582,28 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, - struct xdr_buf *xdr, - unsigned int offset, unsigned long length) +int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; + struct svc_rdma_chunk_ctxt *cc; int ret; - if (!length) - return 0; - - info = svc_rdma_write_info_alloc(rdma, wr_ch); + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; + cc = &info->wi_cc; - ret = svc_rdma_send_xdr_pagelist(info, xdr, offset, length); - if (ret < 0) + ret = svc_rdma_xb_write(xdr, info); + if (ret != xdr->len) goto out_err; - ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; - - trace_svcrdma_send_write_chunk(xdr->page_len); - return length; + return xdr->len; out_err: svc_rdma_write_info_free(info); @@ -581,62 +625,62 @@ out_err: */ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_recv_ctxt *rctxt, - struct xdr_buf *xdr) + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; - int consumed, ret; + struct svc_rdma_chunk_ctxt *cc; + struct svc_rdma_chunk *chunk; + int ret; - info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk); + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return 0; + + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; + cc = &info->wi_cc; - ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); - if (ret < 0) - goto out_err; - consumed = xdr->head[0].iov_len; - - /* Send the page list in the Reply chunk only if the - * client did not provide Write chunks. - */ - if (!rctxt->rc_write_list && xdr->page_len) { - ret = svc_rdma_send_xdr_pagelist(info, xdr, - xdr->head[0].iov_len, - xdr->page_len); - if (ret < 0) - goto out_err; - consumed += xdr->page_len; - } - - if (xdr->tail[0].iov_len) { - ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); - if (ret < 0) - goto out_err; - consumed += xdr->tail[0].iov_len; - } - - ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_write, info); if (ret < 0) goto out_err; - trace_svcrdma_send_reply_chunk(consumed); - return consumed; + trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); + if (ret < 0) + goto out_err; + + return xdr->len; out_err: svc_rdma_write_info_free(info); return ret; } +/** + * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment + * @info: context for ongoing I/O + * @segment: co-ordinates of remote memory to be read + * + * Returns: + * %0: the Read WR chain was constructed successfully + * %-EINVAL: there were not enough rq_pages to finish + * %-ENOMEM: allocating a local resources failed + * %-EIO: a DMA mapping error occurred + */ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, - struct svc_rqst *rqstp, - u32 rkey, u32 len, u64 offset) + const struct svc_rdma_segment *segment) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; + struct svc_rqst *rqstp = info->ri_rqst; struct svc_rdma_rw_ctxt *ctxt; - unsigned int sge_no, seg_len; + unsigned int sge_no, seg_len, len; struct scatterlist *sg; int ret; + len = segment->rs_length; sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); if (!ctxt) @@ -670,8 +714,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, goto out_overrun; } - ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey, - DMA_FROM_DEVICE); + ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset, + segment->rs_handle, DMA_FROM_DEVICE); if (ret < 0) return -EIO; @@ -684,54 +728,177 @@ out_overrun: return -EINVAL; } -/* Walk the segments in the Read chunk starting at @p and construct - * RDMA Read operations to pull the chunk to the server. +/** + * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk + * @info: context for ongoing I/O + * @chunk: Read chunk to pull + * + * Return values: + * %0: the Read WR chain was constructed successfully + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: allocating a local resources failed + * %-EIO: a DMA mapping error occurred */ -static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_build_read_chunk(struct svc_rdma_read_info *info, + const struct svc_rdma_chunk *chunk) { + const struct svc_rdma_segment *segment; int ret; ret = -EINVAL; - info->ri_chunklen = 0; - while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { - u32 handle, length; - u64 offset; - - p = xdr_decode_rdma_segment(p, &handle, &length, &offset); - ret = svc_rdma_build_read_segment(info, rqstp, handle, length, - offset); + pcl_for_each_segment(segment, chunk) { + ret = svc_rdma_build_read_segment(info, segment); if (ret < 0) break; - - trace_svcrdma_send_rseg(handle, length, offset); - info->ri_chunklen += length; + info->ri_totalbytes += segment->rs_length; } - return ret; } -/* Construct RDMA Reads to pull over a normal Read chunk. The chunk - * data lands in the page list of head->rc_arg.pages. +/** + * svc_rdma_copy_inline_range - Copy part of the inline content into pages + * @info: context for RDMA Reads + * @offset: offset into the Receive buffer of region to copy + * @remaining: length of region to copy + * + * Take a page at a time from rqstp->rq_pages and copy the inline + * content from the Receive buffer into that page. Update + * info->ri_pageno and info->ri_pageoff so that the next RDMA Read + * result will land contiguously with the copied content. + * + * Return values: + * %0: Inline content was successfully copied + * %-EINVAL: offset or length was incorrect + */ +static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info, + unsigned int offset, + unsigned int remaining) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + unsigned char *dst, *src = head->rc_recv_buf; + struct svc_rqst *rqstp = info->ri_rqst; + unsigned int page_no, numpages; + + numpages = PAGE_ALIGN(info->ri_pageoff + remaining) >> PAGE_SHIFT; + for (page_no = 0; page_no < numpages; page_no++) { + unsigned int page_len; + + page_len = min_t(unsigned int, remaining, + PAGE_SIZE - info->ri_pageoff); + + head->rc_arg.pages[info->ri_pageno] = + rqstp->rq_pages[info->ri_pageno]; + if (!info->ri_pageoff) + head->rc_page_count++; + + dst = page_address(head->rc_arg.pages[info->ri_pageno]); + memcpy(dst + info->ri_pageno, src + offset, page_len); + + info->ri_totalbytes += page_len; + info->ri_pageoff += page_len; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + remaining -= page_len; + offset += page_len; + } + + return -EINVAL; +} + +/** + * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks + * @info: context for RDMA Reads + * + * The chunk data lands in head->rc_arg as a series of contiguous pages, + * like an incoming TCP call. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *info) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; + struct svc_rdma_chunk *chunk, *next; + struct xdr_buf *buf = &head->rc_arg; + unsigned int start, length; + int ret; + + start = 0; + chunk = pcl_first_chunk(pcl); + length = chunk->ch_position; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + + pcl_for_each_chunk(chunk, pcl) { + ret = svc_rdma_build_read_chunk(info, chunk); + if (ret < 0) + return ret; + + next = pcl_next_chunk(pcl, chunk); + if (!next) + break; + + start += length; + length = next->ch_position - info->ri_totalbytes; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + } + + start += length; + length = head->rc_byte_len - start; + ret = svc_rdma_copy_inline_range(info, start, length); + if (ret < 0) + return ret; + + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; + + head->rc_hdr_count = 1; + buf->head[0].iov_base = page_address(head->rc_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; + return 0; +} + +/** + * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks + * @info: context for RDMA Reads + * + * The chunk data lands in the page list of head->rc_arg.pages. * * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. * Therefore, XDR round-up of the Read chunk and trailing * inline content must both be added at the end of the pagelist. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct xdr_buf *buf = &head->rc_arg; + struct svc_rdma_chunk *chunk; + unsigned int length; int ret; - ret = svc_rdma_build_read_chunk(rqstp, info, p); + chunk = pcl_first_chunk(&head->rc_read_pcl); + ret = svc_rdma_build_read_chunk(info, chunk); if (ret < 0) goto out; - trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position); - head->rc_hdr_count = 0; /* Split the Receive buffer between the head and tail @@ -739,11 +906,9 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, * chunk is not included in either the pagelist or in * the tail. */ - head->rc_arg.tail[0].iov_base = - head->rc_arg.head[0].iov_base + info->ri_position; - head->rc_arg.tail[0].iov_len = - head->rc_arg.head[0].iov_len - info->ri_position; - head->rc_arg.head[0].iov_len = info->ri_position; + buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; + buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; + buf->head[0].iov_len = chunk->ch_position; /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). * @@ -754,50 +919,149 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, * Currently these chunks always start at page offset 0, * thus the rounded-up length never crosses a page boundary. */ - info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; - - head->rc_arg.page_len = info->ri_chunklen; - head->rc_arg.len += info->ri_chunklen; - head->rc_arg.buflen += info->ri_chunklen; + length = XDR_QUADLEN(info->ri_totalbytes) << 2; + buf->page_len = length; + buf->len += length; + buf->buflen += length; out: return ret; } -/* Construct RDMA Reads to pull over a Position Zero Read chunk. - * The start of the data lands in the first page just after - * the Transport header, and the rest lands in the page list of +/** + * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk + * @info: context for RDMA Reads + * @chunk: parsed Call chunk to pull + * @offset: offset of region to pull + * @length: length of region to pull + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static int svc_rdma_read_chunk_range(struct svc_rdma_read_info *info, + const struct svc_rdma_chunk *chunk, + unsigned int offset, unsigned int length) +{ + const struct svc_rdma_segment *segment; + int ret; + + ret = -EINVAL; + pcl_for_each_segment(segment, chunk) { + struct svc_rdma_segment dummy; + + if (offset > segment->rs_length) { + offset -= segment->rs_length; + continue; + } + + dummy.rs_handle = segment->rs_handle; + dummy.rs_length = min_t(u32, length, segment->rs_length) - offset; + dummy.rs_offset = segment->rs_offset + offset; + + ret = svc_rdma_build_read_segment(info, &dummy); + if (ret < 0) + break; + + info->ri_totalbytes += dummy.rs_length; + length -= dummy.rs_length; + offset = 0; + } + return ret; +} + +/** + * svc_rdma_read_call_chunk - Build RDMA Read WQEs to pull a Long Message + * @info: context for RDMA Reads + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: there were not enough resources to finish + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ +static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_chunk *call_chunk = + pcl_first_chunk(&head->rc_call_pcl); + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; + struct svc_rdma_chunk *chunk, *next; + unsigned int start, length; + int ret; + + if (pcl_is_empty(pcl)) + return svc_rdma_build_read_chunk(info, call_chunk); + + start = 0; + chunk = pcl_first_chunk(pcl); + length = chunk->ch_position; + ret = svc_rdma_read_chunk_range(info, call_chunk, start, length); + if (ret < 0) + return ret; + + pcl_for_each_chunk(chunk, pcl) { + ret = svc_rdma_build_read_chunk(info, chunk); + if (ret < 0) + return ret; + + next = pcl_next_chunk(pcl, chunk); + if (!next) + break; + + start += length; + length = next->ch_position - info->ri_totalbytes; + ret = svc_rdma_read_chunk_range(info, call_chunk, + start, length); + if (ret < 0) + return ret; + } + + start += length; + length = call_chunk->ch_length - start; + return svc_rdma_read_chunk_range(info, call_chunk, start, length); +} + +/** + * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message + * @info: context for RDMA Reads + * + * The start of the data lands in the first page just after the + * Transport header, and the rest lands in the page list of * head->rc_arg.pages. * * Assumptions: - * - A PZRC has an XDR-aligned length (no implicit round-up). - * - There can be no trailing inline content (IOW, we assume - * a PZRC is never sent in an RDMA_MSG message, though it's - * allowed by spec). + * - A PZRC is never sent in an RDMA_MSG message, though it's + * allowed by spec. + * + * Return values: + * %0: RDMA Read WQEs were successfully built + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info, - __be32 *p) +static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info) { struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct xdr_buf *buf = &head->rc_arg; int ret; - ret = svc_rdma_build_read_chunk(rqstp, info, p); + ret = svc_rdma_read_call_chunk(info); if (ret < 0) goto out; - trace_svcrdma_send_pzr(info->ri_chunklen); - - head->rc_arg.len += info->ri_chunklen; - head->rc_arg.buflen += info->ri_chunklen; + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; head->rc_hdr_count = 1; - head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); - head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, - info->ri_chunklen); - - head->rc_arg.page_len = info->ri_chunklen - - head->rc_arg.head[0].iov_len; + buf->head[0].iov_base = page_address(head->rc_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; out: return ret; @@ -824,26 +1088,34 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, } /** - * svc_rdma_recv_read_chunk - Pull a Read chunk from the client + * svc_rdma_process_read_list - Pull list of Read chunks from the client * @rdma: controlling RDMA transport * @rqstp: set of pages to use as Read sink buffers * @head: pages under I/O collect here - * @p: pointer to start of Read chunk * - * Returns: - * %0 if all needed RDMA Reads were posted successfully, - * %-EINVAL if client provided too many segments, - * %-ENOMEM if rdma_rw context pool was exhausted, - * %-ENOTCONN if posting failed (connection is lost), - * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * The RPC/RDMA protocol assumes that the upper layer's XDR decoders + * pull each Read chunk as they decode an incoming RPC message. * - * Assumptions: - * - All Read segments in @p have the same Position value. + * On Linux, however, the server needs to have a fully-constructed RPC + * message in rqstp->rq_arg when there is a positive return code from + * ->xpo_recvfrom. So the Read list is safety-checked immediately when + * it is received, then here the whole Read list is pulled all at once. + * The ingress RPC message is fully reconstructed once all associated + * RDMA Reads have completed. + * + * Return values: + * %1: all needed RDMA Reads were posted successfully, + * %-EINVAL: client provided too many chunks or segments, + * %-ENOMEM: rdma_rw context pool was exhausted, + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, - struct svc_rdma_recv_ctxt *head, __be32 *p) +int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) { struct svc_rdma_read_info *info; + struct svc_rdma_chunk_ctxt *cc; int ret; /* The request (with page list) is constructed in @@ -861,23 +1133,29 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, info = svc_rdma_read_info_alloc(rdma); if (!info) return -ENOMEM; + cc = &info->ri_cc; + info->ri_rqst = rqstp; info->ri_readctxt = head; info->ri_pageno = 0; info->ri_pageoff = 0; + info->ri_totalbytes = 0; - info->ri_position = be32_to_cpup(p + 1); - if (info->ri_position) - ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); - else - ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); + if (pcl_is_empty(&head->rc_call_pcl)) { + if (head->rc_read_pcl.cl_count == 1) + ret = svc_rdma_read_data_item(info); + else + ret = svc_rdma_read_multiple_chunks(info); + } else + ret = svc_rdma_read_special(info); if (ret < 0) goto out_err; - ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); + trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); + ret = svc_rdma_post_chunk_ctxt(cc); if (ret < 0) goto out_err; svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count); - return 0; + return 1; out_err: svc_rdma_read_info_free(info); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index c3d588b149aa..68af79d4f04f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -358,49 +358,42 @@ static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt) /** * svc_rdma_encode_write_segment - Encode one Write segment - * @src: matching Write chunk in the RPC Call header * @sctxt: Send context for the RPC Reply + * @chunk: Write chunk to push * @remaining: remaining bytes of the payload left in the Write chunk + * @segno: which segment in the chunk * * Return values: * On success, returns length in bytes of the Reply XDR buffer - * that was consumed by the Write segment + * that was consumed by the Write segment, and updates @remaining * %-EMSGSIZE on XDR buffer overflow */ -static ssize_t svc_rdma_encode_write_segment(__be32 *src, - struct svc_rdma_send_ctxt *sctxt, - unsigned int *remaining) +static ssize_t svc_rdma_encode_write_segment(struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + u32 *remaining, unsigned int segno) { + const struct svc_rdma_segment *segment = &chunk->ch_segments[segno]; + const size_t len = rpcrdma_segment_maxsz * sizeof(__be32); + u32 length; __be32 *p; - const size_t len = rpcrdma_segment_maxsz * sizeof(*p); - u32 handle, length; - u64 offset; p = xdr_reserve_space(&sctxt->sc_stream, len); if (!p) return -EMSGSIZE; - xdr_decode_rdma_segment(src, &handle, &length, &offset); - - if (*remaining < length) { - /* segment only partly filled */ - length = *remaining; - *remaining = 0; - } else { - /* entire segment was consumed */ - *remaining -= length; - } - xdr_encode_rdma_segment(p, handle, length, offset); - - trace_svcrdma_encode_wseg(handle, length, offset); + length = min_t(u32, *remaining, segment->rs_length); + *remaining -= length; + xdr_encode_rdma_segment(p, segment->rs_handle, length, + segment->rs_offset); + trace_svcrdma_encode_wseg(sctxt, segno, segment->rs_handle, length, + segment->rs_offset); return len; } /** * svc_rdma_encode_write_chunk - Encode one Write chunk - * @src: matching Write chunk in the RPC Call header * @sctxt: Send context for the RPC Reply - * @remaining: size in bytes of the payload in the Write chunk + * @chunk: Write chunk to push * * Copy a Write chunk from the Call transport header to the * Reply transport header. Update each segment's length field @@ -411,33 +404,28 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src, * that was consumed by the Write chunk * %-EMSGSIZE on XDR buffer overflow */ -static ssize_t svc_rdma_encode_write_chunk(__be32 *src, - struct svc_rdma_send_ctxt *sctxt, - unsigned int remaining) +static ssize_t svc_rdma_encode_write_chunk(struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk) { - unsigned int i, nsegs; + u32 remaining = chunk->ch_payload_length; + unsigned int segno; ssize_t len, ret; len = 0; - trace_svcrdma_encode_write_chunk(remaining); - - src++; ret = xdr_stream_encode_item_present(&sctxt->sc_stream); if (ret < 0) - return -EMSGSIZE; + return ret; len += ret; - nsegs = be32_to_cpup(src++); - ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs); + ret = xdr_stream_encode_u32(&sctxt->sc_stream, chunk->ch_segcount); if (ret < 0) - return -EMSGSIZE; + return ret; len += ret; - for (i = nsegs; i; i--) { - ret = svc_rdma_encode_write_segment(src, sctxt, &remaining); + for (segno = 0; segno < chunk->ch_segcount; segno++) { + ret = svc_rdma_encode_write_segment(sctxt, chunk, &remaining, segno); if (ret < 0) - return -EMSGSIZE; - src += rpcrdma_segment_maxsz; + return ret; len += ret; } @@ -448,32 +436,25 @@ static ssize_t svc_rdma_encode_write_chunk(__be32 *src, * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list * @rctxt: Reply context with information about the RPC Call * @sctxt: Send context for the RPC Reply - * @length: size in bytes of the payload in the first Write chunk - * - * The client provides a Write chunk list in the Call message. Fill - * in the segments in the first Write chunk in the Reply's transport - * header with the number of bytes consumed in each segment. - * Remaining chunks are returned unused. - * - * Assumptions: - * - Client has provided only one Write chunk * * Return values: * On success, returns length in bytes of the Reply XDR buffer * that was consumed by the Reply's Write list * %-EMSGSIZE on XDR buffer overflow */ -static ssize_t -svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt, - struct svc_rdma_send_ctxt *sctxt, - unsigned int length) +static ssize_t svc_rdma_encode_write_list(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt) { + struct svc_rdma_chunk *chunk; ssize_t len, ret; - ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length); - if (ret < 0) - return ret; - len = ret; + len = 0; + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { + ret = svc_rdma_encode_write_chunk(sctxt, chunk); + if (ret < 0) + return ret; + len += ret; + } /* Terminate the Write list */ ret = xdr_stream_encode_item_absent(&sctxt->sc_stream); @@ -489,56 +470,174 @@ svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt, * @sctxt: Send context for the RPC Reply * @length: size in bytes of the payload in the Reply chunk * - * Assumptions: - * - Reply can always fit in the client-provided Reply chunk - * * Return values: * On success, returns length in bytes of the Reply XDR buffer * that was consumed by the Reply's Reply chunk * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the RPC message is larger than the Reply chunk */ static ssize_t -svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt, +svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt, struct svc_rdma_send_ctxt *sctxt, unsigned int length) { - return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt, - length); + struct svc_rdma_chunk *chunk; + + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return xdr_stream_encode_item_absent(&sctxt->sc_stream); + + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + if (length > chunk->ch_length) + return -E2BIG; + + chunk->ch_payload_length = length; + return svc_rdma_encode_write_chunk(sctxt, chunk); } -static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct page *page, - unsigned long offset, - unsigned int len) +struct svc_rdma_map_data { + struct svcxprt_rdma *md_rdma; + struct svc_rdma_send_ctxt *md_ctxt; +}; + +/** + * svc_rdma_page_dma_map - DMA map one page + * @data: pointer to arguments + * @page: struct page to DMA map + * @offset: offset into the page + * @len: number of bytes to map + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if the page cannot be DMA mapped + */ +static int svc_rdma_page_dma_map(void *data, struct page *page, + unsigned long offset, unsigned int len) { + struct svc_rdma_map_data *args = data; + struct svcxprt_rdma *rdma = args->md_rdma; + struct svc_rdma_send_ctxt *ctxt = args->md_ctxt; struct ib_device *dev = rdma->sc_cm_id->device; dma_addr_t dma_addr; + ++ctxt->sc_cur_sge_no; + dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); - trace_svcrdma_dma_map_page(rdma, dma_addr, len); if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; + trace_svcrdma_dma_map_page(rdma, dma_addr, len); ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; ctxt->sc_send_wr.num_sge++; return 0; out_maperr: + trace_svcrdma_dma_map_err(rdma, dma_addr, len); return -EIO; } -/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() +/** + * svc_rdma_iov_dma_map - DMA map an iovec + * @data: pointer to arguments + * @iov: kvec to DMA map + * + * ib_dma_map_page() is used here because svc_rdma_dma_unmap() * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if the iovec cannot be DMA mapped */ -static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - unsigned char *base, - unsigned int len) +static int svc_rdma_iov_dma_map(void *data, const struct kvec *iov) { - return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), - offset_in_page(base), len); + if (!iov->iov_len) + return 0; + return svc_rdma_page_dma_map(data, virt_to_page(iov->iov_base), + offset_in_page(iov->iov_base), + iov->iov_len); +} + +/** + * svc_rdma_xb_dma_map - DMA map all segments of an xdr_buf + * @xdr: xdr_buf containing portion of an RPC message to transmit + * @data: pointer to arguments + * + * Returns: + * %0 if DMA mapping was successful + * %-EIO if DMA mapping failed + * + * On failure, any DMA mappings that have been already done must be + * unmapped by the caller. + */ +static int svc_rdma_xb_dma_map(const struct xdr_buf *xdr, void *data) +{ + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + int ret; + + ret = svc_rdma_iov_dma_map(data, &xdr->head[0]); + if (ret < 0) + return ret; + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + pageoff = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + + ret = svc_rdma_page_dma_map(data, *ppages++, pageoff, len); + if (ret < 0) + return ret; + + remaining -= len; + pageoff = 0; + } + + ret = svc_rdma_iov_dma_map(data, &xdr->tail[0]); + if (ret < 0) + return ret; + + return xdr->len; +} + +struct svc_rdma_pullup_data { + u8 *pd_dest; + unsigned int pd_length; + unsigned int pd_num_sges; +}; + +/** + * svc_rdma_xb_count_sges - Count how many SGEs will be needed + * @xdr: xdr_buf containing portion of an RPC message to transmit + * @data: pointer to arguments + * + * Returns: + * Number of SGEs needed to Send the contents of @xdr inline + */ +static int svc_rdma_xb_count_sges(const struct xdr_buf *xdr, + void *data) +{ + struct svc_rdma_pullup_data *args = data; + unsigned int remaining; + unsigned long offset; + + if (xdr->head[0].iov_len) + ++args->pd_num_sges; + + offset = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + ++args->pd_num_sges; + remaining -= min_t(u32, PAGE_SIZE - offset, remaining); + offset = 0; + } + + if (xdr->tail[0].iov_len) + ++args->pd_num_sges; + + args->pd_length += xdr->len; + return 0; } /** @@ -549,48 +648,71 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, * @xdr: xdr_buf containing RPC message to transmit * * Returns: - * %true if pull-up must be used - * %false otherwise + * %true if pull-up must be used + * %false otherwise */ -static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *sctxt, +static bool svc_rdma_pull_up_needed(const struct svcxprt_rdma *rdma, + const struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, - struct xdr_buf *xdr) + const struct xdr_buf *xdr) { - int elements; + /* Resources needed for the transport header */ + struct svc_rdma_pullup_data args = { + .pd_length = sctxt->sc_hdrbuf.len, + .pd_num_sges = 1, + }; + int ret; - /* For small messages, copying bytes is cheaper than DMA mapping. - */ - if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH) + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_count_sges, &args); + if (ret < 0) + return false; + + if (args.pd_length < RPCRDMA_PULLUP_THRESH) return true; + return args.pd_num_sges >= rdma->sc_max_send_sges; +} - /* Check whether the xdr_buf has more elements than can - * fit in a single RDMA Send. - */ - /* xdr->head */ - elements = 1; +/** + * svc_rdma_xb_linearize - Copy region of xdr_buf to flat buffer + * @xdr: xdr_buf containing portion of an RPC message to copy + * @data: pointer to arguments + * + * Returns: + * Always zero. + */ +static int svc_rdma_xb_linearize(const struct xdr_buf *xdr, + void *data) +{ + struct svc_rdma_pullup_data *args = data; + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; - /* xdr->pages */ - if (!rctxt || !rctxt->rc_write_list) { - unsigned int remaining; - unsigned long pageoff; - - pageoff = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - ++elements; - remaining -= min_t(u32, PAGE_SIZE - pageoff, - remaining); - pageoff = 0; - } + if (xdr->head[0].iov_len) { + memcpy(args->pd_dest, xdr->head[0].iov_base, xdr->head[0].iov_len); + args->pd_dest += xdr->head[0].iov_len; } - /* xdr->tail */ - if (xdr->tail[0].iov_len) - ++elements; + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + pageoff = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + memcpy(args->pd_dest, page_address(*ppages) + pageoff, len); + remaining -= len; + args->pd_dest += len; + pageoff = 0; + ppages++; + } - /* assume 1 SGE is needed for the transport header */ - return elements >= rdma->sc_max_send_sges; + if (xdr->tail[0].iov_len) { + memcpy(args->pd_dest, xdr->tail[0].iov_base, xdr->tail[0].iov_len); + args->pd_dest += xdr->tail[0].iov_len; + } + + args->pd_length += xdr->len; + return 0; } /** @@ -603,54 +725,30 @@ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, * The device is not capable of sending the reply directly. * Assemble the elements of @xdr into the transport header buffer. * - * Returns zero on success, or a negative errno on failure. + * Assumptions: + * pull_up_needed has determined that @xdr will fit in the buffer. + * + * Returns: + * %0 if pull-up was successful + * %-EMSGSIZE if a buffer manipulation problem occurred */ -static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, +static int svc_rdma_pull_up_reply_msg(const struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, const struct xdr_buf *xdr) { - unsigned char *dst, *tailbase; - unsigned int taillen; + struct svc_rdma_pullup_data args = { + .pd_dest = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len, + }; + int ret; - dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len; - memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); - dst += xdr->head[0].iov_len; + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_linearize, &args); + if (ret < 0) + return ret; - tailbase = xdr->tail[0].iov_base; - taillen = xdr->tail[0].iov_len; - if (rctxt && rctxt->rc_write_list) { - u32 xdrpad; - - xdrpad = xdr_pad_size(xdr->page_len); - if (taillen && xdrpad) { - tailbase += xdrpad; - taillen -= xdrpad; - } - } else { - unsigned int len, remaining; - unsigned long pageoff; - struct page **ppages; - - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - pageoff = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - len = min_t(u32, PAGE_SIZE - pageoff, remaining); - - memcpy(dst, page_address(*ppages) + pageoff, len); - remaining -= len; - dst += len; - pageoff = 0; - ppages++; - } - } - - if (taillen) - memcpy(dst, tailbase, taillen); - - sctxt->sc_sges[0].length += xdr->len; - trace_svcrdma_send_pullup(sctxt->sc_sges[0].length); + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length; + trace_svcrdma_send_pullup(sctxt, args.pd_length); return 0; } @@ -660,22 +758,22 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, * @rctxt: Write and Reply chunks provided by client * @xdr: prepared xdr_buf containing RPC message * - * Load the xdr_buf into the ctxt's sge array, and DMA map each - * element as it is added. The Send WR's num_sge field is set. + * Returns: + * %0 if DMA mapping was successful. + * %-EMSGSIZE if a buffer manipulation problem occurred + * %-EIO if DMA mapping failed * - * Returns zero on success, or a negative errno on failure. + * The Send WR's num_sge field is set in all cases. */ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, - struct xdr_buf *xdr) + const struct xdr_buf *xdr) { - unsigned int len, remaining; - unsigned long page_off; - struct page **ppages; - unsigned char *base; - u32 xdr_pad; - int ret; + struct svc_rdma_map_data args = { + .md_rdma = rdma, + .md_ctxt = sctxt, + }; /* Set up the (persistently-mapped) transport header SGE. */ sctxt->sc_send_wr.num_sge = 1; @@ -684,7 +782,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, /* If there is a Reply chunk, nothing follows the transport * header, and we're done here. */ - if (rctxt && rctxt->rc_reply_chunk) + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) return 0; /* For pull-up, svc_rdma_send() will sync the transport header. @@ -693,58 +791,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); - ++sctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, sctxt, - xdr->head[0].iov_base, - xdr->head[0].iov_len); - if (ret < 0) - return ret; - - /* If a Write chunk is present, the xdr_buf's page list - * is not included inline. However the Upper Layer may - * have added XDR padding in the tail buffer, and that - * should not be included inline. - */ - if (rctxt && rctxt->rc_write_list) { - base = xdr->tail[0].iov_base; - len = xdr->tail[0].iov_len; - xdr_pad = xdr_pad_size(xdr->page_len); - - if (len && xdr_pad) { - base += xdr_pad; - len -= xdr_pad; - } - - goto tail; - } - - ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_off = xdr->page_base & ~PAGE_MASK; - remaining = xdr->page_len; - while (remaining) { - len = min_t(u32, PAGE_SIZE - page_off, remaining); - - ++sctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++, - page_off, len); - if (ret < 0) - return ret; - - remaining -= len; - page_off = 0; - } - - base = xdr->tail[0].iov_base; - len = xdr->tail[0].iov_len; -tail: - if (len) { - ++sctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len); - if (ret < 0) - return ret; - } - - return 0; + return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_dma_map, &args); } /* The svc_rqst and all resources it owns are released as soon as @@ -894,9 +942,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; __be32 *rdma_argp = rctxt->rc_recv_buf; - __be32 *wr_lst = rctxt->rc_write_list; - __be32 *rp_ch = rctxt->rc_reply_chunk; - struct xdr_buf *xdr = &rqstp->rq_res; struct svc_rdma_send_ctxt *sctxt; __be32 *p; int ret; @@ -914,45 +959,22 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) rpcrdma_fixed_maxsz * sizeof(*p)); if (!p) goto err0; + + ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + if (ret < 0) + goto err2; + *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p = rp_ch ? rdma_nomsg : rdma_msg; + *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg; if (svc_rdma_encode_read_list(sctxt) < 0) goto err0; - if (wr_lst) { - /* XXX: Presume the client sent only one Write chunk */ - unsigned long offset; - unsigned int length; - - if (rctxt->rc_read_payload_length) { - offset = rctxt->rc_read_payload_offset; - length = rctxt->rc_read_payload_length; - } else { - offset = xdr->head[0].iov_len; - length = xdr->page_len; - } - ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset, - length); - if (ret < 0) - goto err2; - if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0) - goto err0; - } else { - if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0) - goto err0; - } - if (rp_ch) { - ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); - if (ret < 0) - goto err2; - if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) - goto err0; - } else { - if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0) - goto err0; - } + if (svc_rdma_encode_write_list(rctxt, sctxt) < 0) + goto err0; + if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) + goto err0; ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) @@ -979,28 +1001,46 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) } /** - * svc_rdma_read_payload - special processing for a READ payload + * svc_rdma_result_payload - special processing for a result payload * @rqstp: svc_rqst to operate on * @offset: payload's byte offset in @xdr * @length: size of payload, in bytes * - * Returns zero on success. - * - * For the moment, just record the xdr_buf location of the READ - * payload. svc_rdma_sendto will use that location later when - * we actually send the payload. + * Return values: + * %0 if successful or nothing needed to be done + * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the payload was larger than the Write chunk + * %-EINVAL if client provided too many segments + * %-ENOMEM if rdma_rw context pool was exhausted + * %-ENOTCONN if posting failed (connection is lost) + * %-EIO if rdma_rw initialization failed (DMA mapping, etc) */ -int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, - unsigned int length) +int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; + struct svc_rdma_chunk *chunk; + struct svcxprt_rdma *rdma; + struct xdr_buf subbuf; + int ret; - /* XXX: Just one READ payload slot for now, since our - * transport implementation currently supports only one - * Write chunk. - */ - rctxt->rc_read_payload_offset = offset; - rctxt->rc_read_payload_length = length; + chunk = rctxt->rc_cur_result_payload; + if (!length || !chunk) + return 0; + rctxt->rc_cur_result_payload = + pcl_next_chunk(&rctxt->rc_write_pcl, chunk); + if (length > chunk->ch_length) + return -E2BIG; + chunk->ch_position = offset; + chunk->ch_payload_length = length; + + if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length)) + return -EMSGSIZE; + + rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); + ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); + if (ret < 0) + return ret; return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index fb044792b571..afba4e9d5425 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -80,7 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, - .xpo_read_payload = svc_rdma_read_payload, + .xpo_result_payload = svc_rdma_result_payload, .xpo_release_rqst = svc_rdma_release_rqst, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free,