2018-05-07 22:26:55 +03:00
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2007-12-13 01:13:25 +03:00
/*
2018-05-07 22:27:21 +03:00
* Copyright ( c ) 2016 - 2018 Oracle . All rights reserved .
2014-05-29 00:12:01 +04:00
* Copyright ( c ) 2014 Open Grid Computing , Inc . All rights reserved .
2007-12-13 01:13:25 +03:00
* Copyright ( c ) 2005 - 2006 Network Appliance , Inc . All rights reserved .
*
* This software is available to you under a choice of one of two
* licenses . You may choose to be licensed under the terms of the GNU
* General Public License ( GPL ) Version 2 , available from the file
* COPYING in the main directory of this source tree , or the BSD - type
* license below :
*
* Redistribution and use in source and binary forms , with or without
* modification , are permitted provided that the following conditions
* are met :
*
* Redistributions of source code must retain the above copyright
* notice , this list of conditions and the following disclaimer .
*
* Redistributions in binary form must reproduce the above
* copyright notice , this list of conditions and the following
* disclaimer in the documentation and / or other materials provided
* with the distribution .
*
* Neither the name of the Network Appliance , Inc . nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written
* permission .
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* " AS IS " AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT
* LIMITED TO , THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED . IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT , INDIRECT , INCIDENTAL ,
* SPECIAL , EXEMPLARY , OR CONSEQUENTIAL DAMAGES ( INCLUDING , BUT NOT
* LIMITED TO , PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE ,
* DATA , OR PROFITS ; OR BUSINESS INTERRUPTION ) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY , WHETHER IN CONTRACT , STRICT LIABILITY , OR TORT
* ( INCLUDING NEGLIGENCE OR OTHERWISE ) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE , EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE .
*
* Author : Tom Tucker < tom @ opengridcomputing . com >
*/
2017-04-09 20:06:25 +03:00
/* Operation
*
* The main entry point is svc_rdma_sendto . This is called by the
* RPC server when an RPC Reply is ready to be transmitted to a client .
*
* The passed - in svc_rqst contains a struct xdr_buf which holds an
* XDR - encoded RPC Reply message . sendto must construct the RPC - over - RDMA
* transport header , post all Write WRs needed for this Reply , then post
* a Send WR conveying the transport header and the RPC message itself to
* the client .
*
* svc_rdma_sendto must fully transmit the Reply before returning , as
* the svc_rqst will be recycled as soon as sendto returns . Remaining
* resources referred to by the svc_rqst are also recycled at that time .
* Therefore any resources that must remain longer must be detached
* from the svc_rqst and released later .
*
* Page Management
*
* The I / O that performs Reply transmission is asynchronous , and may
* complete well after sendto returns . Thus pages under I / O must be
* removed from the svc_rqst before sendto returns .
*
* The logic here depends on Send Queue and completion ordering . Since
* the Send WR is always posted last , it will always complete last . Thus
* when it completes , it is guaranteed that all previous Write WRs have
* also completed .
*
* Write WRs are constructed and posted . Each Write segment gets its own
* svc_rdma_rw_ctxt , allowing the Write completion handler to find and
* DMA - unmap the pages under I / O for that Write segment . The Write
* completion handler does not release any pages .
*
2018-05-07 22:28:04 +03:00
* When the Send WR is constructed , it also gets its own svc_rdma_send_ctxt .
2017-04-09 20:06:25 +03:00
* The ownership of all of the Reply ' s pages are transferred into that
* ctxt , the Send WR is posted , and sendto returns .
*
2018-05-07 22:28:04 +03:00
* The svc_rdma_send_ctxt is presented when the Send WR completes . The
2017-04-09 20:06:25 +03:00
* Send completion handler finally releases the Reply ' s pages .
*
* This mechanism also assumes that completions on the transport ' s Send
* Completion Queue do not run in parallel . Otherwise a Write completion
* and Send completion running at the same time could release pages that
* are still DMA - mapped .
*
* Error Handling
*
* - If the Send WR is posted successfully , it will either complete
* successfully , or get flushed . Either way , the Send completion
* handler releases the Reply ' s pages .
* - If the Send WR cannot be not posted , the forward path releases
* the Reply ' s pages .
*
* This handles the case , without the use of page reference counting ,
* where two different Write segments send portions of the same page .
*/
2007-12-13 01:13:25 +03:00
# include <linux/spinlock.h>
# include <asm/unaligned.h>
2018-05-07 22:27:11 +03:00
2007-12-13 01:13:25 +03:00
# include <rdma/ib_verbs.h>
# include <rdma/rdma_cm.h>
2018-05-07 22:27:11 +03:00
# include <linux/sunrpc/debug.h>
# include <linux/sunrpc/rpc_rdma.h>
2007-12-13 01:13:25 +03:00
# include <linux/sunrpc/svc_rdma.h>
2018-05-07 22:27:11 +03:00
# include "xprt_rdma.h"
# include <trace/events/rpcrdma.h>
2007-12-13 01:13:25 +03:00
# define RPCDBG_FACILITY RPCDBG_SVCXPRT
2018-05-07 22:28:04 +03:00
static void svc_rdma_wc_send ( struct ib_cq * cq , struct ib_wc * wc ) ;
static inline struct svc_rdma_send_ctxt *
svc_rdma_next_send_ctxt ( struct list_head * list )
{
return list_first_entry_or_null ( list , struct svc_rdma_send_ctxt ,
sc_list ) ;
}
static struct svc_rdma_send_ctxt *
svc_rdma_send_ctxt_alloc ( struct svcxprt_rdma * rdma )
{
struct svc_rdma_send_ctxt * ctxt ;
2018-05-07 22:28:09 +03:00
size_t size ;
2018-05-07 22:28:04 +03:00
int i ;
2018-05-07 22:28:09 +03:00
size = sizeof ( * ctxt ) ;
size + = rdma - > sc_max_send_sges * sizeof ( struct ib_sge ) ;
ctxt = kmalloc ( size , GFP_KERNEL ) ;
2018-05-07 22:28:04 +03:00
if ( ! ctxt )
return NULL ;
ctxt - > sc_cqe . done = svc_rdma_wc_send ;
ctxt - > sc_send_wr . next = NULL ;
ctxt - > sc_send_wr . wr_cqe = & ctxt - > sc_cqe ;
ctxt - > sc_send_wr . sg_list = ctxt - > sc_sges ;
ctxt - > sc_send_wr . send_flags = IB_SEND_SIGNALED ;
2018-05-07 22:28:09 +03:00
for ( i = 0 ; i < rdma - > sc_max_send_sges ; i + + )
2018-05-07 22:28:04 +03:00
ctxt - > sc_sges [ i ] . lkey = rdma - > sc_pd - > local_dma_lkey ;
return ctxt ;
}
/**
* svc_rdma_send_ctxts_destroy - Release all send_ctxt ' s for an xprt
* @ rdma : svcxprt_rdma being torn down
*
*/
void svc_rdma_send_ctxts_destroy ( struct svcxprt_rdma * rdma )
{
struct svc_rdma_send_ctxt * ctxt ;
while ( ( ctxt = svc_rdma_next_send_ctxt ( & rdma - > sc_send_ctxts ) ) ) {
list_del ( & ctxt - > sc_list ) ;
kfree ( ctxt ) ;
}
}
/**
* svc_rdma_send_ctxt_get - Get a free send_ctxt
* @ rdma : controlling svcxprt_rdma
*
* Returns a ready - to - use send_ctxt , or NULL if none are
* available and a fresh one cannot be allocated .
*/
struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_get ( struct svcxprt_rdma * rdma )
{
struct svc_rdma_send_ctxt * ctxt ;
spin_lock ( & rdma - > sc_send_lock ) ;
ctxt = svc_rdma_next_send_ctxt ( & rdma - > sc_send_ctxts ) ;
if ( ! ctxt )
goto out_empty ;
list_del ( & ctxt - > sc_list ) ;
spin_unlock ( & rdma - > sc_send_lock ) ;
out :
ctxt - > sc_send_wr . num_sge = 0 ;
ctxt - > sc_page_count = 0 ;
return ctxt ;
out_empty :
spin_unlock ( & rdma - > sc_send_lock ) ;
ctxt = svc_rdma_send_ctxt_alloc ( rdma ) ;
if ( ! ctxt )
return NULL ;
goto out ;
}
/**
* svc_rdma_send_ctxt_put - Return send_ctxt to free list
* @ rdma : controlling svcxprt_rdma
* @ ctxt : object to return to the free list
*
* Pages left in sc_pages are DMA unmapped and released .
*/
void svc_rdma_send_ctxt_put ( struct svcxprt_rdma * rdma ,
struct svc_rdma_send_ctxt * ctxt )
{
struct ib_device * device = rdma - > sc_cm_id - > device ;
unsigned int i ;
for ( i = 0 ; i < ctxt - > sc_send_wr . num_sge ; i + + )
ib_dma_unmap_page ( device ,
ctxt - > sc_sges [ i ] . addr ,
ctxt - > sc_sges [ i ] . length ,
DMA_TO_DEVICE ) ;
for ( i = 0 ; i < ctxt - > sc_page_count ; + + i )
put_page ( ctxt - > sc_pages [ i ] ) ;
spin_lock ( & rdma - > sc_send_lock ) ;
list_add ( & ctxt - > sc_list , & rdma - > sc_send_ctxts ) ;
spin_unlock ( & rdma - > sc_send_lock ) ;
}
/**
* svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
* @ cq : Completion Queue context
* @ wc : Work Completion object
*
* NB : The svc_xprt / svcxprt_rdma is pinned whenever it ' s possible that
* the Send completion handler could be running .
*/
static void svc_rdma_wc_send ( struct ib_cq * cq , struct ib_wc * wc )
{
struct svcxprt_rdma * rdma = cq - > cq_context ;
struct ib_cqe * cqe = wc - > wr_cqe ;
struct svc_rdma_send_ctxt * ctxt ;
trace_svcrdma_wc_send ( wc ) ;
atomic_inc ( & rdma - > sc_sq_avail ) ;
wake_up ( & rdma - > sc_send_wait ) ;
ctxt = container_of ( cqe , struct svc_rdma_send_ctxt , sc_cqe ) ;
svc_rdma_send_ctxt_put ( rdma , ctxt ) ;
if ( unlikely ( wc - > status ! = IB_WC_SUCCESS ) ) {
set_bit ( XPT_CLOSE , & rdma - > sc_xprt . xpt_flags ) ;
svc_xprt_enqueue ( & rdma - > sc_xprt ) ;
if ( wc - > status ! = IB_WC_WR_FLUSH_ERR )
pr_err ( " svcrdma: Send: %s (%u/0x%x) \n " ,
ib_wc_status_msg ( wc - > status ) ,
wc - > status , wc - > vendor_err ) ;
}
svc_xprt_put ( & rdma - > sc_xprt ) ;
}
int svc_rdma_send ( struct svcxprt_rdma * rdma , struct ib_send_wr * wr )
{
struct ib_send_wr * bad_wr , * n_wr ;
int wr_count ;
int i ;
int ret ;
wr_count = 1 ;
for ( n_wr = wr - > next ; n_wr ; n_wr = n_wr - > next )
wr_count + + ;
/* If the SQ is full, wait until an SQ entry is available */
while ( 1 ) {
if ( ( atomic_sub_return ( wr_count , & rdma - > sc_sq_avail ) < 0 ) ) {
atomic_inc ( & rdma_stat_sq_starve ) ;
trace_svcrdma_sq_full ( rdma ) ;
atomic_add ( wr_count , & rdma - > sc_sq_avail ) ;
wait_event ( rdma - > sc_send_wait ,
atomic_read ( & rdma - > sc_sq_avail ) > wr_count ) ;
if ( test_bit ( XPT_CLOSE , & rdma - > sc_xprt . xpt_flags ) )
return - ENOTCONN ;
trace_svcrdma_sq_retry ( rdma ) ;
continue ;
}
/* Take a transport ref for each WR posted */
for ( i = 0 ; i < wr_count ; i + + )
svc_xprt_get ( & rdma - > sc_xprt ) ;
/* Bump used SQ WR count and post */
ret = ib_post_send ( rdma - > sc_qp , wr , & bad_wr ) ;
trace_svcrdma_post_send ( wr , ret ) ;
if ( ret ) {
set_bit ( XPT_CLOSE , & rdma - > sc_xprt . xpt_flags ) ;
for ( i = 0 ; i < wr_count ; i + + )
svc_xprt_put ( & rdma - > sc_xprt ) ;
wake_up ( & rdma - > sc_send_wait ) ;
}
break ;
}
return ret ;
}
2016-03-01 21:05:45 +03:00
static u32 xdr_padsize ( u32 len )
{
return ( len & 3 ) ? ( 4 - ( len & 3 ) ) : 0 ;
}
2017-04-09 20:06:25 +03:00
/* Returns length of transport header, in bytes.
*/
static unsigned int svc_rdma_reply_hdr_len ( __be32 * rdma_resp )
{
unsigned int nsegs ;
__be32 * p ;
p = rdma_resp ;
/* RPC-over-RDMA V1 replies never have a Read list. */
p + = rpcrdma_fixed_maxsz + 1 ;
/* Skip Write list. */
while ( * p + + ! = xdr_zero ) {
nsegs = be32_to_cpup ( p + + ) ;
p + = nsegs * rpcrdma_segment_maxsz ;
}
/* Skip Reply chunk. */
if ( * p + + ! = xdr_zero ) {
nsegs = be32_to_cpup ( p + + ) ;
p + = nsegs * rpcrdma_segment_maxsz ;
}
return ( unsigned long ) p - ( unsigned long ) rdma_resp ;
}
/* One Write chunk is copied from Call transport header to Reply
* transport header . Each segment ' s length field is updated to
* reflect number of bytes consumed in the segment .
*
* Returns number of segments in this chunk .
*/
static unsigned int xdr_encode_write_chunk ( __be32 * dst , __be32 * src ,
unsigned int remaining )
{
unsigned int i , nsegs ;
u32 seg_len ;
/* Write list discriminator */
* dst + + = * src + + ;
/* number of segments in this chunk */
nsegs = be32_to_cpup ( src ) ;
* dst + + = * src + + ;
for ( i = nsegs ; i ; i - - ) {
/* segment's RDMA handle */
* dst + + = * src + + ;
/* bytes returned in this segment */
seg_len = be32_to_cpu ( * src ) ;
if ( remaining > = seg_len ) {
/* entire segment was consumed */
* dst = * src ;
remaining - = seg_len ;
} else {
/* segment only partly filled */
* dst = cpu_to_be32 ( remaining ) ;
remaining = 0 ;
}
dst + + ; src + + ;
/* segment's RDMA offset */
* dst + + = * src + + ;
* dst + + = * src + + ;
}
return nsegs ;
}
/* The client provided a Write list in the Call message. Fill in
* the segments in the first Write chunk in the Reply ' s transport
* header with the number of bytes consumed in each segment .
* Remaining chunks are returned unused .
*
* Assumptions :
* - Client has provided only one Write chunk
*/
static void svc_rdma_xdr_encode_write_list ( __be32 * rdma_resp , __be32 * wr_ch ,
unsigned int consumed )
{
unsigned int nsegs ;
__be32 * p , * q ;
/* RPC-over-RDMA V1 replies never have a Read list. */
p = rdma_resp + rpcrdma_fixed_maxsz + 1 ;
q = wr_ch ;
while ( * q ! = xdr_zero ) {
nsegs = xdr_encode_write_chunk ( p , q , consumed ) ;
q + = 2 + nsegs * rpcrdma_segment_maxsz ;
p + = 2 + nsegs * rpcrdma_segment_maxsz ;
consumed = 0 ;
}
/* Terminate Write list */
* p + + = xdr_zero ;
/* Reply chunk discriminator; may be replaced later */
* p = xdr_zero ;
}
/* The client provided a Reply chunk in the Call message. Fill in
* the segments in the Reply chunk in the Reply message with the
* number of bytes consumed in each segment .
*
* Assumptions :
* - Reply can always fit in the provided Reply chunk
*/
static void svc_rdma_xdr_encode_reply_chunk ( __be32 * rdma_resp , __be32 * rp_ch ,
unsigned int consumed )
{
__be32 * p ;
/* Find the Reply chunk in the Reply's xprt header.
* RPC - over - RDMA V1 replies never have a Read list .
*/
p = rdma_resp + rpcrdma_fixed_maxsz + 1 ;
/* Skip past Write list */
while ( * p + + ! = xdr_zero )
p + = 1 + be32_to_cpup ( p ) * rpcrdma_segment_maxsz ;
xdr_encode_write_chunk ( p , rp_ch , consumed ) ;
}
2016-11-29 19:04:42 +03:00
/* Parse the RPC Call's transport header.
2015-07-09 23:45:28 +03:00
*/
2017-04-09 20:06:25 +03:00
static void svc_rdma_get_write_arrays ( __be32 * rdma_argp ,
__be32 * * write , __be32 * * reply )
2015-07-09 23:45:28 +03:00
{
2016-11-29 19:04:42 +03:00
__be32 * p ;
2015-07-09 23:45:28 +03:00
2017-04-09 20:06:25 +03:00
p = rdma_argp + rpcrdma_fixed_maxsz ;
2015-07-09 23:45:28 +03:00
2016-11-29 19:04:42 +03:00
/* Read list */
while ( * p + + ! = xdr_zero )
p + = 5 ;
2015-07-09 23:45:28 +03:00
2016-11-29 19:04:42 +03:00
/* Write list */
if ( * p ! = xdr_zero ) {
2017-04-09 20:06:25 +03:00
* write = p ;
2016-11-29 19:04:42 +03:00
while ( * p + + ! = xdr_zero )
p + = 1 + be32_to_cpu ( * p ) * 4 ;
} else {
* write = NULL ;
p + + ;
2015-07-09 23:45:28 +03:00
}
2016-11-29 19:04:42 +03:00
/* Reply chunk */
if ( * p ! = xdr_zero )
2017-04-09 20:06:25 +03:00
* reply = p ;
2016-11-29 19:04:42 +03:00
else
* reply = NULL ;
2015-07-09 23:45:28 +03:00
}
2016-09-13 17:53:23 +03:00
/* RPC-over-RDMA Version One private extension: Remote Invalidation.
* Responder ' s choice : requester signals it can handle Send With
* Invalidate , and responder chooses one rkey to invalidate .
*
* Find a candidate rkey to invalidate when sending a reply . Picks the
2017-04-09 20:06:08 +03:00
* first R_key it finds in the chunk lists .
2016-09-13 17:53:23 +03:00
*
* Returns zero if RPC ' s chunk lists are empty .
*/
2017-04-09 20:06:08 +03:00
static u32 svc_rdma_get_inv_rkey ( __be32 * rdma_argp ,
__be32 * wr_lst , __be32 * rp_ch )
2016-09-13 17:53:23 +03:00
{
2017-04-09 20:06:08 +03:00
__be32 * p ;
2016-09-13 17:53:23 +03:00
2017-04-09 20:06:08 +03:00
p = rdma_argp + rpcrdma_fixed_maxsz ;
if ( * p ! = xdr_zero )
p + = 2 ;
else if ( wr_lst & & be32_to_cpup ( wr_lst + 1 ) )
p = wr_lst + 2 ;
else if ( rp_ch & & be32_to_cpup ( rp_ch + 1 ) )
p = rp_ch + 2 ;
else
return 0 ;
return be32_to_cpup ( p ) ;
2016-09-13 17:53:23 +03:00
}
2017-04-09 20:05:44 +03:00
static int svc_rdma_dma_map_page ( struct svcxprt_rdma * rdma ,
2018-05-07 22:28:04 +03:00
struct svc_rdma_send_ctxt * ctxt ,
2017-04-09 20:05:44 +03:00
struct page * page ,
2018-05-07 22:27:53 +03:00
unsigned long offset ,
2017-04-09 20:05:44 +03:00
unsigned int len )
{
struct ib_device * dev = rdma - > sc_cm_id - > device ;
dma_addr_t dma_addr ;
dma_addr = ib_dma_map_page ( dev , page , offset , len , DMA_TO_DEVICE ) ;
if ( ib_dma_mapping_error ( dev , dma_addr ) )
2017-06-24 00:17:15 +03:00
goto out_maperr ;
2017-04-09 20:05:44 +03:00
2018-05-07 22:28:09 +03:00
ctxt - > sc_sges [ ctxt - > sc_cur_sge_no ] . addr = dma_addr ;
ctxt - > sc_sges [ ctxt - > sc_cur_sge_no ] . length = len ;
2018-05-07 22:28:04 +03:00
ctxt - > sc_send_wr . num_sge + + ;
2017-04-09 20:05:44 +03:00
return 0 ;
2017-06-24 00:17:15 +03:00
out_maperr :
2018-05-07 22:27:16 +03:00
trace_svcrdma_dma_map_page ( rdma , page ) ;
2017-06-24 00:17:15 +03:00
return - EIO ;
2017-04-09 20:05:44 +03:00
}
2018-05-07 22:27:53 +03:00
/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
* handles DMA - unmap and it uses ib_dma_unmap_page ( ) exclusively .
*/
static int svc_rdma_dma_map_buf ( struct svcxprt_rdma * rdma ,
2018-05-07 22:28:04 +03:00
struct svc_rdma_send_ctxt * ctxt ,
2018-05-07 22:27:53 +03:00
unsigned char * base ,
unsigned int len )
{
2018-05-07 22:28:09 +03:00
return svc_rdma_dma_map_page ( rdma , ctxt , virt_to_page ( base ) ,
2018-05-07 22:27:53 +03:00
offset_in_page ( base ) , len ) ;
}
2017-04-09 20:05:44 +03:00
/**
* svc_rdma_map_reply_hdr - DMA map the transport header buffer
* @ rdma : controlling transport
* @ ctxt : op_ctxt for the Send WR
* @ rdma_resp : buffer containing transport header
* @ len : length of transport header
*
* Returns :
* % 0 if the header is DMA mapped ,
* % - EIO if DMA mapping failed .
*/
int svc_rdma_map_reply_hdr ( struct svcxprt_rdma * rdma ,
2018-05-07 22:28:04 +03:00
struct svc_rdma_send_ctxt * ctxt ,
2017-04-09 20:05:44 +03:00
__be32 * rdma_resp ,
unsigned int len )
{
2018-05-07 22:28:04 +03:00
ctxt - > sc_pages [ 0 ] = virt_to_page ( rdma_resp ) ;
ctxt - > sc_page_count + + ;
2018-05-07 22:28:09 +03:00
ctxt - > sc_cur_sge_no = 0 ;
return svc_rdma_dma_map_page ( rdma , ctxt , ctxt - > sc_pages [ 0 ] , 0 , len ) ;
2017-04-09 20:05:44 +03:00
}
2017-04-09 20:06:25 +03:00
/* Load the xdr_buf into the ctxt's sge array, and DMA map each
* element as it is added .
*
2018-05-07 22:27:59 +03:00
* Returns zero on success , or a negative errno on failure .
2007-12-13 01:13:25 +03:00
*/
2017-04-09 20:06:25 +03:00
static int svc_rdma_map_reply_msg ( struct svcxprt_rdma * rdma ,
2018-05-07 22:28:04 +03:00
struct svc_rdma_send_ctxt * ctxt ,
2017-04-09 20:06:25 +03:00
struct xdr_buf * xdr , __be32 * wr_lst )
2007-12-13 01:13:25 +03:00
{
2018-05-07 22:28:09 +03:00
unsigned int len , remaining ;
2018-05-07 22:27:53 +03:00
unsigned long page_off ;
2017-04-09 20:06:25 +03:00
struct page * * ppages ;
unsigned char * base ;
u32 xdr_pad ;
int ret ;
2007-12-13 01:13:25 +03:00
2018-05-07 22:28:09 +03:00
if ( + + ctxt - > sc_cur_sge_no > = rdma - > sc_max_send_sges )
return - EIO ;
ret = svc_rdma_dma_map_buf ( rdma , ctxt ,
2017-04-09 20:06:25 +03:00
xdr - > head [ 0 ] . iov_base ,
xdr - > head [ 0 ] . iov_len ) ;
if ( ret < 0 )
return ret ;
2007-12-13 01:13:25 +03:00
2017-04-09 20:06:25 +03:00
/* If a Write chunk is present, the xdr_buf's page list
* is not included inline . However the Upper Layer may
* have added XDR padding in the tail buffer , and that
* should not be included inline .
*/
if ( wr_lst ) {
base = xdr - > tail [ 0 ] . iov_base ;
len = xdr - > tail [ 0 ] . iov_len ;
xdr_pad = xdr_padsize ( xdr - > page_len ) ;
2007-12-13 01:13:25 +03:00
2017-04-09 20:06:25 +03:00
if ( len & & xdr_pad ) {
base + = xdr_pad ;
len - = xdr_pad ;
2015-01-13 19:03:03 +03:00
}
2007-12-13 01:13:25 +03:00
2017-04-09 20:06:25 +03:00
goto tail ;
2007-12-13 01:13:25 +03:00
}
2017-04-09 20:06:25 +03:00
ppages = xdr - > pages + ( xdr - > page_base > > PAGE_SHIFT ) ;
page_off = xdr - > page_base & ~ PAGE_MASK ;
remaining = xdr - > page_len ;
while ( remaining ) {
len = min_t ( u32 , PAGE_SIZE - page_off , remaining ) ;
2016-03-01 21:05:36 +03:00
2018-05-07 22:28:09 +03:00
if ( + + ctxt - > sc_cur_sge_no > = rdma - > sc_max_send_sges )
return - EIO ;
ret = svc_rdma_dma_map_page ( rdma , ctxt , * ppages + + ,
page_off , len ) ;
2017-04-09 20:06:25 +03:00
if ( ret < 0 )
return ret ;
2007-12-13 01:13:25 +03:00
2017-04-09 20:06:25 +03:00
remaining - = len ;
page_off = 0 ;
2007-12-13 01:13:25 +03:00
}
2017-04-09 20:06:25 +03:00
base = xdr - > tail [ 0 ] . iov_base ;
len = xdr - > tail [ 0 ] . iov_len ;
tail :
if ( len ) {
2018-05-07 22:28:09 +03:00
if ( + + ctxt - > sc_cur_sge_no > = rdma - > sc_max_send_sges )
return - EIO ;
ret = svc_rdma_dma_map_buf ( rdma , ctxt , base , len ) ;
2017-04-09 20:06:25 +03:00
if ( ret < 0 )
return ret ;
}
2016-03-01 21:05:36 +03:00
2018-05-07 22:27:59 +03:00
return 0 ;
2007-12-13 01:13:25 +03:00
}
2017-04-09 20:06:00 +03:00
/* The svc_rqst and all resources it owns are released as soon as
* svc_rdma_sendto returns . Transfer pages under I / O to the ctxt
* so they are released by the Send completion handler .
*/
static void svc_rdma_save_io_pages ( struct svc_rqst * rqstp ,
2018-05-07 22:28:04 +03:00
struct svc_rdma_send_ctxt * ctxt )
2017-04-09 20:06:00 +03:00
{
int i , pages = rqstp - > rq_next_page - rqstp - > rq_respages ;
2018-05-07 22:28:04 +03:00
ctxt - > sc_page_count + = pages ;
2017-04-09 20:06:00 +03:00
for ( i = 0 ; i < pages ; i + + ) {
2018-05-07 22:28:04 +03:00
ctxt - > sc_pages [ i + 1 ] = rqstp - > rq_respages [ i ] ;
2017-04-09 20:06:00 +03:00
rqstp - > rq_respages [ i ] = NULL ;
}
rqstp - > rq_next_page = rqstp - > rq_respages + 1 ;
}
2017-04-09 20:05:36 +03:00
/**
* svc_rdma_post_send_wr - Set up and post one Send Work Request
* @ rdma : controlling transport
* @ ctxt : op_ctxt for transmitting the Send WR
* @ inv_rkey : R_key argument to Send With Invalidate , or zero
*
* Returns :
* % 0 if the Send * was posted successfully ,
* % - ENOTCONN if the connection was lost or dropped ,
* % - EINVAL if there was a problem with the Send we built ,
* % - ENOMEM if ib_post_send failed .
*/
int svc_rdma_post_send_wr ( struct svcxprt_rdma * rdma ,
2018-05-07 22:28:04 +03:00
struct svc_rdma_send_ctxt * ctxt ,
2017-04-09 20:05:36 +03:00
u32 inv_rkey )
{
2018-05-07 22:27:59 +03:00
dprintk ( " svcrdma: posting Send WR with %u sge(s) \n " ,
2018-05-07 22:28:04 +03:00
ctxt - > sc_send_wr . num_sge ) ;
2017-04-09 20:05:36 +03:00
if ( inv_rkey ) {
2018-05-07 22:28:04 +03:00
ctxt - > sc_send_wr . opcode = IB_WR_SEND_WITH_INV ;
ctxt - > sc_send_wr . ex . invalidate_rkey = inv_rkey ;
2017-04-09 20:05:36 +03:00
} else {
2018-05-07 22:28:04 +03:00
ctxt - > sc_send_wr . opcode = IB_WR_SEND ;
2017-04-09 20:05:36 +03:00
}
2018-05-07 22:28:04 +03:00
return svc_rdma_send ( rdma , & ctxt - > sc_send_wr ) ;
2017-04-09 20:05:36 +03:00
}
2017-04-09 20:06:25 +03:00
/* Prepare the portion of the RPC Reply that will be transmitted
* via RDMA Send . The RPC - over - RDMA transport header is prepared
2018-05-07 22:28:04 +03:00
* in sc_sges [ 0 ] , and the RPC xdr_buf is prepared in following sges .
2017-04-09 20:06:25 +03:00
*
* Depending on whether a Write list or Reply chunk is present ,
* the server may send all , a portion of , or none of the xdr_buf .
2018-05-07 22:28:04 +03:00
* In the latter case , only the transport header ( sc_sges [ 0 ] ) is
2017-04-09 20:06:25 +03:00
* transmitted .
*
* RDMA Send is the last step of transmitting an RPC reply . Pages
* involved in the earlier RDMA Writes are here transferred out
* of the rqstp and into the ctxt ' s page array . These pages are
* DMA unmapped by each Write completion , but the subsequent Send
* completion finally releases these pages .
*
* Assumptions :
* - The Reply ' s transport header will never be larger than a page .
2007-12-13 01:13:25 +03:00
*/
2017-04-09 20:06:25 +03:00
static int svc_rdma_send_reply_msg ( struct svcxprt_rdma * rdma ,
__be32 * rdma_argp , __be32 * rdma_resp ,
struct svc_rqst * rqstp ,
__be32 * wr_lst , __be32 * rp_ch )
2007-12-13 01:13:25 +03:00
{
2018-05-07 22:28:04 +03:00
struct svc_rdma_send_ctxt * ctxt ;
2017-04-09 20:06:25 +03:00
u32 inv_rkey ;
int ret ;
2018-05-07 22:28:04 +03:00
ctxt = svc_rdma_send_ctxt_get ( rdma ) ;
if ( ! ctxt )
return - ENOMEM ;
2007-12-13 01:13:25 +03:00
2017-04-09 20:06:25 +03:00
ret = svc_rdma_map_reply_hdr ( rdma , ctxt , rdma_resp ,
svc_rdma_reply_hdr_len ( rdma_resp ) ) ;
if ( ret < 0 )
2008-10-04 00:45:03 +04:00
goto err ;
2017-04-09 20:06:25 +03:00
if ( ! rp_ch ) {
ret = svc_rdma_map_reply_msg ( rdma , ctxt ,
& rqstp - > rq_res , wr_lst ) ;
if ( ret < 0 )
2014-05-29 00:12:01 +04:00
goto err ;
2015-01-13 19:03:03 +03:00
}
2007-12-13 01:13:25 +03:00
2017-04-09 20:06:00 +03:00
svc_rdma_save_io_pages ( rqstp , ctxt ) ;
2014-05-29 00:12:01 +04:00
2017-04-09 20:06:25 +03:00
inv_rkey = 0 ;
if ( rdma - > sc_snd_w_inv )
inv_rkey = svc_rdma_get_inv_rkey ( rdma_argp , wr_lst , rp_ch ) ;
2018-05-07 22:27:59 +03:00
ret = svc_rdma_post_send_wr ( rdma , ctxt , inv_rkey ) ;
2007-12-13 01:13:25 +03:00
if ( ret )
2008-10-04 00:45:03 +04:00
goto err ;
2007-12-13 01:13:25 +03:00
2008-10-04 00:45:03 +04:00
return 0 ;
2017-04-09 20:06:25 +03:00
err :
2018-05-07 22:28:04 +03:00
svc_rdma_send_ctxt_put ( rdma , ctxt ) ;
2016-05-04 17:53:05 +03:00
return ret ;
2007-12-13 01:13:25 +03:00
}
2017-04-09 20:06:41 +03:00
/* Given the client-provided Write and Reply chunks, the server was not
* able to form a complete reply . Return an RDMA_ERROR message so the
* client can retire this RPC transaction . As above , the Send completion
* routine releases payload pages that were part of a previous RDMA Write .
*
* Remote Invalidation is skipped for simplicity .
*/
static int svc_rdma_send_error_msg ( struct svcxprt_rdma * rdma ,
__be32 * rdma_resp , struct svc_rqst * rqstp )
{
2018-05-07 22:28:04 +03:00
struct svc_rdma_send_ctxt * ctxt ;
2017-04-09 20:06:41 +03:00
__be32 * p ;
int ret ;
2018-05-07 22:28:04 +03:00
ctxt = svc_rdma_send_ctxt_get ( rdma ) ;
if ( ! ctxt )
return - ENOMEM ;
2017-04-09 20:06:41 +03:00
/* Replace the original transport header with an
* RDMA_ERROR response . XID etc are preserved .
*/
2018-05-07 22:27:11 +03:00
trace_svcrdma_err_chunk ( * rdma_resp ) ;
2017-04-09 20:06:41 +03:00
p = rdma_resp + 3 ;
* p + + = rdma_error ;
* p = err_chunk ;
ret = svc_rdma_map_reply_hdr ( rdma , ctxt , rdma_resp , 20 ) ;
if ( ret < 0 )
goto err ;
svc_rdma_save_io_pages ( rqstp , ctxt ) ;
2018-05-07 22:27:59 +03:00
ret = svc_rdma_post_send_wr ( rdma , ctxt , 0 ) ;
2017-04-09 20:06:41 +03:00
if ( ret )
goto err ;
return 0 ;
err :
2018-05-07 22:28:04 +03:00
svc_rdma_send_ctxt_put ( rdma , ctxt ) ;
2017-04-09 20:06:41 +03:00
return ret ;
}
2007-12-13 01:13:25 +03:00
void svc_rdma_prep_reply_hdr ( struct svc_rqst * rqstp )
{
}
2017-04-09 20:06:25 +03:00
/**
* svc_rdma_sendto - Transmit an RPC reply
* @ rqstp : processed RPC request , reply XDR already in : : rq_res
*
* Any resources still associated with @ rqstp are released upon return .
* If no reply message was possible , the connection is closed .
*
* Returns :
* % 0 if an RPC reply has been successfully posted ,
* % - ENOMEM if a resource shortage occurred ( connection is lost ) ,
* % - ENOTCONN if posting failed ( connection is lost ) .
*/
2007-12-13 01:13:25 +03:00
int svc_rdma_sendto ( struct svc_rqst * rqstp )
{
struct svc_xprt * xprt = rqstp - > rq_xprt ;
struct svcxprt_rdma * rdma =
container_of ( xprt , struct svcxprt_rdma , sc_xprt ) ;
2018-05-07 22:27:37 +03:00
struct svc_rdma_recv_ctxt * rctxt = rqstp - > rq_xprt_ctxt ;
2017-04-09 20:06:25 +03:00
__be32 * p , * rdma_argp , * rdma_resp , * wr_lst , * rp_ch ;
struct xdr_buf * xdr = & rqstp - > rq_res ;
2007-12-13 01:13:25 +03:00
struct page * res_page ;
2017-04-09 20:06:25 +03:00
int ret ;
2007-12-13 01:13:25 +03:00
2018-05-07 22:27:43 +03:00
rdma_argp = rctxt - > rc_recv_buf ;
2017-04-09 20:06:25 +03:00
svc_rdma_get_write_arrays ( rdma_argp , & wr_lst , & rp_ch ) ;
2007-12-13 01:13:25 +03:00
2016-11-29 19:04:50 +03:00
/* Create the RDMA response header. xprt->xpt_mutex,
* acquired in svc_send ( ) , serializes RPC replies . The
* code path below that inserts the credit grant value
* into each transport header runs only inside this
* critical section .
*/
2016-01-07 22:49:45 +03:00
ret = - ENOMEM ;
res_page = alloc_page ( GFP_KERNEL ) ;
if ( ! res_page )
goto err0 ;
2007-12-13 01:13:25 +03:00
rdma_resp = page_address ( res_page ) ;
2017-02-07 19:58:23 +03:00
2017-04-09 20:06:25 +03:00
p = rdma_resp ;
* p + + = * rdma_argp ;
* p + + = * ( rdma_argp + 1 ) ;
2017-02-07 19:58:23 +03:00
* p + + = rdma - > sc_fc_credits ;
2017-04-09 20:06:25 +03:00
* p + + = rp_ch ? rdma_nomsg : rdma_msg ;
2017-02-07 19:58:23 +03:00
/* Start with empty chunks */
* p + + = xdr_zero ;
* p + + = xdr_zero ;
* p = xdr_zero ;
2007-12-13 01:13:25 +03:00
2017-04-09 20:06:25 +03:00
if ( wr_lst ) {
/* XXX: Presume the client sent only one Write chunk */
ret = svc_rdma_send_write_chunk ( rdma , wr_lst , xdr ) ;
2016-03-01 21:05:36 +03:00
if ( ret < 0 )
2017-04-09 20:06:41 +03:00
goto err2 ;
2017-04-09 20:06:25 +03:00
svc_rdma_xdr_encode_write_list ( rdma_resp , wr_lst , ret ) ;
2007-12-13 01:13:25 +03:00
}
2017-04-09 20:06:25 +03:00
if ( rp_ch ) {
ret = svc_rdma_send_reply_chunk ( rdma , rp_ch , wr_lst , xdr ) ;
2016-03-01 21:05:36 +03:00
if ( ret < 0 )
2017-04-09 20:06:41 +03:00
goto err2 ;
2017-04-09 20:06:25 +03:00
svc_rdma_xdr_encode_reply_chunk ( rdma_resp , rp_ch , ret ) ;
2007-12-13 01:13:25 +03:00
}
2017-04-09 20:06:25 +03:00
ret = svc_rdma_send_reply_msg ( rdma , rdma_argp , rdma_resp , rqstp ,
wr_lst , rp_ch ) ;
2016-03-01 21:06:11 +03:00
if ( ret < 0 )
2016-09-13 17:52:59 +03:00
goto err0 ;
2018-05-07 22:27:37 +03:00
ret = 0 ;
out :
rqstp - > rq_xprt_ctxt = NULL ;
svc_rdma_recv_ctxt_put ( rdma , rctxt ) ;
return ret ;
2008-10-04 00:45:03 +04:00
2017-04-09 20:06:41 +03:00
err2 :
2017-07-13 20:51:15 +03:00
if ( ret ! = - E2BIG & & ret ! = - EINVAL )
2017-04-09 20:06:41 +03:00
goto err1 ;
ret = svc_rdma_send_error_msg ( rdma , rdma_resp , rqstp ) ;
if ( ret < 0 )
goto err0 ;
2018-05-07 22:27:37 +03:00
ret = 0 ;
goto out ;
2017-04-09 20:06:41 +03:00
2008-10-04 00:45:03 +04:00
err1 :
put_page ( res_page ) ;
err0 :
2018-05-07 22:27:16 +03:00
trace_svcrdma_send_failed ( rqstp , ret ) ;
2017-04-09 20:06:25 +03:00
set_bit ( XPT_CLOSE , & xprt - > xpt_flags ) ;
2018-05-07 22:27:37 +03:00
ret = - ENOTCONN ;
goto out ;
2007-12-13 01:13:25 +03:00
}