2006-05-11 10:00:44 +03:00
/*
* iSCSI Initiator over iSER Data - Path
*
* Copyright ( C ) 2004 Dmitry Yusupov
* Copyright ( C ) 2004 Alex Aizman
* Copyright ( C ) 2005 Mike Christie
* Copyright ( c ) 2005 , 2006 Voltaire , Inc . All rights reserved .
* maintained by openib - general @ openib . org
*
* This software is available to you under a choice of one of two
* licenses . You may choose to be licensed under the terms of the GNU
* General Public License ( GPL ) Version 2 , available from the file
* COPYING in the main directory of this source tree , or the
* OpenIB . org BSD license below :
*
* Redistribution and use in source and binary forms , with or
* without modification , are permitted provided that the following
* conditions are met :
*
* - Redistributions of source code must retain the above
* copyright notice , this list of conditions and the following
* disclaimer .
*
* - Redistributions in binary form must reproduce the above
* copyright notice , this list of conditions and the following
* disclaimer in the documentation and / or other materials
* provided with the distribution .
*
* THE SOFTWARE IS PROVIDED " AS IS " , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY , WHETHER IN AN
* ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING FROM , OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE .
*
* Credits :
* Christoph Hellwig
* FUJITA Tomonori
* Arne Redlich
* Zhenyu Wang
* Modified by :
* Erez Zilber
*/
# include <linux/types.h>
# include <linux/list.h>
# include <linux/hardirq.h>
# include <linux/kfifo.h>
# include <linux/blkdev.h>
# include <linux/init.h>
# include <linux/ioctl.h>
# include <linux/cdev.h>
# include <linux/in.h>
# include <linux/net.h>
# include <linux/scatterlist.h>
# include <linux/delay.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
# include <linux/slab.h>
2011-05-27 15:35:46 -04:00
# include <linux/module.h>
2006-05-11 10:00:44 +03:00
# include <net/sock.h>
# include <asm/uaccess.h>
# include <scsi/scsi_cmnd.h>
# include <scsi/scsi_device.h>
# include <scsi/scsi_eh.h>
# include <scsi/scsi_tcq.h>
# include <scsi/scsi_host.h>
# include <scsi/scsi.h>
# include <scsi/scsi_transport_iscsi.h>
# include "iscsi_iser.h"
2008-05-21 15:53:59 -05:00
static struct scsi_host_template iscsi_iser_sht ;
static struct iscsi_transport iscsi_iser_transport ;
static struct scsi_transport_template * iscsi_iser_scsi_transport ;
2006-05-11 10:00:44 +03:00
static unsigned int iscsi_max_lun = 512 ;
module_param_named ( max_lun , iscsi_max_lun , uint , S_IRUGO ) ;
int iser_debug_level = 0 ;
2013-05-01 13:25:24 +00:00
MODULE_DESCRIPTION ( " iSER (iSCSI Extensions for RDMA) Datamover " ) ;
2006-05-11 10:00:44 +03:00
MODULE_LICENSE ( " Dual BSD/GPL " ) ;
MODULE_AUTHOR ( " Alex Nezhinsky, Dan Bar Dov, Or Gerlitz " ) ;
2013-05-01 13:25:24 +00:00
MODULE_VERSION ( DRV_VER ) ;
2006-05-11 10:00:44 +03:00
module_param_named ( debug_level , iser_debug_level , int , 0644 ) ;
MODULE_PARM_DESC ( debug_level , " Enable debug tracing if > 0 (default:disabled) " ) ;
struct iser_global ig ;
void
iscsi_iser_recv ( struct iscsi_conn * conn ,
struct iscsi_hdr * hdr , char * rx_data , int rx_data_len )
{
int rc = 0 ;
int datalen ;
int ahslen ;
/* verify PDU length */
datalen = ntoh24 ( hdr - > dlength ) ;
2011-08-01 21:14:09 +00:00
if ( datalen > rx_data_len | | ( datalen + 4 ) < rx_data_len ) {
iser_err ( " wrong datalen %d (hdr), %d (IB) \n " ,
datalen , rx_data_len ) ;
2006-05-11 10:00:44 +03:00
rc = ISCSI_ERR_DATALEN ;
goto error ;
}
2011-08-01 21:14:09 +00:00
if ( datalen ! = rx_data_len )
iser_dbg ( " aligned datalen (%d) hdr, %d (IB) \n " ,
datalen , rx_data_len ) ;
2006-05-11 10:00:44 +03:00
/* read AHS */
ahslen = hdr - > hlength * 4 ;
2008-05-21 15:54:04 -05:00
rc = iscsi_complete_pdu ( conn , hdr , rx_data , rx_data_len ) ;
2006-05-11 10:00:44 +03:00
if ( rc & & rc ! = ISCSI_ERR_NO_SCSI_CMD )
goto error ;
return ;
error :
iscsi_conn_failure ( conn , rc ) ;
}
2008-12-02 00:32:14 -06:00
static int iscsi_iser_pdu_alloc ( struct iscsi_task * task , uint8_t opcode )
2008-12-02 00:32:06 -06:00
{
struct iscsi_iser_task * iser_task = task - > dd_data ;
task - > hdr = ( struct iscsi_hdr * ) & iser_task - > desc . iscsi_header ;
task - > hdr_max = sizeof ( iser_task - > desc . iscsi_header ) ;
return 0 ;
}
2006-05-11 10:00:44 +03:00
2010-02-08 13:19:56 +00:00
int iser_initialize_task_headers ( struct iscsi_task * task ,
struct iser_tx_desc * tx_desc )
{
struct iscsi_iser_conn * iser_conn = task - > conn - > dd_data ;
struct iser_device * device = iser_conn - > ib_conn - > device ;
struct iscsi_iser_task * iser_task = task - > dd_data ;
u64 dma_addr ;
dma_addr = ib_dma_map_single ( device - > ib_device , ( void * ) tx_desc ,
ISER_HEADERS_LEN , DMA_TO_DEVICE ) ;
if ( ib_dma_mapping_error ( device - > ib_device , dma_addr ) )
return - ENOMEM ;
tx_desc - > dma_addr = dma_addr ;
tx_desc - > tx_sg [ 0 ] . addr = tx_desc - > dma_addr ;
tx_desc - > tx_sg [ 0 ] . length = ISER_HEADERS_LEN ;
tx_desc - > tx_sg [ 0 ] . lkey = device - > mr - > lkey ;
iser_task - > iser_conn = iser_conn ;
return 0 ;
}
2006-05-11 10:00:44 +03:00
/**
2008-05-21 15:54:11 -05:00
* iscsi_iser_task_init - Initialize task
* @ task : iscsi task
2006-05-11 10:00:44 +03:00
*
2008-05-21 15:54:11 -05:00
* Initialize the task for the scsi command or mgmt command .
2008-05-21 15:54:08 -05:00
*/
2007-12-13 12:43:35 -06:00
static int
2008-05-21 15:54:11 -05:00
iscsi_iser_task_init ( struct iscsi_task * task )
2006-05-11 10:00:44 +03:00
{
2008-05-21 15:54:11 -05:00
struct iscsi_iser_task * iser_task = task - > dd_data ;
2006-05-11 10:00:44 +03:00
2011-11-04 00:21:27 +02:00
if ( iser_initialize_task_headers ( task , & iser_task - > desc ) )
2010-02-08 13:19:56 +00:00
return - ENOMEM ;
2008-05-21 15:54:11 -05:00
/* mgmt task */
2010-02-08 13:19:56 +00:00
if ( ! task - > sc )
2008-05-21 15:54:08 -05:00
return 0 ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:11 -05:00
iser_task - > command_sent = 0 ;
iser_task_rdma_init ( iser_task ) ;
2007-12-13 12:43:35 -06:00
return 0 ;
2006-05-11 10:00:44 +03:00
}
/**
2008-05-21 15:54:11 -05:00
* iscsi_iser_mtask_xmit - xmit management ( immediate ) task
2006-05-11 10:00:44 +03:00
* @ conn : iscsi connection
2008-05-21 15:54:11 -05:00
* @ task : task management task
2006-05-11 10:00:44 +03:00
*
* Notes :
* The function can return - EAGAIN in which case caller must
* call it again later , or recover . ' 0 ' return code means successful
* xmit .
*
* */
static int
2008-05-21 15:54:11 -05:00
iscsi_iser_mtask_xmit ( struct iscsi_conn * conn , struct iscsi_task * task )
2006-05-11 10:00:44 +03:00
{
int error = 0 ;
2010-02-08 13:22:34 +00:00
iser_dbg ( " mtask xmit [cid %d itt 0x%x] \n " , conn - > id , task - > itt ) ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:11 -05:00
error = iser_send_control ( conn , task ) ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:11 -05:00
/* since iser xmits control with zero copy, tasks can not be recycled
2006-05-11 10:00:44 +03:00
* right after sending them .
* The recycling scheme is based on whether a response is expected
2008-05-21 15:54:11 -05:00
* - if yes , the task is recycled at iscsi_complete_pdu
* - if no , the task is recycled at iser_snd_completion
2006-05-11 10:00:44 +03:00
*/
return error ;
}
static int
2008-05-21 15:54:08 -05:00
iscsi_iser_task_xmit_unsol_data ( struct iscsi_conn * conn ,
2008-05-21 15:54:11 -05:00
struct iscsi_task * task )
2006-05-11 10:00:44 +03:00
{
2008-12-02 00:32:06 -06:00
struct iscsi_r2t_info * r2t = & task - > unsol_r2t ;
struct iscsi_data hdr ;
2006-05-11 10:00:44 +03:00
int error = 0 ;
/* Send data-out PDUs while there's still unsolicited data to send */
2008-12-02 00:32:06 -06:00
while ( iscsi_task_has_unsol_data ( task ) ) {
iscsi_prep_data_out_pdu ( task , r2t , & hdr ) ;
2009-03-05 14:45:57 -06:00
iser_dbg ( " Sending data-out: itt 0x%x, data count %d \n " ,
2008-12-02 00:32:06 -06:00
hdr . itt , r2t - > data_count ) ;
2006-05-11 10:00:44 +03:00
/* the buffer description has been passed with the command */
/* Send the command */
2008-05-21 15:54:11 -05:00
error = iser_send_data_out ( conn , task , & hdr ) ;
2006-05-11 10:00:44 +03:00
if ( error ) {
2008-12-02 00:32:06 -06:00
r2t - > datasn - - ;
2008-05-21 15:54:08 -05:00
goto iscsi_iser_task_xmit_unsol_data_exit ;
2006-05-11 10:00:44 +03:00
}
2008-12-02 00:32:06 -06:00
r2t - > sent + = r2t - > data_count ;
2009-03-05 14:45:57 -06:00
iser_dbg ( " Need to send %d more as data-out PDUs \n " ,
2008-12-02 00:32:06 -06:00
r2t - > data_length - r2t - > sent ) ;
2006-05-11 10:00:44 +03:00
}
2008-05-21 15:54:08 -05:00
iscsi_iser_task_xmit_unsol_data_exit :
2006-05-11 10:00:44 +03:00
return error ;
}
static int
2008-05-21 15:54:11 -05:00
iscsi_iser_task_xmit ( struct iscsi_task * task )
2006-05-11 10:00:44 +03:00
{
2008-05-21 15:54:11 -05:00
struct iscsi_conn * conn = task - > conn ;
struct iscsi_iser_task * iser_task = task - > dd_data ;
2006-05-11 10:00:44 +03:00
int error = 0 ;
2008-05-21 15:54:11 -05:00
if ( ! task - > sc )
return iscsi_iser_mtask_xmit ( conn , task ) ;
2008-05-21 15:54:08 -05:00
2008-05-21 15:54:11 -05:00
if ( task - > sc - > sc_data_direction = = DMA_TO_DEVICE ) {
BUG_ON ( scsi_bufflen ( task - > sc ) = = 0 ) ;
2007-05-30 12:57:18 -05:00
2009-03-05 14:45:57 -06:00
iser_dbg ( " cmd [itt %x total %d imm %d unsol_data %d \n " ,
2008-05-21 15:54:11 -05:00
task - > itt , scsi_bufflen ( task - > sc ) ,
2008-12-02 00:32:06 -06:00
task - > imm_count , task - > unsol_r2t . data_length ) ;
2007-05-30 12:57:18 -05:00
}
2010-02-08 13:22:34 +00:00
iser_dbg ( " ctask xmit [cid %d itt 0x%x] \n " ,
2008-05-21 15:54:11 -05:00
conn - > id , task - > itt ) ;
2006-05-11 10:00:44 +03:00
/* Send the cmd PDU */
2008-05-21 15:54:11 -05:00
if ( ! iser_task - > command_sent ) {
error = iser_send_command ( conn , task ) ;
2006-05-11 10:00:44 +03:00
if ( error )
2008-05-21 15:54:08 -05:00
goto iscsi_iser_task_xmit_exit ;
2008-05-21 15:54:11 -05:00
iser_task - > command_sent = 1 ;
2006-05-11 10:00:44 +03:00
}
/* Send unsolicited data-out PDU(s) if necessary */
2008-12-02 00:32:06 -06:00
if ( iscsi_task_has_unsol_data ( task ) )
2008-05-21 15:54:11 -05:00
error = iscsi_iser_task_xmit_unsol_data ( conn , task ) ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:08 -05:00
iscsi_iser_task_xmit_exit :
2006-05-11 10:00:44 +03:00
return error ;
}
2008-12-02 00:32:06 -06:00
static void iscsi_iser_cleanup_task ( struct iscsi_task * task )
2006-05-11 10:00:44 +03:00
{
2008-05-21 15:54:11 -05:00
struct iscsi_iser_task * iser_task = task - > dd_data ;
2011-11-04 00:21:27 +02:00
struct iser_tx_desc * tx_desc = & iser_task - > desc ;
struct iscsi_iser_conn * iser_conn = task - > conn - > dd_data ;
struct iser_device * device = iser_conn - > ib_conn - > device ;
ib_dma_unmap_single ( device - > ib_device ,
tx_desc - > dma_addr , ISER_HEADERS_LEN , DMA_TO_DEVICE ) ;
2006-05-11 10:00:44 +03:00
2009-05-13 17:57:49 -05:00
/* mgmt tasks do not need special cleanup */
if ( ! task - > sc )
2008-05-21 15:54:08 -05:00
return ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:11 -05:00
if ( iser_task - > status = = ISER_TASK_STATUS_STARTED ) {
iser_task - > status = ISER_TASK_STATUS_COMPLETED ;
iser_task_rdma_finalize ( iser_task ) ;
2006-05-11 10:00:44 +03:00
}
}
static struct iscsi_cls_conn *
iscsi_iser_conn_create ( struct iscsi_cls_session * cls_session , uint32_t conn_idx )
{
struct iscsi_conn * conn ;
struct iscsi_cls_conn * cls_conn ;
struct iscsi_iser_conn * iser_conn ;
2008-05-21 15:54:01 -05:00
cls_conn = iscsi_conn_setup ( cls_session , sizeof ( * iser_conn ) , conn_idx ) ;
2006-05-11 10:00:44 +03:00
if ( ! cls_conn )
return NULL ;
conn = cls_conn - > dd_data ;
/*
* due to issues with the login code re iser sematics
* this not set in iscsi_conn_setup - FIXME
*/
2010-02-08 13:17:42 +00:00
conn - > max_recv_dlength = ISER_RECV_DATA_SEG_LEN ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:01 -05:00
iser_conn = conn - > dd_data ;
2006-05-11 10:00:44 +03:00
conn - > dd_data = iser_conn ;
iser_conn - > iscsi_conn = conn ;
return cls_conn ;
}
static void
iscsi_iser_conn_destroy ( struct iscsi_cls_conn * cls_conn )
{
struct iscsi_conn * conn = cls_conn - > dd_data ;
struct iscsi_iser_conn * iser_conn = conn - > dd_data ;
2008-05-21 15:54:03 -05:00
struct iser_conn * ib_conn = iser_conn - > ib_conn ;
2006-05-11 10:00:44 +03:00
iscsi_conn_teardown ( cls_conn ) ;
2008-05-21 15:54:03 -05:00
/*
* Userspace will normally call the stop callback and
* already have freed the ib_conn , but if it goofed up then
* we free it here .
*/
if ( ib_conn ) {
ib_conn - > iser_conn = NULL ;
IB/iser: Enhance disconnection logic for multi-pathing
The iser connection teardown flow isn't over until the underlying
Connection Manager (e.g the IB CM) delivers a disconnected or timeout
event through the RDMA-CM. When the remote (target) side isn't
reachable, e.g when some HW e.g port/hca/switch isn't functioning or
taken down administratively, the CM timeout flow is used and the event
may be generated only after relatively long time -- on the order of
tens of seconds.
The current iser code exposes this possibly long delay to higher
layers, specifically to the iscsid daemon and iscsi kernel stack. As a
result, the iscsi stack doesn't respond well: this low-level CM delay
is added to the fail-over time under HA schemes such as the one
provided by DM multipath through the multipathd(8) service.
This patch enhances the reference counting scheme on iser's IB
connections so that the disconnect flow initiated by iscsid from user
space (ep_disconnect) doesn't wait for the CM to deliver the
disconnect/timeout event. (The connection teardown isn't done from
iser's view point until the event is delivered)
The iser ib (rdma) connection object is destroyed when its reference
count reaches zero. When this happens on the RDMA-CM callback
context, extra care is taken so that the RDMA-CM does the actual
destroying of the associated ID, since doing it in the callback is
prohibited.
The reference count of iser ib connection normally reaches three,
where the <ref, deref> relations are
1. conn <init, terminate>
2. conn <bind, stop/destroy>
3. cma id <create, disconnect/error/timeout callbacks>
With this patch, multipath fail-over time is about 30 seconds, while
without this patch, multipath fail-over time is about 130 seconds.
Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
2010-05-05 17:31:44 +03:00
iser_conn_put ( ib_conn , 1 ) ; /* deref iscsi/ib conn unbinding */
2008-05-21 15:54:03 -05:00
}
2006-05-11 10:00:44 +03:00
}
static int
iscsi_iser_conn_bind ( struct iscsi_cls_session * cls_session ,
struct iscsi_cls_conn * cls_conn , uint64_t transport_eph ,
int is_leading )
{
struct iscsi_conn * conn = cls_conn - > dd_data ;
struct iscsi_iser_conn * iser_conn ;
struct iser_conn * ib_conn ;
2008-05-21 15:54:14 -05:00
struct iscsi_endpoint * ep ;
2006-05-11 10:00:44 +03:00
int error ;
error = iscsi_conn_bind ( cls_session , cls_conn , is_leading ) ;
if ( error )
return error ;
/* the transport ep handle comes from user space so it must be
* verified against the global ib connections list */
2008-05-21 15:54:14 -05:00
ep = iscsi_lookup_endpoint ( transport_eph ) ;
if ( ! ep ) {
2006-05-11 10:00:44 +03:00
iser_err ( " can't bind eph %llx \n " ,
( unsigned long long ) transport_eph ) ;
return - EINVAL ;
}
2008-05-21 15:54:14 -05:00
ib_conn = ep - > dd_data ;
2012-03-05 18:21:44 +02:00
if ( iser_alloc_rx_descriptors ( ib_conn ) )
return - ENOMEM ;
2006-05-11 10:00:44 +03:00
/* binds the iSER connection retrieved from the previously
* connected ep_handle to the iSCSI layer connection . exchanges
* connection pointers */
2013-05-01 13:25:25 +00:00
iser_info ( " binding iscsi/iser conn %p %p to ib_conn %p \n " ,
conn , conn - > dd_data , ib_conn ) ;
2006-05-11 10:00:44 +03:00
iser_conn = conn - > dd_data ;
ib_conn - > iser_conn = iser_conn ;
iser_conn - > ib_conn = ib_conn ;
IB/iser: Enhance disconnection logic for multi-pathing
The iser connection teardown flow isn't over until the underlying
Connection Manager (e.g the IB CM) delivers a disconnected or timeout
event through the RDMA-CM. When the remote (target) side isn't
reachable, e.g when some HW e.g port/hca/switch isn't functioning or
taken down administratively, the CM timeout flow is used and the event
may be generated only after relatively long time -- on the order of
tens of seconds.
The current iser code exposes this possibly long delay to higher
layers, specifically to the iscsid daemon and iscsi kernel stack. As a
result, the iscsi stack doesn't respond well: this low-level CM delay
is added to the fail-over time under HA schemes such as the one
provided by DM multipath through the multipathd(8) service.
This patch enhances the reference counting scheme on iser's IB
connections so that the disconnect flow initiated by iscsid from user
space (ep_disconnect) doesn't wait for the CM to deliver the
disconnect/timeout event. (The connection teardown isn't done from
iser's view point until the event is delivered)
The iser ib (rdma) connection object is destroyed when its reference
count reaches zero. When this happens on the RDMA-CM callback
context, extra care is taken so that the RDMA-CM does the actual
destroying of the associated ID, since doing it in the callback is
prohibited.
The reference count of iser ib connection normally reaches three,
where the <ref, deref> relations are
1. conn <init, terminate>
2. conn <bind, stop/destroy>
3. cma id <create, disconnect/error/timeout callbacks>
With this patch, multipath fail-over time is about 30 seconds, while
without this patch, multipath fail-over time is about 130 seconds.
Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
2010-05-05 17:31:44 +03:00
iser_conn_get ( ib_conn ) ; /* ref iscsi/ib conn binding */
2006-05-11 10:00:44 +03:00
return 0 ;
}
2008-05-21 15:54:03 -05:00
static void
iscsi_iser_conn_stop ( struct iscsi_cls_conn * cls_conn , int flag )
{
struct iscsi_conn * conn = cls_conn - > dd_data ;
struct iscsi_iser_conn * iser_conn = conn - > dd_data ;
struct iser_conn * ib_conn = iser_conn - > ib_conn ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:03 -05:00
/*
2008-05-21 15:54:18 -05:00
* Userspace may have goofed up and not bound the connection or
* might have only partially setup the connection .
2008-05-21 15:54:03 -05:00
*/
2008-05-21 15:54:18 -05:00
if ( ib_conn ) {
iscsi_conn_stop ( cls_conn , flag ) ;
/*
* There is no unbind event so the stop callback
* must release the ref from the bind .
*/
IB/iser: Enhance disconnection logic for multi-pathing
The iser connection teardown flow isn't over until the underlying
Connection Manager (e.g the IB CM) delivers a disconnected or timeout
event through the RDMA-CM. When the remote (target) side isn't
reachable, e.g when some HW e.g port/hca/switch isn't functioning or
taken down administratively, the CM timeout flow is used and the event
may be generated only after relatively long time -- on the order of
tens of seconds.
The current iser code exposes this possibly long delay to higher
layers, specifically to the iscsid daemon and iscsi kernel stack. As a
result, the iscsi stack doesn't respond well: this low-level CM delay
is added to the fail-over time under HA schemes such as the one
provided by DM multipath through the multipathd(8) service.
This patch enhances the reference counting scheme on iser's IB
connections so that the disconnect flow initiated by iscsid from user
space (ep_disconnect) doesn't wait for the CM to deliver the
disconnect/timeout event. (The connection teardown isn't done from
iser's view point until the event is delivered)
The iser ib (rdma) connection object is destroyed when its reference
count reaches zero. When this happens on the RDMA-CM callback
context, extra care is taken so that the RDMA-CM does the actual
destroying of the associated ID, since doing it in the callback is
prohibited.
The reference count of iser ib connection normally reaches three,
where the <ref, deref> relations are
1. conn <init, terminate>
2. conn <bind, stop/destroy>
3. cma id <create, disconnect/error/timeout callbacks>
With this patch, multipath fail-over time is about 30 seconds, while
without this patch, multipath fail-over time is about 130 seconds.
Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
2010-05-05 17:31:44 +03:00
iser_conn_put ( ib_conn , 1 ) ; /* deref iscsi/ib conn unbinding */
2008-05-21 15:54:18 -05:00
}
2008-05-21 15:54:03 -05:00
iser_conn - > ib_conn = NULL ;
2006-05-11 10:00:44 +03:00
}
2008-05-21 15:53:59 -05:00
static void iscsi_iser_session_destroy ( struct iscsi_cls_session * cls_session )
{
struct Scsi_Host * shost = iscsi_session_to_shost ( cls_session ) ;
2008-09-24 11:46:10 -05:00
iscsi_session_teardown ( cls_session ) ;
2008-05-21 15:54:00 -05:00
iscsi_host_remove ( shost ) ;
iscsi_host_free ( shost ) ;
2008-05-21 15:53:59 -05:00
}
2006-05-11 10:00:44 +03:00
static struct iscsi_cls_session *
2008-05-21 15:54:14 -05:00
iscsi_iser_session_create ( struct iscsi_endpoint * ep ,
2008-05-21 15:53:59 -05:00
uint16_t cmds_max , uint16_t qdepth ,
2009-03-05 14:46:06 -06:00
uint32_t initial_cmdsn )
2006-05-11 10:00:44 +03:00
{
struct iscsi_cls_session * cls_session ;
struct iscsi_session * session ;
2008-05-21 15:54:14 -05:00
struct Scsi_Host * shost ;
struct iser_conn * ib_conn ;
2008-05-21 15:53:59 -05:00
2010-02-08 13:22:34 +00:00
shost = iscsi_host_alloc ( & iscsi_iser_sht , 0 , 0 ) ;
2008-05-21 15:53:59 -05:00
if ( ! shost )
return NULL ;
shost - > transportt = iscsi_iser_scsi_transport ;
shost - > max_lun = iscsi_max_lun ;
shost - > max_id = 0 ;
shost - > max_channel = 0 ;
shost - > max_cmd_len = 16 ;
2008-05-21 15:54:14 -05:00
/*
* older userspace tools ( before 2.0 - 870 ) did not pass us
* the leading conn ' s ep so this will be NULL ;
*/
if ( ep )
ib_conn = ep - > dd_data ;
if ( iscsi_host_add ( shost ,
ep ? ib_conn - > device - > ib_device - > dma_device : NULL ) )
2008-05-21 15:53:59 -05:00
goto free_host ;
2006-05-11 10:00:44 +03:00
2007-05-30 12:57:19 -05:00
/*
* we do not support setting can_queue cmd_per_lun from userspace yet
* because we preallocate so many resources
*/
2008-05-21 15:53:59 -05:00
cls_session = iscsi_session_setup ( & iscsi_iser_transport , shost ,
2009-09-22 08:21:22 +05:30
ISCSI_DEF_XMIT_CMDS_MAX , 0 ,
2008-05-21 15:54:11 -05:00
sizeof ( struct iscsi_iser_task ) ,
2008-05-21 15:54:12 -05:00
initial_cmdsn , 0 ) ;
2006-05-11 10:00:44 +03:00
if ( ! cls_session )
2008-05-21 15:53:59 -05:00
goto remove_host ;
session = cls_session - > dd_data ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:08 -05:00
shost - > can_queue = session - > scsi_cmds_max ;
2006-05-11 10:00:44 +03:00
return cls_session ;
2008-05-21 15:53:59 -05:00
remove_host :
2008-05-21 15:54:00 -05:00
iscsi_host_remove ( shost ) ;
2008-05-21 15:53:59 -05:00
free_host :
2008-05-21 15:54:00 -05:00
iscsi_host_free ( shost ) ;
2008-05-21 15:53:59 -05:00
return NULL ;
2006-05-11 10:00:44 +03:00
}
static int
2006-06-28 12:00:25 -05:00
iscsi_iser_set_param ( struct iscsi_cls_conn * cls_conn ,
enum iscsi_param param , char * buf , int buflen )
2006-05-11 10:00:44 +03:00
{
2006-06-28 12:00:25 -05:00
int value ;
2006-05-11 10:00:44 +03:00
switch ( param ) {
case ISCSI_PARAM_MAX_RECV_DLENGTH :
/* TBD */
break ;
case ISCSI_PARAM_HDRDGST_EN :
2006-06-28 12:00:25 -05:00
sscanf ( buf , " %d " , & value ) ;
2006-05-11 10:00:44 +03:00
if ( value ) {
2013-05-01 13:25:25 +00:00
iser_err ( " DataDigest wasn't negotiated to None " ) ;
2006-05-11 10:00:44 +03:00
return - EPROTO ;
}
break ;
case ISCSI_PARAM_DATADGST_EN :
2006-06-28 12:00:25 -05:00
sscanf ( buf , " %d " , & value ) ;
2006-05-11 10:00:44 +03:00
if ( value ) {
2013-05-01 13:25:25 +00:00
iser_err ( " DataDigest wasn't negotiated to None " ) ;
2006-05-11 10:00:44 +03:00
return - EPROTO ;
}
break ;
case ISCSI_PARAM_IFMARKER_EN :
2006-06-28 12:00:25 -05:00
sscanf ( buf , " %d " , & value ) ;
2006-05-11 10:00:44 +03:00
if ( value ) {
2013-05-01 13:25:25 +00:00
iser_err ( " IFMarker wasn't negotiated to No " ) ;
2006-05-11 10:00:44 +03:00
return - EPROTO ;
}
break ;
case ISCSI_PARAM_OFMARKER_EN :
2006-06-28 12:00:25 -05:00
sscanf ( buf , " %d " , & value ) ;
2006-05-11 10:00:44 +03:00
if ( value ) {
2013-05-01 13:25:25 +00:00
iser_err ( " OFMarker wasn't negotiated to No " ) ;
2006-05-11 10:00:44 +03:00
return - EPROTO ;
}
break ;
default :
2006-06-28 12:00:25 -05:00
return iscsi_set_param ( cls_conn , param , buf , buflen ) ;
2006-05-11 10:00:44 +03:00
}
return 0 ;
}
static void
iscsi_iser_conn_get_stats ( struct iscsi_cls_conn * cls_conn , struct iscsi_stats * stats )
{
struct iscsi_conn * conn = cls_conn - > dd_data ;
stats - > txdata_octets = conn - > txdata_octets ;
stats - > rxdata_octets = conn - > rxdata_octets ;
stats - > scsicmd_pdus = conn - > scsicmd_pdus_cnt ;
stats - > dataout_pdus = conn - > dataout_pdus_cnt ;
stats - > scsirsp_pdus = conn - > scsirsp_pdus_cnt ;
stats - > datain_pdus = conn - > datain_pdus_cnt ; /* always 0 */
stats - > r2t_pdus = conn - > r2t_pdus_cnt ; /* always 0 */
stats - > tmfcmd_pdus = conn - > tmfcmd_pdus_cnt ;
stats - > tmfrsp_pdus = conn - > tmfrsp_pdus_cnt ;
2008-04-29 13:46:52 -07:00
stats - > custom_length = 4 ;
2006-05-11 10:00:44 +03:00
strcpy ( stats - > custom [ 0 ] . desc , " qp_tx_queue_full " ) ;
stats - > custom [ 0 ] . value = 0 ; /* TB iser_conn->qp_tx_queue_full; */
strcpy ( stats - > custom [ 1 ] . desc , " fmr_map_not_avail " ) ;
stats - > custom [ 1 ] . value = 0 ; /* TB iser_conn->fmr_map_not_avail */ ;
strcpy ( stats - > custom [ 2 ] . desc , " eh_abort_cnt " ) ;
stats - > custom [ 2 ] . value = conn - > eh_abort_cnt ;
2008-04-29 13:46:52 -07:00
strcpy ( stats - > custom [ 3 ] . desc , " fmr_unalign_cnt " ) ;
stats - > custom [ 3 ] . value = conn - > fmr_unalign_cnt ;
2006-05-11 10:00:44 +03:00
}
2011-02-16 15:04:40 -06:00
static int iscsi_iser_get_ep_param ( struct iscsi_endpoint * ep ,
enum iscsi_param param , char * buf )
{
struct iser_conn * ib_conn = ep - > dd_data ;
int len ;
switch ( param ) {
case ISCSI_PARAM_CONN_PORT :
case ISCSI_PARAM_CONN_ADDRESS :
if ( ! ib_conn | | ! ib_conn - > cma_id )
return - ENOTCONN ;
return iscsi_conn_get_addr_param ( ( struct sockaddr_storage * )
& ib_conn - > cma_id - > route . addr . dst_addr ,
param , buf ) ;
break ;
default :
return - ENOSYS ;
}
return len ;
}
2008-05-21 15:54:14 -05:00
static struct iscsi_endpoint *
2009-05-13 17:57:38 -05:00
iscsi_iser_ep_connect ( struct Scsi_Host * shost , struct sockaddr * dst_addr ,
int non_blocking )
2006-05-11 10:00:44 +03:00
{
int err ;
struct iser_conn * ib_conn ;
2008-05-21 15:54:14 -05:00
struct iscsi_endpoint * ep ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:14 -05:00
ep = iscsi_create_endpoint ( sizeof ( * ib_conn ) ) ;
if ( ! ep )
return ERR_PTR ( - ENOMEM ) ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:14 -05:00
ib_conn = ep - > dd_data ;
ib_conn - > ep = ep ;
iser_conn_init ( ib_conn ) ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:14 -05:00
err = iser_connect ( ib_conn , NULL , ( struct sockaddr_in * ) dst_addr ,
non_blocking ) ;
2012-04-29 17:04:21 +03:00
if ( err )
2008-05-21 15:54:14 -05:00
return ERR_PTR ( err ) ;
2012-04-29 17:04:21 +03:00
2008-05-21 15:54:14 -05:00
return ep ;
2006-05-11 10:00:44 +03:00
}
static int
2008-05-21 15:54:14 -05:00
iscsi_iser_ep_poll ( struct iscsi_endpoint * ep , int timeout_ms )
2006-05-11 10:00:44 +03:00
{
2008-05-21 15:54:14 -05:00
struct iser_conn * ib_conn ;
2006-05-11 10:00:44 +03:00
int rc ;
2008-05-21 15:54:14 -05:00
ib_conn = ep - > dd_data ;
2006-05-11 10:00:44 +03:00
rc = wait_event_interruptible_timeout ( ib_conn - > wait ,
ib_conn - > state = = ISER_CONN_UP ,
msecs_to_jiffies ( timeout_ms ) ) ;
/* if conn establishment failed, return error code to iscsi */
if ( ! rc & &
( ib_conn - > state = = ISER_CONN_TERMINATING | |
ib_conn - > state = = ISER_CONN_DOWN ) )
rc = - 1 ;
2013-05-01 13:25:25 +00:00
iser_info ( " ib conn %p rc = %d \n " , ib_conn , rc ) ;
2006-05-11 10:00:44 +03:00
if ( rc > 0 )
return 1 ; /* success, this is the equivalent of POLLOUT */
else if ( ! rc )
return 0 ; /* timeout */
else
return rc ; /* signal */
}
static void
2008-05-21 15:54:14 -05:00
iscsi_iser_ep_disconnect ( struct iscsi_endpoint * ep )
2006-05-11 10:00:44 +03:00
{
2006-07-24 15:47:26 -05:00
struct iser_conn * ib_conn ;
2006-05-11 10:00:44 +03:00
2008-05-21 15:54:14 -05:00
ib_conn = ep - > dd_data ;
2008-05-21 15:54:03 -05:00
if ( ib_conn - > iser_conn )
/*
* Must suspend xmit path if the ep is bound to the
* iscsi_conn , so we know we are not accessing the ib_conn
* when we free it .
*
* This may not be bound if the ep poll failed .
*/
iscsi_suspend_tx ( ib_conn - > iser_conn - > iscsi_conn ) ;
2006-05-11 10:00:44 +03:00
2013-05-01 13:25:25 +00:00
iser_info ( " ib conn %p state %d \n " , ib_conn , ib_conn - > state ) ;
2006-05-11 10:00:44 +03:00
iser_conn_terminate ( ib_conn ) ;
}
2011-07-23 23:11:19 -04:00
static umode_t iser_attr_is_visible ( int param_type , int param )
2011-07-25 13:48:42 -05:00
{
switch ( param_type ) {
2011-07-25 13:48:45 -05:00
case ISCSI_HOST_PARAM :
switch ( param ) {
case ISCSI_HOST_PARAM_NETDEV_NAME :
case ISCSI_HOST_PARAM_HWADDRESS :
case ISCSI_HOST_PARAM_INITIATOR_NAME :
return S_IRUGO ;
default :
return 0 ;
}
2011-07-25 13:48:42 -05:00
case ISCSI_PARAM :
switch ( param ) {
case ISCSI_PARAM_MAX_RECV_DLENGTH :
case ISCSI_PARAM_MAX_XMIT_DLENGTH :
case ISCSI_PARAM_HDRDGST_EN :
case ISCSI_PARAM_DATADGST_EN :
case ISCSI_PARAM_CONN_ADDRESS :
case ISCSI_PARAM_CONN_PORT :
case ISCSI_PARAM_EXP_STATSN :
case ISCSI_PARAM_PERSISTENT_ADDRESS :
case ISCSI_PARAM_PERSISTENT_PORT :
case ISCSI_PARAM_PING_TMO :
case ISCSI_PARAM_RECV_TMO :
2011-07-25 13:48:43 -05:00
case ISCSI_PARAM_INITIAL_R2T_EN :
case ISCSI_PARAM_MAX_R2T :
case ISCSI_PARAM_IMM_DATA_EN :
case ISCSI_PARAM_FIRST_BURST :
case ISCSI_PARAM_MAX_BURST :
case ISCSI_PARAM_PDU_INORDER_EN :
case ISCSI_PARAM_DATASEQ_INORDER_EN :
case ISCSI_PARAM_TARGET_NAME :
case ISCSI_PARAM_TPGT :
case ISCSI_PARAM_USERNAME :
case ISCSI_PARAM_PASSWORD :
case ISCSI_PARAM_USERNAME_IN :
case ISCSI_PARAM_PASSWORD_IN :
case ISCSI_PARAM_FAST_ABORT :
case ISCSI_PARAM_ABORT_TMO :
case ISCSI_PARAM_LU_RESET_TMO :
case ISCSI_PARAM_TGT_RESET_TMO :
case ISCSI_PARAM_IFACE_NAME :
case ISCSI_PARAM_INITIATOR_NAME :
2011-07-25 13:48:42 -05:00
return S_IRUGO ;
default :
return 0 ;
}
}
return 0 ;
}
2006-05-11 10:00:44 +03:00
static struct scsi_host_template iscsi_iser_sht = {
2007-07-26 12:46:46 -05:00
. module = THIS_MODULE ,
2013-05-01 13:25:24 +00:00
. name = " iSCSI Initiator over iSER " ,
2006-05-11 10:00:44 +03:00
. queuecommand = iscsi_queuecommand ,
2008-01-17 11:53:17 +02:00
. change_queue_depth = iscsi_change_queue_depth ,
2006-05-11 10:00:44 +03:00
. sg_tablesize = ISCSI_ISER_SG_TABLESIZE ,
2006-09-11 12:20:54 +03:00
. max_sectors = 1024 ,
2009-03-05 14:46:01 -06:00
. cmd_per_lun = ISER_DEF_CMD_PER_LUN ,
2006-05-11 10:00:44 +03:00
. eh_abort_handler = iscsi_eh_abort ,
2008-01-22 12:06:25 +02:00
. eh_device_reset_handler = iscsi_eh_device_reset ,
2010-02-20 08:02:10 +05:30
. eh_target_reset_handler = iscsi_eh_recover_target ,
2009-04-21 15:32:32 -05:00
. target_alloc = iscsi_target_alloc ,
2006-05-11 10:00:44 +03:00
. use_clustering = DISABLE_CLUSTERING ,
. proc_name = " iscsi_iser " ,
. this_id = - 1 ,
} ;
static struct iscsi_transport iscsi_iser_transport = {
. owner = THIS_MODULE ,
. name = " iser " ,
. caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T ,
/* session management */
. create_session = iscsi_iser_session_create ,
2008-05-21 15:53:59 -05:00
. destroy_session = iscsi_iser_session_destroy ,
2006-05-11 10:00:44 +03:00
/* connection management */
. create_conn = iscsi_iser_conn_create ,
. bind_conn = iscsi_iser_conn_bind ,
. destroy_conn = iscsi_iser_conn_destroy ,
2011-07-25 13:48:42 -05:00
. attr_is_visible = iser_attr_is_visible ,
2006-06-28 12:00:25 -05:00
. set_param = iscsi_iser_set_param ,
. get_conn_param = iscsi_conn_get_param ,
2011-02-16 15:04:40 -06:00
. get_ep_param = iscsi_iser_get_ep_param ,
2006-06-28 12:00:25 -05:00
. get_session_param = iscsi_session_get_param ,
2012-03-05 18:21:44 +02:00
. start_conn = iscsi_conn_start ,
2008-05-21 15:54:03 -05:00
. stop_conn = iscsi_iser_conn_stop ,
2007-05-30 12:57:12 -05:00
/* iscsi host params */
. get_host_param = iscsi_host_get_param ,
. set_host_param = iscsi_host_set_param ,
2006-05-11 10:00:44 +03:00
/* IO */
. send_pdu = iscsi_conn_send_pdu ,
. get_stats = iscsi_iser_conn_get_stats ,
2008-05-21 15:54:08 -05:00
. init_task = iscsi_iser_task_init ,
. xmit_task = iscsi_iser_task_xmit ,
. cleanup_task = iscsi_iser_cleanup_task ,
2008-12-02 00:32:06 -06:00
. alloc_pdu = iscsi_iser_pdu_alloc ,
2006-05-11 10:00:44 +03:00
/* recovery */
. session_recovery_timedout = iscsi_session_recovery_timedout ,
. ep_connect = iscsi_iser_ep_connect ,
. ep_poll = iscsi_iser_ep_poll ,
. ep_disconnect = iscsi_iser_ep_disconnect
} ;
static int __init iser_init ( void )
{
int err ;
iser_dbg ( " Starting iSER datamover... \n " ) ;
if ( iscsi_max_lun < 1 ) {
2013-05-01 13:25:25 +00:00
iser_err ( " Invalid max_lun value of %u \n " , iscsi_max_lun ) ;
2006-05-11 10:00:44 +03:00
return - EINVAL ;
}
memset ( & ig , 0 , sizeof ( struct iser_global ) ) ;
ig . desc_cache = kmem_cache_create ( " iser_descriptors " ,
2010-02-08 13:19:56 +00:00
sizeof ( struct iser_tx_desc ) ,
2006-05-11 10:00:44 +03:00
0 , SLAB_HWCACHE_ALIGN ,
2007-07-20 10:11:58 +09:00
NULL ) ;
2006-05-11 10:00:44 +03:00
if ( ig . desc_cache = = NULL )
return - ENOMEM ;
/* device init is called only after the first addr resolution */
mutex_init ( & ig . device_list_mutex ) ;
INIT_LIST_HEAD ( & ig . device_list ) ;
mutex_init ( & ig . connlist_mutex ) ;
INIT_LIST_HEAD ( & ig . connlist ) ;
2008-05-21 15:53:59 -05:00
iscsi_iser_scsi_transport = iscsi_register_transport (
& iscsi_iser_transport ) ;
if ( ! iscsi_iser_scsi_transport ) {
2006-05-11 10:00:44 +03:00
iser_err ( " iscsi_register_transport failed \n " ) ;
err = - EINVAL ;
goto register_transport_failure ;
}
return 0 ;
register_transport_failure :
kmem_cache_destroy ( ig . desc_cache ) ;
return err ;
}
static void __exit iser_exit ( void )
{
iser_dbg ( " Removing iSER datamover... \n " ) ;
iscsi_unregister_transport ( & iscsi_iser_transport ) ;
kmem_cache_destroy ( ig . desc_cache ) ;
}
module_init ( iser_init ) ;
module_exit ( iser_exit ) ;