2009-02-24 18:30:30 +03:00
/*
* Copyright ( c ) 2006 Oracle . All rights reserved .
*
* This software is available to you under a choice of one of two
* licenses . You may choose to be licensed under the terms of the GNU
* General Public License ( GPL ) Version 2 , available from the file
* COPYING in the main directory of this source tree , or the
* OpenIB . org BSD license below :
*
* Redistribution and use in source and binary forms , with or
* without modification , are permitted provided that the following
* conditions are met :
*
* - Redistributions of source code must retain the above
* copyright notice , this list of conditions and the following
* disclaimer .
*
* - Redistributions in binary form must reproduce the above
* copyright notice , this list of conditions and the following
* disclaimer in the documentation and / or other materials
* provided with the distribution .
*
* THE SOFTWARE IS PROVIDED " AS IS " , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY , WHETHER IN AN
* ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING FROM , OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE .
*
*/
# include <linux/kernel.h>
# include <linux/in.h>
# include <linux/if.h>
# include <linux/netdevice.h>
# include <linux/inetdevice.h>
# include <linux/if_arp.h>
# include <linux/delay.h>
# include "rds.h"
# include "ib.h"
unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE ;
unsigned int fmr_message_size = RDS_FMR_SIZE + 1 ; /* +1 allows for unaligned MRs */
2009-07-17 17:13:22 +04:00
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT ;
2009-02-24 18:30:30 +03:00
module_param ( fmr_pool_size , int , 0444 ) ;
MODULE_PARM_DESC ( fmr_pool_size , " Max number of fmr per HCA " ) ;
module_param ( fmr_message_size , int , 0444 ) ;
MODULE_PARM_DESC ( fmr_message_size , " Max size of a RDMA transfer " ) ;
2009-07-17 17:13:22 +04:00
module_param ( rds_ib_retry_count , int , 0444 ) ;
MODULE_PARM_DESC ( rds_ib_retry_count , " Number of hw retries before reporting an error " ) ;
2009-02-24 18:30:30 +03:00
struct list_head rds_ib_devices ;
2009-04-01 12:20:19 +04:00
/* NOTE: if also grabbing ibdev lock, grab this first */
2009-02-24 18:30:30 +03:00
DEFINE_SPINLOCK ( ib_nodev_conns_lock ) ;
LIST_HEAD ( ib_nodev_conns ) ;
void rds_ib_add_one ( struct ib_device * device )
{
struct rds_ib_device * rds_ibdev ;
struct ib_device_attr * dev_attr ;
/* Only handle IB (no iWARP) devices */
if ( device - > node_type ! = RDMA_NODE_IB_CA )
return ;
dev_attr = kmalloc ( sizeof * dev_attr , GFP_KERNEL ) ;
if ( ! dev_attr )
return ;
if ( ib_query_device ( device , dev_attr ) ) {
rdsdebug ( " Query device failed for %s \n " , device - > name ) ;
goto free_attr ;
}
rds_ibdev = kmalloc ( sizeof * rds_ibdev , GFP_KERNEL ) ;
if ( ! rds_ibdev )
goto free_attr ;
spin_lock_init ( & rds_ibdev - > spinlock ) ;
rds_ibdev - > max_wrs = dev_attr - > max_qp_wr ;
rds_ibdev - > max_sge = min ( dev_attr - > max_sge , RDS_IB_MAX_SGE ) ;
rds_ibdev - > fmr_max_remaps = dev_attr - > max_map_per_fmr ? : 32 ;
rds_ibdev - > max_fmrs = dev_attr - > max_fmr ?
min_t ( unsigned int , dev_attr - > max_fmr , fmr_pool_size ) :
fmr_pool_size ;
rds_ibdev - > dev = device ;
rds_ibdev - > pd = ib_alloc_pd ( device ) ;
if ( IS_ERR ( rds_ibdev - > pd ) )
goto free_dev ;
rds_ibdev - > mr = ib_get_dma_mr ( rds_ibdev - > pd ,
IB_ACCESS_LOCAL_WRITE ) ;
if ( IS_ERR ( rds_ibdev - > mr ) )
goto err_pd ;
rds_ibdev - > mr_pool = rds_ib_create_mr_pool ( rds_ibdev ) ;
if ( IS_ERR ( rds_ibdev - > mr_pool ) ) {
rds_ibdev - > mr_pool = NULL ;
goto err_mr ;
}
INIT_LIST_HEAD ( & rds_ibdev - > ipaddr_list ) ;
INIT_LIST_HEAD ( & rds_ibdev - > conn_list ) ;
list_add_tail ( & rds_ibdev - > list , & rds_ib_devices ) ;
ib_set_client_data ( device , & rds_ib_client , rds_ibdev ) ;
goto free_attr ;
err_mr :
ib_dereg_mr ( rds_ibdev - > mr ) ;
err_pd :
ib_dealloc_pd ( rds_ibdev - > pd ) ;
free_dev :
kfree ( rds_ibdev ) ;
free_attr :
kfree ( dev_attr ) ;
}
void rds_ib_remove_one ( struct ib_device * device )
{
struct rds_ib_device * rds_ibdev ;
struct rds_ib_ipaddr * i_ipaddr , * i_next ;
rds_ibdev = ib_get_client_data ( device , & rds_ib_client ) ;
if ( ! rds_ibdev )
return ;
list_for_each_entry_safe ( i_ipaddr , i_next , & rds_ibdev - > ipaddr_list , list ) {
list_del ( & i_ipaddr - > list ) ;
kfree ( i_ipaddr ) ;
}
2009-04-01 12:20:19 +04:00
rds_ib_destroy_conns ( rds_ibdev ) ;
2009-02-24 18:30:30 +03:00
if ( rds_ibdev - > mr_pool )
rds_ib_destroy_mr_pool ( rds_ibdev - > mr_pool ) ;
ib_dereg_mr ( rds_ibdev - > mr ) ;
while ( ib_dealloc_pd ( rds_ibdev - > pd ) ) {
rdsdebug ( " Failed to dealloc pd %p \n " , rds_ibdev - > pd ) ;
msleep ( 1 ) ;
}
list_del ( & rds_ibdev - > list ) ;
kfree ( rds_ibdev ) ;
}
struct ib_client rds_ib_client = {
. name = " rds_ib " ,
. add = rds_ib_add_one ,
. remove = rds_ib_remove_one
} ;
static int rds_ib_conn_info_visitor ( struct rds_connection * conn ,
void * buffer )
{
struct rds_info_rdma_connection * iinfo = buffer ;
struct rds_ib_connection * ic ;
/* We will only ever look at IB transports */
if ( conn - > c_trans ! = & rds_ib_transport )
return 0 ;
iinfo - > src_addr = conn - > c_laddr ;
iinfo - > dst_addr = conn - > c_faddr ;
memset ( & iinfo - > src_gid , 0 , sizeof ( iinfo - > src_gid ) ) ;
memset ( & iinfo - > dst_gid , 0 , sizeof ( iinfo - > dst_gid ) ) ;
if ( rds_conn_state ( conn ) = = RDS_CONN_UP ) {
struct rds_ib_device * rds_ibdev ;
struct rdma_dev_addr * dev_addr ;
ic = conn - > c_transport_data ;
dev_addr = & ic - > i_cm_id - > route . addr . dev_addr ;
ib_addr_get_sgid ( dev_addr , ( union ib_gid * ) & iinfo - > src_gid ) ;
ib_addr_get_dgid ( dev_addr , ( union ib_gid * ) & iinfo - > dst_gid ) ;
rds_ibdev = ib_get_client_data ( ic - > i_cm_id - > device , & rds_ib_client ) ;
iinfo - > max_send_wr = ic - > i_send_ring . w_nr ;
iinfo - > max_recv_wr = ic - > i_recv_ring . w_nr ;
iinfo - > max_send_sge = rds_ibdev - > max_sge ;
rds_ib_get_mr_info ( rds_ibdev , iinfo ) ;
}
return 1 ;
}
static void rds_ib_ic_info ( struct socket * sock , unsigned int len ,
struct rds_info_iterator * iter ,
struct rds_info_lengths * lens )
{
rds_for_each_conn_info ( sock , len , iter , lens ,
rds_ib_conn_info_visitor ,
sizeof ( struct rds_info_rdma_connection ) ) ;
}
/*
* Early RDS / IB was built to only bind to an address if there is an IPoIB
* device with that address set .
*
* If it were me , I ' d advocate for something more flexible . Sending and
* receiving should be device - agnostic . Transports would try and maintain
* connections between peers who have messages queued . Userspace would be
* allowed to influence which paths have priority . We could call userspace
* asserting this policy " routing " .
*/
static int rds_ib_laddr_check ( __be32 addr )
{
int ret ;
struct rdma_cm_id * cm_id ;
struct sockaddr_in sin ;
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs .
*/
cm_id = rdma_create_id ( NULL , NULL , RDMA_PS_TCP ) ;
2009-04-09 18:09:46 +04:00
if ( IS_ERR ( cm_id ) )
return PTR_ERR ( cm_id ) ;
2009-02-24 18:30:30 +03:00
memset ( & sin , 0 , sizeof ( sin ) ) ;
sin . sin_family = AF_INET ;
sin . sin_addr . s_addr = addr ;
/* rdma_bind_addr will only succeed for IB & iWARP devices */
ret = rdma_bind_addr ( cm_id , ( struct sockaddr * ) & sin ) ;
/* due to this, we will claim to support iWARP devices unless we
check node_type . */
if ( ret | | cm_id - > device - > node_type ! = RDMA_NODE_IB_CA )
ret = - EADDRNOTAVAIL ;
rdsdebug ( " addr %pI4 ret %d node type %d \n " ,
& addr , ret ,
cm_id - > device ? cm_id - > device - > node_type : - 1 ) ;
rdma_destroy_id ( cm_id ) ;
return ret ;
}
void rds_ib_exit ( void )
{
rds_info_deregister_func ( RDS_INFO_IB_CONNECTIONS , rds_ib_ic_info ) ;
2009-04-01 12:20:19 +04:00
rds_ib_destroy_nodev_conns ( ) ;
2009-02-24 18:30:30 +03:00
ib_unregister_client ( & rds_ib_client ) ;
rds_ib_sysctl_exit ( ) ;
rds_ib_recv_exit ( ) ;
rds_trans_unregister ( & rds_ib_transport ) ;
}
struct rds_transport rds_ib_transport = {
. laddr_check = rds_ib_laddr_check ,
. xmit_complete = rds_ib_xmit_complete ,
. xmit = rds_ib_xmit ,
. xmit_cong_map = NULL ,
. xmit_rdma = rds_ib_xmit_rdma ,
. recv = rds_ib_recv ,
. conn_alloc = rds_ib_conn_alloc ,
. conn_free = rds_ib_conn_free ,
. conn_connect = rds_ib_conn_connect ,
. conn_shutdown = rds_ib_conn_shutdown ,
. inc_copy_to_user = rds_ib_inc_copy_to_user ,
. inc_purge = rds_ib_inc_purge ,
. inc_free = rds_ib_inc_free ,
. cm_initiate_connect = rds_ib_cm_initiate_connect ,
. cm_handle_connect = rds_ib_cm_handle_connect ,
. cm_connect_complete = rds_ib_cm_connect_complete ,
. stats_info_copy = rds_ib_stats_info_copy ,
. exit = rds_ib_exit ,
. get_mr = rds_ib_get_mr ,
. sync_mr = rds_ib_sync_mr ,
. free_mr = rds_ib_free_mr ,
. flush_mrs = rds_ib_flush_mrs ,
. t_owner = THIS_MODULE ,
. t_name = " infiniband " ,
2009-08-21 16:28:34 +04:00
. t_type = RDS_TRANS_IB
2009-02-24 18:30:30 +03:00
} ;
int __init rds_ib_init ( void )
{
int ret ;
INIT_LIST_HEAD ( & rds_ib_devices ) ;
ret = ib_register_client ( & rds_ib_client ) ;
if ( ret )
goto out ;
ret = rds_ib_sysctl_init ( ) ;
if ( ret )
goto out_ibreg ;
ret = rds_ib_recv_init ( ) ;
if ( ret )
goto out_sysctl ;
ret = rds_trans_register ( & rds_ib_transport ) ;
if ( ret )
goto out_recv ;
rds_info_register_func ( RDS_INFO_IB_CONNECTIONS , rds_ib_ic_info ) ;
goto out ;
out_recv :
rds_ib_recv_exit ( ) ;
out_sysctl :
rds_ib_sysctl_exit ( ) ;
out_ibreg :
ib_unregister_client ( & rds_ib_client ) ;
out :
return ret ;
}
MODULE_LICENSE ( " GPL " ) ;