From f69b22e65ecfcb3648304e0e32a1df6f0d421375 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Wed, 4 Nov 2015 13:42:39 -0800 Subject: [PATCH 01/17] RDS: log the address on bind failure It's useful to know the IP address when RDS fails to bind a connection. Thus, adding it to the error message. Orabug: 21894138 Reviewed-by: Wei Lin Guay Signed-off-by: Santosh Shilimkar --- net/rds/bind.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rds/bind.c b/net/rds/bind.c index 095f6ce583fe..3a915bedb76c 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); - printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, " - "load rds_tcp or rds_rdma?\n"); + pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n", + __func__, &sin->sin_addr.s_addr); goto out; } From bb7897631d2379ec198635cc24bf1e8c629d0bda Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sun, 4 Dec 2016 16:41:29 -0800 Subject: [PATCH 02/17] RDS: mark few internal functions static to make sparse build happy Fixes below warnings: warning: symbol 'rds_send_probe' was not declared. Should it be static? warning: symbol 'rds_send_ping' was not declared. Should it be static? warning: symbol 'rds_tcp_accept_one_path' was not declared. Should it be static? warning: symbol 'rds_walk_conn_path_info' was not declared. Should it be static? Signed-off-by: Santosh Shilimkar --- net/rds/connection.c | 10 +++++----- net/rds/send.c | 4 ++-- net/rds/tcp_listen.c | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index fe9d31c0b22d..0e04dcceb1d4 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -545,11 +545,11 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); -void rds_walk_conn_path_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens, - int (*visitor)(struct rds_conn_path *, void *), - size_t item_len) +static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_conn_path *, void *), + size_t item_len) { u64 buffer[(item_len + 7) / 8]; struct hlist_head *head; diff --git a/net/rds/send.c b/net/rds/send.c index 77c8c6e613ad..bb13c56fc2f8 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1169,7 +1169,7 @@ out: * or * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ -int +static int rds_send_probe(struct rds_conn_path *cp, __be16 sport, __be16 dport, u8 h_flags) { @@ -1238,7 +1238,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport) return rds_send_probe(cp, 0, dport, 0); } -void +static void rds_send_ping(struct rds_connection *conn) { unsigned long flags; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index f74bab3ecdca..67d0929c7d3d 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -79,6 +79,7 @@ bail: * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side * by moving them to CONNECTING in this function. */ +static struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; From ff3f19a2f608ee406331e8c7b60d7376e75c2157 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Mon, 14 Mar 2016 07:43:55 -0700 Subject: [PATCH 03/17] RDS: IB: include faddr in connection log Also use pr_* for it. Signed-off-by: Santosh Shilimkar --- net/rds/ib_cm.c | 19 +++++++++---------- net/rds/ib_recv.c | 4 ++-- net/rds/ib_send.c | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 5b2ab95afa07..b9da1e59ecc1 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -113,19 +113,18 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } if (conn->c_version < RDS_PROTOCOL(3, 1)) { - printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," - " no longer supported\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version)); + pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); rds_conn_destroy(conn); return; } else { - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + pr_notice("RDS/IB: connected <%pI4,%pI4> version %u.%u%s\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); } /* diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 606a11f681d2..6803b75eb8bd 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -980,8 +980,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, + rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); } diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 84d90c97332f..19eca5c4c00c 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -300,8 +300,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, wc->status, + rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); } } From fab8688d7185a1fe01ee9e0930fc59c0f161ee93 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Mon, 4 Jul 2016 15:31:21 -0700 Subject: [PATCH 04/17] RDS: IB: make the transport retry count smallest Transport retry is not much useful since it indicate packet loss in fabric so its better to failover fast rather than longer retry. Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/ib.h b/net/rds/ib.h index 45ac8e8e58f4..f4e81214e70a 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -16,7 +16,7 @@ #define RDS_IB_DEFAULT_SEND_WR 256 #define RDS_IB_DEFAULT_FR_WR 512 -#define RDS_IB_DEFAULT_RETRY_COUNT 2 +#define RDS_IB_DEFAULT_RETRY_COUNT 1 #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ From 3e56c2f856d7aba6a03feea834d68f9c05f7d0b6 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sun, 4 Dec 2016 16:25:43 -0800 Subject: [PATCH 05/17] RDS: RDMA: fix the ib_map_mr_sg_zbva() argument Fixes warning: Using plain integer as NULL pointer Signed-off-by: Santosh Shilimkar --- net/rds/ib_frmr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index d921adc62765..66b3d6228a15 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) struct rds_ib_frmr *frmr = &ibmr->u.frmr; struct ib_send_wr *failed_wr; struct ib_reg_wr reg_wr; - int ret; + int ret, off = 0; while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { atomic_inc(&ibmr->ic->i_fastreg_wrs); cpu_relax(); } - ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE); + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, + &off, PAGE_SIZE); if (unlikely(ret != ibmr->sg_len)) return ret < 0 ? ret : -EINVAL; From 8d5d8a5fd7f9337b2eff689df14ff3ae617f3ae6 Mon Sep 17 00:00:00 2001 From: Qing Huang Date: Mon, 4 Jul 2016 16:29:13 -0700 Subject: [PATCH 06/17] RDS: RDMA: start rdma listening after init This prevents RDS from handling incoming rdma packets before RDS completes initializing its recv/send components. Signed-off-by: Qing Huang Signed-off-by: Santosh Shilimkar --- net/rds/rdma_transport.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index d5f311767157..fc59821f0a27 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -206,18 +206,13 @@ static int rds_rdma_init(void) { int ret; - ret = rds_rdma_listen_init(); + ret = rds_ib_init(); if (ret) goto out; - ret = rds_ib_init(); + ret = rds_rdma_listen_init(); if (ret) - goto err_ib_init; - - goto out; - -err_ib_init: - rds_rdma_listen_stop(); + rds_ib_exit(); out: return ret; } From 584a8279a44a800dea5a5c1e9d53a002e03016b4 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Mon, 4 Jul 2016 17:04:37 -0700 Subject: [PATCH 07/17] RDS: RDMA: return appropriate error on rdma map failures The first message to a remote node should prompt a new connection even if it is RDMA operation. For RDMA operation the MR mapping can fail because connections is not yet up. Since the connection establishment is asynchronous, we make sure the map failure because of unavailable connection reach to the user by appropriate error code. Before returning to the user, lets trigger the connection so that its ready for the next retry. Signed-off-by: Santosh Shilimkar --- net/rds/send.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/rds/send.c b/net/rds/send.c index bb13c56fc2f8..0a6f38b1c8a5 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -945,6 +945,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_rdma_map(rs, rm, cmsg); if (!ret) *allocated_mr = 1; + else if (ret == -ENODEV) + /* Accommodate the get_mr() case which can fail + * if connection isn't established yet. + */ + ret = -EAGAIN; break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: @@ -1082,8 +1087,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); - if (ret) + if (ret) { + /* Trigger connection so that its ready for the next retry */ + if (ret == -EAGAIN) + rds_conn_connect_if_down(conn); goto out; + } if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", From 56012459310a1dbcc55c2dbf5500a9f7571402cb Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 8 Mar 2016 09:19:01 -0800 Subject: [PATCH 08/17] RDS: IB: split the mr registration and invalidation path MR invalidation in RDS is done in background thread and not in data path like registration. So break the dependency between them which helps to remove the performance bottleneck. Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 4 +++- net/rds/ib_cm.c | 9 +++++++-- net/rds/ib_frmr.c | 11 ++++++----- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/net/rds/ib.h b/net/rds/ib.h index f4e81214e70a..f14c26d22b27 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -14,7 +14,8 @@ #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 -#define RDS_IB_DEFAULT_FR_WR 512 +#define RDS_IB_DEFAULT_FR_WR 256 +#define RDS_IB_DEFAULT_FR_INV_WR 256 #define RDS_IB_DEFAULT_RETRY_COUNT 1 @@ -125,6 +126,7 @@ struct rds_ib_connection { /* To control the number of wrs from fastreg */ atomic_t i_fastreg_wrs; + atomic_t i_fastunreg_wrs; /* interrupt handling */ struct tasklet_struct i_send_tasklet; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index b9da1e59ecc1..3002acf75766 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -382,7 +382,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) * completion queue and send queue. This extra space is used for FRMR * registration and invalidation work requests */ - fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); + fr_queue_space = rds_ibdev->use_fastreg ? + (RDS_IB_DEFAULT_FR_WR + 1) + + (RDS_IB_DEFAULT_FR_INV_WR + 1) + : 0; /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); @@ -444,6 +447,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.send_cq = ic->i_send_cq; attr.recv_cq = ic->i_recv_cq; atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); + atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR); /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -766,7 +770,8 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0) && - (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); + (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) && + (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR)); tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 66b3d6228a15..48332a6ed738 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -241,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) if (frmr->fr_state != FRMR_IS_INUSE) goto out; - while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { - atomic_inc(&ibmr->ic->i_fastreg_wrs); + while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastunreg_wrs); cpu_relax(); } @@ -261,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) if (unlikely(ret)) { frmr->fr_state = FRMR_IS_STALE; frmr->fr_inv = false; - atomic_inc(&ibmr->ic->i_fastreg_wrs); + atomic_inc(&ibmr->ic->i_fastunreg_wrs); pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); goto out; } @@ -289,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (frmr->fr_inv) { frmr->fr_state = FRMR_IS_FREE; frmr->fr_inv = false; + atomic_inc(&ic->i_fastreg_wrs); + } else { + atomic_inc(&ic->i_fastunreg_wrs); } - - atomic_inc(&ic->i_fastreg_wrs); } void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, From c536a068870a08fb7b35482e701a6b72e294b493 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sun, 3 Jul 2016 19:14:10 -0700 Subject: [PATCH 09/17] RDS: RDMA: silence the use_once mr log flood In absence of extension headers, message log will keep flooding the console. As such even without use_once we can clean up the MRs so its not really an error case message so make it debug message Signed-off-by: Santosh Shilimkar --- net/rds/rdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index ea961144084f..4297f3f337d7 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -415,7 +415,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { - printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", + r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } From 581d53c91cbf7b31415a9ed5e9a8b89d6af609b3 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 9 Jul 2016 18:31:38 -0700 Subject: [PATCH 10/17] RDS: IB: track and log active side endpoint in connection Useful to know the active and passive end points in a RDS IB connection. Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 3 +++ net/rds/ib_cm.c | 11 +++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/net/rds/ib.h b/net/rds/ib.h index f14c26d22b27..5f02b4d8f10c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -181,6 +181,9 @@ struct rds_ib_connection { /* Batched completions */ unsigned int i_unsignaled_wrs; + + /* Endpoint role in connection */ + bool i_active_side; }; /* This assumes that atomic_t is at least 32 bits */ diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 3002acf75766..4d1bf04b06b5 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -120,16 +120,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_conn_destroy(conn); return; } else { - pr_notice("RDS/IB: connected <%pI4,%pI4> version %u.%u%s\n", + pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n", + ic->i_active_side ? "Active" : "Passive", &conn->c_laddr, &conn->c_faddr, RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version), ic->i_flowctl ? ", flow control" : ""); } - /* - * Init rings and fill recv. this needs to wait until protocol negotiation - * is complete, since ring layout is different from 3.0 to 3.1. + /* Init rings and fill recv. this needs to wait until protocol + * negotiation is complete, since ring layout is different + * from 3.1 to 4.1. */ rds_ib_send_init_ring(ic); rds_ib_recv_init_ring(ic); @@ -685,6 +686,7 @@ out: if (ic->i_cm_id == cm_id) ret = 0; } + ic->i_active_side = true; return ret; } @@ -859,6 +861,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_sends = NULL; vfree(ic->i_recvs); ic->i_recvs = NULL; + ic->i_active_side = false; } int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) From 09b2b8f52895addd9bf28dc5ac98ff5cc750cf9a Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 9 Jul 2016 17:14:02 -0700 Subject: [PATCH 11/17] RDS: IB: add few useful cache stasts Tracks the ib receive cache total, incoming and frag allocations. Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 7 +++++++ net/rds/ib_recv.c | 6 ++++++ net/rds/ib_stats.c | 2 ++ 3 files changed, 15 insertions(+) diff --git a/net/rds/ib.h b/net/rds/ib.h index 5f02b4d8f10c..c62e5513d306 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -151,6 +151,7 @@ struct rds_ib_connection { u64 i_ack_recv; /* last ACK received */ struct rds_ib_refill_cache i_cache_incs; struct rds_ib_refill_cache i_cache_frags; + atomic_t i_cache_allocs; /* sending acks */ unsigned long i_ack_flags; @@ -254,6 +255,8 @@ struct rds_ib_statistics { uint64_t s_ib_rx_refill_from_cq; uint64_t s_ib_rx_refill_from_thread; uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_total_frags; + uint64_t s_ib_rx_total_incs; uint64_t s_ib_rx_credit_updates; uint64_t s_ib_ack_sent; uint64_t s_ib_ack_send_failure; @@ -276,6 +279,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_1m_reused; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; + uint64_t s_ib_recv_added_to_cache; + uint64_t s_ib_recv_removed_from_cache; }; extern struct workqueue_struct *rds_ib_wq; @@ -406,6 +411,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) +#define rds_ib_stats_add(member, count) \ + rds_stats_add_which(rds_ib_stats, member, count) unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 6803b75eb8bd..4b0f12679219 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic, rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); + atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs); + rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE); } /* Recycle inc after freeing attached frags */ @@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i atomic_dec(&rds_ib_allocation); return NULL; } + rds_ib_stats_inc(s_ib_rx_total_incs); } INIT_LIST_HEAD(&ibinc->ii_frags); rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); @@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); if (cache_item) { frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs); + rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE); } else { frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); if (!frag) @@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic kmem_cache_free(rds_ib_frag_slab, frag); return NULL; } + rds_ib_stats_inc(s_ib_rx_total_frags); } INIT_LIST_HEAD(&frag->f_item); diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 7e78dca1f252..9252ad126335 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -55,6 +55,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rx_refill_from_cq", "ib_rx_refill_from_thread", "ib_rx_alloc_limit", + "ib_rx_total_frags", + "ib_rx_total_incs", "ib_rx_credit_updates", "ib_ack_sent", "ib_ack_send_failure", From be2f76eacc278c272f26d46e4168efe5a55f5383 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Mon, 4 Jul 2016 16:16:36 -0700 Subject: [PATCH 12/17] RDS: IB: Add vector spreading for cqs Based on available device vectors, allocate cqs accordingly to get better spread of completion vectors which helps performace great deal.. Signed-off-by: Santosh Shilimkar --- net/rds/ib.c | 11 +++++++++++ net/rds/ib.h | 5 +++++ net/rds/ib_cm.c | 40 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index 5680d90b0b77..8d70884d7bb6 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -111,6 +111,9 @@ static void rds_ib_dev_free(struct work_struct *work) kfree(i_ipaddr); } + if (rds_ibdev->vector_load) + kfree(rds_ibdev->vector_load); + kfree(rds_ibdev); } @@ -159,6 +162,14 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; + rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors, + GFP_KERNEL); + if (!rds_ibdev->vector_load) { + pr_err("RDS/IB: %s failed to allocate vector memory\n", + __func__); + goto put_dev; + } + rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(rds_ibdev->pd)) { diff --git a/net/rds/ib.h b/net/rds/ib.h index c62e5513d306..1fe9f79fead5 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -185,6 +185,10 @@ struct rds_ib_connection { /* Endpoint role in connection */ bool i_active_side; + + /* Send/Recv vectors */ + int i_scq_vector; + int i_rcq_vector; }; /* This assumes that atomic_t is at least 32 bits */ @@ -227,6 +231,7 @@ struct rds_ib_device { spinlock_t spinlock; /* protect the above */ atomic_t refcount; struct work_struct free_work; + int *vector_load; }; #define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 4d1bf04b06b5..33c8584ada1f 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -358,6 +358,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context) tasklet_schedule(&ic->i_send_tasklet); } +static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev) +{ + int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1]; + int index = rds_ibdev->dev->num_comp_vectors - 1; + int i; + + for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) { + if (rds_ibdev->vector_load[i] < min) { + index = i; + min = rds_ibdev->vector_load[i]; + } + } + + rds_ibdev->vector_load[index]++; + return index; +} + +static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index) +{ + rds_ibdev->vector_load[index]--; +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -399,25 +421,30 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; + ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev); cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1; - + cq_attr.comp_vector = ic->i_scq_vector; ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_send_cq)) { ret = PTR_ERR(ic->i_send_cq); ic->i_send_cq = NULL; + ibdev_put_vector(rds_ibdev, ic->i_scq_vector); rdsdebug("ib_create_cq send failed: %d\n", ret); goto out; } + ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev); cq_attr.cqe = ic->i_recv_ring.w_nr; + cq_attr.comp_vector = ic->i_rcq_vector; ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_recv_cq)) { ret = PTR_ERR(ic->i_recv_cq); ic->i_recv_cq = NULL; + ibdev_put_vector(rds_ibdev, ic->i_rcq_vector); rdsdebug("ib_create_cq recv failed: %d\n", ret); goto out; } @@ -780,10 +807,17 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) /* first destroy the ib state that generates callbacks */ if (ic->i_cm_id->qp) rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) + if (ic->i_send_cq) { + if (ic->rds_ibdev) + ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector); ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) + } + + if (ic->i_recv_cq) { + if (ic->rds_ibdev) + ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector); ib_destroy_cq(ic->i_recv_cq); + } /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) From 941f8d55f6d613a460a5e080d25a38509f45eb75 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 18 Feb 2016 20:06:47 -0800 Subject: [PATCH 13/17] RDS: RDMA: Fix the composite message user notification When application sends an RDS RDMA composite message consist of RDMA transfer to be followed up by non RDMA payload, it expect to be notified *only* when the full message gets delivered. RDS RDMA notification doesn't behave this way though. Thanks to Venkat for debug and root casuing the issue where only first part of the message(RDMA) was successfully delivered but remainder payload delivery failed. In that case, application should not be notified with a false positive of message delivery success. Fix this case by making sure the user gets notified only after the full message delivery. Reviewed-by: Venkat Venkatsubra Signed-off-by: Santosh Shilimkar --- net/rds/ib_send.c | 25 +++++++++++++++---------- net/rds/rdma.c | 10 ++++++++++ net/rds/rds.h | 1 + net/rds/send.c | 4 +++- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 19eca5c4c00c..5e72de10c484 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm, complete(rm, notify_status); } -static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, - struct rm_data_op *op, - int wc_status) -{ - if (op->op_nents) - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - DMA_TO_DEVICE); -} - static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, struct rm_rdma_op *op, int wc_status) @@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_atomic_fadd); } +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + struct rds_message *rm = container_of(op, struct rds_message, data); + + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); + + if (rm->rdma.op_active && rm->data.op_notify) + rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status); +} + /* * Unmap the resources associated with a struct send_work. * diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4297f3f337d7..138aef644c56 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -627,6 +627,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; + + /* Enable rmda notification on data operation for composite + * rds messages and make sure notification is enabled only + * for the data operation which follows it so that application + * gets notified only after full message gets delivered. + */ + if (rm->data.op_sg) { + rm->rdma.op_notify = 0; + rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + } } /* The cookie contains the R_Key of the remote memory region, and diff --git a/net/rds/rds.h b/net/rds/rds.h index ebbf909b87ec..0bb8213c7d0b 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -419,6 +419,7 @@ struct rds_message { } rdma; struct rm_data_op { unsigned int op_active:1; + unsigned int op_notify:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; diff --git a/net/rds/send.c b/net/rds/send.c index 0a6f38b1c8a5..45e025b65d29 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -476,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; + unsigned int notify = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); + notify = rm->rdma.op_notify | rm->data.op_notify; ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_active && notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); From cf657269d311d575eb196c7045579b3443631b8b Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 29 Sep 2016 11:07:11 -0700 Subject: [PATCH 14/17] RDS: IB: fix panic due to handlers running post teardown Shutdown code reaping loop takes care of emptying the CQ's before they being destroyed. And once tasklets are killed, the hanlders are not expected to run. But because of core tasklet code issues, tasklet handler could still run even after tasklet_kill, RDS IB shutdown code already reaps the CQs before freeing cq/qp resources so as such the handlers have nothing left to do post shutdown. On other hand any handler running after teardown and trying to access already freed qp/cq resources causes issues Patch fixes this race by makes sure that handlers returns without any action post teardown. Reviewed-by: Wengang Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 1 + net/rds/ib_cm.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/net/rds/ib.h b/net/rds/ib.h index 1fe9f79fead5..540458928f3c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -185,6 +185,7 @@ struct rds_ib_connection { /* Endpoint role in connection */ bool i_active_side; + atomic_t i_cq_quiesce; /* Send/Recv vectors */ int i_scq_vector; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 33c8584ada1f..ce3775abc6e7 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -128,6 +128,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even ic->i_flowctl ? ", flow control" : ""); } + atomic_set(&ic->i_cq_quiesce, 0); + /* Init rings and fill recv. this needs to wait until protocol * negotiation is complete, since ring layout is different * from 3.1 to 4.1. @@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); + /* if cq has been already reaped, ignore incoming cq event */ + if (atomic_read(&ic->i_cq_quiesce)) + return; + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); poll_scq(ic, ic->i_send_cq, ic->i_send_wc); @@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); + /* if cq has been already reaped, ignore incoming cq event */ + if (atomic_read(&ic->i_cq_quiesce)) + return; + memset(&state, 0, sizeof(state)); poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); @@ -804,6 +814,8 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); + atomic_set(&ic->i_cq_quiesce, 1); + /* first destroy the ib state that generates callbacks */ if (ic->i_cm_id->qp) rdma_destroy_qp(ic->i_cm_id); From 192a798f52998a643cef84fce0204be56666b0bf Mon Sep 17 00:00:00 2001 From: Venkat Venkatsubra Date: Sat, 9 Jul 2016 17:36:20 -0700 Subject: [PATCH 15/17] RDS: add stat for socket recv memory usage Tracks the receive side memory added to scokets and removed from sockets. Signed-off-by: Venkat Venkatsubra Signed-off-by: Santosh Shilimkar --- net/rds/rds.h | 3 +++ net/rds/recv.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/net/rds/rds.h b/net/rds/rds.h index 0bb8213c7d0b..8ccd5a93e56c 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -631,6 +631,9 @@ struct rds_statistics { uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; + uint64_t s_recv_bytes_added_to_socket; + uint64_t s_recv_bytes_removed_from_socket; + }; /* af_rds.c */ diff --git a/net/rds/recv.c b/net/rds/recv.c index 9d0666e5fe35..ba19eeeae85a 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -94,6 +94,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, return; rs->rs_rcv_bytes += delta; + if (delta > 0) + rds_stats_add(s_recv_bytes_added_to_socket, delta); + else + rds_stats_add(s_recv_bytes_removed_from_socket, -delta); now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " From f9fb69adb6c7acca60977a4db5a5f95b8e66c041 Mon Sep 17 00:00:00 2001 From: Avinash Repaka Date: Mon, 29 Feb 2016 15:30:57 -0800 Subject: [PATCH 16/17] RDS: make message size limit compliant with spec RDS support max message size as 1M but the code doesn't check this in all cases. Patch fixes it for RDMA & non-RDMA and RDS MR size and its enforced irrespective of underlying transport. Signed-off-by: Avinash Repaka Signed-off-by: Santosh Shilimkar --- net/rds/rdma.c | 9 ++++++++- net/rds/rds.h | 3 +++ net/rds/send.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 138aef644c56..f06fac4886b0 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -40,7 +40,6 @@ /* * XXX * - build with sparse - * - should we limit the size of a mr region? let transport return failure? * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ @@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } + /* Restrict the size of mr irrespective of underlying transport + * To account for unaligned mr regions, subtract one from nr_pages + */ + if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { + ret = -EMSGSIZE; + goto out; + } + rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); diff --git a/net/rds/rds.h b/net/rds/rds.h index 8ccd5a93e56c..f713194e4620 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...) #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) +/* Used to limit both RDMA and non-RDMA RDS message to 1MB */ +#define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20)) + #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) diff --git a/net/rds/send.c b/net/rds/send.c index 45e025b65d29..5cc64039caf7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -994,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) return hash; } +static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) +{ + struct rds_rdma_args *args; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { + args = CMSG_DATA(cmsg); + *rdma_bytes += args->remote_vec.bytes; + } + } + return 0; +} + int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; @@ -1008,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; + size_t total_payload_len = payload_len, rdma_payload_len = 0; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1040,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) } release_sock(sk); + ret = rds_rdma_bytes(msg, &rdma_payload_len); + if (ret) + goto out; + + total_payload_len += rdma_payload_len; + if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { + ret = -EMSGSIZE; + goto out; + } + if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; goto out; From 3289025aedc018f8fd9d0e37fb9efa0c6d531ffa Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Mon, 4 Jul 2016 22:35:15 -0700 Subject: [PATCH 17/17] RDS: add receive message trace used by application Socket option to tap receive path latency in various stages in nano seconds. It can be enabled on selective sockets using using SO_RDS_MSG_RXPATH_LATENCY socket option. RDS will return the data to application with RDS_CMSG_RXPATH_LATENCY in defined format. Scope is left to add more trace points for future without need of change in the interface. Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar --- include/uapi/linux/rds.h | 33 +++++++++++++++++++++++++++++++++ net/rds/af_rds.c | 28 ++++++++++++++++++++++++++++ net/rds/ib_recv.c | 4 ++++ net/rds/rds.h | 10 ++++++++++ net/rds/recv.c | 32 +++++++++++++++++++++++++++++--- net/rds/tcp_recv.c | 5 +++++ 6 files changed, 109 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 0f9265cb2a96..3833113ab2c0 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -52,6 +52,13 @@ #define RDS_GET_MR_FOR_DEST 7 #define SO_RDS_TRANSPORT 8 +/* Socket option to tap receive path latency + * SO_RDS: SO_RDS_MSG_RXPATH_LATENCY + * Format used struct rds_rx_trace_so + */ +#define SO_RDS_MSG_RXPATH_LATENCY 10 + + /* supported values for SO_RDS_TRANSPORT */ #define RDS_TRANS_IB 0 #define RDS_TRANS_IWARP 1 @@ -77,6 +84,12 @@ * the same as for the GET_MR setsockopt. * RDS_CMSG_RDMA_STATUS (recvmsg) * Returns the status of a completed RDMA operation. + * RDS_CMSG_RXPATH_LATENCY(recvmsg) + * Returns rds message latencies in various stages of receive + * path in nS. Its set per socket using SO_RDS_MSG_RXPATH_LATENCY + * socket option. Legitimate points are defined in + * enum rds_message_rxpath_latency. More points can be added in + * future. CSMG format is struct rds_cmsg_rx_trace. */ #define RDS_CMSG_RDMA_ARGS 1 #define RDS_CMSG_RDMA_DEST 2 @@ -87,6 +100,7 @@ #define RDS_CMSG_ATOMIC_CSWP 7 #define RDS_CMSG_MASKED_ATOMIC_FADD 8 #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 +#define RDS_CMSG_RXPATH_LATENCY 11 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 @@ -171,6 +185,25 @@ struct rds_info_rdma_connection { uint32_t rdma_mr_size; }; +/* RDS message Receive Path Latency points */ +enum rds_message_rxpath_latency { + RDS_MSG_RX_HDR_TO_DGRAM_START = 0, + RDS_MSG_RX_DGRAM_REASSEMBLE, + RDS_MSG_RX_DGRAM_DELIVERED, + RDS_MSG_RX_DGRAM_TRACE_MAX +}; + +struct rds_rx_trace_so { + u8 rx_traces; + u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX]; +}; + +struct rds_cmsg_rx_trace { + u8 rx_traces; + u8 rx_trace_pos[RDS_MSG_RX_DGRAM_TRACE_MAX]; + u64 rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; +}; + /* * Congestion monitoring. * Congestion control in RDS happens at the host connection diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 2ac1e6194be3..fd8217404162 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -298,6 +298,30 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, return 0; } +static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, + int optlen) +{ + struct rds_rx_trace_so trace; + int i; + + if (optlen != sizeof(struct rds_rx_trace_so)) + return -EFAULT; + + if (copy_from_user(&trace, optval, sizeof(trace))) + return -EFAULT; + + rs->rs_rx_traces = trace.rx_traces; + for (i = 0; i < rs->rs_rx_traces; i++) { + if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { + rs->rs_rx_traces = 0; + return -EFAULT; + } + rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; + } + + return 0; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -338,6 +362,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, ret = rds_enable_recvtstamp(sock->sk, optval, optlen); release_sock(sock->sk); break; + case SO_RDS_MSG_RXPATH_LATENCY: + ret = rds_recv_track_latency(rs, optval, optlen); + break; default: ret = -ENOPROTOOPT; } @@ -484,6 +511,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_cong_list); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; + rs->rs_rx_traces = 0; spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 4b0f12679219..e10624aa6959 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -911,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, ic->i_ibinc = ibinc; hdr = &ibinc->ii_inc.i_hdr; + ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = + local_clock(); memcpy(hdr, ihdr, sizeof(*hdr)); ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] = + local_clock(); rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, ic->i_recv_data_rem, hdr->h_flags); diff --git a/net/rds/rds.h b/net/rds/rds.h index f713194e4620..07fff73dd4f3 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -253,6 +253,11 @@ struct rds_ext_header_rdma_dest { #define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ +#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) +#define RDS_MSG_RX_HDR 0 +#define RDS_MSG_RX_START 1 +#define RDS_MSG_RX_END 2 +#define RDS_MSG_RX_CMSG 3 struct rds_incoming { atomic_t i_refcount; @@ -265,6 +270,7 @@ struct rds_incoming { rds_rdma_cookie_t i_rdma_cookie; struct timeval i_rx_tstamp; + u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; struct rds_mr { @@ -575,6 +581,10 @@ struct rds_sock { unsigned char rs_recverr, rs_cong_monitor; u32 rs_hash_initval; + + /* Socket receive path trace points*/ + u8 rs_rx_traces; + u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) diff --git a/net/rds/recv.c b/net/rds/recv.c index ba19eeeae85a..8b7e7b7f2c2d 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -43,6 +43,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr) { + int i; + atomic_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; @@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, inc->i_rdma_cookie = 0; inc->i_rx_tstamp.tv_sec = 0; inc->i_rx_tstamp.tv_usec = 0; + + for (i = 0; i < RDS_RX_MAX_TRACES; i++) + inc->i_rx_lat_trace[i] = 0; } EXPORT_SYMBOL_GPL(rds_inc_init); @@ -373,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, if (sock_flag(sk, SOCK_RCVTSTAMP)) do_gettimeofday(&inc->i_rx_tstamp); rds_inc_addref(inc); + inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); } else { @@ -534,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); if (ret) - return ret; + goto out; } if ((inc->i_rx_tstamp.tv_sec != 0) && @@ -543,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, sizeof(struct timeval), &inc->i_rx_tstamp); if (ret) - return ret; + goto out; } - return 0; + if (rs->rs_rx_traces) { + struct rds_cmsg_rx_trace t; + int i, j; + + inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock(); + t.rx_traces = rs->rs_rx_traces; + for (i = 0; i < rs->rs_rx_traces; i++) { + j = rs->rs_rx_trace[i]; + t.rx_trace_pos[i] = j; + t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] - + inc->i_rx_lat_trace[j]; + } + + ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY, + sizeof(t), &t); + if (ret) + goto out; + } + +out: + return ret; } int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index ad4892e97f91..e006ef8e6d40 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, rdsdebug("alloced tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, cp->cp_conn->c_faddr); + tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = + local_clock(); + /* * XXX * we might be able to use the __ variants when * we've already serialized at a higher level. @@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, /* could be 0 for a 0 len message */ tc->t_tinc_data_rem = be32_to_cpu(tinc->ti_inc.i_hdr.h_len); + tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] = + local_clock(); } }