From 2c1619edef61a03cb516efaa81750784c3071d10 Mon Sep 17 00:00:00 2001 From: Danit Goldberg Date: Thu, 24 Jan 2019 14:18:15 +0200 Subject: [PATCH 01/38] IB/cma: Define option to set ack timeout and pack tos_set Define new option in 'rdma_set_option' to override calculated QP timeout when requested to provide QP attributes to modify a QP. At the same time, pack tos_set to be bitfield. Signed-off-by: Danit Goldberg Reviewed-by: Moni Shoua Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 32 ++++++++++++++++++++++++++++++ drivers/infiniband/core/cma_priv.h | 4 +++- drivers/infiniband/core/ucma.c | 7 +++++++ include/rdma/rdma_cm.h | 1 + include/uapi/rdma/rdma_user_cm.h | 4 ++++ 5 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e15546ae4d0f..83aa2ad0c27e 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -888,6 +888,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; + id_priv->timeout_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); @@ -1130,6 +1131,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, } else ret = -ENOSYS; + if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) + qp_attr->timeout = id_priv->timeout; + return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); @@ -2490,6 +2494,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) } EXPORT_SYMBOL(rdma_set_service_type); +/** + * rdma_set_ack_timeout() - Set the ack timeout of QP associated + * with a connection identifier. + * @id: Communication identifier to associated with service type. + * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. + * + * This function should be called before rdma_connect() on active side, + * and on passive side before rdma_accept(). It is applicable to primary + * path only. The timeout will affect the local side of the QP, it is not + * negotiated with remote side and zero disables the timer. + * + * Return: 0 for success + */ +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) +{ + struct rdma_id_private *id_priv; + + if (id->qp_type != IB_QPT_RC) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->timeout = timeout; + id_priv->timeout_set = true; + + return 0; +} +EXPORT_SYMBOL(rdma_set_ack_timeout); + static void cma_query_handler(int status, struct sa_path_rec *path_rec, void *context) { diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index cf47c69436a7..ca7307277518 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -84,9 +84,11 @@ struct rdma_id_private { u32 options; u8 srq; u8 tos; - bool tos_set; + u8 tos_set:1; + u8 timeout_set:1; u8 reuseaddr; u8 afonly; + u8 timeout; enum ib_gid_type gid_type; /* diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 01d68ed46c1b..7468b26b8a01 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; + case RDMA_OPTION_ID_ACK_TIMEOUT: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); + break; default: ret = -ENOSYS; } diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 60987a5903b7..71f48cfdc24c 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -374,6 +374,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); */ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout); /** * rdma_get_service_id - Return the IB service ID for a specified address. * @id: Communication identifier associated with the address. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 0d1e78ebad05..e42940a215a3 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -300,6 +300,10 @@ enum { RDMA_OPTION_ID_TOS = 0, RDMA_OPTION_ID_REUSEADDR = 1, RDMA_OPTION_ID_AFONLY = 2, + RDMA_OPTION_ID_ACK_TIMEOUT = 3 +}; + +enum { RDMA_OPTION_IB_PATH = 1 }; From 9491128f780e5be382a5b22990439d017f6dfc59 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 1 Feb 2019 12:44:27 -0800 Subject: [PATCH 02/38] RDMA/cma: listening device cm_ids should inherit tos If a user binds to INADDR_ANY and sets the service id, then the device-specific cm_ids should also use this tos. This allows an app to do: rdma_bind_addr(INADDR_ANY) set_service_type() rdma_listen() And connections setup via this listening endpoint will use the correct tos. Signed-off-by: Steve Wise Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 83aa2ad0c27e..e761ddd09aed 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2466,6 +2466,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; + dev_id_priv->tos_set = id_priv->tos_set; + dev_id_priv->tos = id_priv->tos; ret = rdma_listen(id, id_priv->backlog); if (ret) From 926ba19b3574f6a80823a42484877ed65e91da9c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 1 Feb 2019 12:44:32 -0800 Subject: [PATCH 03/38] RDMA/iwcm: add tos_set bool to iw_cm struct This allows drivers to know the tos was actively set by the application. Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 ++ include/rdma/iw_cm.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e761ddd09aed..c43512752b8a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2414,6 +2414,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) return PTR_ERR(id); id->tos = id_priv->tos; + id->tos_set = id_priv->tos_set; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), @@ -3843,6 +3844,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, return PTR_ERR(cm_id); cm_id->tos = id_priv->tos; + cm_id->tos_set = id_priv->tos_set; id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 48512abd3162..0e1f02815643 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -94,7 +94,8 @@ struct iw_cm_id { void (*add_ref)(struct iw_cm_id *); void (*rem_ref)(struct iw_cm_id *); u8 tos; - bool mapped; + bool tos_set:1; + bool mapped:1; }; struct iw_cm_conn_param { From 7235ea227e19100ca748ccd6279322a166a90953 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 1 Feb 2019 12:44:37 -0800 Subject: [PATCH 04/38] iw_cxgb4: use listening ep tos when accepting new connections If the parent listening endpoint has a service type set, then use that when setting up the connection. This allows server-side applications to mandate the tos for passive side connections via rdma_set_service_type() on the listening endpoints. Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 59917eb124da..c4e4085430bf 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2476,7 +2476,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) u16 peer_mss = ntohs(req->tcpopt.mss); int iptype; unsigned short hdrs; - u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); + u8 tos; parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid); if (!parent_ep) { @@ -2490,6 +2490,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto reject; } + if (parent_ep->com.cm_id->tos_set) + tos = parent_ep->com.cm_id->tos; + else + tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); + cxgb_get_4tuple(req, parent_ep->com.dev->rdev.lldi.adapter_type, &iptype, local_ip, peer_ip, &local_port, &peer_port); @@ -2509,7 +2514,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) ntohs(peer_port), peer_mss); dst = cxgb_find_route6(&dev->rdev.lldi, get_real_dev, local_ip, peer_ip, local_port, peer_port, - PASS_OPEN_TOS_G(ntohl(req->tos_stid)), + tos, ((struct sockaddr_in6 *) &parent_ep->com.local_addr)->sin6_scope_id); } From cb3ba0bde881f0cb7e3945d2a266901e2bd18c92 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 1 Feb 2019 12:44:41 -0800 Subject: [PATCH 05/38] iw_cxgb4: use tos when importing the endpoint import_ep() is passed the correct tos, but doesn't use it correctly. Fixes: ac8e4c69a021 ("cxgb4/iw_cxgb4: TOS support") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index c4e4085430bf..0259198d594d 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2072,7 +2072,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, } else { pdev = get_real_dev(n->dev); ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, - n, pdev, 0); + n, pdev, rt_tos2priority(tos)); if (!ep->l2t) goto out; ep->mtu = dst_mtu(dst); From c8a7eb554a83214c3d8ee5cb322da8c72810d2dc Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 1 Feb 2019 12:44:53 -0800 Subject: [PATCH 06/38] iw_cxgb4: use tos when finding ipv6 routes When IPv6 support was added, the correct tos was not passed to cxgb_find_route6(). This potentially results in the wrong route entry. Fixes: 830662f6f032 ("RDMA/cxgb4: Add support for active and passive open connection with IPv6 address") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 0259198d594d..77efd4ae8e10 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2161,7 +2161,8 @@ static int c4iw_reconnect(struct c4iw_ep *ep) laddr6->sin6_addr.s6_addr, raddr6->sin6_addr.s6_addr, laddr6->sin6_port, - raddr6->sin6_port, 0, + raddr6->sin6_port, + ep->com.cm_id->tos, raddr6->sin6_scope_id); iptype = 6; ra = (__u8 *)&raddr6->sin6_addr; @@ -3326,7 +3327,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) laddr6->sin6_addr.s6_addr, raddr6->sin6_addr.s6_addr, laddr6->sin6_port, - raddr6->sin6_port, 0, + raddr6->sin6_port, cm_id->tos, raddr6->sin6_scope_id); } if (!ep->dst) { From 0c236606490b5b356ac3bb885b6417ce09bce63f Mon Sep 17 00:00:00 2001 From: Parvi Kaustubhi Date: Fri, 8 Feb 2019 13:53:43 -0800 Subject: [PATCH 07/38] IB/usnic: Fix locking when unregistering Move the call to usnic_ib_device_remove after usnic_ib_ibdev_list_lock has been released. Signed-off-by: Parvi Kaustubhi Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/usnic/usnic_ib_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 577d9301251a..1ec155823716 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -470,15 +470,17 @@ static void usnic_ib_undiscover_pf(struct kref *kref) &usnic_ib_ibdev_list, ib_dev_link) { if (us_ibdev->pdev == dev) { list_del(&us_ibdev->ib_dev_link); - usnic_ib_device_remove(us_ibdev); found = true; break; } } - WARN(!found, "Failed to remove PF %s\n", pci_name(dev)); mutex_unlock(&usnic_ib_ibdev_list_lock); + if (found) + usnic_ib_device_remove(us_ibdev); + else + WARN(1, "Failed to remove PF %s\n", pci_name(dev)); } static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic) From d60667fc398ed34b3c7456b020481c55c760e503 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Sat, 2 Feb 2019 11:09:42 +0200 Subject: [PATCH 08/38] IB/core: Unregister notifier before freeing MAD security If the notifier runs after the security context is freed an access of freed memory can occur. Fixes: 47a2b338fe63 ("IB/core: Enforce security on management datagrams") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/security.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 1efadbccf394..402449d4a888 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -727,9 +727,10 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) if (!rdma_protocol_ib(agent->device, agent->port_num)) return; - security_ib_free_security(agent->security); if (agent->lsm_nb_reg) unregister_lsm_notifier(&agent->lsm_nb); + + security_ib_free_security(agent->security); } int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) From 6e88e672b69f0e627acdae74a527b730ea224b6b Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Sat, 2 Feb 2019 11:09:43 +0200 Subject: [PATCH 09/38] IB/core: Fix potential memory leak while creating MAD agents If the MAD agents isn't allowed to manage the subnet, or fails to register for the LSM notifier, the security context is leaked. Free the context in these cases. Fixes: 47a2b338fe63 ("IB/core: Enforce security on management datagrams") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Reported-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/security.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 402449d4a888..7662e9347238 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -710,16 +710,20 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, dev_name(&agent->device->dev), agent->port_num); if (ret) - return ret; + goto free_security; agent->lsm_nb.notifier_call = ib_mad_agent_security_change; ret = register_lsm_notifier(&agent->lsm_nb); if (ret) - return ret; + goto free_security; agent->smp_allowed = true; agent->lsm_nb_reg = true; return 0; + +free_security: + security_ib_free_security(agent->security); + return ret; } void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) From 805b754d492f6227e1646001bdf85ad4bb819e55 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Sat, 2 Feb 2019 11:09:44 +0200 Subject: [PATCH 10/38] IB/core: Eliminate a hole in MAD agent struct Move the security related fields above the u8s to eliminate a hole in the struct. pahole before: struct ib_mad_agent { ... u32 hi_tid; /* 48 4 */ u32 flags; /* 52 4 */ u8 port_num; /* 56 1 */ u8 rmpp_version; /* 57 1 */ /* XXX 6 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ void * security; /* 64 8 */ bool smp_allowed; /* 72 1 */ bool lsm_nb_reg; /* 73 1 */ /* XXX 6 bytes hole, try to pack */ struct notifier_block lsm_nb; /* 80 24 */ /* XXX last struct has 4 bytes of padding */ /* size: 104, cachelines: 2, members: 14 */ ... }; pahole after: struct ib_mad_agent { ... u32 hi_tid; /* 48 4 */ u32 flags; /* 52 4 */ void * security; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct notifier_block lsm_nb; /* 64 24 */ /* XXX last struct has 4 bytes of padding */ u8 port_num; /* 88 1 */ u8 rmpp_version; /* 89 1 */ bool smp_allowed; /* 90 1 */ bool lsm_nb_reg; /* 91 1 */ /* size: 96, cachelines: 2, members: 14 */ ... }; Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_mad.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index fdef558e3a2d..1c0b914f199d 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -616,12 +616,12 @@ struct ib_mad_agent { void *context; u32 hi_tid; u32 flags; + void *security; + struct notifier_block lsm_nb; u8 port_num; u8 rmpp_version; - void *security; bool smp_allowed; bool lsm_nb_reg; - struct notifier_block lsm_nb; }; /** From c66f67414c1f88554485bb2a0abf8b5c0d741de7 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Sat, 2 Feb 2019 11:09:45 +0200 Subject: [PATCH 11/38] IB/core: Don't register each MAD agent for LSM notifier When creating many MAD agents in a short period of time, receive packet processing can be delayed long enough to cause timeouts while new agents are being added to the atomic notifier chain with IRQs disabled. Notifier chain registration and unregstration is an O(n) operation. With large numbers of MAD agents being created and destroyed simultaneously the CPUs spend too much time with interrupts disabled. Instead of each MAD agent registering for it's own LSM notification, maintain a list of agents internally and register once, this registration already existed for handling the PKeys. This list is write mostly, so a normal spin lock is used vs a read/write lock. All MAD agents must be checked, so a single list is used instead of breaking them down per device. Notifier calls are done under rcu_read_lock, so there isn't a risk of similar packet timeouts while checking the MAD agents security settings when notified. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Acked-by: Paul Moore Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 5 +++ drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/security.c | 48 ++++++++++++++++------------- include/rdma/ib_mad.h | 3 +- 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index bcb3e3029a9b..d053110207eb 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -202,6 +202,7 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, enum ib_qp_type qp_type); void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); +void ib_mad_agent_security_change(void); #else static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) { @@ -267,6 +268,10 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, { return 0; } + +static inline void ib_mad_agent_security_change(void) +{ +} #endif struct ib_device *ib_device_get_by_index(u32 ifindex); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 55221990d946..32cd35c9b21e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -452,6 +452,7 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; schedule_work(&ib_policy_change_work); + ib_mad_agent_security_change(); return NOTIFY_OK; } diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 7662e9347238..a70d2ba312ed 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -39,6 +39,10 @@ #include "core_priv.h" #include "mad_priv.h" +static LIST_HEAD(mad_agent_list); +/* Lock to protect mad_agent_list */ +static DEFINE_SPINLOCK(mad_agent_list_lock); + static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) { struct pkey_index_qp_list *pkey = NULL; @@ -676,19 +680,18 @@ static int ib_security_pkey_access(struct ib_device *dev, return security_ib_pkey_access(sec, subnet_prefix, pkey); } -static int ib_mad_agent_security_change(struct notifier_block *nb, - unsigned long event, - void *data) +void ib_mad_agent_security_change(void) { - struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb); + struct ib_mad_agent *ag; - if (event != LSM_POLICY_CHANGE) - return NOTIFY_DONE; - - ag->smp_allowed = !security_ib_endport_manage_subnet( - ag->security, dev_name(&ag->device->dev), ag->port_num); - - return NOTIFY_OK; + spin_lock(&mad_agent_list_lock); + list_for_each_entry(ag, + &mad_agent_list, + mad_agent_sec_list) + WRITE_ONCE(ag->smp_allowed, + !security_ib_endport_manage_subnet(ag->security, + dev_name(&ag->device->dev), ag->port_num)); + spin_unlock(&mad_agent_list_lock); } int ib_mad_agent_security_setup(struct ib_mad_agent *agent, @@ -699,6 +702,8 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (!rdma_protocol_ib(agent->device, agent->port_num)) return 0; + INIT_LIST_HEAD(&agent->mad_agent_sec_list); + ret = security_ib_alloc_security(&agent->security); if (ret) return ret; @@ -706,22 +711,20 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (qp_type != IB_QPT_SMI) return 0; + spin_lock(&mad_agent_list_lock); ret = security_ib_endport_manage_subnet(agent->security, dev_name(&agent->device->dev), agent->port_num); if (ret) goto free_security; - agent->lsm_nb.notifier_call = ib_mad_agent_security_change; - ret = register_lsm_notifier(&agent->lsm_nb); - if (ret) - goto free_security; - - agent->smp_allowed = true; - agent->lsm_nb_reg = true; + WRITE_ONCE(agent->smp_allowed, true); + list_add(&agent->mad_agent_sec_list, &mad_agent_list); + spin_unlock(&mad_agent_list_lock); return 0; free_security: + spin_unlock(&mad_agent_list_lock); security_ib_free_security(agent->security); return ret; } @@ -731,8 +734,11 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) if (!rdma_protocol_ib(agent->device, agent->port_num)) return; - if (agent->lsm_nb_reg) - unregister_lsm_notifier(&agent->lsm_nb); + if (agent->qp->qp_type == IB_QPT_SMI) { + spin_lock(&mad_agent_list_lock); + list_del(&agent->mad_agent_sec_list); + spin_unlock(&mad_agent_list_lock); + } security_ib_free_security(agent->security); } @@ -743,7 +749,7 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) return 0; if (map->agent.qp->qp_type == IB_QPT_SMI) { - if (!map->agent.smp_allowed) + if (!READ_ONCE(map->agent.smp_allowed)) return -EACCES; return 0; } diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 1c0b914f199d..79ba8219e7dc 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -617,11 +617,10 @@ struct ib_mad_agent { u32 hi_tid; u32 flags; void *security; - struct notifier_block lsm_nb; + struct list_head mad_agent_sec_list; u8 port_num; u8 rmpp_version; bool smp_allowed; - bool lsm_nb_reg; }; /** From 30471d4b20335d9bd9ae9b2382a1e1e97d18d86d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 3 Feb 2019 14:55:50 +0200 Subject: [PATCH 12/38] RDMA/core: Share driver structure size with core Add new macros to be used in drivers while registering ops structure and IB/core while calling allocation routines, so drivers won't need to perform kzalloc/kfree in their paths. The change in allocation stage allows us to initialize common fields prior to calling to drivers (e.g. restrack). Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 ++ include/rdma/ib_verbs.h | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 32cd35c9b21e..d806a5c7b202 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1228,6 +1228,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) (ptr)->name = ops->name; \ } while (0) +#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2e1f1e885ee5..e29eae4aec84 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2264,6 +2264,19 @@ struct ib_counters_read_attr { struct uverbs_attr_bundle; +#define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ + .size_##ib_struct = \ + (sizeof(struct drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ + BUILD_BUG_ON_ZERO( \ + !__same_type(((struct drv_struct *)NULL)->member, \ + struct ib_struct))) + +#define rdma_zalloc_drv_obj(ib_dev, ib_type) \ + ((struct ib_type *)kzalloc(ib_dev->ops.size_##ib_type, GFP_KERNEL)) + +#define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct + /** * struct ib_device_ops - InfiniBand device operations * This structure defines all the InfiniBand device operations, providers will From 21a428a019c9a6d133e745b529b9bf18c1187e70 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 3 Feb 2019 14:55:51 +0200 Subject: [PATCH 13/38] RDMA: Handle PD allocations by IB/core The PD allocations in IB/core allows us to simplify drivers and their error flows in their .alloc_pd() paths. The changes in .alloc_pd() go hand in had with relevant update in .dealloc_pd(). We will use this opportunity and convert .dealloc_pd() to don't fail, as it was suggested a long time ago, failures are not happening as we have never seen a WARN_ON print. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 + drivers/infiniband/core/uverbs_cmd.c | 15 ++- drivers/infiniband/core/uverbs_std_types.c | 2 +- drivers/infiniband/core/verbs.c | 27 +++--- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 37 +++----- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 9 +- drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.c | 25 ++--- drivers/infiniband/hw/cxgb4/provider.c | 25 ++--- drivers/infiniband/hw/hns/hns_roce_device.h | 7 +- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 27 +++--- drivers/infiniband/hw/hns/hns_roce_main.c | 1 + drivers/infiniband/hw/hns/hns_roce_pd.c | 25 ++--- drivers/infiniband/hw/i40iw/i40iw_utils.c | 1 - drivers/infiniband/hw/i40iw/i40iw_verbs.c | 32 +++---- drivers/infiniband/hw/mlx4/main.c | 36 +++----- drivers/infiniband/hw/mlx5/main.c | 48 +++++----- drivers/infiniband/hw/mthca/mthca_provider.c | 29 ++---- drivers/infiniband/hw/nes/nes_verbs.c | 32 +++---- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 + drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 92 ++++++++----------- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 6 +- drivers/infiniband/hw/qedr/main.c | 1 + drivers/infiniband/hw/qedr/verbs.c | 34 ++----- drivers/infiniband/hw/qedr/verbs.h | 6 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 1 + drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 26 ++---- drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 7 +- .../infiniband/hw/vmw_pvrdma/pvrdma_main.c | 1 + .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 43 +++------ .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 7 +- drivers/infiniband/sw/rdmavt/pd.c | 29 ++---- drivers/infiniband/sw/rdmavt/pd.h | 7 +- drivers/infiniband/sw/rdmavt/vt.c | 1 + drivers/infiniband/sw/rxe/rxe_pool.c | 60 +++++++++--- drivers/infiniband/sw/rxe/rxe_pool.h | 4 + drivers/infiniband/sw/rxe/rxe_verbs.c | 16 ++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 +- include/rdma/ib_verbs.h | 9 +- 39 files changed, 325 insertions(+), 409 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d806a5c7b202..57e1e177921e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1319,6 +1319,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, unmap_fmr); + + SET_OBJ_SIZE(dev_ops, ib_pd); } EXPORT_SYMBOL(ib_set_device_ops); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index aa260cafbd85..5ac143f22df0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -407,9 +407,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = ib_dev->ops.alloc_pd(ib_dev, uobj->context, &attrs->driver_udata); - if (IS_ERR(pd)) { - ret = PTR_ERR(pd); + pd = rdma_zalloc_drv_obj(ib_dev, ib_pd); + if (!pd) { + ret = -ENOMEM; goto err; } @@ -417,11 +417,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) pd->uobject = uobj; pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); + pd->res.type = RDMA_RESTRACK_PD; + + ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata); + if (ret) + goto err_alloc; uobj->object = pd; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; - pd->res.type = RDMA_RESTRACK_PD; rdma_restrack_uadd(&pd->res); ret = uverbs_response(attrs, &resp, sizeof(resp)); @@ -432,7 +436,8 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) err_copy: ib_dealloc_pd(pd); - +err_alloc: + kfree(pd); err: uobj_alloc_abort(uobj); return ret; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index cbc72312eb41..f224cb727224 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -188,7 +188,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject, if (ret) return ret; - ib_dealloc_pd((struct ib_pd *)uobject->object); + ib_dealloc_pd(pd); return 0; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 3220fb42ecce..de5d895a5054 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -254,10 +254,11 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, { struct ib_pd *pd; int mr_access_flags = 0; + int ret; - pd = device->ops.alloc_pd(device, NULL, NULL); - if (IS_ERR(pd)) - return pd; + pd = rdma_zalloc_drv_obj(device, ib_pd); + if (!pd) + return ERR_PTR(-ENOMEM); pd->device = device; pd->uobject = NULL; @@ -265,6 +266,16 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, atomic_set(&pd->usecnt, 0); pd->flags = flags; + pd->res.type = RDMA_RESTRACK_PD; + rdma_restrack_set_task(&pd->res, caller); + + ret = device->ops.alloc_pd(pd, NULL, NULL); + if (ret) { + kfree(pd); + return ERR_PTR(ret); + } + rdma_restrack_kadd(&pd->res); + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else @@ -275,10 +286,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } - pd->res.type = RDMA_RESTRACK_PD; - rdma_restrack_set_task(&pd->res, caller); - rdma_restrack_kadd(&pd->res); - if (mr_access_flags) { struct ib_mr *mr; @@ -329,10 +336,8 @@ void ib_dealloc_pd(struct ib_pd *pd) WARN_ON(atomic_read(&pd->usecnt)); rdma_restrack_del(&pd->res); - /* Making delalloc_pd a void return is a WIP, no driver should return - an error here. */ - ret = pd->device->ops.dealloc_pd(pd); - WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); + pd->device->ops.dealloc_pd(pd); + kfree(pd); } EXPORT_SYMBOL(ib_dealloc_pd); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 1d7469e23cde..1606571af63d 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -563,41 +563,29 @@ fail: } /* Protection Domains */ -int bnxt_re_dealloc_pd(struct ib_pd *ib_pd) +void bnxt_re_dealloc_pd(struct ib_pd *ib_pd) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; - int rc; bnxt_re_destroy_fence_mr(pd); - if (pd->qplib_pd.id) { - rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res, - &rdev->qplib_res.pd_tbl, - &pd->qplib_pd); - if (rc) - dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD"); - } - - kfree(pd); - return 0; + if (pd->qplib_pd.id) + bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, + &pd->qplib_pd); } -struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *ucontext, - struct ib_udata *udata) +int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *ucontext, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_re_ucontext *ucntx = container_of(ucontext, struct bnxt_re_ucontext, ib_uctx); - struct bnxt_re_pd *pd; + struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd); int rc; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - pd->rdev = rdev; if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) { dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD"); @@ -637,13 +625,12 @@ struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, if (bnxt_re_create_fence_mr(pd)) dev_warn(rdev_to_dev(rdev), "Failed to create Fence-MR\n"); - return &pd->ib_pd; + return 0; dbfail: - (void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, - &pd->qplib_pd); + bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, + &pd->qplib_pd); fail: - kfree(pd); - return ERR_PTR(rc); + return rc; } /* Address Handles */ diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index c4af72604b4f..c7cca803cfa3 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -56,8 +56,8 @@ struct bnxt_re_fence_data { }; struct bnxt_re_pd { + struct ib_pd ib_pd; struct bnxt_re_dev *rdev; - struct ib_pd ib_pd; struct bnxt_qplib_pd qplib_pd; struct bnxt_re_fence_data fence; }; @@ -163,10 +163,9 @@ int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid); enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev, u8 port_num); -struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int bnxt_re_dealloc_pd(struct ib_pd *pd); +int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void bnxt_re_dealloc_pd(struct ib_pd *pd); struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 0d40a930c192..0a89ef6e5754 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -637,6 +637,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .query_srq = bnxt_re_query_srq, .reg_user_mr = bnxt_re_reg_user_mr, .req_notify_cq = bnxt_re_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), }; static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 804c1fc7bfc1..4cc9a6ae2139 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -370,7 +370,7 @@ static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) return ret; } -static int iwch_deallocate_pd(struct ib_pd *pd) +static void iwch_deallocate_pd(struct ib_pd *pd) { struct iwch_dev *rhp; struct iwch_pd *php; @@ -379,15 +379,13 @@ static int iwch_deallocate_pd(struct ib_pd *pd) rhp = php->rhp; pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid); cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); - kfree(php); - return 0; } -static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct iwch_pd *php; + struct iwch_pd *php = to_iwch_pd(pd); + struct ib_device *ibdev = pd->device; u32 pdid; struct iwch_dev *rhp; @@ -395,12 +393,8 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, rhp = (struct iwch_dev *) ibdev; pdid = cxio_hal_get_pdid(rhp->rdev.rscp); if (!pdid) - return ERR_PTR(-EINVAL); - php = kzalloc(sizeof(*php), GFP_KERNEL); - if (!php) { - cxio_hal_put_pdid(rhp->rdev.rscp, pdid); - return ERR_PTR(-ENOMEM); - } + return -EINVAL; + php->pdid = pdid; php->rhp = rhp; if (context) { @@ -408,11 +402,11 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { iwch_deallocate_pd(&php->ibpd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php); - return &php->ibpd; + return 0; } static int iwch_dereg_mr(struct ib_mr *ib_mr) @@ -1350,6 +1344,7 @@ static const struct ib_device_ops iwch_dev_ops = { .reg_user_mr = iwch_reg_user_mr, .req_notify_cq = iwch_arm_cq, .resize_cq = iwch_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd), }; int iwch_register_device(struct iwch_dev *dev) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index f59bf7e5a589..680b5e98491d 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -209,7 +209,7 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) return ret; } -static int c4iw_deallocate_pd(struct ib_pd *pd) +static void c4iw_deallocate_pd(struct ib_pd *pd) { struct c4iw_dev *rhp; struct c4iw_pd *php; @@ -221,15 +221,13 @@ static int c4iw_deallocate_pd(struct ib_pd *pd) mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur--; mutex_unlock(&rhp->rdev.stats.lock); - kfree(php); - return 0; } -static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct c4iw_pd *php; + struct c4iw_pd *php = to_c4iw_pd(pd); + struct ib_device *ibdev = pd->device; u32 pdid; struct c4iw_dev *rhp; @@ -237,12 +235,8 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, rhp = (struct c4iw_dev *) ibdev; pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); if (!pdid) - return ERR_PTR(-EINVAL); - php = kzalloc(sizeof(*php), GFP_KERNEL); - if (!php) { - c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid); - return ERR_PTR(-ENOMEM); - } + return -EINVAL; + php->pdid = pdid; php->rhp = rhp; if (context) { @@ -250,7 +244,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { c4iw_deallocate_pd(&php->ibpd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } mutex_lock(&rhp->rdev.stats.lock); @@ -259,7 +253,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; mutex_unlock(&rhp->rdev.stats.lock); pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php); - return &php->ibpd; + return 0; } static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, @@ -570,6 +564,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .query_qp = c4iw_ib_query_qp, .reg_user_mr = c4iw_reg_user_mr, .req_notify_cq = c4iw_arm_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd), }; void c4iw_register_device(struct work_struct *work) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 8ca8d74dfb6a..9ee86daf1700 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1114,10 +1114,9 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *pd, int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); int hns_roce_destroy_ah(struct ib_ah *ah, u32 flags); -struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev, - struct ib_ucontext *context, - struct ib_udata *udata); -int hns_roce_dealloc_pd(struct ib_pd *pd); +int hns_roce_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void hns_roce_dealloc_pd(struct ib_pd *pd); struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index fa08c22aad66..a18b88c95995 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -711,13 +711,14 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) struct ib_qp_attr attr = { 0 }; struct hns_roce_v1_priv *priv; struct hns_roce_qp *hr_qp; + struct ib_device *ibdev; struct ib_cq *cq; struct ib_pd *pd; union ib_gid dgid; u64 subnet_prefix; int attr_mask = 0; + int ret = -ENOMEM; int i, j; - int ret; u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 }; u8 phy_port; u8 port = 0; @@ -742,12 +743,16 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) free_mr->mr_free_cq->ib_cq.cq_context = NULL; atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0); - pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL); - if (IS_ERR(pd)) { - dev_err(dev, "Create pd for reserved loop qp failed!"); - ret = -ENOMEM; + ibdev = &hr_dev->ib_dev; + pd = rdma_zalloc_drv_obj(ibdev, ib_pd); + if (pd) + goto alloc_mem_failed; + + pd->device = ibdev; + ret = hns_roce_alloc_pd(pd, NULL, NULL); + if (ret) goto alloc_pd_failed; - } + free_mr->mr_free_pd = to_hr_pd(pd); free_mr->mr_free_pd->ibpd.device = &hr_dev->ib_dev; free_mr->mr_free_pd->ibpd.uobject = NULL; @@ -854,10 +859,12 @@ create_lp_qp_failed: dev_err(dev, "Destroy qp %d for mr free failed!\n", i); } - if (hns_roce_dealloc_pd(pd)) - dev_err(dev, "Destroy pd for create_lp_qp failed!\n"); + hns_roce_dealloc_pd(pd); alloc_pd_failed: + kfree(pd); + +alloc_mem_failed: if (hns_roce_ib_destroy_cq(cq)) dev_err(dev, "Destroy cq for create_lp_qp failed!\n"); @@ -891,9 +898,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) if (ret) dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret); - ret = hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd); - if (ret) - dev_err(dev, "Destroy pd for mr_free failed(%d)!\n", ret); + hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd); } static int hns_roce_db_init(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 67a8c4333f4f..ccf10622586c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -472,6 +472,7 @@ static const struct ib_device_ops hns_roce_dev_ops = { .query_pkey = hns_roce_query_pkey, .query_port = hns_roce_query_port, .reg_user_mr = hns_roce_reg_user_mr, + INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd), }; static const struct ib_device_ops hns_roce_dev_mr_ops = { diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index 4a29b2cb9bab..b9b97c5e97e6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -57,24 +57,19 @@ void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev) hns_roce_bitmap_cleanup(&hr_dev->pd_bitmap); } -struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev, - struct ib_ucontext *context, - struct ib_udata *udata) +int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct ib_device *ib_dev = ibpd->device; struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); struct device *dev = hr_dev->dev; - struct hns_roce_pd *pd; + struct hns_roce_pd *pd = to_hr_pd(ibpd); int ret; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - ret = hns_roce_pd_alloc(to_hr_dev(ib_dev), &pd->pdn); if (ret) { - kfree(pd); dev_err(dev, "[alloc_pd]hns_roce_pd_alloc failed!\n"); - return ERR_PTR(ret); + return ret; } if (context) { @@ -83,21 +78,17 @@ struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev, if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn); dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n"); - kfree(pd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &pd->ibpd; + return 0; } EXPORT_SYMBOL_GPL(hns_roce_alloc_pd); -int hns_roce_dealloc_pd(struct ib_pd *pd) +void hns_roce_dealloc_pd(struct ib_pd *pd) { hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn); - kfree(to_hr_pd(pd)); - - return 0; } EXPORT_SYMBOL_GPL(hns_roce_dealloc_pd); diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 59e978141ad4..c5a881172524 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -601,7 +601,6 @@ void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev) if (!atomic_dec_and_test(&iwpd->usecount)) return; i40iw_free_resource(iwdev, iwdev->allocated_pds, iwpd->sc_pd.pd_id); - kfree(iwpd); } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index d4ab46dd9e6c..28449ad57b37 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -312,16 +312,15 @@ static void i40iw_dealloc_push_page(struct i40iw_device *iwdev, struct i40iw_sc_ /** * i40iw_alloc_pd - allocate protection domain - * @ibdev: device pointer from stack + * @pd: PD pointer * @context: user context created during alloc * @udata: user data */ -static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct i40iw_pd *iwpd; - struct i40iw_device *iwdev = to_iwdev(ibdev); + struct i40iw_pd *iwpd = to_iwpd(pd); + struct i40iw_device *iwdev = to_iwdev(pd->device); struct i40iw_sc_dev *dev = &iwdev->sc_dev; struct i40iw_alloc_pd_resp uresp; struct i40iw_sc_pd *sc_pd; @@ -330,19 +329,13 @@ static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev, int err; if (iwdev->closing) - return ERR_PTR(-ENODEV); + return -ENODEV; err = i40iw_alloc_resource(iwdev, iwdev->allocated_pds, iwdev->max_pd, &pd_id, &iwdev->next_pd); if (err) { i40iw_pr_err("alloc resource failed\n"); - return ERR_PTR(err); - } - - iwpd = kzalloc(sizeof(*iwpd), GFP_KERNEL); - if (!iwpd) { - err = -ENOMEM; - goto free_res; + return err; } sc_pd = &iwpd->sc_pd; @@ -361,25 +354,23 @@ static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev, } i40iw_add_pdusecount(iwpd); - return &iwpd->ibpd; + return 0; + error: - kfree(iwpd); -free_res: i40iw_free_resource(iwdev, iwdev->allocated_pds, pd_id); - return ERR_PTR(err); + return err; } /** * i40iw_dealloc_pd - deallocate pd * @ibpd: ptr of pd to be deallocated */ -static int i40iw_dealloc_pd(struct ib_pd *ibpd) +static void i40iw_dealloc_pd(struct ib_pd *ibpd) { struct i40iw_pd *iwpd = to_iwpd(ibpd); struct i40iw_device *iwdev = to_iwdev(ibpd->device); i40iw_rem_pdusecount(iwpd, iwdev); - return 0; } /** @@ -2750,6 +2741,7 @@ static const struct ib_device_ops i40iw_dev_ops = { .query_qp = i40iw_query_qp, .reg_user_mr = i40iw_reg_user_mr, .req_notify_cq = i40iw_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd), }; /** diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d66002a31000..c0f6aea7ed7c 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1186,38 +1186,27 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) } } -static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct mlx4_ib_pd *pd; + struct mlx4_ib_pd *pd = to_mpd(ibpd); + struct ib_device *ibdev = ibpd->device; int err; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); - if (err) { - kfree(pd); - return ERR_PTR(err); - } + if (err) + return err; - if (context) - if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { - mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); - kfree(pd); - return ERR_PTR(-EFAULT); - } - return &pd->ibpd; + if (context && ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) { + mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); + return -EFAULT; + } + return 0; } -static int mlx4_ib_dealloc_pd(struct ib_pd *pd) +static void mlx4_ib_dealloc_pd(struct ib_pd *pd) { mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); - kfree(pd); - - return 0; } static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, @@ -2580,6 +2569,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = { .req_notify_cq = mlx4_ib_arm_cq, .rereg_user_mr = mlx4_ib_rereg_user_mr, .resize_cq = mlx4_ib_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd), }; static const struct ib_device_ops mlx4_ib_dev_wq_ops = { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 76d6c2557d0c..f9cddc6f2ab6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2280,30 +2280,24 @@ int mlx5_ib_dealloc_dm(struct ib_dm *ibdm) return 0; } -static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct mlx5_ib_pd *pd = to_mpd(ibpd); + struct ib_device *ibdev = ibpd->device; struct mlx5_ib_alloc_pd_resp resp; - struct mlx5_ib_pd *pd; int err; u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; u16 uid = 0; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - uid = context ? to_mucontext(context)->devx_uid : 0; MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); MLX5_SET(alloc_pd_in, in, uid, uid); err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in), out, sizeof(out)); - if (err) { - kfree(pd); - return ERR_PTR(err); - } + if (err) + return err; pd->pdn = MLX5_GET(alloc_pd_out, out, pd); pd->uid = uid; @@ -2311,23 +2305,19 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid); - kfree(pd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &pd->ibpd; + return 0; } -static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +static void mlx5_ib_dealloc_pd(struct ib_pd *pd) { struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid); - kfree(mpd); - - return 0; } enum { @@ -4680,23 +4670,28 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; + struct ib_device *ibdev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; int port; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); + ibdev = &dev->ib_dev; mutex_init(&devr->mutex); - devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); - if (IS_ERR(devr->p0)) { - ret = PTR_ERR(devr->p0); - goto error0; - } - devr->p0->device = &dev->ib_dev; + devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd); + if (!devr->p0) + return -ENOMEM; + + devr->p0->device = ibdev; devr->p0->uobject = NULL; atomic_set(&devr->p0->usecnt, 0); + ret = mlx5_ib_alloc_pd(devr->p0, NULL, NULL); + if (ret) + goto error0; + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL); if (IS_ERR(devr->c0)) { ret = PTR_ERR(devr->c0); @@ -4794,6 +4789,7 @@ error2: error1: mlx5_ib_dealloc_pd(devr->p0); error0: + kfree(devr->p0); return ret; } @@ -4809,6 +4805,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr) mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); + kfree(devr->p0); /* Make sure no change P_Key work items are still executing */ for (port = 0; port < dev->num_ports; ++port) @@ -5938,6 +5935,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), }; static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = { diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 1bb67562c8c8..2c754bc226f3 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -374,40 +374,30 @@ static int mthca_mmap_uar(struct ib_ucontext *context, return 0; } -static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int mthca_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct mthca_pd *pd; + struct ib_device *ibdev = ibpd->device; + struct mthca_pd *pd = to_mpd(ibpd); int err; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - err = mthca_pd_alloc(to_mdev(ibdev), !context, pd); - if (err) { - kfree(pd); - return ERR_PTR(err); - } + if (err) + return err; if (context) { if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) { mthca_pd_free(to_mdev(ibdev), pd); - kfree(pd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &pd->ibpd; + return 0; } -static int mthca_dealloc_pd(struct ib_pd *pd) +static void mthca_dealloc_pd(struct ib_pd *pd) { mthca_pd_free(to_mdev(pd->device), to_mpd(pd)); - kfree(pd); - - return 0; } static struct ib_ah *mthca_ah_create(struct ib_pd *pd, @@ -1228,6 +1218,7 @@ static const struct ib_device_ops mthca_dev_ops = { .query_qp = mthca_query_qp, .reg_user_mr = mthca_reg_user_mr, .resize_cq = mthca_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd), }; static const struct ib_device_ops mthca_dev_arbel_srq_ops = { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 6eb991d40035..f18b28ae4bd9 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -658,10 +658,11 @@ static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) /** * nes_alloc_pd */ -static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, struct ib_udata *udata) +static int nes_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct nes_pd *nespd; + struct ib_device *ibdev = pd->device; + struct nes_pd *nespd = to_nespd(pd); struct nes_vnic *nesvnic = to_nesvnic(ibdev); struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; @@ -676,15 +677,8 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD); - if (err) { - return ERR_PTR(err); - } - - nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL); - if (!nespd) { - nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); - return ERR_PTR(-ENOMEM); - } + if (err) + return err; nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n", nespd, dev_name(&nesvnic->nesibdev->ibdev.dev)); @@ -700,16 +694,14 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) { nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n"); nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); - kfree(nespd); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } uresp.pd_id = nespd->pd_id; uresp.mmap_db_index = nespd->mmap_db_index; if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) { nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); - kfree(nespd); - return ERR_PTR(-EFAULT); + return -EFAULT; } set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); @@ -718,14 +710,14 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, } nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd); - return &nespd->ibpd; + return 0; } /** * nes_dealloc_pd */ -static int nes_dealloc_pd(struct ib_pd *ibpd) +static void nes_dealloc_pd(struct ib_pd *ibpd) { struct nes_ucontext *nesucontext; struct nes_pd *nespd = to_nespd(ibpd); @@ -748,9 +740,6 @@ static int nes_dealloc_pd(struct ib_pd *ibpd) nespd->pd_id, nespd); nes_free_resource(nesadapter, nesadapter->allocated_pds, (nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12)); - kfree(nespd); - - return 0; } @@ -3658,6 +3647,7 @@ static const struct ib_device_ops nes_dev_ops = { .query_qp = nes_query_qp, .reg_user_mr = nes_reg_user_mr, .req_notify_cq = nes_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd), }; /** diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 88970a6bb555..0de83c92691f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -179,6 +179,7 @@ static const struct ib_device_ops ocrdma_dev_ops = { .reg_user_mr = ocrdma_reg_user_mr, .req_notify_cq = ocrdma_arm_cq, .resize_cq = ocrdma_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd), }; static const struct ib_device_ops ocrdma_dev_srq_ops = { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 2a62936bef4d..980ba97188ff 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -367,17 +367,12 @@ static int ocrdma_get_pd_num(struct ocrdma_dev *dev, struct ocrdma_pd *pd) return status; } -static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, - struct ocrdma_ucontext *uctx, - struct ib_udata *udata) +static int _ocrdma_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd, + struct ocrdma_ucontext *uctx, + struct ib_udata *udata) { - struct ocrdma_pd *pd = NULL; int status; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - if (udata && uctx && dev->attr.max_dpp_pds) { pd->dpp_enabled = ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R; @@ -386,15 +381,8 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, dev->attr.wqe_size) : 0; } - if (dev->pd_mgr->pd_prealloc_valid) { - status = ocrdma_get_pd_num(dev, pd); - if (status == 0) { - return pd; - } else { - kfree(pd); - return ERR_PTR(status); - } - } + if (dev->pd_mgr->pd_prealloc_valid) + return ocrdma_get_pd_num(dev, pd); retry: status = ocrdma_mbx_alloc_pd(dev, pd); @@ -403,13 +391,11 @@ retry: pd->dpp_enabled = false; pd->num_dpp_qp = 0; goto retry; - } else { - kfree(pd); - return ERR_PTR(status); } + return status; } - return pd; + return 0; } static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, @@ -418,30 +404,33 @@ static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, return (uctx->cntxt_pd == pd); } -static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, +static void _ocrdma_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) { - int status; - if (dev->pd_mgr->pd_prealloc_valid) - status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled); + ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled); else - status = ocrdma_mbx_dealloc_pd(dev, pd); - - kfree(pd); - return status; + ocrdma_mbx_dealloc_pd(dev, pd); } static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev, struct ocrdma_ucontext *uctx, struct ib_udata *udata) { - int status = 0; + struct ib_device *ibdev = &dev->ibdev; + struct ib_pd *pd; + int status; - uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata); - if (IS_ERR(uctx->cntxt_pd)) { - status = PTR_ERR(uctx->cntxt_pd); - uctx->cntxt_pd = NULL; + pd = rdma_zalloc_drv_obj(ibdev, ib_pd); + if (!pd) + return -ENOMEM; + + pd->device = ibdev; + uctx->cntxt_pd = get_ocrdma_pd(pd); + + status = _ocrdma_alloc_pd(dev, uctx->cntxt_pd, uctx, udata); + if (status) { + kfree(uctx->cntxt_pd); goto err; } @@ -460,6 +449,7 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) pr_err("%s(%d) Freeing in use pdid=0x%x.\n", __func__, dev->id, pd->id); } + kfree(uctx->cntxt_pd); uctx->cntxt_pd = NULL; (void)_ocrdma_dealloc_pd(dev, pd); return 0; @@ -537,6 +527,7 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, return &ctx->ibucontext; cpy_err: + ocrdma_dealloc_ucontext_pd(ctx); pd_err: ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len); map_err: @@ -658,10 +649,10 @@ dpp_map_err: return status; } -struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +int ocrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); struct ocrdma_pd *pd; struct ocrdma_ucontext *uctx = NULL; @@ -677,11 +668,10 @@ struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev, } } - pd = _ocrdma_alloc_pd(dev, uctx, udata); - if (IS_ERR(pd)) { - status = PTR_ERR(pd); + pd = get_ocrdma_pd(ibpd); + status = _ocrdma_alloc_pd(dev, pd, uctx, udata); + if (status) goto exit; - } pd_mapping: if (udata && context) { @@ -689,25 +679,22 @@ pd_mapping: if (status) goto err; } - return &pd->ibpd; + return 0; err: - if (is_uctx_pd) { + if (is_uctx_pd) ocrdma_release_ucontext_pd(uctx); - } else { - if (_ocrdma_dealloc_pd(dev, pd)) - pr_err("%s: _ocrdma_dealloc_pd() failed\n", __func__); - } + else + _ocrdma_dealloc_pd(dev, pd); exit: - return ERR_PTR(status); + return status; } -int ocrdma_dealloc_pd(struct ib_pd *ibpd) +void ocrdma_dealloc_pd(struct ib_pd *ibpd) { struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); struct ocrdma_ucontext *uctx = NULL; - int status = 0; u64 usr_db; uctx = pd->uctx; @@ -721,11 +708,10 @@ int ocrdma_dealloc_pd(struct ib_pd *ibpd) if (is_ucontext_pd(uctx, pd)) { ocrdma_release_ucontext_pd(uctx); - return status; + return; } } - status = _ocrdma_dealloc_pd(dev, pd); - return status; + _ocrdma_dealloc_pd(dev, pd); } static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index b69cfdce7970..1fd66721c930 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -70,9 +70,9 @@ int ocrdma_dealloc_ucontext(struct ib_ucontext *); int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma); -struct ib_pd *ocrdma_alloc_pd(struct ib_device *, - struct ib_ucontext *, struct ib_udata *); -int ocrdma_dealloc_pd(struct ib_pd *pd); +int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx, + struct ib_udata *udata); +void ocrdma_dealloc_pd(struct ib_pd *pd); struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 878e9e23652b..44ce4989dcef 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -239,6 +239,7 @@ static const struct ib_device_ops qedr_dev_ops = { .reg_user_mr = qedr_reg_user_mr, .req_notify_cq = qedr_arm_cq, .resize_cq = qedr_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd), }; static int qedr_register_device(struct qedr_dev *dev) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 989f08633fbe..a06d2258394a 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -450,11 +450,12 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) vma->vm_page_prot); } -struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, struct ib_udata *udata) +int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct qedr_dev *dev = get_qedr_dev(ibdev); - struct qedr_pd *pd; + struct qedr_pd *pd = get_qedr_pd(ibpd); u16 pd_id; int rc; @@ -463,16 +464,12 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, if (!dev->rdma_ctx) { DP_ERR(dev, "invalid RDMA context\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - rc = dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id); if (rc) - goto err; + return rc; pd->pd_id = pd_id; @@ -485,36 +482,23 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, if (rc) { DP_ERR(dev, "copy error pd_id=0x%x.\n", pd_id); dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd_id); - goto err; + return rc; } pd->uctx = get_qedr_ucontext(context); pd->uctx->pd = pd; } - return &pd->ibpd; - -err: - kfree(pd); - return ERR_PTR(rc); + return 0; } -int qedr_dealloc_pd(struct ib_pd *ibpd) +void qedr_dealloc_pd(struct ib_pd *ibpd) { struct qedr_dev *dev = get_qedr_dev(ibpd->device); struct qedr_pd *pd = get_qedr_pd(ibpd); - if (!pd) { - pr_err("Invalid PD received in dealloc_pd\n"); - return -EINVAL; - } - DP_DEBUG(dev, QEDR_MSG_INIT, "Deallocating PD %d\n", pd->pd_id); dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd->pd_id); - - kfree(pd); - - return 0; } static void qedr_free_pbl(struct qedr_dev *dev, diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 1852b7012bf4..97a6ff3f9afb 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -47,9 +47,9 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *); int qedr_dealloc_ucontext(struct ib_ucontext *); int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma); -struct ib_pd *qedr_alloc_pd(struct ib_device *, - struct ib_ucontext *, struct ib_udata *); -int qedr_dealloc_pd(struct ib_pd *pd); +int qedr_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx, + struct ib_udata *udata); +void qedr_dealloc_pd(struct ib_pd *pd); struct ib_cq *qedr_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 1ec155823716..256ad2f236c8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -352,6 +352,7 @@ static const struct ib_device_ops usnic_dev_ops = { .query_port = usnic_ib_query_port, .query_qp = usnic_ib_query_qp, .reg_user_mr = usnic_ib_reg_mr, + INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd), }; /* Start of PF discovery section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 9dea18106247..0ced89b51448 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -456,37 +456,23 @@ int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, return 0; } -struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct usnic_ib_pd *pd; + struct usnic_ib_pd *pd = to_upd(ibpd); void *umem_pd; - usnic_dbg("\n"); - - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - umem_pd = pd->umem_pd = usnic_uiom_alloc_pd(); if (IS_ERR_OR_NULL(umem_pd)) { - kfree(pd); - return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM); + return umem_pd ? PTR_ERR(umem_pd) : -ENOMEM; } - usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", - pd, context, dev_name(&ibdev->dev)); - return &pd->ibpd; + return 0; } -int usnic_ib_dealloc_pd(struct ib_pd *pd) +void usnic_ib_dealloc_pd(struct ib_pd *pd) { - usnic_info("freeing domain 0x%p\n", pd); - usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd); - kfree(pd); - return 0; } struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 99a6d81c2bcd..44a9d2f82bf5 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -51,10 +51,9 @@ int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num); int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey); -struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int usnic_ib_dealloc_pd(struct ib_pd *pd); +int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata); +void usnic_ib_dealloc_pd(struct ib_pd *pd); struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index e582beaf9430..47e653d2495c 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -195,6 +195,7 @@ static const struct ib_device_ops pvrdma_dev_ops = { .query_qp = pvrdma_query_qp, .reg_user_mr = pvrdma_reg_user_mr, .req_notify_cq = pvrdma_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd), }; static const struct ib_device_ops pvrdma_dev_srq_ops = { diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index fafb2add3b44..f44220f72e05 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -438,37 +438,29 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) /** * pvrdma_alloc_pd - allocate protection domain - * @ibdev: the IB device + * @ibpd: PD pointer * @context: user context * @udata: user data * * @return: the ib_pd protection domain pointer on success, otherwise errno. */ -struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct pvrdma_pd *pd; + struct ib_device *ibdev = ibpd->device; + struct pvrdma_pd *pd = to_vpd(ibpd); struct pvrdma_dev *dev = to_vdev(ibdev); - union pvrdma_cmd_req req; - union pvrdma_cmd_resp rsp; + union pvrdma_cmd_req req = {}; + union pvrdma_cmd_resp rsp = {}; struct pvrdma_cmd_create_pd *cmd = &req.create_pd; struct pvrdma_cmd_create_pd_resp *resp = &rsp.create_pd_resp; struct pvrdma_alloc_pd_resp pd_resp = {0}; int ret; - void *ptr; /* Check allowed max pds */ if (!atomic_add_unless(&dev->num_pds, 1, dev->dsr->caps.max_pd)) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) { - ptr = ERR_PTR(-ENOMEM); - goto err; - } - - memset(cmd, 0, sizeof(*cmd)); cmd->hdr.cmd = PVRDMA_CMD_CREATE_PD; cmd->ctx_handle = (context) ? to_vucontext(context)->ctx_handle : 0; ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_PD_RESP); @@ -476,8 +468,7 @@ struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev, dev_warn(&dev->pdev->dev, "failed to allocate protection domain, error: %d\n", ret); - ptr = ERR_PTR(ret); - goto freepd; + goto err; } pd->privileged = !context; @@ -490,18 +481,16 @@ struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev, dev_warn(&dev->pdev->dev, "failed to copy back protection domain\n"); pvrdma_dealloc_pd(&pd->ibpd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } /* u32 pd handle */ - return &pd->ibpd; + return 0; -freepd: - kfree(pd); err: atomic_dec(&dev->num_pds); - return ptr; + return ret; } /** @@ -510,14 +499,13 @@ err: * * @return: 0 on success, otherwise errno. */ -int pvrdma_dealloc_pd(struct ib_pd *pd) +void pvrdma_dealloc_pd(struct ib_pd *pd) { struct pvrdma_dev *dev = to_vdev(pd->device); - union pvrdma_cmd_req req; + union pvrdma_cmd_req req = {}; struct pvrdma_cmd_destroy_pd *cmd = &req.destroy_pd; int ret; - memset(cmd, 0, sizeof(*cmd)); cmd->hdr.cmd = PVRDMA_CMD_DESTROY_PD; cmd->pd_handle = to_vpd(pd)->pd_handle; @@ -527,10 +515,7 @@ int pvrdma_dealloc_pd(struct ib_pd *pd) "could not dealloc protection domain, error: %d\n", ret); - kfree(to_vpd(pd)); atomic_dec(&dev->num_pds); - - return 0; } /** diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index f7f758d60110..ed91baad1ffa 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -399,10 +399,9 @@ int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata); int pvrdma_dealloc_ucontext(struct ib_ucontext *context); -struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int pvrdma_dealloc_pd(struct ib_pd *ibpd); +int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void pvrdma_dealloc_pd(struct ib_pd *ibpd); struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, diff --git a/drivers/infiniband/sw/rdmavt/pd.c b/drivers/infiniband/sw/rdmavt/pd.c index dcc1870b8d23..6033054b22fa 100644 --- a/drivers/infiniband/sw/rdmavt/pd.c +++ b/drivers/infiniband/sw/rdmavt/pd.c @@ -50,7 +50,7 @@ /** * rvt_alloc_pd - allocate a protection domain - * @ibdev: ib device + * @ibpd: PD * @context: optional user context * @udata: optional user data * @@ -58,19 +58,14 @@ * * Return: 0 on success */ -struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +int rvt_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct rvt_dev_info *dev = ib_to_rvt(ibdev); - struct rvt_pd *pd; - struct ib_pd *ret; + struct rvt_pd *pd = ibpd_to_rvtpd(ibpd); + int ret = 0; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } /* * While we could continue allocating protecetion domains, being * constrained only by system resources. The IBTA spec defines that @@ -81,8 +76,7 @@ struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, spin_lock(&dev->n_pds_lock); if (dev->n_pds_allocated == dev->dparms.props.max_pd) { spin_unlock(&dev->n_pds_lock); - kfree(pd); - ret = ERR_PTR(-ENOMEM); + ret = -ENOMEM; goto bail; } @@ -92,8 +86,6 @@ struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, /* ib_alloc_pd() will initialize pd->ibpd. */ pd->user = !!udata; - ret = &pd->ibpd; - bail: return ret; } @@ -104,16 +96,11 @@ bail: * * Return: always 0 */ -int rvt_dealloc_pd(struct ib_pd *ibpd) +void rvt_dealloc_pd(struct ib_pd *ibpd) { - struct rvt_pd *pd = ibpd_to_rvtpd(ibpd); struct rvt_dev_info *dev = ib_to_rvt(ibpd->device); spin_lock(&dev->n_pds_lock); dev->n_pds_allocated--; spin_unlock(&dev->n_pds_lock); - - kfree(pd); - - return 0; } diff --git a/drivers/infiniband/sw/rdmavt/pd.h b/drivers/infiniband/sw/rdmavt/pd.h index 1892ca4a9746..7a887e4a45e7 100644 --- a/drivers/infiniband/sw/rdmavt/pd.h +++ b/drivers/infiniband/sw/rdmavt/pd.h @@ -50,9 +50,8 @@ #include -struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int rvt_dealloc_pd(struct ib_pd *ibpd); +int rvt_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void rvt_dealloc_pd(struct ib_pd *ibpd); #endif /* DEF_RDMAVTPD_H */ diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index b3f0c5578925..a19832c73d5a 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -436,6 +436,7 @@ static const struct ib_device_ops rvt_dev_ops = { .req_notify_cq = rvt_req_notify_cq, .resize_cq = rvt_resize_cq, .unmap_fmr = rvt_unmap_fmr, + INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd), }; static noinline int check_support(struct rvt_dev_info *rdi, int verb) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index b5c91df22047..cd3f14629ba8 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -46,6 +46,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_PD] = { .name = "rxe-pd", .size = sizeof(struct rxe_pd), + .flags = RXE_POOL_NO_ALLOC, }, [RXE_TYPE_AH] = { .name = "rxe-ah", @@ -119,8 +120,10 @@ static void rxe_cache_clean(size_t cnt) for (i = 0; i < cnt; i++) { type = &rxe_type_info[i]; - kmem_cache_destroy(type->cache); - type->cache = NULL; + if (!(type->flags & RXE_POOL_NO_ALLOC)) { + kmem_cache_destroy(type->cache); + type->cache = NULL; + } } } @@ -134,14 +137,17 @@ int rxe_cache_init(void) for (i = 0; i < RXE_NUM_TYPES; i++) { type = &rxe_type_info[i]; size = ALIGN(type->size, RXE_POOL_ALIGN); - type->cache = kmem_cache_create(type->name, size, - RXE_POOL_ALIGN, - RXE_POOL_CACHE_FLAGS, NULL); - if (!type->cache) { - pr_err("Unable to init kmem cache for %s\n", - type->name); - err = -ENOMEM; - goto err1; + if (!(type->flags & RXE_POOL_NO_ALLOC)) { + type->cache = + kmem_cache_create(type->name, size, + RXE_POOL_ALIGN, + RXE_POOL_CACHE_FLAGS, NULL); + if (!type->cache) { + pr_err("Unable to init kmem cache for %s\n", + type->name); + err = -ENOMEM; + goto err1; + } } } @@ -415,6 +421,37 @@ out_put_pool: return NULL; } +int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem) +{ + unsigned long flags; + + might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); + + read_lock_irqsave(&pool->pool_lock, flags); + if (pool->state != RXE_POOL_STATE_VALID) { + read_unlock_irqrestore(&pool->pool_lock, flags); + return -EINVAL; + } + kref_get(&pool->ref_cnt); + read_unlock_irqrestore(&pool->pool_lock, flags); + + kref_get(&pool->rxe->ref_cnt); + + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) + goto out_put_pool; + + elem->pool = pool; + kref_init(&elem->ref_cnt); + + return 0; + +out_put_pool: + atomic_dec(&pool->num_elem); + rxe_dev_put(pool->rxe); + rxe_pool_put(pool); + return -EINVAL; +} + void rxe_elem_release(struct kref *kref) { struct rxe_pool_entry *elem = @@ -424,7 +461,8 @@ void rxe_elem_release(struct kref *kref) if (pool->cleanup) pool->cleanup(elem); - kmem_cache_free(pool_cache(pool), elem); + if (!(pool->flags & RXE_POOL_NO_ALLOC)) + kmem_cache_free(pool_cache(pool), elem); atomic_dec(&pool->num_elem); rxe_dev_put(pool->rxe); rxe_pool_put(pool); diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 72968c29e01f..2f2cff1cbe43 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -41,6 +41,7 @@ enum rxe_pool_flags { RXE_POOL_ATOMIC = BIT(0), RXE_POOL_INDEX = BIT(1), RXE_POOL_KEY = BIT(2), + RXE_POOL_NO_ALLOC = BIT(4), }; enum rxe_elem_type { @@ -131,6 +132,9 @@ void rxe_pool_cleanup(struct rxe_pool *pool); /* allocate an object from pool */ void *rxe_alloc(struct rxe_pool *pool); +/* connect already allocated object to pool */ +int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem); + /* assign an index to an indexed object and insert object into * pool's rb tree */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index cc5a05124ece..051c3930e808 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -191,23 +191,20 @@ static int rxe_port_immutable(struct ib_device *dev, u8 port_num, return 0; } -static struct ib_pd *rxe_alloc_pd(struct ib_device *dev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct rxe_dev *rxe = to_rdev(dev); - struct rxe_pd *pd; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); - pd = rxe_alloc(&rxe->pd_pool); - return pd ? &pd->ibpd : ERR_PTR(-ENOMEM); + return rxe_add_to_pool(&rxe->pd_pool, &pd->pelem); } -static int rxe_dealloc_pd(struct ib_pd *ibpd) +static void rxe_dealloc_pd(struct ib_pd *ibpd) { struct rxe_pd *pd = to_rpd(ibpd); rxe_drop_ref(pd); - return 0; } static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, @@ -1183,6 +1180,7 @@ static const struct ib_device_ops rxe_dev_ops = { .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, .resize_cq = rxe_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), }; int rxe_register_device(struct rxe_dev *rxe) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 74e04801d34d..70839d3f55d9 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -66,8 +66,8 @@ struct rxe_ucontext { }; struct rxe_pd { + struct ib_pd ibpd; struct rxe_pool_entry pelem; - struct ib_pd ibpd; }; struct rxe_ah { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e29eae4aec84..854d7816787c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2385,10 +2385,9 @@ struct ib_device_ops { int (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); - struct ib_pd *(*alloc_pd)(struct ib_device *device, - struct ib_ucontext *context, - struct ib_udata *udata); - int (*dealloc_pd)(struct ib_pd *pd); + int (*alloc_pd)(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); + void (*dealloc_pd)(struct ib_pd *pd); struct ib_ah *(*create_ah)(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, struct ib_udata *udata); @@ -2530,6 +2529,8 @@ struct ib_device_ops { */ int (*fill_res_entry)(struct sk_buff *msg, struct rdma_restrack_entry *entry); + + DECLARE_RDMA_OBJ_SIZE(ib_pd); }; struct ib_device { From e3593b568a68b0e1a434b80fd6eaebfb655e839d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:47 -0700 Subject: [PATCH 14/38] RDMA/device: Check that the rename is nop under the lock Since another rename could be running in parallel it is safer to check that the name is not changing inside the lock, where we already know the device name will not change. Fixes: d21943dd19b5 ("RDMA/core: Implement IB device rename function") Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- drivers/infiniband/core/device.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 57e1e177921e..60083bde3e39 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -189,12 +189,14 @@ static struct ib_device *__ib_device_get_by_name(const char *name) int ib_device_rename(struct ib_device *ibdev, const char *name) { - int ret = 0; - - if (!strcmp(name, dev_name(&ibdev->dev))) - return ret; + int ret; mutex_lock(&device_mutex); + if (!strcmp(name, dev_name(&ibdev->dev))) { + ret = 0; + goto out; + } + if (__ib_device_get_by_name(name)) { ret = -EEXIST; goto out; From b34b269ad85d7dd4a512487f2395c3be3e40f76a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:48 -0700 Subject: [PATCH 15/38] RDMA/device: Ensure that security memory is always freed Since this only frees memory it should be done during the release callback. Otherwise there are possible error flows where it might not get called if registration aborts. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 4 ++-- drivers/infiniband/core/device.c | 10 +++------- drivers/infiniband/core/security.c | 4 +--- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index d053110207eb..a1826f4c2e23 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -181,7 +181,7 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND -void ib_security_destroy_port_pkey_list(struct ib_device *device); +void ib_security_release_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, u8 port_num, @@ -204,7 +204,7 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); void ib_mad_agent_security_change(void); #else -static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) +static inline void ib_security_release_port_pkey_list(struct ib_device *device) { } diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 60083bde3e39..b997feac2c63 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -253,6 +253,8 @@ static void ib_device_release(struct device *device) ib_cache_release_one(dev); kfree(dev->port_immutable); } + ib_security_release_port_pkey_list(dev); + kfree(dev->port_pkey_list); kfree(dev); } @@ -522,7 +524,6 @@ static void cleanup_device(struct ib_device *device) { ib_cache_cleanup_one(device); ib_cache_release_one(device); - kfree(device->port_pkey_list); kfree(device->port_immutable); } @@ -560,12 +561,10 @@ static int setup_device(struct ib_device *device) if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto pkey_cleanup; + return ret; } return 0; -pkey_cleanup: - kfree(device->port_pkey_list); port_cleanup: kfree(device->port_immutable); return ret; @@ -682,9 +681,6 @@ void ib_unregister_device(struct ib_device *device) ib_cache_cleanup_one(device); - ib_security_destroy_port_pkey_list(device); - kfree(device->port_pkey_list); - down_write(&lists_rwsem); write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index a70d2ba312ed..dad6a94a43f3 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -558,13 +558,12 @@ void ib_security_cache_change(struct ib_device *device, } } -void ib_security_destroy_port_pkey_list(struct ib_device *device) +void ib_security_release_port_pkey_list(struct ib_device *device) { struct pkey_index_qp_list *pkey, *tmp_pkey; int i; for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { - spin_lock(&device->port_pkey_list[i].list_lock); list_for_each_entry_safe(pkey, tmp_pkey, &device->port_pkey_list[i].pkey_list, @@ -572,7 +571,6 @@ void ib_security_destroy_port_pkey_list(struct ib_device *device) list_del(&pkey->pkey_index_list); kfree(pkey); } - spin_unlock(&device->port_pkey_list[i].list_lock); } } From d45f89d59bcd42d6b8575d0af69d7a3a98e73bb6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:49 -0700 Subject: [PATCH 16/38] RDMA/device: Call ib_cache_release_one() only from ib_device_release() Instead of complicated logic about when this memory is freed, always free it during device release(). All the cache pointers start out as NULL, so it is safe to call this before the cache is initialized. This makes for a simpler error unwind flow, and a simpler understanding of the lifetime of the memory allocations inside the struct ib_device. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 3 +++ drivers/infiniband/core/device.c | 41 ++++++++++---------------------- 2 files changed, 15 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 7b04590f307f..2338d0b3a0ca 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1460,6 +1460,9 @@ void ib_cache_release_one(struct ib_device *device) { int p; + if (!device->cache.ports) + return; + /* * The release function frees all the cache elements. * This function should be called as part of freeing diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b997feac2c63..872662a84b16 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -244,17 +244,10 @@ static void ib_device_release(struct device *device) struct ib_device *dev = container_of(device, struct ib_device, dev); WARN_ON(dev->reg_state == IB_DEV_REGISTERED); - if (dev->reg_state == IB_DEV_UNREGISTERED) { - /* - * In IB_DEV_UNINITIALIZED state, cache or port table - * is not even created. Free cache and port table only when - * device reaches UNREGISTERED state. - */ - ib_cache_release_one(dev); - kfree(dev->port_immutable); - } + ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); kfree(dev->port_pkey_list); + kfree(dev->port_immutable); kfree(dev); } @@ -520,13 +513,6 @@ static void setup_dma_device(struct ib_device *device) } } -static void cleanup_device(struct ib_device *device) -{ - ib_cache_cleanup_one(device); - ib_cache_release_one(device); - kfree(device->port_immutable); -} - static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; @@ -548,26 +534,16 @@ static int setup_device(struct ib_device *device) if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); - goto port_cleanup; + return ret; } ret = setup_port_pkey_list(device); if (ret) { dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); - goto port_cleanup; - } - - ret = ib_cache_setup_one(device); - if (ret) { - dev_warn(&device->dev, - "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret; } - return 0; -port_cleanup: - kfree(device->port_immutable); - return ret; + return 0; } /** @@ -607,6 +583,13 @@ int ib_register_device(struct ib_device *device, const char *name) if (ret) goto out; + ret = ib_cache_setup_one(device); + if (ret) { + dev_warn(&device->dev, + "Couldn't set up InfiniBand P_Key/GID cache\n"); + goto out; + } + device->index = __dev_new_index(); ib_device_register_rdmacg(device); @@ -633,7 +616,7 @@ int ib_register_device(struct ib_device *device, const char *name) cg_cleanup: ib_device_unregister_rdmacg(device); - cleanup_device(device); + ib_cache_cleanup_one(device); out: mutex_unlock(&device_mutex); return ret; From 652432f33c01b2edaa5b2550b423cd894b1c7b9a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:50 -0700 Subject: [PATCH 17/38] RDMA/device: Get rid of reg_state This really has no purpose anymore, refcount can be used to tell if the device is still registered. Keeping it around just invites mis-use. Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- drivers/infiniband/core/device.c | 8 ++------ include/rdma/ib_verbs.h | 6 ------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 872662a84b16..1c54ded776d0 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -243,7 +243,7 @@ static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); - WARN_ON(dev->reg_state == IB_DEV_REGISTERED); + WARN_ON(refcount_read(&dev->refcount)); ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); kfree(dev->port_pkey_list); @@ -316,8 +316,7 @@ EXPORT_SYMBOL(_ib_alloc_device); void ib_dealloc_device(struct ib_device *device) { WARN_ON(!list_empty(&device->client_data_list)); - WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && - device->reg_state != IB_DEV_UNINITIALIZED); + WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); put_device(&device->dev); } @@ -602,7 +601,6 @@ int ib_register_device(struct ib_device *device, const char *name) } refcount_set(&device->refcount, 1); - device->reg_state = IB_DEV_REGISTERED; list_for_each_entry(client, &client_list, list) if (!add_client_context(device, client) && client->add) @@ -673,8 +671,6 @@ void ib_unregister_device(struct ib_device *device) } write_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); - - device->reg_state = IB_DEV_UNREGISTERED; } EXPORT_SYMBOL(ib_unregister_device); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 854d7816787c..d8ba987e8b29 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2572,12 +2572,6 @@ struct ib_device { struct kobject *ports_kobj; struct list_head port_list; - enum { - IB_DEV_UNINITIALIZED, - IB_DEV_REGISTERED, - IB_DEV_UNREGISTERED - } reg_state; - int uverbs_abi_ver; u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; From 3b88afd38e88d1bb2e900204ff0af7301a379a09 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:51 -0700 Subject: [PATCH 18/38] RDMA/device: Use an ida instead of a free page in alloc_name ida is the proper data structure to hold list of clustered small integers and then allocate an unused integer. Get rid of the convoluted and limited open-coded bitmap. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 1c54ded776d0..3a80f96c2919 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -213,30 +213,36 @@ out: static int alloc_name(struct ib_device *ibdev, const char *name) { - unsigned long *inuse; struct ib_device *device; + struct ida inuse; + int rc; int i; - inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); - if (!inuse) - return -ENOMEM; - + ida_init(&inuse); list_for_each_entry(device, &device_list, core_list) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; - if (i < 0 || i >= PAGE_SIZE * 8) + if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); - if (!strcmp(buf, dev_name(&device->dev))) - set_bit(i, inuse); + if (strcmp(buf, dev_name(&device->dev)) != 0) + continue; + + rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); + if (rc < 0) + goto out; } - i = find_first_zero_bit(inuse, PAGE_SIZE * 8); - free_page((unsigned long) inuse); + rc = ida_alloc(&inuse, GFP_KERNEL); + if (rc < 0) + goto out; - return dev_set_name(&ibdev->dev, name, i); + rc = dev_set_name(&ibdev->dev, name, rc); +out: + ida_destroy(&inuse); + return rc; } static void ib_device_release(struct device *device) From e59178d895afa29b671323f8265a1e50afe989e5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:52 -0700 Subject: [PATCH 19/38] RDMA/devices: Use xarray to store the clients This gives each client a unique ID and will let us move client_data to use xarray, and revise the locking scheme. clients have to be add/removed in strict FIFO/LIFO order as they interdepend. To support this the client_ids are assigned to increase in FIFO order. The existing linked list is kept to support reverse iteration until xarray can get a reverse iteration API. Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- drivers/infiniband/core/device.c | 50 ++++++++++++++++++++++++++++---- include/rdma/ib_verbs.h | 3 +- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 3a80f96c2919..f87d85659359 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -65,15 +65,17 @@ struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); -/* The device_list and client_list contain devices and clients after their +/* The device_list and clients contain devices and clients after their * registration has completed, and the devices and clients are removed * during unregistration. */ static LIST_HEAD(device_list); static LIST_HEAD(client_list); +#define CLIENT_REGISTERED XA_MARK_1 +static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); /* * device_mutex and lists_rwsem protect access to both device_list and - * client_list. device_mutex protects writer access by device and client + * clients. device_mutex protects writer access by device and client * registration / de-registration. lists_rwsem protects reader access to * these lists. Iterators of these lists must lock it for read, while updates * to the lists must be done with a write lock. A special case is when the @@ -564,6 +566,7 @@ int ib_register_device(struct ib_device *device, const char *name) { int ret; struct ib_client *client; + unsigned long index; setup_dma_device(device); @@ -608,7 +611,7 @@ int ib_register_device(struct ib_device *device, const char *name) refcount_set(&device->refcount, 1); - list_for_each_entry(client, &client_list, list) + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) if (!add_client_context(device, client) && client->add) client->add(device); @@ -680,6 +683,32 @@ void ib_unregister_device(struct ib_device *device) } EXPORT_SYMBOL(ib_unregister_device); +static int assign_client_id(struct ib_client *client) +{ + int ret; + + /* + * The add/remove callbacks must be called in FIFO/LIFO order. To + * achieve this we assign client_ids so they are sorted in + * registration order, and retain a linked list we can reverse iterate + * to get the LIFO order. The extra linked list can go away if xarray + * learns to reverse iterate. + */ + if (list_empty(&client_list)) + client->client_id = 0; + else + client->client_id = + list_last_entry(&client_list, struct ib_client, list) + ->client_id; + ret = xa_alloc(&clients, &client->client_id, INT_MAX, client, + GFP_KERNEL); + if (ret) + goto out; + +out: + return ret; +} + /** * ib_register_client - Register an IB client * @client:Client to register @@ -696,15 +725,21 @@ EXPORT_SYMBOL(ib_unregister_device); int ib_register_client(struct ib_client *client) { struct ib_device *device; + int ret; mutex_lock(&device_mutex); + ret = assign_client_id(client); + if (ret) { + mutex_unlock(&device_mutex); + return ret; + } list_for_each_entry(device, &device_list, core_list) if (!add_client_context(device, client) && client->add) client->add(device); down_write(&lists_rwsem); - list_add_tail(&client->list, &client_list); + xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&lists_rwsem); mutex_unlock(&device_mutex); @@ -729,7 +764,7 @@ void ib_unregister_client(struct ib_client *client) mutex_lock(&device_mutex); down_write(&lists_rwsem); - list_del(&client->list); + xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&lists_rwsem); list_for_each_entry(device, &device_list, core_list) { @@ -765,6 +800,10 @@ void ib_unregister_client(struct ib_client *client) kfree(found_context); } + down_write(&lists_rwsem); + list_del(&client->list); + xa_erase(&clients, client->client_id); + up_write(&lists_rwsem); mutex_unlock(&device_mutex); } EXPORT_SYMBOL(ib_unregister_client); @@ -1422,6 +1461,7 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); + WARN_ON(!xa_empty(&clients)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d8ba987e8b29..cc15820513cd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2610,7 +2610,7 @@ struct ib_device { }; struct ib_client { - char *name; + const char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *, void *client_data); @@ -2637,6 +2637,7 @@ struct ib_client { const struct sockaddr *addr, void *client_data); struct list_head list; + u32 client_id; /* kverbs are not required by the client */ u8 no_kverbs_req:1; From 0df91bb67334eebaf73d4ba32567e16d55f4f116 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:53 -0700 Subject: [PATCH 20/38] RDMA/devices: Use xarray to store the client_data Now that we have a small ID for each client we can use xarray instead of linearly searching linked lists for client data. This will give much faster and scalable client data lookup, and will lets us revise the locking scheme. Since xarray can store 'going_down' using a mark just entirely eliminate the struct ib_client_data and directly store the client_data value in the xarray. However this does require a special iterator as we must still iterate over any NULL client_data values. Also eliminate the client_data_lock in favour of internal xarray locking. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 348 +++++++++++++++---------------- include/rdma/ib_verbs.h | 23 +- 2 files changed, 186 insertions(+), 185 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index f87d85659359..5096593b99e9 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -51,30 +51,72 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); -struct ib_client_data { - struct list_head list; - struct ib_client *client; - void * data; - /* The device or client is going down. Do not call client or device - * callbacks other than remove(). */ - bool going_down; -}; - struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); -/* The device_list and clients contain devices and clients after their - * registration has completed, and the devices and clients are removed - * during unregistration. */ -static LIST_HEAD(device_list); +/* + * devices contains devices that have had their names assigned. The + * devices may not be registered. Users that care about the registration + * status need to call ib_device_try_get() on the device to ensure it is + * registered, and keep it registered, for the required duration. + * + */ +static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); + +/* + * Note that if the *rwsem is held and the *_REGISTERED mark is seen then the + * object is guaranteed to be and remain registered for the duration of the + * lock. + */ +#define DEVICE_REGISTERED XA_MARK_1 + static LIST_HEAD(client_list); #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); /* - * device_mutex and lists_rwsem protect access to both device_list and + * If client_data is registered then the corresponding client must also still + * be registered. + */ +#define CLIENT_DATA_REGISTERED XA_MARK_1 +/* + * xarray has this behavior where it won't iterate over NULL values stored in + * allocated arrays. So we need our own iterator to see all values stored in + * the array. This does the same thing as xa_for_each except that it also + * returns NULL valued entries if the array is allocating. Simplified to only + * work on simple xarrays. + */ +static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, + xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + entry = xas_find_marked(&xas, ULONG_MAX, filter); + if (xa_is_zero(entry)) + break; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) { + *indexp = xas.xa_index; + if (xa_is_zero(entry)) + return NULL; + return entry; + } + return XA_ERROR(-ENOENT); +} +#define xan_for_each_marked(xa, index, entry, filter) \ + for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ + !xa_is_err(entry); \ + (index)++, entry = xan_find_marked(xa, &(index), filter)) + +/* + * device_mutex and lists_rwsem protect access to both devices and * clients. device_mutex protects writer access by device and client * registration / de-registration. lists_rwsem protects reader access to * these lists. Iterators of these lists must lock it for read, while updates @@ -135,17 +177,6 @@ static int ib_device_check_mandatory(struct ib_device *device) return 0; } -static struct ib_device *__ib_device_get_by_index(u32 index) -{ - struct ib_device *device; - - list_for_each_entry(device, &device_list, core_list) - if (device->index == index) - return device; - - return NULL; -} - /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. @@ -155,7 +186,7 @@ struct ib_device *ib_device_get_by_index(u32 index) struct ib_device *device; down_read(&lists_rwsem); - device = __ib_device_get_by_index(index); + device = xa_load(&devices, index); if (device) { if (!ib_device_try_get(device)) device = NULL; @@ -181,8 +212,9 @@ EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; + unsigned long index; - list_for_each_entry(device, &device_list, core_list) + xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; @@ -216,12 +248,13 @@ out: static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; + unsigned long index; struct ida inuse; int rc; int i; ida_init(&inuse); - list_for_each_entry(device, &device_list, core_list) { + xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) @@ -256,6 +289,7 @@ static void ib_device_release(struct device *device) ib_security_release_port_pkey_list(dev); kfree(dev->port_pkey_list); kfree(dev->port_immutable); + xa_destroy(&dev->client_data); kfree(dev); } @@ -306,8 +340,11 @@ struct ib_device *_ib_alloc_device(size_t size) INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - rwlock_init(&device->client_data_lock); - INIT_LIST_HEAD(&device->client_data_list); + /* + * client_data needs to be alloc because we don't want our mark to be + * destroyed if the user stores NULL in the client data. + */ + xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); INIT_LIST_HEAD(&device->port_list); init_completion(&device->unreg_completion); @@ -323,7 +360,7 @@ EXPORT_SYMBOL(_ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { - WARN_ON(!list_empty(&device->client_data_list)); + WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); put_device(&device->dev); @@ -332,26 +369,20 @@ EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { - struct ib_client_data *context; + void *entry; if (!device->kverbs_provider && !client->no_kverbs_req) return -EOPNOTSUPP; - context = kmalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return -ENOMEM; - - context->client = client; - context->data = NULL; - context->going_down = false; - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_add(&context->list, &device->client_data_list); - write_unlock_irq(&device->client_data_lock); + entry = xa_store(&device->client_data, client->client_id, NULL, + GFP_KERNEL); + if (!xa_is_err(entry)) + xa_set_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); up_write(&lists_rwsem); - return 0; + return xa_err(entry); } static int verify_immutable(const struct ib_device *dev, u8 port) @@ -428,9 +459,10 @@ static int setup_port_pkey_list(struct ib_device *device) static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; + unsigned long index; down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { int i; for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) { @@ -461,28 +493,48 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_OK; } -/** - * __dev_new_index - allocate an device index - * - * Returns a suitable unique value for a new device interface - * number. It assumes that there are less than 2^32-1 ib devices - * will be present in the system. +/* + * Assign the unique string device name and the unique device index. */ -static u32 __dev_new_index(void) +static int assign_name(struct ib_device *device, const char *name) { - /* - * The device index to allow stable naming. - * Similar to struct net -> ifindex. - */ - static u32 index; + static u32 last_id; + int ret; - for (;;) { - if (!(++index)) - index = 1; + /* Assign a unique name to the device */ + if (strchr(name, '%')) + ret = alloc_name(device, name); + else + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; - if (!__ib_device_get_by_index(index)) - return index; + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); + + /* Cyclically allocate a user visible ID for the device */ + device->index = last_id; + ret = xa_alloc(&devices, &device->index, INT_MAX, device, GFP_KERNEL); + if (ret == -ENOSPC) { + device->index = 0; + ret = xa_alloc(&devices, &device->index, INT_MAX, device, + GFP_KERNEL); + } + if (ret) + goto out; + last_id = device->index + 1; + + ret = 0; +out: + return ret; +} + +static void release_name(struct ib_device *device) +{ + xa_erase(&devices, device->index); } static void setup_dma_device(struct ib_device *device) @@ -572,34 +624,21 @@ int ib_register_device(struct ib_device *device, const char *name) mutex_lock(&device_mutex); - if (strchr(name, '%')) { - ret = alloc_name(device, name); - if (ret) - goto out; - } else { - ret = dev_set_name(&device->dev, name); - if (ret) - goto out; - } - if (__ib_device_get_by_name(dev_name(&device->dev))) { - ret = -ENFILE; + ret = assign_name(device, name); + if (ret) goto out; - } - strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = setup_device(device); if (ret) - goto out; + goto out_name; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto out; + goto out_name; } - device->index = __dev_new_index(); - ib_device_register_rdmacg(device); ret = ib_device_register_sysfs(device); @@ -616,7 +655,7 @@ int ib_register_device(struct ib_device *device, const char *name) client->add(device); down_write(&lists_rwsem); - list_add_tail(&device->core_list, &device_list); + xa_set_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&lists_rwsem); mutex_unlock(&device_mutex); return 0; @@ -624,6 +663,8 @@ int ib_register_device(struct ib_device *device, const char *name) cg_cleanup: ib_device_unregister_rdmacg(device); ib_cache_cleanup_one(device); +out_name: + release_name(device); out: mutex_unlock(&device_mutex); return ret; @@ -638,8 +679,8 @@ EXPORT_SYMBOL(ib_register_device); */ void ib_unregister_device(struct ib_device *device) { - struct ib_client_data *context, *tmp; - unsigned long flags; + struct ib_client *client; + unsigned long index; /* * Wait for all netlink command callers to finish working on the @@ -651,34 +692,31 @@ void ib_unregister_device(struct ib_device *device) mutex_lock(&device_mutex); down_write(&lists_rwsem); - list_del(&device->core_list); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - context->going_down = true; - write_unlock_irq(&device->client_data_lock); + xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); + xa_for_each (&clients, index, client) + xa_clear_mark(&device->client_data, index, + CLIENT_DATA_REGISTERED); downgrade_write(&lists_rwsem); - list_for_each_entry(context, &device->client_data_list, list) { - if (context->client->remove) - context->client->remove(device, context->data); - } + list_for_each_entry_reverse(client, &client_list, list) + if (xa_get_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED) && + client->remove) + client->remove(device, xa_load(&device->client_data, + client->client_id)); up_read(&lists_rwsem); ib_device_unregister_sysfs(device); ib_device_unregister_rdmacg(device); + release_name(device); + mutex_unlock(&device_mutex); ib_cache_cleanup_one(device); down_write(&lists_rwsem); - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { - list_del(&context->list); - kfree(context); - } - write_unlock_irqrestore(&device->client_data_lock, flags); + xa_destroy(&device->client_data); up_write(&lists_rwsem); } EXPORT_SYMBOL(ib_unregister_device); @@ -725,6 +763,7 @@ out: int ib_register_client(struct ib_client *client) { struct ib_device *device; + unsigned long index; int ret; mutex_lock(&device_mutex); @@ -734,7 +773,7 @@ int ib_register_client(struct ib_client *client) return ret; } - list_for_each_entry(device, &device_list, core_list) + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) if (!add_client_context(device, client) && client->add) client->add(device); @@ -758,8 +797,8 @@ EXPORT_SYMBOL(ib_register_client); */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context; struct ib_device *device; + unsigned long index; mutex_lock(&device_mutex); @@ -767,37 +806,19 @@ void ib_unregister_client(struct ib_client *client) xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&lists_rwsem); - list_for_each_entry(device, &device_list, core_list) { - struct ib_client_data *found_context = NULL; - + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->going_down = true; - found_context = context; - break; - } - write_unlock_irq(&device->client_data_lock); + xa_clear_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); up_write(&lists_rwsem); if (client->remove) - client->remove(device, found_context ? - found_context->data : NULL); - - if (!found_context) { - dev_warn(&device->dev, - "No client context found for %s\n", - client->name); - continue; - } + client->remove(device, xa_load(&device->client_data, + client->client_id)); down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_del(&found_context->list); - write_unlock_irq(&device->client_data_lock); + xa_erase(&device->client_data, client->client_id); up_write(&lists_rwsem); - kfree(found_context); } down_write(&lists_rwsem); @@ -808,59 +829,28 @@ void ib_unregister_client(struct ib_client *client) } EXPORT_SYMBOL(ib_unregister_client); -/** - * ib_get_client_data - Get IB client context - * @device:Device to get context for - * @client:Client to get context for - * - * ib_get_client_data() returns client context set with - * ib_set_client_data(). - */ -void *ib_get_client_data(struct ib_device *device, struct ib_client *client) -{ - struct ib_client_data *context; - void *ret = NULL; - unsigned long flags; - - read_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - ret = context->data; - break; - } - read_unlock_irqrestore(&device->client_data_lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_client_data); - /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * - * ib_set_client_data() sets client context that can be retrieved with - * ib_get_client_data(). + * ib_set_client_data() sets client context data that can be retrieved with + * ib_get_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { - struct ib_client_data *context; - unsigned long flags; + void *rc; - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->data = data; - goto out; - } + if (WARN_ON(IS_ERR(data))) + data = NULL; - dev_warn(&device->dev, "No client context found for %s\n", - client->name); - -out: - write_unlock_irqrestore(&device->client_data_lock, flags); + rc = xa_store(&device->client_data, client->client_id, data, + GFP_KERNEL); + WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); @@ -1018,9 +1008,10 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *cookie) { struct ib_device *dev; + unsigned long index; down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&lists_rwsem); } @@ -1034,12 +1025,13 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { + unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { ret = nldev_cb(dev, skb, cb, idx); if (ret) break; @@ -1212,26 +1204,25 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, const struct sockaddr *addr) { struct net_device *net_dev = NULL; - struct ib_client_data *context; + unsigned long index; + void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; down_read(&lists_rwsem); - list_for_each_entry(context, &dev->client_data_list, list) { - struct ib_client *client = context->client; + xan_for_each_marked (&dev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); - if (context->going_down) + if (!client || !client->get_net_dev_by_params) continue; - if (client->get_net_dev_by_params) { - net_dev = client->get_net_dev_by_params(dev, port, pkey, - gid, addr, - context->data); - if (net_dev) - break; - } + net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, + addr, client_data); + if (net_dev) + break; } up_read(&lists_rwsem); @@ -1462,6 +1453,7 @@ static void __exit ib_core_cleanup(void) /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); WARN_ON(!xa_empty(&clients)); + WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index cc15820513cd..8558f31ca46f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,12 +2542,7 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - rwlock_t client_data_lock; - struct list_head core_list; - /* Access to the client_data_list is protected by the client_data_lock - * rwlock and the lists_rwsem read-write semaphore - */ - struct list_head client_data_list; + struct xarray client_data; struct ib_cache cache; /** @@ -2660,7 +2655,21 @@ void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); -void *ib_get_client_data(struct ib_device *device, struct ib_client *client); +/** + * ib_get_client_data - Get IB client context + * @device:Device to get context for + * @client:Client to get context for + * + * ib_get_client_data() returns the client context data set with + * ib_set_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. + */ +static inline void *ib_get_client_data(struct ib_device *device, + struct ib_client *client) +{ + return xa_load(&device->client_data, client->client_id); +} void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); void ib_set_device_ops(struct ib_device *device, From 921eab1143aadf976a42cac4605b4d35159b355d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:54 -0700 Subject: [PATCH 21/38] RDMA/devices: Re-organize device.c locking The locking here started out with a single lock that covered everything and then has lately veered into crazy town. The fundamental problem is that several places need to iterate over a linked list, but also need to drop their locks to avoid deadlock during client callbacks. xarray's restartable iteration offers a simple solution to the problem. Once all the lists are xarrays we can drop locks in the places that need that and rely on xarray to provide consistency and locking for the data structure. The resulting simplification is that each of the three lists has a dedicated rwsem that must be held when working with the list it covers. One data structure is no longer covered by multiple locks. The sleeping semaphore is selected because the read side generally needs to be held over something sleeping, and using RCU reader locking in those cases is overkill. In the process this simplifies the entire registration/unregistration flow to be the expected list of setups and the reversed list of matching teardowns, and the registration lock 'refcount' can now be revised to be released after the ULPs are removed, providing a very sane semantic for this feature. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 361 +++++++++++++++++++------------ include/rdma/ib_verbs.h | 1 + 2 files changed, 222 insertions(+), 140 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5096593b99e9..3325be4f91a5 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -56,6 +55,29 @@ struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); +/* + * Each of the three rwsem locks (devices, clients, client_data) protects the + * xarray of the same name. Specifically it allows the caller to assert that + * the MARK will/will not be changing under the lock, and for devices and + * clients, that the value in the xarray is still a valid pointer. Change of + * the MARK is linked to the object state, so holding the lock and testing the + * MARK also asserts that the contained object is in a certain state. + * + * This is used to build a two stage register/unregister flow where objects + * can continue to be in the xarray even though they are still in progress to + * register/unregister. + * + * The xarray itself provides additional locking, and restartable iteration, + * which is also relied on. + * + * Locks should not be nested, with the exception of client_data, which is + * allowed to nest under the read side of the other two locks. + * + * The devices_rwsem also protects the device name list, any change or + * assignment of device name must also hold the write side to guarantee unique + * names. + */ + /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration @@ -64,17 +86,13 @@ EXPORT_SYMBOL_GPL(ib_wq); * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); - -/* - * Note that if the *rwsem is held and the *_REGISTERED mark is seen then the - * object is guaranteed to be and remain registered for the duration of the - * lock. - */ +static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static LIST_HEAD(client_list); #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(clients_rwsem); /* * If client_data is registered then the corresponding client must also still @@ -115,20 +133,6 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) -/* - * device_mutex and lists_rwsem protect access to both devices and - * clients. device_mutex protects writer access by device and client - * registration / de-registration. lists_rwsem protects reader access to - * these lists. Iterators of these lists must lock it for read, while updates - * to the lists must be done with a write lock. A special case is when the - * device_mutex is locked. In this case locking the lists for read access is - * not necessary as the device_mutex implies it. - * - * lists_rwsem also protects access to the client data list. - */ -static DEFINE_MUTEX(device_mutex); -static DECLARE_RWSEM(lists_rwsem); - static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); @@ -185,13 +189,13 @@ struct ib_device *ib_device_get_by_index(u32 index) { struct ib_device *device; - down_read(&lists_rwsem); + down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!ib_device_try_get(device)) device = NULL; } - up_read(&lists_rwsem); + up_read(&devices_rwsem); return device; } @@ -225,7 +229,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) { int ret; - mutex_lock(&device_mutex); + down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { ret = 0; goto out; @@ -241,7 +245,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) goto out; strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); out: - mutex_unlock(&device_mutex); + up_write(&devices_rwsem); return ret; } @@ -253,6 +257,7 @@ static int alloc_name(struct ib_device *ibdev, const char *name) int rc; int i; + lockdep_assert_held_exclusive(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; @@ -345,6 +350,7 @@ struct ib_device *_ib_alloc_device(size_t size) * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); + init_rwsem(&device->client_data_rwsem); INIT_LIST_HEAD(&device->port_list); init_completion(&device->unreg_completion); @@ -367,22 +373,86 @@ void ib_dealloc_device(struct ib_device *device) } EXPORT_SYMBOL(ib_dealloc_device); -static int add_client_context(struct ib_device *device, struct ib_client *client) +/* + * add_client_context() and remove_client_context() must be safe against + * parallel calls on the same device - registration/unregistration of both the + * device and client can be occurring in parallel. + * + * The routines need to be a fence, any caller must not return until the add + * or remove is fully completed. + */ +static int add_client_context(struct ib_device *device, + struct ib_client *client) { - void *entry; + int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) - return -EOPNOTSUPP; + return 0; - down_write(&lists_rwsem); - entry = xa_store(&device->client_data, client->client_id, NULL, - GFP_KERNEL); - if (!xa_is_err(entry)) - xa_set_mark(&device->client_data, client->client_id, - CLIENT_DATA_REGISTERED); - up_write(&lists_rwsem); + down_write(&device->client_data_rwsem); + /* + * Another caller to add_client_context got here first and has already + * completely initialized context. + */ + if (xa_get_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED)) + goto out; - return xa_err(entry); + ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, + GFP_KERNEL)); + if (ret) + goto out; + downgrade_write(&device->client_data_rwsem); + if (client->add) + client->add(device); + + /* Readers shall not see a client until add has been completed */ + xa_set_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); + up_read(&device->client_data_rwsem); + return 0; + +out: + up_write(&device->client_data_rwsem); + return ret; +} + +static void remove_client_context(struct ib_device *device, + unsigned int client_id) +{ + struct ib_client *client; + void *client_data; + + down_write(&device->client_data_rwsem); + if (!xa_get_mark(&device->client_data, client_id, + CLIENT_DATA_REGISTERED)) { + up_write(&device->client_data_rwsem); + return; + } + client_data = xa_load(&device->client_data, client_id); + xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); + client = xa_load(&clients, client_id); + downgrade_write(&device->client_data_rwsem); + + /* + * Notice we cannot be holding any exclusive locks when calling the + * remove callback as the remove callback can recurse back into any + * public functions in this module and thus try for any locks those + * functions take. + * + * For this reason clients and drivers should not call the + * unregistration functions will holdling any locks. + * + * It tempting to drop the client_data_rwsem too, but this is required + * to ensure that unregister_client does not return until all clients + * are completely unregistered, which is required to avoid module + * unloading races. + */ + if (client->remove) + client->remove(device, client_data); + + xa_erase(&device->client_data, client_id); + up_read(&device->client_data_rwsem); } static int verify_immutable(const struct ib_device *dev, u8 port) @@ -461,7 +531,7 @@ static void ib_policy_change_task(struct work_struct *work) struct ib_device *dev; unsigned long index; - down_read(&lists_rwsem); + down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { int i; @@ -478,7 +548,7 @@ static void ib_policy_change_task(struct work_struct *work) ib_security_cache_change(dev, i, sp); } } - up_read(&lists_rwsem); + up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, @@ -501,6 +571,7 @@ static int assign_name(struct ib_device *device, const char *name) static u32 last_id; int ret; + down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); @@ -528,13 +599,17 @@ static int assign_name(struct ib_device *device, const char *name) last_id = device->index + 1; ret = 0; + out: + up_write(&devices_rwsem); return ret; } static void release_name(struct ib_device *device) { + down_write(&devices_rwsem); xa_erase(&devices, device->index); + up_write(&devices_rwsem); } static void setup_dma_device(struct ib_device *device) @@ -572,11 +647,18 @@ static void setup_dma_device(struct ib_device *device) } } +/* + * setup_device() allocates memory and sets up data that requires calling the + * device ops, this is the only reason these actions are not done during + * ib_alloc_device. It is undone by ib_dealloc_device(). + */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; + setup_dma_device(device); + ret = ib_device_check_mandatory(device); if (ret) return ret; @@ -605,6 +687,54 @@ static int setup_device(struct ib_device *device) return 0; } +static void disable_device(struct ib_device *device) +{ + struct ib_client *client; + + WARN_ON(!refcount_read(&device->refcount)); + + down_write(&devices_rwsem); + xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); + up_write(&devices_rwsem); + + down_read(&clients_rwsem); + list_for_each_entry_reverse(client, &client_list, list) + remove_client_context(device, client->client_id); + up_read(&clients_rwsem); + + /* Pairs with refcount_set in enable_device */ + ib_device_put(device); + wait_for_completion(&device->unreg_completion); +} + +/* + * An enabled device is visible to all clients and to all the public facing + * APIs that return a device pointer. + */ +static int enable_device(struct ib_device *device) +{ + struct ib_client *client; + unsigned long index; + int ret; + + refcount_set(&device->refcount, 1); + down_write(&devices_rwsem); + xa_set_mark(&devices, device->index, DEVICE_REGISTERED); + up_write(&devices_rwsem); + + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + ret = add_client_context(device, client); + if (ret) { + up_read(&clients_rwsem); + disable_device(device); + return ret; + } + } + up_read(&clients_rwsem); + return 0; +} + /** * ib_register_device - Register an IB device with IB core * @device:Device to register @@ -617,26 +747,20 @@ static int setup_device(struct ib_device *device) int ib_register_device(struct ib_device *device, const char *name) { int ret; - struct ib_client *client; - unsigned long index; - - setup_dma_device(device); - - mutex_lock(&device_mutex); ret = assign_name(device, name); if (ret) - goto out; + return ret; ret = setup_device(device); if (ret) - goto out_name; + goto out; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto out_name; + goto out; } ib_device_register_rdmacg(device); @@ -648,25 +772,19 @@ int ib_register_device(struct ib_device *device, const char *name) goto cg_cleanup; } - refcount_set(&device->refcount, 1); + ret = enable_device(device); + if (ret) + goto sysfs_cleanup; - xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) - if (!add_client_context(device, client) && client->add) - client->add(device); - - down_write(&lists_rwsem); - xa_set_mark(&devices, device->index, DEVICE_REGISTERED); - up_write(&lists_rwsem); - mutex_unlock(&device_mutex); return 0; +sysfs_cleanup: + ib_device_unregister_sysfs(device); cg_cleanup: ib_device_unregister_rdmacg(device); ib_cache_cleanup_one(device); -out_name: - release_name(device); out: - mutex_unlock(&device_mutex); + release_name(device); return ret; } EXPORT_SYMBOL(ib_register_device); @@ -679,45 +797,11 @@ EXPORT_SYMBOL(ib_register_device); */ void ib_unregister_device(struct ib_device *device) { - struct ib_client *client; - unsigned long index; - - /* - * Wait for all netlink command callers to finish working on the - * device. - */ - ib_device_put(device); - wait_for_completion(&device->unreg_completion); - - mutex_lock(&device_mutex); - - down_write(&lists_rwsem); - xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); - xa_for_each (&clients, index, client) - xa_clear_mark(&device->client_data, index, - CLIENT_DATA_REGISTERED); - downgrade_write(&lists_rwsem); - - list_for_each_entry_reverse(client, &client_list, list) - if (xa_get_mark(&device->client_data, client->client_id, - CLIENT_DATA_REGISTERED) && - client->remove) - client->remove(device, xa_load(&device->client_data, - client->client_id)); - up_read(&lists_rwsem); - + disable_device(device); ib_device_unregister_sysfs(device); ib_device_unregister_rdmacg(device); - - release_name(device); - - mutex_unlock(&device_mutex); - ib_cache_cleanup_one(device); - - down_write(&lists_rwsem); - xa_destroy(&device->client_data); - up_write(&lists_rwsem); + release_name(device); } EXPORT_SYMBOL(ib_unregister_device); @@ -725,6 +809,7 @@ static int assign_client_id(struct ib_client *client) { int ret; + down_write(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in @@ -743,7 +828,11 @@ static int assign_client_id(struct ib_client *client) if (ret) goto out; + xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); + list_add_tail(&client->list, &client_list); + out: + up_write(&clients_rwsem); return ret; } @@ -766,23 +855,20 @@ int ib_register_client(struct ib_client *client) unsigned long index; int ret; - mutex_lock(&device_mutex); ret = assign_client_id(client); - if (ret) { - mutex_unlock(&device_mutex); + if (ret) return ret; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { + ret = add_client_context(device, client); + if (ret) { + up_read(&devices_rwsem); + ib_unregister_client(client); + return ret; + } } - - xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) - if (!add_client_context(device, client) && client->add) - client->add(device); - - down_write(&lists_rwsem); - xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); - up_write(&lists_rwsem); - - mutex_unlock(&device_mutex); - + up_read(&devices_rwsem); return 0; } EXPORT_SYMBOL(ib_register_client); @@ -794,38 +880,31 @@ EXPORT_SYMBOL(ib_register_client); * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. + * + * This is a full fence, once it returns no client callbacks will be called, + * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; - mutex_lock(&device_mutex); - - down_write(&lists_rwsem); + down_write(&clients_rwsem); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); - up_write(&lists_rwsem); + up_write(&clients_rwsem); + /* + * Every device still known must be serialized to make sure we are + * done with the client callbacks before we return. + */ + down_read(&devices_rwsem); + xa_for_each (&devices, index, device) + remove_client_context(device, client->client_id); + up_read(&devices_rwsem); - xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { - down_write(&lists_rwsem); - xa_clear_mark(&device->client_data, client->client_id, - CLIENT_DATA_REGISTERED); - up_write(&lists_rwsem); - - if (client->remove) - client->remove(device, xa_load(&device->client_data, - client->client_id)); - - down_write(&lists_rwsem); - xa_erase(&device->client_data, client->client_id); - up_write(&lists_rwsem); - } - - down_write(&lists_rwsem); + down_write(&clients_rwsem); list_del(&client->list); xa_erase(&clients, client->client_id); - up_write(&lists_rwsem); - mutex_unlock(&device_mutex); + up_write(&clients_rwsem); } EXPORT_SYMBOL(ib_unregister_client); @@ -1010,10 +1089,10 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, struct ib_device *dev; unsigned long index; - down_read(&lists_rwsem); + down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); - up_read(&lists_rwsem); + up_read(&devices_rwsem); } /** @@ -1030,15 +1109,14 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, unsigned int idx = 0; int ret = 0; - down_read(&lists_rwsem); + down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } - - up_read(&lists_rwsem); + up_read(&devices_rwsem); return ret; } @@ -1196,6 +1274,7 @@ EXPORT_SYMBOL(ib_find_pkey); * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. + * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, @@ -1210,8 +1289,11 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, if (!rdma_protocol_ib(dev, port)) return NULL; - down_read(&lists_rwsem); - + /* + * Holding the read side guarantees that the client will not become + * unregistered while we are calling get_net_dev_by_params() + */ + down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); @@ -1224,8 +1306,7 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, if (net_dev) break; } - - up_read(&lists_rwsem); + up_read(&dev->client_data_rwsem); return net_dev; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8558f31ca46f..135fab2c016c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,6 +2542,7 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; + struct rw_semaphore client_data_rwsem; struct xarray client_data; struct ib_cache cache; From e381a1cb650d97e213b5943c81bbcadf8f480962 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 6 Feb 2019 22:54:42 +0530 Subject: [PATCH 22/38] cxgb4: add tcb flags and tcb rpl struct This patch adds the tcb flags and structures needed for querying tcb information. Signed-off-by: Raju Rangoju Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/chelsio/cxgb4/t4_msg.h | 8 ++++++++ drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h | 12 ++++++++++++ 2 files changed, 20 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h index c62a0c830705..38dd41eb959e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h @@ -56,6 +56,7 @@ enum { CPL_TX_DATA_ISO = 0x1F, CPL_CLOSE_LISTSRV_RPL = 0x20, + CPL_GET_TCB_RPL = 0x22, CPL_L2T_WRITE_RPL = 0x23, CPL_PASS_OPEN_RPL = 0x24, CPL_ACT_OPEN_RPL = 0x25, @@ -688,6 +689,13 @@ struct cpl_get_tcb { #define NO_REPLY_V(x) ((x) << NO_REPLY_S) #define NO_REPLY_F NO_REPLY_V(1U) +struct cpl_get_tcb_rpl { + union opcode_tid ot; + __u8 cookie; + __u8 status; + __be16 len; +}; + struct cpl_set_tcb_field { WR_HDR; union opcode_tid ot; diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h b/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h index 3297ce025e8b..1b9afb192f7f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h @@ -41,6 +41,14 @@ #define TCB_SMAC_SEL_V(x) ((x) << TCB_SMAC_SEL_S) #define TCB_T_FLAGS_W 1 +#define TCB_T_FLAGS_S 0 +#define TCB_T_FLAGS_M 0xffffffffffffffffULL +#define TCB_T_FLAGS_V(x) ((__u64)(x) << TCB_T_FLAGS_S) + +#define TCB_RQ_START_W 30 +#define TCB_RQ_START_S 0 +#define TCB_RQ_START_M 0x3ffffffULL +#define TCB_RQ_START_V(x) ((x) << TCB_RQ_START_S) #define TF_CCTRL_ECE_S 60 #define TF_CCTRL_CWR_S 61 @@ -66,4 +74,8 @@ #define TCB_RX_FRAG3_LEN_RAW_W 29 #define TCB_RX_FRAG3_START_IDX_OFFSET_RAW_W 30 #define TCB_PDU_HDR_LEN_W 31 + +#define TF_RX_PDU_OUT_S 49 +#define TF_RX_PDU_OUT_V(x) ((__u64)(x) << TF_RX_PDU_OUT_S) + #endif /* __T4_TCB_H */ From 11a27e2121a544cae2dde62df9218b3d5d888a02 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 6 Feb 2019 22:54:43 +0530 Subject: [PATCH 23/38] iw_cxgb4: complete the cached SRQ buffers If TP fetches an SRQ buffer but ends up not using it before the connection is aborted, then it passes the index of that SRQ buffer to the host in ABORT_REQ_RSS or ABORT_RPL CPL message. But, if the srqidx field is zero in the received ABORT_RPL or ABORT_REQ_RSS CPL, then we need to read the tcb.rq_start field to see if it really did have an RQE cached. This works around a case where HW does not include the srqidx in the ABORT_RPL/ABORT_REQ_RSS CPL. The final value of rq_start is the one present in TCB with the TF_RX_PDU_OUT bit cleared. So, we need to read the TCB, examine the TF_RX_PDU_OUT (bit 49 of t_flags) in order to determine if there's a rx PDU feedback event pending. Signed-off-by: Raju Rangoju Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 161 +++++++++++++++++++++++-- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 3 + drivers/infiniband/hw/cxgb4/t4.h | 1 + 3 files changed, 157 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 77efd4ae8e10..b188d89674f1 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -655,7 +655,33 @@ static int send_halfclose(struct c4iw_ep *ep) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } -static int send_abort(struct c4iw_ep *ep) +void read_tcb(struct c4iw_ep *ep) +{ + struct sk_buff *skb; + struct cpl_get_tcb *req; + int wrlen = roundup(sizeof(*req), 16); + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (WARN_ON(!skb)) + return; + + set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx); + req = (struct cpl_get_tcb *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + INIT_TP_WR(req, ep->hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_GET_TCB, ep->hwtid)); + req->reply_ctrl = htons(REPLY_CHAN_V(0) | QUEUENO_V(ep->rss_qid)); + + /* + * keep a ref on the ep so the tcb is not unlocked before this + * cpl completes. The ref is released in read_tcb_rpl(). + */ + c4iw_get_ep(&ep->com); + if (WARN_ON(c4iw_ofld_send(&ep->com.dev->rdev, skb))) + c4iw_put_ep(&ep->com); +} + +static int send_abort_req(struct c4iw_ep *ep) { u32 wrlen = roundup(sizeof(struct cpl_abort_req), 16); struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list); @@ -670,6 +696,17 @@ static int send_abort(struct c4iw_ep *ep) return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t); } +static int send_abort(struct c4iw_ep *ep) +{ + if (!ep->com.qp || !ep->com.qp->srq) { + send_abort_req(ep); + return 0; + } + set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags); + read_tcb(ep); + return 0; +} + static int send_connect(struct c4iw_ep *ep) { struct cpl_act_open_req *req = NULL; @@ -1851,14 +1888,11 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } -static void complete_cached_srq_buffers(struct c4iw_ep *ep, - __be32 srqidx_status) +static void complete_cached_srq_buffers(struct c4iw_ep *ep, u32 srqidx) { enum chip_type adapter_type; - u32 srqidx; adapter_type = ep->com.dev->rdev.lldi.adapter_type; - srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(srqidx_status)); /* * If this TCB had a srq buffer cached, then we must complete @@ -1876,6 +1910,7 @@ static void complete_cached_srq_buffers(struct c4iw_ep *ep, static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { + u32 srqidx; struct c4iw_ep *ep; struct cpl_abort_rpl_rss6 *rpl = cplhdr(skb); int release = 0; @@ -1887,7 +1922,10 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } - complete_cached_srq_buffers(ep, rpl->srqidx_status); + if (ep->com.qp && ep->com.qp->srq) { + srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(rpl->srqidx_status)); + complete_cached_srq_buffers(ep, srqidx ? srqidx : ep->srqe_idx); + } pr_debug("ep %p tid %u\n", ep, ep->hwtid); mutex_lock(&ep->com.mutex); @@ -2746,6 +2784,21 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } +static void finish_peer_abort(struct c4iw_dev *dev, struct c4iw_ep *ep) +{ + complete_cached_srq_buffers(ep, ep->srqe_idx); + if (ep->com.cm_id && ep->com.qp) { + struct c4iw_qp_attributes attrs; + + attrs.next_state = C4IW_QP_STATE_ERROR; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + } + peer_abort_upcall(ep); + release_ep_resources(ep); + c4iw_put_ep(&ep->com); +} + static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_abort_req_rss6 *req = cplhdr(skb); @@ -2756,6 +2809,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) int release = 0; unsigned int tid = GET_TID(req); u8 status; + u32 srqidx; u32 len = roundup(sizeof(struct cpl_abort_rpl), 16); @@ -2775,8 +2829,6 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) goto deref_ep; } - complete_cached_srq_buffers(ep, req->srqidx_status); - pr_debug("ep %p tid %u state %u\n", ep, ep->hwtid, ep->com.state); set_bit(PEER_ABORT, &ep->com.history); @@ -2825,6 +2877,23 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) stop_ep_timer(ep); /*FALLTHROUGH*/ case FPDU_MODE: + if (ep->com.qp && ep->com.qp->srq) { + srqidx = ABORT_RSS_SRQIDX_G( + be32_to_cpu(req->srqidx_status)); + if (srqidx) { + complete_cached_srq_buffers(ep, + req->srqidx_status); + } else { + /* Hold ep ref until finish_peer_abort() */ + c4iw_get_ep(&ep->com); + __state_set(&ep->com, ABORTING); + set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags); + read_tcb(ep); + break; + + } + } + if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_ERROR; ret = c4iw_modify_qp(ep->com.qp->rhp, @@ -3726,6 +3795,80 @@ static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, return; } +static inline u64 t4_tcb_get_field64(__be64 *tcb, u16 word) +{ + u64 tlo = be64_to_cpu(tcb[((31 - word) / 2)]); + u64 thi = be64_to_cpu(tcb[((31 - word) / 2) - 1]); + u64 t; + u32 shift = 32; + + t = (thi << shift) | (tlo >> shift); + + return t; +} + +static inline u32 t4_tcb_get_field32(__be64 *tcb, u16 word, u32 mask, u32 shift) +{ + u32 v; + u64 t = be64_to_cpu(tcb[(31 - word) / 2]); + + if (word & 0x1) + shift += 32; + v = (t >> shift) & mask; + return v; +} + +static int read_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_get_tcb_rpl *rpl = cplhdr(skb); + __be64 *tcb = (__be64 *)(rpl + 1); + unsigned int tid = GET_TID(rpl); + struct c4iw_ep *ep; + u64 t_flags_64; + u32 rx_pdu_out; + + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; + /* Examine the TF_RX_PDU_OUT (bit 49 of the t_flags) in order to + * determine if there's a rx PDU feedback event pending. + * + * If that bit is set, it means we'll need to re-read the TCB's + * rq_start value. The final value is the one present in a TCB + * with the TF_RX_PDU_OUT bit cleared. + */ + + t_flags_64 = t4_tcb_get_field64(tcb, TCB_T_FLAGS_W); + rx_pdu_out = (t_flags_64 & TF_RX_PDU_OUT_V(1)) >> TF_RX_PDU_OUT_S; + + c4iw_put_ep(&ep->com); /* from get_ep_from_tid() */ + c4iw_put_ep(&ep->com); /* from read_tcb() */ + + /* If TF_RX_PDU_OUT bit is set, re-read the TCB */ + if (rx_pdu_out) { + if (++ep->rx_pdu_out_cnt >= 2) { + WARN_ONCE(1, "tcb re-read() reached the guard limit, finishing the cleanup\n"); + goto cleanup; + } + read_tcb(ep); + return 0; + } + + ep->srqe_idx = t4_tcb_get_field32(tcb, TCB_RQ_START_W, TCB_RQ_START_W, + TCB_RQ_START_S); +cleanup: + pr_debug("ep %p tid %u %016x\n", ep, ep->hwtid, ep->srqe_idx); + + if (test_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) + finish_peer_abort(dev, ep); + else if (test_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) + send_abort_req(ep); + else + WARN_ONCE(1, "unexpected state!"); + + return 0; +} + static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_fw6_msg *rpl = cplhdr(skb); @@ -4046,6 +4189,7 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = { [CPL_CLOSE_CON_RPL] = close_con_rpl, [CPL_RDMA_TERMINATE] = terminate, [CPL_FW4_ACK] = fw4_ack, + [CPL_GET_TCB_RPL] = read_tcb_rpl, [CPL_FW6_MSG] = deferred_fw6_msg, [CPL_RX_PKT] = rx_pkt, [FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe, @@ -4277,6 +4421,7 @@ c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = { [CPL_RDMA_TERMINATE] = sched, [CPL_FW4_ACK] = sched, [CPL_SET_TCB_RPL] = set_tcb_rpl, + [CPL_GET_TCB_RPL] = sched, [CPL_FW6_MSG] = fw6_msg, [CPL_RX_PKT] = sched }; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index f0fceadd0d12..3a0923f7c60e 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -982,6 +982,9 @@ struct c4iw_ep { int rcv_win; u32 snd_wscale; struct c4iw_ep_stats stats; + u32 srqe_idx; + u32 rx_pdu_out_cnt; + struct sk_buff *peer_abort_skb; }; static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index fff6d48d262f..b170817b2741 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -35,6 +35,7 @@ #include "t4_regs.h" #include "t4_values.h" #include "t4_msg.h" +#include "t4_tcb.h" #include "t4fw_ri_api.h" #define T4_MAX_NUM_PD 65536 From f368ff188ae4b3ef6f740a15999ea0373261b619 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 6 Feb 2019 22:54:44 +0530 Subject: [PATCH 24/38] iw_cxgb4: fix srqidx leak during connection abort When an application aborts the connection by moving QP from RTS to ERROR, then iw_cxgb4's modify_rc_qp() RTS->ERROR logic sets the *srqidxp to 0 via t4_set_wq_in_error(&qhp->wq, 0), and aborts the connection by calling c4iw_ep_disconnect(). c4iw_ep_disconnect() does the following: 1. sends up a close_complete_upcall(ep, -ECONNRESET) to libcxgb4. 2. sends abort request CPL to hw. But, since the close_complete_upcall() is sent before sending the ABORT_REQ to hw, libcxgb4 would fail to release the srqidx if the connection holds one. Because, the srqidx is passed up to libcxgb4 only after corresponding ABORT_RPL is processed by kernel in abort_rpl(). This patch handle the corner-case by moving the call to close_complete_upcall() from c4iw_ep_disconnect() to abort_rpl(). So that libcxgb4 is notified about the -ECONNRESET only after abort_rpl(), and libcxgb4 can relinquish the srqidx properly. Signed-off-by: Raju Rangoju Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index b188d89674f1..ae90b2932bd2 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1941,8 +1941,10 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) } mutex_unlock(&ep->com.mutex); - if (release) + if (release) { + close_complete_upcall(ep, -ECONNRESET); release_ep_resources(ep); + } c4iw_put_ep(&ep->com); return 0; } @@ -3684,7 +3686,6 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) if (close) { if (abrupt) { set_bit(EP_DISC_ABORT, &ep->com.history); - close_complete_upcall(ep, -ECONNRESET); ret = send_abort(ep); } else { set_bit(EP_DISC_CLOSE, &ep->com.history); From d901b2760dc6cd5fbbf2eac31d71d94baa6c4aef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 4 Jan 2019 11:40:21 -0700 Subject: [PATCH 25/38] lib/scatterlist: Provide a DMA page iterator Commit 2db76d7c3c6d ("lib/scatterlist: sg_page_iter: support sg lists w/o backing pages") introduced the sg_page_iter_dma_address() function without providing a way to use it in the general case. If the sg_dma_len() is not equal to the sg length callers cannot safely use the for_each_sg_page/sg_page_iter_dma_address combination. Resolve this API mistake by providing a DMA specific iterator, for_each_sg_dma_page(), that uses the right length so sg_page_iter_dma_address() works as expected with all sglists. A new iterator type is introduced to provide compile-time safety against wrongly mixing accessors and iterators. Acked-by: Christoph Hellwig (for scatterlist) Acked-by: Thomas Hellstrom Acked-by: Sakari Ailus (ipu3-cio2) Signed-off-by: Jason Gunthorpe --- .clang-format | 1 + drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 8 +++- drivers/media/pci/intel/ipu3/ipu3-cio2.c | 4 +- include/linux/scatterlist.h | 49 ++++++++++++++++++---- lib/scatterlist.c | 26 ++++++++++++ 5 files changed, 76 insertions(+), 12 deletions(-) diff --git a/.clang-format b/.clang-format index bc2ffb2a0b53..335ce29ab813 100644 --- a/.clang-format +++ b/.clang-format @@ -240,6 +240,7 @@ ForEachMacros: - 'for_each_set_bit' - 'for_each_set_bit_from' - 'for_each_sg' + - 'for_each_sg_dma_page' - 'for_each_sg_page' - 'for_each_sibling_event' - '__for_each_thread' diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index 31786b200afc..a3357ff7540d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -311,7 +311,13 @@ static dma_addr_t __vmw_piter_dma_addr(struct vmw_piter *viter) static dma_addr_t __vmw_piter_sg_addr(struct vmw_piter *viter) { - return sg_page_iter_dma_address(&viter->iter); + /* + * FIXME: This driver wrongly mixes DMA and CPU SG list iteration and + * needs revision. See + * https://lore.kernel.org/lkml/20190104223531.GA1705@ziepe.ca/ + */ + return sg_page_iter_dma_address( + container_of(&viter->iter, struct sg_dma_page_iter, base)); } diff --git a/drivers/media/pci/intel/ipu3/ipu3-cio2.c b/drivers/media/pci/intel/ipu3/ipu3-cio2.c index cdb79ae2d8dc..9fbfbda74171 100644 --- a/drivers/media/pci/intel/ipu3/ipu3-cio2.c +++ b/drivers/media/pci/intel/ipu3/ipu3-cio2.c @@ -846,7 +846,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb) unsigned int pages = DIV_ROUND_UP(vb->planes[0].length, CIO2_PAGE_SIZE); unsigned int lops = DIV_ROUND_UP(pages + 1, entries_per_page); struct sg_table *sg; - struct sg_page_iter sg_iter; + struct sg_dma_page_iter sg_iter; int i, j; if (lops <= 0 || lops > CIO2_MAX_LOPS) { @@ -873,7 +873,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb) b->offset = sg->sgl->offset; i = j = 0; - for_each_sg_page(sg->sgl, &sg_iter, sg->nents, 0) { + for_each_sg_dma_page (sg->sgl, &sg_iter, sg->nents, 0) { if (!pages--) break; b->lop[i][j] = sg_page_iter_dma_address(&sg_iter) >> PAGE_SHIFT; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index b96f0d0b5b8f..b4be960c7e5d 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -339,12 +339,12 @@ int sg_alloc_table_chained(struct sg_table *table, int nents, /* * sg page iterator * - * Iterates over sg entries page-by-page. On each successful iteration, - * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter) - * to get the current page and its dma address. @piter->sg will point to the - * sg holding this page and @piter->sg_pgoffset to the page's page offset - * within the sg. The iteration will stop either when a maximum number of sg - * entries was reached or a terminating sg (sg_last(sg) == true) was reached. + * Iterates over sg entries page-by-page. On each successful iteration, you + * can call sg_page_iter_page(@piter) to get the current page and its dma + * address. @piter->sg will point to the sg holding this page and + * @piter->sg_pgoffset to the page's page offset within the sg. The iteration + * will stop either when a maximum number of sg entries was reached or a + * terminating sg (sg_last(sg) == true) was reached. */ struct sg_page_iter { struct scatterlist *sg; /* sg holding the page */ @@ -356,7 +356,19 @@ struct sg_page_iter { * next step */ }; +/* + * sg page iterator for DMA addresses + * + * This is the same as sg_page_iter however you can call + * sg_page_iter_dma_address(@dma_iter) to get the page's DMA + * address. sg_page_iter_page() cannot be called on this iterator. + */ +struct sg_dma_page_iter { + struct sg_page_iter base; +}; + bool __sg_page_iter_next(struct sg_page_iter *piter); +bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); @@ -372,11 +384,13 @@ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) /** * sg_page_iter_dma_address - get the dma address of the current page held by * the page iterator. - * @piter: page iterator holding the page + * @dma_iter: page iterator holding the page */ -static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) +static inline dma_addr_t +sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) { - return sg_dma_address(piter->sg) + (piter->sg_pgoffset << PAGE_SHIFT); + return sg_dma_address(dma_iter->base.sg) + + (dma_iter->base.sg_pgoffset << PAGE_SHIFT); } /** @@ -385,11 +399,28 @@ static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over * @pgoffset: starting page offset + * + * Callers may use sg_page_iter_page() to get each page pointer. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ __sg_page_iter_next(piter);) +/** + * for_each_sg_dma_page - iterate over the pages of the given sg list + * @sglist: sglist to iterate over + * @dma_iter: page iterator to hold current page + * @dma_nents: maximum number of sg entries to iterate over, this is the value + * returned from dma_map_sg + * @pgoffset: starting page offset + * + * Callers may use sg_page_iter_dma_address() to get each page's DMA address. + */ +#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ + for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ + pgoffset); \ + __sg_page_iter_dma_next(dma_iter);) + /* * Mapping sg iterator * diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 9ba349e775ef..739dc9fe2c55 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -625,6 +625,32 @@ bool __sg_page_iter_next(struct sg_page_iter *piter) } EXPORT_SYMBOL(__sg_page_iter_next); +static int sg_dma_page_count(struct scatterlist *sg) +{ + return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT; +} + +bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter) +{ + struct sg_page_iter *piter = &dma_iter->base; + + if (!piter->__nents || !piter->sg) + return false; + + piter->sg_pgoffset += piter->__pg_advance; + piter->__pg_advance = 1; + + while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) { + piter->sg_pgoffset -= sg_dma_page_count(piter->sg); + piter->sg = sg_next(piter->sg); + if (!--piter->__nents || !piter->sg) + return false; + } + + return true; +} +EXPORT_SYMBOL(__sg_page_iter_dma_next); + /** * sg_miter_start - start mapping iteration over a sg list * @miter: sg mapping iter to be started From 161ebe2498d4108c5e80dc3b47736ecda32cd816 Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:24:57 -0600 Subject: [PATCH 26/38] RDMA/bnxt_re: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 21 ++++++++------------- drivers/infiniband/hw/bnxt_re/qplib_res.c | 9 +++++---- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 1606571af63d..bff9320a968e 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3553,19 +3553,14 @@ static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig, u64 *pbl_tbl = pbl_tbl_orig; u64 paddr; u64 page_mask = (1ULL << page_shift) - 1; - int i, pages; - struct scatterlist *sg; - int entry; + struct sg_dma_page_iter sg_iter; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - pages = sg_dma_len(sg) >> PAGE_SHIFT; - for (i = 0; i < pages; i++) { - paddr = sg_dma_address(sg) + (i << PAGE_SHIFT); - if (pbl_tbl == pbl_tbl_orig) - *pbl_tbl++ = paddr & ~page_mask; - else if ((paddr & page_mask) == 0) - *pbl_tbl++ = paddr; - } + for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + paddr = sg_page_iter_dma_address(&sg_iter); + if (pbl_tbl == pbl_tbl_orig) + *pbl_tbl++ = paddr & ~page_mask; + else if ((paddr & page_mask) == 0) + *pbl_tbl++ = paddr; } return pbl_tbl - pbl_tbl_orig; } @@ -3628,7 +3623,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, goto free_umem; } - page_shift = umem->page_shift; + page_shift = PAGE_SHIFT; if (!bnxt_re_page_size_ok(page_shift)) { dev_err(rdev_to_dev(rdev), "umem page size unsupported!"); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index c8502c2844a2..d08b9d9948fd 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, struct scatterlist *sghead, u32 pages, u32 pg_size) { - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; bool is_umem = false; int i; @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, } else { i = 0; is_umem = true; - for_each_sg(sghead, sg, pages, i) { - pbl->pg_map_arr[i] = sg_dma_address(sg); - pbl->pg_arr[i] = sg_virt(sg); + for_each_sg_dma_page (sghead, &sg_iter, pages, 0) { + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); + pbl->pg_arr[i] = NULL; if (!pbl->pg_arr[i]) goto fail; + i++; pbl->pg_count++; } } From 8d249af3e6d70db08b61449fe6ad1da000d625d1 Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:24:58 -0600 Subject: [PATCH 27/38] RDMA/mthca: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mthca/mthca_provider.c | 36 +++++++++----------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 2c754bc226f3..516c8cf9c0fd 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -897,12 +897,11 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(pd->device); - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; struct mthca_mr *mr; struct mthca_reg_mr ucmd; u64 *pages; - int shift, n, len; - int i, k, entry; + int n, i; int err = 0; int write_mtt_size; @@ -929,7 +928,6 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto err; } - shift = mr->umem->page_shift; n = mr->umem->nmap; mr->mtt = mthca_alloc_mtt(dev, n); @@ -948,21 +946,19 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); - for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { - len = sg_dma_len(sg) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = sg_dma_address(sg) + (k << shift); - /* - * Be friendly to write_mtt and pass it chunks - * of appropriate size. - */ - if (i == write_mtt_size) { - err = mthca_write_mtt(dev, mr->mtt, n, pages, i); - if (err) - goto mtt_done; - n += i; - i = 0; - } + for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) { + pages[i++] = sg_page_iter_dma_address(&sg_iter); + + /* + * Be friendly to write_mtt and pass it chunks + * of appropriate size. + */ + if (i == write_mtt_size) { + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); + if (err) + goto mtt_done; + n += i; + i = 0; } } @@ -973,7 +969,7 @@ mtt_done: if (err) goto err_mtt; - err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length, + err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, PAGE_SHIFT, virt, length, convert_access(acc), mr); if (err) From 43fae91276a543e430649bc88284bd7be475dfba Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:24:59 -0600 Subject: [PATCH 28/38] RDMA/i40iw: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 33 +++++++++++------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 28449ad57b37..d5fb2b927587 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1360,32 +1360,29 @@ static void i40iw_copy_user_pgaddrs(struct i40iw_mr *iwmr, { struct ib_umem *region = iwmr->region; struct i40iw_pbl *iwpbl = &iwmr->iwpbl; - int chunk_pages, entry, i; struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc; struct i40iw_pble_info *pinfo; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; u64 pg_addr = 0; u32 idx = 0; + bool first_pg = true; pinfo = (level == I40IW_LEVEL_1) ? NULL : palloc->level2.leaf; - for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) { - chunk_pages = sg_dma_len(sg) >> region->page_shift; - if ((iwmr->type == IW_MEMREG_TYPE_QP) && - !iwpbl->qp_mr.sq_page) - iwpbl->qp_mr.sq_page = sg_page(sg); - for (i = 0; i < chunk_pages; i++) { - pg_addr = sg_dma_address(sg) + - (i << region->page_shift); + if (iwmr->type == IW_MEMREG_TYPE_QP) + iwpbl->qp_mr.sq_page = sg_page(region->sg_head.sgl); - if ((entry + i) == 0) - *pbl = cpu_to_le64(pg_addr & iwmr->page_msk); - else if (!(pg_addr & ~iwmr->page_msk)) - *pbl = cpu_to_le64(pg_addr); - else - continue; - pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx); - } + for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap, 0) { + pg_addr = sg_page_iter_dma_address(&sg_iter); + if (first_pg) + *pbl = cpu_to_le64(pg_addr & iwmr->page_msk); + else if (!(pg_addr & ~iwmr->page_msk)) + *pbl = cpu_to_le64(pg_addr); + else + continue; + + first_pg = false; + pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx); } } From 3856ec55270099494afa0cabba020365a38430a2 Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:25:00 -0600 Subject: [PATCH 29/38] RDMA/hns: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 7 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 25 +++---- drivers/infiniband/hw/hns/hns_roce_mr.c | 86 ++++++++++------------ drivers/infiniband/hw/hns/hns_roce_qp.c | 10 +-- 4 files changed, 55 insertions(+), 73 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index a18b88c95995..c8c90072badd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1871,9 +1871,8 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, unsigned long mtpt_idx) { struct hns_roce_v1_mpt_entry *mpt_entry; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; u64 *pages; - int entry; int i; /* MPT filled into mailbox buf */ @@ -1928,8 +1927,8 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, return -ENOMEM; i = 0; - for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { - pages[i] = ((u64)sg_dma_address(sg)) >> 12; + for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) { + pages[i] = ((u64)sg_page_iter_dma_address(&sg_iter)) >> 12; /* Directly record to MTPT table firstly 7 entry */ if (i >= HNS_ROCE_MAX_INNER_MTPT_NUM) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 19fefff4f699..c648ee825852 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2084,12 +2084,10 @@ static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry, struct hns_roce_mr *mr) { - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; u64 page_addr; u64 *pages; - int i, j; - int len; - int entry; + int i; mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size); mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3)); @@ -2102,17 +2100,14 @@ static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry, return -ENOMEM; i = 0; - for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) { - len = sg_dma_len(sg) >> PAGE_SHIFT; - for (j = 0; j < len; ++j) { - page_addr = sg_dma_address(sg) + - (j << mr->umem->page_shift); - pages[i] = page_addr >> 6; - /* Record the first 2 entry directly to MTPT table */ - if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1) - goto found; - i++; - } + for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) { + page_addr = sg_page_iter_dma_address(&sg_iter); + pages[i] = page_addr >> 6; + + /* Record the first 2 entry directly to MTPT table */ + if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1) + goto found; + i++; } found: mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0])); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index da4fffedb879..b09f1cde2ff5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -976,12 +976,11 @@ int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt, struct ib_umem *umem) { struct device *dev = hr_dev->dev; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; unsigned int order; - int i, k, entry; int npage = 0; int ret = 0; - int len; + int i; u64 page_addr; u64 *pages; u32 bt_page_size; @@ -1014,29 +1013,25 @@ int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev, i = n = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> PAGE_SHIFT; - for (k = 0; k < len; ++k) { - page_addr = - sg_dma_address(sg) + (k << umem->page_shift); - if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) { - if (page_addr & ((1 << mtt->page_shift) - 1)) { - dev_err(dev, "page_addr 0x%llx is not page_shift %d alignment!\n", - page_addr, mtt->page_shift); - ret = -EINVAL; - goto out; - } - pages[i++] = page_addr; - } - npage++; - if (i == bt_page_size / sizeof(u64)) { - ret = hns_roce_write_mtt(hr_dev, mtt, n, i, - pages); - if (ret) - goto out; - n += i; - i = 0; + for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + page_addr = sg_page_iter_dma_address(&sg_iter); + if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) { + if (page_addr & ((1 << mtt->page_shift) - 1)) { + dev_err(dev, + "page_addr 0x%llx is not page_shift %d alignment!\n", + page_addr, mtt->page_shift); + ret = -EINVAL; + goto out; } + pages[i++] = page_addr; + } + npage++; + if (i == bt_page_size / sizeof(u64)) { + ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages); + if (ret) + goto out; + n += i; + i = 0; } } @@ -1052,10 +1047,8 @@ static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, struct ib_umem *umem) { - struct scatterlist *sg; - int i = 0, j = 0, k; - int entry; - int len; + struct sg_dma_page_iter sg_iter; + int i = 0, j = 0; u64 page_addr; u32 pbl_bt_sz; @@ -1063,27 +1056,22 @@ static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev, return 0; pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> PAGE_SHIFT; - for (k = 0; k < len; ++k) { - page_addr = sg_dma_address(sg) + - (k << umem->page_shift); + for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + page_addr = sg_page_iter_dma_address(&sg_iter); + if (!hr_dev->caps.pbl_hop_num) { + mr->pbl_buf[i++] = page_addr >> 12; + } else if (hr_dev->caps.pbl_hop_num == 1) { + mr->pbl_buf[i++] = page_addr; + } else { + if (hr_dev->caps.pbl_hop_num == 2) + mr->pbl_bt_l1[i][j] = page_addr; + else if (hr_dev->caps.pbl_hop_num == 3) + mr->pbl_bt_l2[i][j] = page_addr; - if (!hr_dev->caps.pbl_hop_num) { - mr->pbl_buf[i++] = page_addr >> 12; - } else if (hr_dev->caps.pbl_hop_num == 1) { - mr->pbl_buf[i++] = page_addr; - } else { - if (hr_dev->caps.pbl_hop_num == 2) - mr->pbl_bt_l1[i][j] = page_addr; - else if (hr_dev->caps.pbl_hop_num == 3) - mr->pbl_bt_l2[i][j] = page_addr; - - j++; - if (j >= (pbl_bt_sz / 8)) { - i++; - j = 0; - } + j++; + if (j >= (pbl_bt_sz / 8)) { + i++; + j = 0; } } } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 73066bf38e47..51ca22b9f960 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -640,19 +640,19 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, } hr_qp->mtt.mtt_type = MTT_TYPE_WQE; + page_shift = PAGE_SHIFT; if (hr_dev->caps.mtt_buf_pg_sz) { npages = (ib_umem_page_count(hr_qp->umem) + (1 << hr_dev->caps.mtt_buf_pg_sz) - 1) / - (1 << hr_dev->caps.mtt_buf_pg_sz); - page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz; + (1 << hr_dev->caps.mtt_buf_pg_sz); + page_shift += hr_dev->caps.mtt_buf_pg_sz; ret = hns_roce_mtt_init(hr_dev, npages, page_shift, &hr_qp->mtt); } else { ret = hns_roce_mtt_init(hr_dev, - ib_umem_page_count(hr_qp->umem), - hr_qp->umem->page_shift, - &hr_qp->mtt); + ib_umem_page_count(hr_qp->umem), + page_shift, &hr_qp->mtt); } if (ret) { dev_err(dev, "hns_roce_mtt_init error for create qp\n"); From 48b586ac36fc4922331a50043058464cc1306aed Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:25:01 -0600 Subject: [PATCH 30/38] RDMA/cxgb4: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Acked-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/mem.c | 32 +++++++++++++------------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 96760a36b9fc..8d1ab8273c2b 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -502,10 +502,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { __be64 *pages; - int shift, n, len; - int i, k, entry; + int shift, n, i; int err = -ENOMEM; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; struct c4iw_dev *rhp; struct c4iw_pd *php; struct c4iw_mr *mhp; @@ -541,7 +540,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(mhp->umem)) goto err_free_skb; - shift = mhp->umem->page_shift; + shift = PAGE_SHIFT; n = mhp->umem->nmap; err = alloc_pbl(mhp, n); @@ -556,21 +555,16 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; - for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { - len = sg_dma_len(sg) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address(sg) + - (k << shift)); - if (i == PAGE_SIZE / sizeof *pages) { - err = write_pbl(&mhp->rhp->rdev, - pages, - mhp->attr.pbl_addr + (n << 3), i, - mhp->wr_waitp); - if (err) - goto pbl_done; - n += i; - i = 0; - } + for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) { + pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter)); + if (i == PAGE_SIZE / sizeof(*pages)) { + err = write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (n << 3), i, + mhp->wr_waitp); + if (err) + goto pbl_done; + n += i; + i = 0; } } From b44e47eb065b65aa7cb2bd9c8d8d5c0009cd1393 Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:25:02 -0600 Subject: [PATCH 31/38] RDMA/cxgb3: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Acked-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb3/iwch_provider.c | 29 +++++++++------------ 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 4cc9a6ae2139..80dff6804e48 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -516,14 +516,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { __be64 *pages; - int shift, n, len; - int i, k, entry; + int shift, n, i; int err = 0; struct iwch_dev *rhp; struct iwch_pd *php; struct iwch_mr *mhp; struct iwch_reg_user_mr_resp uresp; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; pr_debug("%s ib_pd %p\n", __func__, pd); php = to_iwch_pd(pd); @@ -541,7 +540,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(err); } - shift = mhp->umem->page_shift; + shift = PAGE_SHIFT; n = mhp->umem->nmap; @@ -557,19 +556,15 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; - for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { - len = sg_dma_len(sg) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address(sg) + - (k << shift)); - if (i == PAGE_SIZE / sizeof *pages) { - err = iwch_write_pbl(mhp, pages, i, n); - if (err) - goto pbl_done; - n += i; - i = 0; - } - } + for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) { + pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter)); + if (i == PAGE_SIZE / sizeof *pages) { + err = iwch_write_pbl(mhp, pages, i, n); + if (err) + goto pbl_done; + n += i; + i = 0; + } } if (i) From f3e6d3117939e982462fe1539f1ae44b1ed57f09 Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:25:03 -0600 Subject: [PATCH 32/38] RDMA/vmw_pvrdma: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Signed-off-by: Jason Gunthorpe --- .../infiniband/hw/vmw_pvrdma/pvrdma_misc.c | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c index fb0c5c0976b3..7944c58ded0e 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c @@ -183,25 +183,20 @@ int pvrdma_page_dir_insert_umem(struct pvrdma_page_dir *pdir, struct ib_umem *umem, u64 offset) { u64 i = offset; - int j, entry; - int ret = 0, len = 0; - struct scatterlist *sg; + int ret = 0; + struct sg_dma_page_iter sg_iter; if (offset >= pdir->npages) return -EINVAL; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - len = sg_dma_len(sg) >> PAGE_SHIFT; - for (j = 0; j < len; j++) { - dma_addr_t addr = sg_dma_address(sg) + - (j << umem->page_shift); + for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + dma_addr_t addr = sg_page_iter_dma_address(&sg_iter); - ret = pvrdma_page_dir_insert_dma(pdir, i, addr); - if (ret) - goto exit; + ret = pvrdma_page_dir_insert_dma(pdir, i, addr); + if (ret) + goto exit; - i++; - } + i++; } exit: From 95ad233ffbed0c87925483bd902b5ec5fa8ed4cd Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:25:04 -0600 Subject: [PATCH 33/38] RDMA/qedr: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Acked-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 61 +++++++++++++----------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index a06d2258394a..a613ebde322f 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -620,13 +620,12 @@ static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem, struct qedr_pbl *pbl, struct qedr_pbl_info *pbl_info, u32 pg_shift) { - int shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0; + int pbe_cnt, total_num_pbes = 0; u32 fw_pg_cnt, fw_pg_per_umem_pg; struct qedr_pbl *pbl_tbl; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; struct regpair *pbe; u64 pg_addr; - int entry; if (!pbl_info->num_pbes) return; @@ -647,38 +646,32 @@ static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem, pbe_cnt = 0; - shift = umem->page_shift; + fw_pg_per_umem_pg = BIT(PAGE_SHIFT - pg_shift); - fw_pg_per_umem_pg = BIT(umem->page_shift - pg_shift); + for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + pg_addr = sg_page_iter_dma_address(&sg_iter); + for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) { + pbe->lo = cpu_to_le32(pg_addr); + pbe->hi = cpu_to_le32(upper_32_bits(pg_addr)); - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - pages = sg_dma_len(sg) >> shift; - pg_addr = sg_dma_address(sg); - for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { - for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) { - pbe->lo = cpu_to_le32(pg_addr); - pbe->hi = cpu_to_le32(upper_32_bits(pg_addr)); + pg_addr += BIT(pg_shift); + pbe_cnt++; + total_num_pbes++; + pbe++; - pg_addr += BIT(pg_shift); - pbe_cnt++; - total_num_pbes++; - pbe++; + if (total_num_pbes == pbl_info->num_pbes) + return; - if (total_num_pbes == pbl_info->num_pbes) - return; - - /* If the given pbl is full storing the pbes, - * move to next pbl. - */ - if (pbe_cnt == - (pbl_info->pbl_size / sizeof(u64))) { - pbl_tbl++; - pbe = (struct regpair *)pbl_tbl->va; - pbe_cnt = 0; - } - - fw_pg_cnt++; + /* If the given pbl is full storing the pbes, + * move to next pbl. + */ + if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) { + pbl_tbl++; + pbe = (struct regpair *)pbl_tbl->va; + pbe_cnt = 0; } + + fw_pg_cnt++; } } } @@ -739,7 +732,7 @@ static inline int qedr_init_user_queue(struct ib_udata *udata, } fw_pages = ib_umem_page_count(q->umem) << - (q->umem->page_shift - FW_PAGE_SHIFT); + (PAGE_SHIFT - FW_PAGE_SHIFT); rc = qedr_prepare_pbl_tbl(dev, &q->pbl_info, fw_pages, 0); if (rc) @@ -1455,7 +1448,7 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, page_cnt = srq->usrq.pbl_info.num_pbes; pbl_base_addr = srq->usrq.pbl_tbl->pa; phy_prod_pair_addr = hw_srq->phy_prod_pair_addr; - page_size = BIT(srq->usrq.umem->page_shift); + page_size = PAGE_SIZE; } else { struct qed_chain *pbl; @@ -2707,7 +2700,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, goto err1; qedr_populate_pbls(dev, mr->umem, mr->info.pbl_table, - &mr->info.pbl_info, mr->umem->page_shift); + &mr->info.pbl_info, PAGE_SHIFT); rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid); if (rc) { @@ -2728,7 +2721,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, mr->hw_mr.pbl_ptr = mr->info.pbl_table[0].pa; mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered; mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size); - mr->hw_mr.page_size_log = mr->umem->page_shift; + mr->hw_mr.page_size_log = PAGE_SHIFT; mr->hw_mr.fbo = ib_umem_offset(mr->umem); mr->hw_mr.length = len; mr->hw_mr.vaddr = usr_addr; From be8c456abfbd3ae753521e438dc2319fb1dbb8a3 Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:25:05 -0600 Subject: [PATCH 34/38] RDMA/ocrdma: Use for_each_sg_dma_page iterator on umem SGL Use the for_each_sg_dma_page iterator variant to walk the umem DMA-mapped SGL and get the page DMA address. This avoids the extra loop to iterate pages in the SGE when for_each_sg iterator is used. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 53 +++++++++------------ 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 980ba97188ff..ed5da67b693d 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -840,10 +840,11 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr, u32 num_pbes) { struct ocrdma_pbe *pbe; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table; struct ib_umem *umem = mr->umem; - int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0; + int pbe_cnt, total_num_pbes = 0; + u64 pg_addr; if (!mr->hwmr.num_pbes) return; @@ -851,36 +852,26 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr, pbe = (struct ocrdma_pbe *)pbl_tbl->va; pbe_cnt = 0; - shift = umem->page_shift; + for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + /* store the page address in pbe */ + pg_addr = sg_page_iter_dma_address(&sg_iter); + pbe->pa_lo = cpu_to_le32(pg_addr); + pbe->pa_hi = cpu_to_le32(upper_32_bits(pg_addr)); + pbe_cnt += 1; + total_num_pbes += 1; + pbe++; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - pages = sg_dma_len(sg) >> shift; - for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) { - /* store the page address in pbe */ - pbe->pa_lo = - cpu_to_le32(sg_dma_address(sg) + - (pg_cnt << shift)); - pbe->pa_hi = - cpu_to_le32(upper_32_bits(sg_dma_address(sg) + - (pg_cnt << shift))); - pbe_cnt += 1; - total_num_pbes += 1; - pbe++; - - /* if done building pbes, issue the mbx cmd. */ - if (total_num_pbes == num_pbes) - return; - - /* if the given pbl is full storing the pbes, - * move to next pbl. - */ - if (pbe_cnt == - (mr->hwmr.pbl_size / sizeof(u64))) { - pbl_tbl++; - pbe = (struct ocrdma_pbe *)pbl_tbl->va; - pbe_cnt = 0; - } + /* if done building pbes, issue the mbx cmd. */ + if (total_num_pbes == num_pbes) + return; + /* if the given pbl is full storing the pbes, + * move to next pbl. + */ + if (pbe_cnt == (mr->hwmr.pbl_size / sizeof(u64))) { + pbl_tbl++; + pbe = (struct ocrdma_pbe *)pbl_tbl->va; + pbe_cnt = 0; } } } @@ -912,7 +903,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, if (status) goto umem_err; - mr->hwmr.pbe_size = BIT(mr->umem->page_shift); + mr->hwmr.pbe_size = PAGE_SIZE; mr->hwmr.fbo = ib_umem_offset(mr->umem); mr->hwmr.va = usr_addr; mr->hwmr.len = len; From 8317d6cdc1c7b8add6f2e5b915b5b5f80fdbfa5b Mon Sep 17 00:00:00 2001 From: "Shiraz, Saleem" Date: Mon, 11 Feb 2019 09:25:07 -0600 Subject: [PATCH 35/38] RDMA/rxe: Use for_each_sg_page iterator on umem SGL The driver walks the umem SGL assuming a 1:1 mapping between SGE and system page. Update to use the for_each_sg_page iterator to get individual pages contained in the SGEs. This is a pre-requisite before adding page combining into SGEs while building the scatter table in IB core. Additionally, purge umem->page_shift usage in the driver as its only relevant for ODP MRs. Use system page size and shift instead. Signed-off-by: Shiraz, Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mr.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 2438093776a0..42f0f25e396c 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -162,11 +162,10 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, int access, struct ib_udata *udata, struct rxe_mem *mem) { - int entry; struct rxe_map **map; struct rxe_phys_buf *buf = NULL; struct ib_umem *umem; - struct scatterlist *sg; + struct sg_page_iter sg_iter; int num_buf; void *vaddr; int err; @@ -191,16 +190,16 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start, goto err1; } - mem->page_shift = umem->page_shift; - mem->page_mask = BIT(umem->page_shift) - 1; + mem->page_shift = PAGE_SHIFT; + mem->page_mask = PAGE_SIZE - 1; num_buf = 0; map = mem->map; if (length > 0) { buf = map[0]->buf; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - vaddr = page_address(sg_page(sg)); + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + vaddr = page_address(sg_page_iter_page(&sg_iter)); if (!vaddr) { pr_warn("null vaddr\n"); err = -ENOMEM; @@ -208,7 +207,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start, } buf->addr = (uintptr_t)vaddr; - buf->size = BIT(umem->page_shift); + buf->size = PAGE_SIZE; num_buf++; buf++; From 0da4d48d99dfdb2a69172079f6a56e22689d16ce Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 11 Feb 2019 17:40:53 +0200 Subject: [PATCH 36/38] IB/mlx5: Fix bad flow upon DEVX mkey creation Fix bad flow upon DEVX mkey creation to prevent deleting the indirect mkey from the radix tree in case there was a previous failure to insert it. Fixes: 534fd7aac56a ("IB/mlx5: Manage indirection mkey upon DEVX flow for ODP") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index cd43e39ced87..8e6d23d6859f 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1204,14 +1204,15 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); if (err) - goto obj_destroy; + goto err_copy; obj->obj_id = get_enc_obj_id(opcode, obj_id); return 0; -obj_destroy: +err_copy: if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) devx_cleanup_mkey(obj); +obj_destroy: mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); obj_free: kfree(obj); From fc9e4477f924e84d7798f7a1d41401d699de1219 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 11 Feb 2019 17:40:54 +0200 Subject: [PATCH 37/38] RDMA/mlx5: Fix memory leak in case we fail to add an IB device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure the IB device is freed on failure. Fixes: b5ca15ad7e61 ("IB/mlx5: Add proper representors support") Signed-off-by: Mark Bloch Reviewed-by: Bodong Wang Reviewed-by: HÃ¥kon Bugge Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ib_rep.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 6d7b8bad4b61..95ac97af6166 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -78,8 +78,10 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) ibdev->mdev = dev; ibdev->num_ports = max(MLX5_CAP_GEN(dev, num_ports), MLX5_CAP_GEN(dev, num_vhca_ports)); - if (!__mlx5_ib_add(ibdev, &rep_profile)) + if (!__mlx5_ib_add(ibdev, &rep_profile)) { + ib_dealloc_device(&ibdev->ib_dev); return -EINVAL; + } rep->rep_if[REP_IB].priv = ibdev; From a87145957eb9c474559b3acd2cfc6e8914b0e08f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 11 Feb 2019 13:34:15 +0000 Subject: [PATCH 38/38] RDMA/bnxt_re: fix or'ing of data into an uninitialized struct member The struct member comp_mask has not been initialized however a bit pattern is being bitwise or'd into the member and hence other bit fields in comp_mask may contain any garbage from the stack. Fix this by making the bitwise or into an assignment. Fixes: 95b86d1c91ad ("RDMA/bnxt_re: Update kernel user abi to pass chip context") Signed-off-by: Colin Ian King Acked-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bff9320a968e..2ed778683c6b 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3702,7 +3702,7 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, } spin_lock_init(&uctx->sh_lock); - resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_CCTX; + resp.comp_mask = BNXT_RE_UCNTX_CMASK_HAVE_CCTX; chip_met_rev_num = rdev->chip_ctx.chip_num; chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) << BNXT_RE_CHIP_ID0_CHIP_REV_SFT;