From 54e02162d4454a99227f520948bf4494c3d972d0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Sun, 11 Feb 2018 11:28:12 +0800 Subject: [PATCH 01/56] ptr_ring: prevent integer overflow when calculating size Switch to use dividing to prevent integer overflow when size is too big to calculate allocation size properly. Reported-by: Eric Biggers Fixes: 6e6e41c31122 ("ptr_ring: fail early if queue occupies more than KMALLOC_MAX_SIZE") Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index b884b7794187..e6335227b844 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -469,7 +469,7 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, */ static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp) { - if (size * sizeof(void *) > KMALLOC_MAX_SIZE) + if (size > KMALLOC_MAX_SIZE / sizeof(void *)) return NULL; return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO); } From 7ac8ff95f48cbfa609a060fd6a1e361dd62feeb3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 11 Feb 2018 18:10:28 -0500 Subject: [PATCH 02/56] mvpp2: fix multicast address filter IPv6 doesn't work on the MacchiatoBIN board. It is caused by broken multicast address filter in the mvpp2 driver. The driver loads doesn't load any multicast entries if "allmulti" is not set. This condition should be reversed. The condition !netdev_mc_empty(dev) is useless (because netdev_for_each_mc_addr is nop if the list is empty). This patch also fixes a possible overflow of the multicast list - if mvpp2_prs_mac_da_accept fails, we set the allmulti flag and retry. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index a1d7b88cf083..5a1668cdb461 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -7137,6 +7137,7 @@ static void mvpp2_set_rx_mode(struct net_device *dev) int id = port->id; bool allmulti = dev->flags & IFF_ALLMULTI; +retry: mvpp2_prs_mac_promisc_set(priv, id, dev->flags & IFF_PROMISC); mvpp2_prs_mac_multi_set(priv, id, MVPP2_PE_MAC_MC_ALL, allmulti); mvpp2_prs_mac_multi_set(priv, id, MVPP2_PE_MAC_MC_IP6, allmulti); @@ -7144,9 +7145,13 @@ static void mvpp2_set_rx_mode(struct net_device *dev) /* Remove all port->id's mcast enries */ mvpp2_prs_mcast_del_all(priv, id); - if (allmulti && !netdev_mc_empty(dev)) { - netdev_for_each_mc_addr(ha, dev) - mvpp2_prs_mac_da_accept(priv, id, ha->addr, true); + if (!allmulti) { + netdev_for_each_mc_addr(ha, dev) { + if (mvpp2_prs_mac_da_accept(priv, id, ha->addr, true)) { + allmulti = true; + goto retry; + } + } } } From 1b12580af1d0677c3c3a19e35bfe5d59b03f737f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 12 Feb 2018 17:15:40 +0800 Subject: [PATCH 03/56] bridge: check brport attr show in brport_show Now br_sysfs_if file flush doesn't have attr show. To read it will cause kernel panic after users chmod u+r this file. Xiong found this issue when running the commands: ip link add br0 type bridge ip link add type veth ip link set veth0 master br0 chmod u+r /sys/devices/virtual/net/veth0/brport/flush timeout 3 cat /sys/devices/virtual/net/veth0/brport/flush kernel crashed with NULL a pointer dereference call trace. This patch is to fix it by return -EINVAL when brport_attr->show is null, just the same as the check for brport_attr->store in brport_store(). Fixes: 9cf637473c85 ("bridge: add sysfs hook to flush forwarding table") Reported-by: Xiong Zhou Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/bridge/br_sysfs_if.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 0254c35b2bf0..126a8ea73c96 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -255,6 +255,9 @@ static ssize_t brport_show(struct kobject *kobj, struct brport_attribute *brport_attr = to_brport_attr(attr); struct net_bridge_port *p = to_brport(kobj); + if (!brport_attr->show) + return -EINVAL; + return brport_attr->show(p, buf); } From 27af86bb038d9c8b8066cd17854ddaf2ea92bce1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 12 Feb 2018 18:29:06 +0800 Subject: [PATCH 04/56] sctp: do not pr_err for the duplicated node in transport rhlist The pr_err in sctp_hash_transport was supposed to report a sctp bug for using rhashtable/rhlist. The err '-EEXIST' introduced in Commit cd2b70875058 ("sctp: check duplicate node before inserting a new transport") doesn't belong to that case. So just return -EEXIST back without pr_err any kmsg. Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new transport") Reported-by: Wei Chen Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/input.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 141c9c466ec1..0247cc432e02 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -897,15 +897,12 @@ int sctp_hash_transport(struct sctp_transport *t) rhl_for_each_entry_rcu(transport, tmp, list, node) if (transport->asoc->ep == t->asoc->ep) { rcu_read_unlock(); - err = -EEXIST; - goto out; + return -EEXIST; } rcu_read_unlock(); err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); - -out: if (err) pr_err_once("insert transport fail, errno %d\n", err); From 947820b9595aa99f73de033ddcfe4c729c903c75 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 12 Feb 2018 18:29:51 +0800 Subject: [PATCH 05/56] sctp: add SCTP_CID_I_DATA and SCTP_CID_I_FWD_TSN conversion in sctp_cname After the support for SCTP_CID_I_DATA and SCTP_CID_I_FWD_TSN chunks, the corresp conversion in sctp_cname should also be added. Otherwise, in some places, pr_debug will print them as "unknown chunk". Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/debug.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/sctp/debug.c b/net/sctp/debug.c index 291c97b07058..8f6c2e8c0953 100644 --- a/net/sctp/debug.c +++ b/net/sctp/debug.c @@ -81,6 +81,12 @@ const char *sctp_cname(const union sctp_subtype cid) case SCTP_CID_RECONF: return "RECONF"; + case SCTP_CID_I_DATA: + return "I_DATA"; + + case SCTP_CID_I_FWD_TSN: + return "I_FWD_TSN"; + default: break; } From fb23403536eabe81ee90d32cb3051030b871d988 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 12 Feb 2018 18:31:24 +0800 Subject: [PATCH 06/56] sctp: remove the useless check in sctp_renege_events Remove the 'if (chunk)' check in sctp_renege_events for idata process, as all renege commands are generated in sctp_eat_data and it can't be NULL. The same thing we already did for common data in sctp_ulpq_renege. Fixes: 94014e8d871a ("sctp: implement renege_events for sctp_stream_interleave") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/stream_interleave.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 8c7cf8f08711..86c26ec42979 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -954,12 +954,8 @@ static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, __u32 freed = 0; __u16 needed; - if (chunk) { - needed = ntohs(chunk->chunk_hdr->length); - needed -= sizeof(struct sctp_idata_chunk); - } else { - needed = SCTP_DEFAULT_MAXWINDOW; - } + needed = ntohs(chunk->chunk_hdr->length) - + sizeof(struct sctp_idata_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); From 808cf9e38cd7923036a99f459ccc8cf2955e47af Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 12 Feb 2018 12:57:04 +0200 Subject: [PATCH 07/56] tcp: Honor the eor bit in tcp_mtu_probe Avoid SKB coalescing if eor bit is set in one of the relevant SKBs. Fixes: c134ecb87817 ("tcp: Make use of MSG_EOR in tcp_sendmsg") Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e9f985e42405..b2bca373f8be 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2027,6 +2027,24 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk) } } +static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) +{ + struct sk_buff *skb, *next; + + skb = tcp_send_head(sk); + tcp_for_write_queue_from_safe(skb, next, sk) { + if (len <= skb->len) + break; + + if (unlikely(TCP_SKB_CB(skb)->eor)) + return false; + + len -= skb->len; + } + + return true; +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -2099,6 +2117,9 @@ static int tcp_mtu_probe(struct sock *sk) return 0; } + if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) + return -1; + /* We're allowed to probe. Build it now. */ nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) @@ -2134,6 +2155,10 @@ static int tcp_mtu_probe(struct sock *sk) /* We've eaten all the data from this skb. * Throw it away. */ TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + /* If this is the last SKB we copy and eor is set + * we need to propagate it to the new skb. + */ + TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); } else { From 18a5b052bb1ae77453c5e50fffe3470ced9ed82f Mon Sep 17 00:00:00 2001 From: Ingo van Lil Date: Mon, 12 Feb 2018 12:02:52 +0100 Subject: [PATCH 08/56] net: phy: fix wrong mask to phy_modify() When forcing a specific link mode, the PHY driver must clear the existing speed and duplex bits in BMCR while preserving some other control bits. This logic was accidentally inverted with the introduction of phy_modify(). Fixes: fea23fb591cc ("net: phy: convert read-modify-write to phy_modify()") Signed-off-by: Ingo van Lil Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index b13eed21c87d..d39ae77707ef 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1382,7 +1382,7 @@ int genphy_setup_forced(struct phy_device *phydev) ctl |= BMCR_FULLDPLX; return phy_modify(phydev, MII_BMCR, - BMCR_LOOPBACK | BMCR_ISOLATE | BMCR_PDOWN, ctl); + ~(BMCR_LOOPBACK | BMCR_ISOLATE | BMCR_PDOWN), ctl); } EXPORT_SYMBOL(genphy_setup_forced); From dd62c236c0fe1166d037485494ec5ff6545480eb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 12 Feb 2018 14:40:00 +0100 Subject: [PATCH 09/56] ravb: Remove obsolete explicit clock handling for WoL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, if Wake-on-LAN is enabled, the EtherAVB device's module clock is manually kept running during system suspend, to make sure the device stays active. Since commit 91c719f5ec6671f7 ("soc: renesas: rcar-sysc: Keep wakeup sources active during system suspend") , this workaround is no longer needed. Hence remove all explicit clock handling to keep the device active. Signed-off-by: Geert Uytterhoeven Reviewed-by: Niklas Söderlund Reviewed-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb_main.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index c87f57ca4437..a95fbd5510d9 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -2255,9 +2255,6 @@ static int ravb_wol_setup(struct net_device *ndev) /* Enable MagicPacket */ ravb_modify(ndev, ECMR, ECMR_MPDE, ECMR_MPDE); - /* Increased clock usage so device won't be suspended */ - clk_enable(priv->clk); - return enable_irq_wake(priv->emac_irq); } @@ -2276,9 +2273,6 @@ static int ravb_wol_restore(struct net_device *ndev) if (ret < 0) return ret; - /* Restore clock usage count */ - clk_disable(priv->clk); - return disable_irq_wake(priv->emac_irq); } From b4580c952e89a332f077038ef19a7582950c082d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 12 Feb 2018 14:42:36 +0100 Subject: [PATCH 10/56] sh_eth: Remove obsolete explicit clock handling for WoL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, if Wake-on-LAN is enabled, the SH-ETH device's module clock is manually kept running during system suspend, to make sure the device stays active. Since commits 91c719f5ec6671f7 ("soc: renesas: rcar-sysc: Keep wakeup sources active during system suspend") and 744dddcae84441b1 ("clk: renesas: mstp: Keep wakeup sources active during system suspend"), this workaround is no longer needed. Hence remove all explicit clock handling to keep the device active. Signed-off-by: Geert Uytterhoeven Reviewed-by: Niklas Söderlund Reviewed-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index a197e11f3a56..92dcf8717fc6 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include @@ -2304,7 +2303,7 @@ static void sh_eth_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol) wol->supported = 0; wol->wolopts = 0; - if (mdp->cd->magic && mdp->clk) { + if (mdp->cd->magic) { wol->supported = WAKE_MAGIC; wol->wolopts = mdp->wol_enabled ? WAKE_MAGIC : 0; } @@ -2314,7 +2313,7 @@ static int sh_eth_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol) { struct sh_eth_private *mdp = netdev_priv(ndev); - if (!mdp->cd->magic || !mdp->clk || wol->wolopts & ~WAKE_MAGIC) + if (!mdp->cd->magic || wol->wolopts & ~WAKE_MAGIC) return -EOPNOTSUPP; mdp->wol_enabled = !!(wol->wolopts & WAKE_MAGIC); @@ -3153,11 +3152,6 @@ static int sh_eth_drv_probe(struct platform_device *pdev) goto out_release; } - /* Get clock, if not found that's OK but Wake-On-Lan is unavailable */ - mdp->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(mdp->clk)) - mdp->clk = NULL; - ndev->base_addr = res->start; spin_lock_init(&mdp->lock); @@ -3278,7 +3272,7 @@ static int sh_eth_drv_probe(struct platform_device *pdev) if (ret) goto out_napi_del; - if (mdp->cd->magic && mdp->clk) + if (mdp->cd->magic) device_set_wakeup_capable(&pdev->dev, 1); /* print device information */ @@ -3331,9 +3325,6 @@ static int sh_eth_wol_setup(struct net_device *ndev) /* Enable MagicPacket */ sh_eth_modify(ndev, ECMR, ECMR_MPDE, ECMR_MPDE); - /* Increased clock usage so device won't be suspended */ - clk_enable(mdp->clk); - return enable_irq_wake(ndev->irq); } @@ -3359,9 +3350,6 @@ static int sh_eth_wol_restore(struct net_device *ndev) if (ret < 0) return ret; - /* Restore clock usage count */ - clk_disable(mdp->clk); - return disable_irq_wake(ndev->irq); } From 8e021a14d908475fea89ef85b5421865f7ad650d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 12 Feb 2018 17:10:19 +0300 Subject: [PATCH 11/56] net: thunderbolt: Tear down connection properly on suspend When suspending to mem or disk the Thunderbolt controller typically goes down as well tearing down the connection automatically. However, when suspend to idle is used this does not happen so we need to make sure the connection is properly disconnected before it can be re-established during resume. Fixes: e69b6c02b4c3 ("net: Add support for networking over Thunderbolt cable") Signed-off-by: Mika Westerberg Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/thunderbolt.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c index ca5e375de27c..71cf9ab72fbc 100644 --- a/drivers/net/thunderbolt.c +++ b/drivers/net/thunderbolt.c @@ -1270,10 +1270,7 @@ static int __maybe_unused tbnet_suspend(struct device *dev) stop_login(net); if (netif_running(net->dev)) { netif_device_detach(net->dev); - tb_ring_stop(net->rx_ring.ring); - tb_ring_stop(net->tx_ring.ring); - tbnet_free_buffers(&net->rx_ring); - tbnet_free_buffers(&net->tx_ring); + tbnet_tear_down(net, true); } return 0; From 027d351c541744c0c780dd5801c63e4b90750b90 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 12 Feb 2018 17:10:20 +0300 Subject: [PATCH 12/56] net: thunderbolt: Run disconnect flow asynchronously when logout is received The control channel calls registered callbacks when control messages such as XDomain protocol messages are received. The control channel handling is done in a worker running on system workqueue which means the networking driver can't run tear down flow which includes sending disconnect request and waiting for a reply in the same worker. Otherwise reply is never received (as the work is already running) and the operation times out. To fix this run disconnect ThunderboltIP flow asynchronously once ThunderboltIP logout message is received. Fixes: e69b6c02b4c3 ("net: Add support for networking over Thunderbolt cable") Signed-off-by: Mika Westerberg Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/thunderbolt.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c index 71cf9ab72fbc..e0d6760f3219 100644 --- a/drivers/net/thunderbolt.c +++ b/drivers/net/thunderbolt.c @@ -166,6 +166,8 @@ struct tbnet_ring { * @connected_work: Worker that finalizes the ThunderboltIP connection * setup and enables DMA paths for high speed data * transfers + * @disconnect_work: Worker that handles tearing down the ThunderboltIP + * connection * @rx_hdr: Copy of the currently processed Rx frame. Used when a * network packet consists of multiple Thunderbolt frames. * In host byte order. @@ -190,6 +192,7 @@ struct tbnet { int login_retries; struct delayed_work login_work; struct work_struct connected_work; + struct work_struct disconnect_work; struct thunderbolt_ip_frame_header rx_hdr; struct tbnet_ring rx_ring; atomic_t frame_id; @@ -445,7 +448,7 @@ static int tbnet_handle_packet(const void *buf, size_t size, void *data) case TBIP_LOGOUT: ret = tbnet_logout_response(net, route, sequence, command_id); if (!ret) - tbnet_tear_down(net, false); + queue_work(system_long_wq, &net->disconnect_work); break; default: @@ -659,6 +662,13 @@ static void tbnet_login_work(struct work_struct *work) } } +static void tbnet_disconnect_work(struct work_struct *work) +{ + struct tbnet *net = container_of(work, typeof(*net), disconnect_work); + + tbnet_tear_down(net, false); +} + static bool tbnet_check_frame(struct tbnet *net, const struct tbnet_frame *tf, const struct thunderbolt_ip_frame_header *hdr) { @@ -881,6 +891,7 @@ static int tbnet_stop(struct net_device *dev) napi_disable(&net->napi); + cancel_work_sync(&net->disconnect_work); tbnet_tear_down(net, true); tb_ring_free(net->rx_ring.ring); @@ -1195,6 +1206,7 @@ static int tbnet_probe(struct tb_service *svc, const struct tb_service_id *id) net = netdev_priv(dev); INIT_DELAYED_WORK(&net->login_work, tbnet_login_work); INIT_WORK(&net->connected_work, tbnet_connected_work); + INIT_WORK(&net->disconnect_work, tbnet_disconnect_work); mutex_init(&net->connection_lock); atomic_set(&net->command_id, 0); atomic_set(&net->frame_id, 0); From 07a2e1cf398187814b405665b19d36425ec7a962 Mon Sep 17 00:00:00 2001 From: Jan Glauber Date: Mon, 12 Feb 2018 18:20:11 +0100 Subject: [PATCH 13/56] net: cavium: fix NULL pointer dereference in cavium_ptp_put Prevent a kernel panic on reboot if ptp_clock is NULL by checking the ptp pointer before using it. Signed-off-by: Jan Glauber Fixes: 8c56df372bc1 ("net: add support for Cavium PTP coprocessor") Cc: Radoslaw Biernacki Cc: Aleksey Makarov Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/common/cavium_ptp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/cavium/common/cavium_ptp.c b/drivers/net/ethernet/cavium/common/cavium_ptp.c index c87c9c684a33..d59497a7bdce 100644 --- a/drivers/net/ethernet/cavium/common/cavium_ptp.c +++ b/drivers/net/ethernet/cavium/common/cavium_ptp.c @@ -75,6 +75,8 @@ EXPORT_SYMBOL(cavium_ptp_get); void cavium_ptp_put(struct cavium_ptp *ptp) { + if (!ptp) + return; pci_dev_put(ptp->pdev); } EXPORT_SYMBOL(cavium_ptp_put); From da360299b6734135a5f66d7db458dcc7801c826a Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Mon, 12 Feb 2018 23:59:51 +0100 Subject: [PATCH 14/56] uapi/if_ether.h: move __UAPI_DEF_ETHHDR libc define This fixes a compile problem of some user space applications by not including linux/libc-compat.h in uapi/if_ether.h. linux/libc-compat.h checks which "features" the header files, included from the libc, provide to make the Linux kernel uapi header files only provide no conflicting structures and enums. If a user application mixes kernel headers and libc headers it could happen that linux/libc-compat.h gets included too early where not all other libc headers are included yet. Then the linux/libc-compat.h would not prevent all the redefinitions and we run into compile problems. This patch removes the include of linux/libc-compat.h from uapi/if_ether.h to fix the recently introduced case, but not all as this is more or less impossible. It is no problem to do the check directly in the if_ether.h file and not in libc-compat.h as this does not need any fancy glibc header detection as glibc never provided struct ethhdr and should define __UAPI_DEF_ETHHDR by them self when they will provide this. The following test program did not compile correctly any more: #include #include #include int main(void) { return 0; } Fixes: 6926e041a892 ("uapi/if_ether.h: prevent redefinition of struct ethhdr") Reported-by: Guillaume Nault Cc: # 4.15 Signed-off-by: Hauke Mehrtens Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 6 +++++- include/uapi/linux/libc-compat.h | 6 ------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index f8cb5760ea4f..8bbbcb5cd94b 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -23,7 +23,6 @@ #define _UAPI_LINUX_IF_ETHER_H #include -#include /* * IEEE 802.3 Ethernet magic constants. The frame sizes omit the preamble @@ -151,6 +150,11 @@ * This is an Ethernet frame header. */ +/* allow libcs like musl to deactivate this, glibc does not implement this. */ +#ifndef __UAPI_DEF_ETHHDR +#define __UAPI_DEF_ETHHDR 1 +#endif + #if __UAPI_DEF_ETHHDR struct ethhdr { unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h index fc29efaa918c..8254c937c9f4 100644 --- a/include/uapi/linux/libc-compat.h +++ b/include/uapi/linux/libc-compat.h @@ -264,10 +264,4 @@ #endif /* __GLIBC__ */ -/* Definitions for if_ether.h */ -/* allow libcs like musl to deactivate this, glibc does not implement this. */ -#ifndef __UAPI_DEF_ETHHDR -#define __UAPI_DEF_ETHHDR 1 -#endif - #endif /* _UAPI_LIBC_COMPAT_H */ From d4e9a408ef5de35dd82c1337b9fe48348b70047c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 13 Feb 2018 11:11:30 +0100 Subject: [PATCH 15/56] net: af_unix: fix typo in UNIX_SKB_FRAGS_SZ comment Change "minimun" to "minimum". Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d545e1d0dea2..2d465bdeccbc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1825,7 +1825,7 @@ out: } /* We use paged skbs for stream sockets, and limit occupancy to 32768 - * bytes, and a minimun of a full page. + * bytes, and a minimum of a full page. */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) From 0f2d2b2736b08dafa3bde31d048750fbc8df3a31 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 13 Feb 2018 11:22:42 +0100 Subject: [PATCH 16/56] mlxsw: spectrum_router: Fix error path in mlxsw_sp_vr_create Since mlxsw_sp_fib_create() and mlxsw_sp_mr_table_create() use ERR_PTR macro to propagate int err through return of a pointer, the return value is not NULL in case of failure. So if one of the calls fails, one of vr->fib4, vr->fib6 or vr->mr4_table is not NULL and mlxsw_sp_vr_is_used wrongly assumes that vr is in use which leads to crash like following one: [ 1293.949291] BUG: unable to handle kernel NULL pointer dereference at 00000000000006c9 [ 1293.952729] IP: mlxsw_sp_mr_table_flush+0x15/0x70 [mlxsw_spectrum] Fix this by using local variables to hold the pointers and set vr->* only in case everything went fine. Fixes: 76610ebbde18 ("mlxsw: spectrum_router: Refactor virtual router handling") Fixes: a3d9bc506d64 ("mlxsw: spectrum_router: Extend virtual routers with IPv6 support") Fixes: d42b0965b1d4 ("mlxsw: spectrum_router: Add multicast routes notification handling functionality") Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index f0b25baba09a..dcc6305f7c22 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -788,6 +788,9 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp, u32 tb_id, struct netlink_ext_ack *extack) { + struct mlxsw_sp_mr_table *mr4_table; + struct mlxsw_sp_fib *fib4; + struct mlxsw_sp_fib *fib6; struct mlxsw_sp_vr *vr; int err; @@ -796,29 +799,30 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp, NL_SET_ERR_MSG(extack, "spectrum: Exceeded number of supported virtual routers"); return ERR_PTR(-EBUSY); } - vr->fib4 = mlxsw_sp_fib_create(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4); - if (IS_ERR(vr->fib4)) - return ERR_CAST(vr->fib4); - vr->fib6 = mlxsw_sp_fib_create(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV6); - if (IS_ERR(vr->fib6)) { - err = PTR_ERR(vr->fib6); + fib4 = mlxsw_sp_fib_create(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4); + if (IS_ERR(fib4)) + return ERR_CAST(fib4); + fib6 = mlxsw_sp_fib_create(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV6); + if (IS_ERR(fib6)) { + err = PTR_ERR(fib6); goto err_fib6_create; } - vr->mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id, - MLXSW_SP_L3_PROTO_IPV4); - if (IS_ERR(vr->mr4_table)) { - err = PTR_ERR(vr->mr4_table); + mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id, + MLXSW_SP_L3_PROTO_IPV4); + if (IS_ERR(mr4_table)) { + err = PTR_ERR(mr4_table); goto err_mr_table_create; } + vr->fib4 = fib4; + vr->fib6 = fib6; + vr->mr4_table = mr4_table; vr->tb_id = tb_id; return vr; err_mr_table_create: - mlxsw_sp_fib_destroy(mlxsw_sp, vr->fib6); - vr->fib6 = NULL; + mlxsw_sp_fib_destroy(mlxsw_sp, fib6); err_fib6_create: - mlxsw_sp_fib_destroy(mlxsw_sp, vr->fib4); - vr->fib4 = NULL; + mlxsw_sp_fib_destroy(mlxsw_sp, fib4); return ERR_PTR(err); } From bb047ddd145860ff24820320a21f03cf8c071b22 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 13 Feb 2018 12:00:16 +0100 Subject: [PATCH 17/56] net: sched: don't set q pointer for shared blocks It is pointless to set block->q for block which are shared among multiple qdiscs. So remove the assignment in that case. Do a bit of code reshuffle to make block->index initialized at that point so we can use tcf_block_shared() helper. Reported-by: Cong Wang Fixes: 4861738775d7 ("net: sched: introduce shared filter blocks infrastructure") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2bc1bc23d42e..a7dc7271042a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -376,17 +376,12 @@ struct tcf_net { static unsigned int tcf_net_id; static int tcf_block_insert(struct tcf_block *block, struct net *net, - u32 block_index, struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack) { struct tcf_net *tn = net_generic(net, tcf_net_id); - int err; - err = idr_alloc_u32(&tn->idr, block, &block_index, block_index, - GFP_KERNEL); - if (err) - return err; - block->index = block_index; - return 0; + return idr_alloc_u32(&tn->idr, block, &block->index, block->index, + GFP_KERNEL); } static void tcf_block_remove(struct tcf_block *block, struct net *net) @@ -397,6 +392,7 @@ static void tcf_block_remove(struct tcf_block *block, struct net *net) } static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, + u32 block_index, struct netlink_ext_ack *extack) { struct tcf_block *block; @@ -419,10 +415,13 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, err = -ENOMEM; goto err_chain_create; } - block->net = qdisc_net(q); block->refcnt = 1; block->net = net; - block->q = q; + block->index = block_index; + + /* Don't store q pointer for blocks which are shared */ + if (!tcf_block_shared(block)) + block->q = q; return block; err_chain_create: @@ -518,13 +517,12 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, } if (!block) { - block = tcf_block_create(net, q, extack); + block = tcf_block_create(net, q, ei->block_index, extack); if (IS_ERR(block)) return PTR_ERR(block); created = true; - if (ei->block_index) { - err = tcf_block_insert(block, net, - ei->block_index, extack); + if (tcf_block_shared(block)) { + err = tcf_block_insert(block, net, extack); if (err) goto err_block_insert; } From 339c21d7c459238135d87da8fefbfd25d98bc375 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 13 Feb 2018 12:00:17 +0100 Subject: [PATCH 18/56] net: sched: fix tc_u_common lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The offending commit wrongly assumes 1:1 mapping between block and q. However, there are multiple blocks for a single q for classful qdiscs. Since the obscure tc_u_common sharing mechanism expects it to be shared among a qdisc, fix it by storing q pointer in case the block is not shared. Reported-by: Paweł Staszewski Reported-by: Cong Wang Fixes: 7fa9d974f3c2 ("net: sched: cls_u32: use block instead of q in tc_u_common") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 6c7601a530e3..ed8b6a24b9e9 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -96,7 +96,7 @@ struct tc_u_hnode { struct tc_u_common { struct tc_u_hnode __rcu *hlist; - struct tcf_block *block; + void *ptr; int refcnt; struct idr handle_idr; struct hlist_node hnode; @@ -330,9 +330,25 @@ static struct hlist_head *tc_u_common_hash; #define U32_HASH_SHIFT 10 #define U32_HASH_SIZE (1 << U32_HASH_SHIFT) +static void *tc_u_common_ptr(const struct tcf_proto *tp) +{ + struct tcf_block *block = tp->chain->block; + + /* The block sharing is currently supported only + * for classless qdiscs. In that case we use block + * for tc_u_common identification. In case the + * block is not shared, block->q is a valid pointer + * and we can use that. That works for classful qdiscs. + */ + if (tcf_block_shared(block)) + return block; + else + return block->q; +} + static unsigned int tc_u_hash(const struct tcf_proto *tp) { - return hash_ptr(tp->chain->block, U32_HASH_SHIFT); + return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT); } static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) @@ -342,7 +358,7 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) h = tc_u_hash(tp); hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) { - if (tc->block == tp->chain->block) + if (tc->ptr == tc_u_common_ptr(tp)) return tc; } return NULL; @@ -371,7 +387,7 @@ static int u32_init(struct tcf_proto *tp) kfree(root_ht); return -ENOBUFS; } - tp_c->block = tp->chain->block; + tp_c->ptr = tc_u_common_ptr(tp); INIT_HLIST_NODE(&tp_c->hnode); idr_init(&tp_c->handle_idr); From d4014d8cc6dfa964e3e66df525de2384e3583018 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 13 Feb 2018 09:46:16 -0800 Subject: [PATCH 19/56] rds: do not call ->conn_alloc with GFP_KERNEL Commit ebeeb1ad9b8a ("rds: tcp: use rds_destroy_pending() to synchronize netns/module teardown and rds connection/workq management") adds an rcu read critical section to __rd_conn_create. The memory allocations in that critcal section need to use GFP_ATOMIC to avoid sleeping. This patch was verified with syzkaller reproducer. Reported-by: syzbot+a0564419941aaae3fe3c@syzkaller.appspotmail.com Fixes: ebeeb1ad9b8a ("rds: tcp: use rds_destroy_pending() to synchronize netns/module teardown and rds connection/workq management") Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/connection.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index 94e190febfdd..2da3176bf792 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -224,7 +224,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (rds_destroy_pending(conn)) ret = -ENETDOWN; else - ret = trans->conn_alloc(conn, gfp); + ret = trans->conn_alloc(conn, GFP_ATOMIC); if (ret) { rcu_read_unlock(); kfree(conn->c_path); From ac5b70198adc25c73fba28de4f78adcee8f6be0b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Feb 2018 21:35:31 -0800 Subject: [PATCH 20/56] net: fix race on decreasing number of TX queues netif_set_real_num_tx_queues() can be called when netdev is up. That usually happens when user requests change of number of channels/rings with ethtool -L. The procedure for changing the number of queues involves resetting the qdiscs and setting dev->num_tx_queues to the new value. When the new value is lower than the old one, extra care has to be taken to ensure ordering of accesses to the number of queues vs qdisc reset. Currently the queues are reset before new dev->num_tx_queues is assigned, leaving a window of time where packets can be enqueued onto the queues going down, leading to a likely crash in the drivers, since most drivers don't check if TX skbs are assigned to an active queue. Fixes: e6484930d7c7 ("net: allocate tx queues in register_netdevice") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index dda9d7b9a840..d4362befe7e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2382,8 +2382,11 @@ EXPORT_SYMBOL(netdev_set_num_tc); */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { + bool disabling; int rc; + disabling = txq < dev->real_num_tx_queues; + if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; @@ -2399,15 +2402,19 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); - if (txq < dev->real_num_tx_queues) { + dev->real_num_tx_queues = txq; + + if (disabling) { + synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } + } else { + dev->real_num_tx_queues = txq; } - dev->real_num_tx_queues = txq; return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); From fae8b6f4a6be42372f8b7ffda39c3ca2cd951dc1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 13 Feb 2018 19:29:13 +0800 Subject: [PATCH 21/56] sctp: fix some copy-paste errors for file comments This patch is to fix the file comments in stream.c and stream_interleave.c v1->v2: rephrase the comment for stream.c according to Neil's suggestion. Fixes: a83863174a61 ("sctp: prepare asoc stream for stream reconf") Fixes: 0c3f6f655487 ("sctp: implement make_datafrag for sctp_stream_interleave") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/stream.c | 2 +- net/sctp/stream_interleave.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index cedf672487f9..f799043abec9 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -6,7 +6,7 @@ * * This file is part of the SCTP kernel implementation * - * These functions manipulate sctp tsn mapping array. + * This file contains sctp stream maniuplation primitives and helpers. * * This SCTP implementation is free software; * you can redistribute it and/or modify it under the terms of diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 86c26ec42979..65ac03b44df8 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -3,7 +3,8 @@ * * This file is part of the SCTP kernel implementation * - * These functions manipulate sctp stream queue/scheduling. + * These functions implement sctp stream message interleaving, mostly + * including I-DATA and I-FORWARD-TSN chunks process. * * This SCTP implementation is free software; * you can redistribute it and/or modify it under the terms of From e6dbe9397ea754e80f59d852a74fc289fa8b0f3a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 13 Feb 2018 17:59:22 +0100 Subject: [PATCH 22/56] Revert "net: thunderx: Add support for xdp redirect" This reverts commit aa136d0c82fcd6af14535853c30e219e02b2692d. As I previously[1] pointed out this implementation of XDP_REDIRECT is wrong. XDP_REDIRECT is a facility that must work between different NIC drivers. Another NIC driver can call ndo_xdp_xmit/nicvf_xdp_xmit, but your driver patch assumes payload data (at top of page) will contain a queue index and a DMA addr, this is not true and worse will likely contain garbage. Given you have not fixed this in due time (just reached v4.16-rc1), the only option I see is a revert. [1] http://lkml.kernel.org/r/20171211130902.482513d3@redhat.com Cc: Sunil Goutham Cc: Christina Jacob Cc: Aleksey Makarov Fixes: aa136d0c82fc ("net: thunderx: Add support for xdp redirect") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- .../net/ethernet/cavium/thunder/nicvf_main.c | 110 +++++------------- .../ethernet/cavium/thunder/nicvf_queues.c | 11 +- .../ethernet/cavium/thunder/nicvf_queues.h | 4 - 3 files changed, 31 insertions(+), 94 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index b68cde9f17d2..7d9c5ffbd041 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -67,11 +67,6 @@ module_param(cpi_alg, int, S_IRUGO); MODULE_PARM_DESC(cpi_alg, "PFC algorithm (0=none, 1=VLAN, 2=VLAN16, 3=IP Diffserv)"); -struct nicvf_xdp_tx { - u64 dma_addr; - u8 qidx; -}; - static inline u8 nicvf_netdev_qidx(struct nicvf *nic, u8 qidx) { if (nic->sqs_mode) @@ -507,29 +502,14 @@ static int nicvf_init_resources(struct nicvf *nic) return 0; } -static void nicvf_unmap_page(struct nicvf *nic, struct page *page, u64 dma_addr) -{ - /* Check if it's a recycled page, if not unmap the DMA mapping. - * Recycled page holds an extra reference. - */ - if (page_ref_count(page) == 1) { - dma_addr &= PAGE_MASK; - dma_unmap_page_attrs(&nic->pdev->dev, dma_addr, - RCV_FRAG_LEN + XDP_HEADROOM, - DMA_FROM_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); - } -} - static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, struct cqe_rx_t *cqe_rx, struct snd_queue *sq, struct rcv_queue *rq, struct sk_buff **skb) { struct xdp_buff xdp; struct page *page; - struct nicvf_xdp_tx *xdp_tx = NULL; u32 action; - u16 len, err, offset = 0; + u16 len, offset = 0; u64 dma_addr, cpu_addr; void *orig_data; @@ -543,7 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, cpu_addr = (u64)phys_to_virt(cpu_addr); page = virt_to_page((void *)cpu_addr); - xdp.data_hard_start = page_address(page) + RCV_BUF_HEADROOM; + xdp.data_hard_start = page_address(page); xdp.data = (void *)cpu_addr; xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; @@ -563,7 +543,18 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, switch (action) { case XDP_PASS: - nicvf_unmap_page(nic, page, dma_addr); + /* Check if it's a recycled page, if not + * unmap the DMA mapping. + * + * Recycled page holds an extra reference. + */ + if (page_ref_count(page) == 1) { + dma_addr &= PAGE_MASK; + dma_unmap_page_attrs(&nic->pdev->dev, dma_addr, + RCV_FRAG_LEN + XDP_PACKET_HEADROOM, + DMA_FROM_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + } /* Build SKB and pass on packet to network stack */ *skb = build_skb(xdp.data, @@ -576,20 +567,6 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, case XDP_TX: nicvf_xdp_sq_append_pkt(nic, sq, (u64)xdp.data, dma_addr, len); return true; - case XDP_REDIRECT: - /* Save DMA address for use while transmitting */ - xdp_tx = (struct nicvf_xdp_tx *)page_address(page); - xdp_tx->dma_addr = dma_addr; - xdp_tx->qidx = nicvf_netdev_qidx(nic, cqe_rx->rq_idx); - - err = xdp_do_redirect(nic->pnicvf->netdev, &xdp, prog); - if (!err) - return true; - - /* Free the page on error */ - nicvf_unmap_page(nic, page, dma_addr); - put_page(page); - break; default: bpf_warn_invalid_xdp_action(action); /* fall through */ @@ -597,7 +574,18 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, trace_xdp_exception(nic->netdev, prog, action); /* fall through */ case XDP_DROP: - nicvf_unmap_page(nic, page, dma_addr); + /* Check if it's a recycled page, if not + * unmap the DMA mapping. + * + * Recycled page holds an extra reference. + */ + if (page_ref_count(page) == 1) { + dma_addr &= PAGE_MASK; + dma_unmap_page_attrs(&nic->pdev->dev, dma_addr, + RCV_FRAG_LEN + XDP_PACKET_HEADROOM, + DMA_FROM_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + } put_page(page); return true; } @@ -1864,50 +1852,6 @@ static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp) } } -static int nicvf_xdp_xmit(struct net_device *netdev, struct xdp_buff *xdp) -{ - struct nicvf *nic = netdev_priv(netdev); - struct nicvf *snic = nic; - struct nicvf_xdp_tx *xdp_tx; - struct snd_queue *sq; - struct page *page; - int err, qidx; - - if (!netif_running(netdev) || !nic->xdp_prog) - return -EINVAL; - - page = virt_to_page(xdp->data); - xdp_tx = (struct nicvf_xdp_tx *)page_address(page); - qidx = xdp_tx->qidx; - - if (xdp_tx->qidx >= nic->xdp_tx_queues) - return -EINVAL; - - /* Get secondary Qset's info */ - if (xdp_tx->qidx >= MAX_SND_QUEUES_PER_QS) { - qidx = xdp_tx->qidx / MAX_SND_QUEUES_PER_QS; - snic = (struct nicvf *)nic->snicvf[qidx - 1]; - if (!snic) - return -EINVAL; - qidx = xdp_tx->qidx % MAX_SND_QUEUES_PER_QS; - } - - sq = &snic->qs->sq[qidx]; - err = nicvf_xdp_sq_append_pkt(snic, sq, (u64)xdp->data, - xdp_tx->dma_addr, - xdp->data_end - xdp->data); - if (err) - return -ENOMEM; - - nicvf_xdp_sq_doorbell(snic, sq, qidx); - return 0; -} - -static void nicvf_xdp_flush(struct net_device *dev) -{ - return; -} - static int nicvf_config_hwtstamp(struct net_device *netdev, struct ifreq *ifr) { struct hwtstamp_config config; @@ -1986,8 +1930,6 @@ static const struct net_device_ops nicvf_netdev_ops = { .ndo_fix_features = nicvf_fix_features, .ndo_set_features = nicvf_set_features, .ndo_bpf = nicvf_xdp, - .ndo_xdp_xmit = nicvf_xdp_xmit, - .ndo_xdp_flush = nicvf_xdp_flush, .ndo_do_ioctl = nicvf_ioctl, }; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index 3eae9ff9b53a..d42704d07484 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -204,7 +204,7 @@ static inline int nicvf_alloc_rcv_buffer(struct nicvf *nic, struct rbdr *rbdr, /* Reserve space for header modifications by BPF program */ if (rbdr->is_xdp) - buf_len += XDP_HEADROOM; + buf_len += XDP_PACKET_HEADROOM; /* Check if it's recycled */ if (pgcache) @@ -224,9 +224,8 @@ ret: nic->rb_page = NULL; return -ENOMEM; } - if (pgcache) - pgcache->dma_addr = *rbuf + XDP_HEADROOM; + pgcache->dma_addr = *rbuf + XDP_PACKET_HEADROOM; nic->rb_page_offset += buf_len; } @@ -1244,7 +1243,7 @@ int nicvf_xdp_sq_append_pkt(struct nicvf *nic, struct snd_queue *sq, int qentry; if (subdesc_cnt > sq->xdp_free_cnt) - return -1; + return 0; qentry = nicvf_get_sq_desc(sq, subdesc_cnt); @@ -1255,7 +1254,7 @@ int nicvf_xdp_sq_append_pkt(struct nicvf *nic, struct snd_queue *sq, sq->xdp_desc_cnt += subdesc_cnt; - return 0; + return 1; } /* Calculate no of SQ subdescriptors needed to transmit all @@ -1656,7 +1655,7 @@ static void nicvf_unmap_rcv_buffer(struct nicvf *nic, u64 dma_addr, if (page_ref_count(page) != 1) return; - len += XDP_HEADROOM; + len += XDP_PACKET_HEADROOM; /* Receive buffers in XDP mode are mapped from page start */ dma_addr &= PAGE_MASK; } diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h index ce1eed7a6d63..5e9a03cf1b4d 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h @@ -11,7 +11,6 @@ #include #include -#include #include #include "q_struct.h" @@ -94,9 +93,6 @@ #define RCV_FRAG_LEN (SKB_DATA_ALIGN(DMA_BUFFER_LEN + NET_SKB_PAD) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) -#define RCV_BUF_HEADROOM 128 /* To store dma address for XDP redirect */ -#define XDP_HEADROOM (XDP_PACKET_HEADROOM + RCV_BUF_HEADROOM) - #define MAX_CQES_FOR_TX ((SND_QUEUE_LEN / MIN_SQ_DESC_PER_PKT_XMIT) * \ MAX_CQE_PER_PKT_XMIT) From cc85c02edfe48a34865ae00f7d22298a3fdd17aa Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 13 Feb 2018 15:32:50 -0600 Subject: [PATCH 23/56] ibmvnic: Wait until reset is complete to set carrier on Pushes back setting the carrier on until the end of the reset code. This resolves a bug where a watchdog timer was detecting that a TX queue had stalled before the adapter reset was complete. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 27447260215d..1a2d8d66f527 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1670,8 +1670,6 @@ static int do_reset(struct ibmvnic_adapter *adapter, return 0; } - netif_carrier_on(netdev); - /* kick napi */ for (i = 0; i < adapter->req_rx_queues; i++) napi_schedule(&adapter->napi[i]); @@ -1679,6 +1677,8 @@ static int do_reset(struct ibmvnic_adapter *adapter, if (adapter->reset_reason != VNIC_RESET_FAILOVER) netdev_notify_peers(netdev); + netif_carrier_on(netdev); + return 0; } From 34f0f4e3f48810b0ba080bf2a65370b0cc179c51 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 13 Feb 2018 18:23:40 -0600 Subject: [PATCH 24/56] ibmvnic: Fix login buffer memory leaks During device bringup, the driver exchanges login buffers with firmware. These buffers contain information such number of TX and RX queues alloted to the device, RX buffer size, etc. These buffers weren't being properly freed on device reset or close. We can free the buffer we send to firmware as soon as we get a response. There is information in the response buffer that the driver needs for normal operation so retain it until the next reset or removal. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 1a2d8d66f527..8625f5e5b6d4 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -791,6 +791,18 @@ static int ibmvnic_login(struct net_device *netdev) return 0; } +static void release_login_buffer(struct ibmvnic_adapter *adapter) +{ + kfree(adapter->login_buf); + adapter->login_buf = NULL; +} + +static void release_login_rsp_buffer(struct ibmvnic_adapter *adapter) +{ + kfree(adapter->login_rsp_buf); + adapter->login_rsp_buf = NULL; +} + static void release_resources(struct ibmvnic_adapter *adapter) { int i; @@ -813,6 +825,8 @@ static void release_resources(struct ibmvnic_adapter *adapter) } } } + + release_login_rsp_buffer(adapter); } static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state) @@ -3013,6 +3027,7 @@ static void send_login(struct ibmvnic_adapter *adapter) struct vnic_login_client_data *vlcd; int i; + release_login_rsp_buffer(adapter); client_data_len = vnic_client_data_len(adapter); buffer_size = @@ -3708,6 +3723,7 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz, DMA_BIDIRECTIONAL); + release_login_buffer(adapter); dma_unmap_single(dev, adapter->login_rsp_buf_token, adapter->login_rsp_buf_sz, DMA_BIDIRECTIONAL); From 6e4842ddfc2b08931ebd6c0bc95322dd56e5232b Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 13 Feb 2018 18:23:41 -0600 Subject: [PATCH 25/56] ibmvnic: Fix NAPI structures memory leak This memory is allocated during initialization but never freed, so do that now. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 8625f5e5b6d4..23e0b423025a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -825,6 +825,8 @@ static void release_resources(struct ibmvnic_adapter *adapter) } } } + kfree(adapter->napi); + adapter->napi = NULL; release_login_rsp_buffer(adapter); } From 4b9b0f01350500173f17e2b2e65beb4df4ef99c7 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 13 Feb 2018 18:23:42 -0600 Subject: [PATCH 26/56] ibmvnic: Free RX socket buffer in case of adapter error If a RX buffer is returned to the client driver with an error, free the corresponding socket buffer before continuing. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 23e0b423025a..bc93fa2be7fa 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1869,6 +1869,7 @@ restart_poll: be16_to_cpu(next->rx_comp.rc)); /* free the entry */ next->rx_comp.first = 0; + dev_kfree_skb_any(rx_buff->skb); remove_buff_from_pool(adapter, rx_buff); continue; } From d0869c0071e40c4407d1a4d7c9497653cf47253b Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 13 Feb 2018 18:23:43 -0600 Subject: [PATCH 27/56] ibmvnic: Clean RX pool buffers during device close During device close or reset, there were some cases of outstanding RX socket buffers not being freed. Include a function similar to the one that already exists to clean TX socket buffers in this case. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 31 +++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index bc93fa2be7fa..996f47568f9e 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1073,6 +1073,35 @@ static int ibmvnic_open(struct net_device *netdev) return rc; } +static void clean_rx_pools(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_rx_pool *rx_pool; + u64 rx_entries; + int rx_scrqs; + int i, j; + + if (!adapter->rx_pool) + return; + + rx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); + rx_entries = adapter->req_rx_add_entries_per_subcrq; + + /* Free any remaining skbs in the rx buffer pools */ + for (i = 0; i < rx_scrqs; i++) { + rx_pool = &adapter->rx_pool[i]; + if (!rx_pool) + continue; + + netdev_dbg(adapter->netdev, "Cleaning rx_pool[%d]\n", i); + for (j = 0; j < rx_entries; j++) { + if (rx_pool->rx_buff[j].skb) { + dev_kfree_skb_any(rx_pool->rx_buff[j].skb); + rx_pool->rx_buff[j].skb = NULL; + } + } + } +} + static void clean_tx_pools(struct ibmvnic_adapter *adapter) { struct ibmvnic_tx_pool *tx_pool; @@ -1150,7 +1179,7 @@ static int __ibmvnic_close(struct net_device *netdev) } } } - + clean_rx_pools(adapter); clean_tx_pools(adapter); adapter->state = VNIC_CLOSED; return rc; From e5d1a1eec0f4b51d0a7a6457d0b1b99b34f3e901 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 14 Feb 2018 13:37:58 +0800 Subject: [PATCH 28/56] tipc: Refactor __tipc_nl_compat_doit As preparation for adding RTNL to make (*cmd->transcode)() and (*cmd->transcode)() constantly protected by RTNL lock, we move out of memory allocations existing between them as many as possible so that the time of holding RTNL can be minimized in __tipc_nl_compat_doit(). Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index e48f0b2c01b9..974169059b9c 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -285,10 +285,6 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, if (!trans_buf) return -ENOMEM; - err = (*cmd->transcode)(cmd, trans_buf, msg); - if (err) - goto trans_out; - attrbuf = kmalloc((tipc_genl_family.maxattr + 1) * sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) { @@ -296,27 +292,32 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, goto trans_out; } - err = nla_parse(attrbuf, tipc_genl_family.maxattr, - (const struct nlattr *)trans_buf->data, - trans_buf->len, NULL, NULL); - if (err) - goto parse_out; - doit_buf = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!doit_buf) { err = -ENOMEM; - goto parse_out; + goto attrbuf_out; } - doit_buf->sk = msg->dst_sk; - memset(&info, 0, sizeof(info)); info.attrs = attrbuf; + err = (*cmd->transcode)(cmd, trans_buf, msg); + if (err) + goto doit_out; + + err = nla_parse(attrbuf, tipc_genl_family.maxattr, + (const struct nlattr *)trans_buf->data, + trans_buf->len, NULL, NULL); + if (err) + goto doit_out; + + doit_buf->sk = msg->dst_sk; + err = (*cmd->doit)(doit_buf, &info); +doit_out: kfree_skb(doit_buf); -parse_out: +attrbuf_out: kfree(attrbuf); trans_out: kfree_skb(trans_buf); From d59d8b77abf4308e9c6809298341e275eac38404 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 14 Feb 2018 13:37:59 +0800 Subject: [PATCH 29/56] tipc: Introduce __tipc_nl_bearer_disable Introduce __tipc_nl_bearer_disable() which doesn't hold RTNL lock. Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 19 +++++++++++++------ net/tipc/bearer.h | 1 + 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index c8001471da6c..61b6625f93a4 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -813,7 +813,7 @@ err_out: return err; } -int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) +int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) { int err; char *name; @@ -835,19 +835,26 @@ int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); - rtnl_lock(); bearer = tipc_bearer_find(net, name); - if (!bearer) { - rtnl_unlock(); + if (!bearer) return -EINVAL; - } bearer_disable(net, bearer); - rtnl_unlock(); return 0; } +int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_bearer_disable(skb, info); + rtnl_unlock(); + + return err; +} + int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) { int err; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 42d6eeeb646d..bcc6d5f7014b 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -188,6 +188,7 @@ extern struct tipc_media udp_media_info; #endif int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); +int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); From 45cf7edfbc07b2208d7b4a79d4a36aeddf16aefd Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 14 Feb 2018 13:38:00 +0800 Subject: [PATCH 30/56] tipc: Introduce __tipc_nl_bearer_enable Introduce __tipc_nl_bearer_enable() which doesn't hold RTNL lock. Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 17 ++++++++++------- net/tipc/bearer.h | 1 + 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 61b6625f93a4..faf8fa033740 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -855,7 +855,7 @@ int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) return err; } -int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) +int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) { int err; char *bearer; @@ -897,15 +897,18 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); } + return tipc_enable_bearer(net, bearer, domain, prio, attrs); +} + +int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) +{ + int err; + rtnl_lock(); - err = tipc_enable_bearer(net, bearer, domain, prio, attrs); - if (err) { - rtnl_unlock(); - return err; - } + err = __tipc_nl_bearer_enable(skb, info); rtnl_unlock(); - return 0; + return err; } int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index bcc6d5f7014b..fc81150ca9c9 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -190,6 +190,7 @@ extern struct tipc_media udp_media_info; int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); +int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); From 93532bb1d436984dac60c92d1a93eecda4fecb29 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 14 Feb 2018 13:38:01 +0800 Subject: [PATCH 31/56] tipc: Introduce __tipc_nl_bearer_set Introduce __tipc_nl_bearer_set() which doesn't holding RTNL lock. Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 23 ++++++++++++++--------- net/tipc/bearer.h | 1 + 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index faf8fa033740..f92c9c58d686 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -954,7 +954,7 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) return 0; } -int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) +int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) { int err; char *name; @@ -975,22 +975,17 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) return -EINVAL; name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); - rtnl_lock(); b = tipc_bearer_find(net, name); - if (!b) { - rtnl_unlock(); + if (!b) return -EINVAL; - } if (attrs[TIPC_NLA_BEARER_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_BEARER_PROP], props); - if (err) { - rtnl_unlock(); + if (err) return err; - } if (props[TIPC_NLA_PROP_TOL]) b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); @@ -999,11 +994,21 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_WIN]) b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); } - rtnl_unlock(); return 0; } +int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_bearer_set(skb, info); + rtnl_unlock(); + + return err; +} + static int __tipc_nl_add_media(struct tipc_nl_msg *msg, struct tipc_media *media, int nlflags) { diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index fc81150ca9c9..cc0f529a56b5 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -194,6 +194,7 @@ int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); +int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); From 07ffb22357323c7189921935b24d68018e1a2b68 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 14 Feb 2018 13:38:02 +0800 Subject: [PATCH 32/56] tipc: Introduce __tipc_nl_media_set Introduce __tipc_nl_media_set() which doesn't hold RTNL lock. Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 23 ++++++++++++++--------- net/tipc/bearer.h | 1 + 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index f92c9c58d686..3e3dce3d4c63 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1130,7 +1130,7 @@ err_out: return err; } -int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) +int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) { int err; char *name; @@ -1148,22 +1148,17 @@ int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) return -EINVAL; name = nla_data(attrs[TIPC_NLA_MEDIA_NAME]); - rtnl_lock(); m = tipc_media_find(name); - if (!m) { - rtnl_unlock(); + if (!m) return -EINVAL; - } if (attrs[TIPC_NLA_MEDIA_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_MEDIA_PROP], props); - if (err) { - rtnl_unlock(); + if (err) return err; - } if (props[TIPC_NLA_PROP_TOL]) m->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); @@ -1172,7 +1167,17 @@ int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_WIN]) m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); } - rtnl_unlock(); return 0; } + +int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_media_set(skb, info); + rtnl_unlock(); + + return err; +} diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index cc0f529a56b5..a53613d95bc9 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -200,6 +200,7 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); +int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); From 5631f65decf390ae480d157838c0c393a991328e Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 14 Feb 2018 13:38:03 +0800 Subject: [PATCH 33/56] tipc: Introduce __tipc_nl_net_set Introduce __tipc_nl_net_set() which doesn't hold RTNL lock. Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/net.c | 15 ++++++++++++--- net/tipc/net.h | 1 + 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/net/tipc/net.c b/net/tipc/net.c index 719c5924b638..1a2fde0d6f61 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -200,7 +200,7 @@ out: return skb->len; } -int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) +int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); @@ -241,10 +241,19 @@ int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) if (!tipc_addr_node_valid(addr)) return -EINVAL; - rtnl_lock(); tipc_net_start(net, addr); - rtnl_unlock(); } return 0; } + +int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) +{ + int err; + + rtnl_lock(); + err = __tipc_nl_net_set(skb, info); + rtnl_unlock(); + + return err; +} diff --git a/net/tipc/net.h b/net/tipc/net.h index c7c254902873..c0306aa2374b 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -47,5 +47,6 @@ void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); +int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); #endif From ed4ffdfec26dfe1bb02435afd1e01f61426f7212 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 14 Feb 2018 13:38:04 +0800 Subject: [PATCH 34/56] tipc: Fix missing RTNL lock protection during setting link properties Currently when user changes link properties, TIPC first checks if user's command message contains media name or bearer name through tipc_media_find() or tipc_bearer_find() which is protected by RTNL lock. But when tipc_nl_compat_link_set() conducts the checking with the two functions, it doesn't hold RTNL lock at all, as a result, the following complaints were reported: audit: type=1400 audit(1514679888.244:9): avc: denied { write } for pid=3194 comm="syzkaller021477" path="socket:[11143]" dev="sockfs" ino=11143 scontext=unconfined_u:system_r:insmod_t:s0-s0:c0.c1023 tcontext=unconfined_u:system_r:insmod_t:s0-s0:c0.c1023 tclass=netlink_generic_socket permissive=1 Reviewed-by: Kirill Tkhai ============================= WARNING: suspicious RCU usage 4.15.0-rc5+ #152 Not tainted ----------------------------- net/tipc/bearer.c:177 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by syzkaller021477/3194: #0: (cb_lock){++++}, at: [<00000000d20133ea>] genl_rcv+0x19/0x40 net/netlink/genetlink.c:634 #1: (genl_mutex){+.+.}, at: [<00000000fcc5d1bc>] genl_lock net/netlink/genetlink.c:33 [inline] #1: (genl_mutex){+.+.}, at: [<00000000fcc5d1bc>] genl_rcv_msg+0x115/0x140 net/netlink/genetlink.c:622 stack backtrace: CPU: 1 PID: 3194 Comm: syzkaller021477 Not tainted 4.15.0-rc5+ #152 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585 tipc_bearer_find+0x2b4/0x3b0 net/tipc/bearer.c:177 tipc_nl_compat_link_set+0x329/0x9f0 net/tipc/netlink_compat.c:729 __tipc_nl_compat_doit net/tipc/netlink_compat.c:288 [inline] tipc_nl_compat_doit+0x15b/0x660 net/tipc/netlink_compat.c:335 tipc_nl_compat_handle net/tipc/netlink_compat.c:1119 [inline] tipc_nl_compat_recv+0x112f/0x18f0 net/tipc/netlink_compat.c:1201 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:599 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:624 netlink_rcv_skb+0x21e/0x460 net/netlink/af_netlink.c:2408 genl_rcv+0x28/0x40 net/netlink/genetlink.c:635 netlink_unicast_kernel net/netlink/af_netlink.c:1275 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1301 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1864 sock_sendmsg_nosec net/socket.c:636 [inline] sock_sendmsg+0xca/0x110 net/socket.c:646 sock_write_iter+0x31a/0x5d0 net/socket.c:915 call_write_iter include/linux/fs.h:1772 [inline] new_sync_write fs/read_write.c:469 [inline] __vfs_write+0x684/0x970 fs/read_write.c:482 vfs_write+0x189/0x510 fs/read_write.c:544 SYSC_write fs/read_write.c:589 [inline] SyS_write+0xef/0x220 fs/read_write.c:581 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline] do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389 entry_SYSENTER_compat+0x54/0x63 arch/x86/entry/entry_64_compat.S:129 In order to correct the mistake, __tipc_nl_compat_doit() has been protected by RTNL lock, which means the whole operation of setting bearer/media properties is under RTNL protection. Signed-off-by: Ying Xue Reported-by: syzbot Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 974169059b9c..4492cda45566 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -301,6 +301,7 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, memset(&info, 0, sizeof(info)); info.attrs = attrbuf; + rtnl_lock(); err = (*cmd->transcode)(cmd, trans_buf, msg); if (err) goto doit_out; @@ -315,6 +316,7 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, err = (*cmd->doit)(doit_buf, &info); doit_out: + rtnl_unlock(); kfree_skb(doit_buf); attrbuf_out: @@ -723,13 +725,13 @@ static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd, media = tipc_media_find(lc->name); if (media) { - cmd->doit = &tipc_nl_media_set; + cmd->doit = &__tipc_nl_media_set; return tipc_nl_compat_media_set(skb, msg); } bearer = tipc_bearer_find(msg->net, lc->name); if (bearer) { - cmd->doit = &tipc_nl_bearer_set; + cmd->doit = &__tipc_nl_bearer_set; return tipc_nl_compat_bearer_set(skb, msg); } @@ -1090,12 +1092,12 @@ static int tipc_nl_compat_handle(struct tipc_nl_compat_msg *msg) return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_ENABLE_BEARER: msg->req_type = TIPC_TLV_BEARER_CONFIG; - doit.doit = tipc_nl_bearer_enable; + doit.doit = __tipc_nl_bearer_enable; doit.transcode = tipc_nl_compat_bearer_enable; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_DISABLE_BEARER: msg->req_type = TIPC_TLV_BEARER_NAME; - doit.doit = tipc_nl_bearer_disable; + doit.doit = __tipc_nl_bearer_disable; doit.transcode = tipc_nl_compat_bearer_disable; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_SHOW_LINK_STATS: @@ -1149,12 +1151,12 @@ static int tipc_nl_compat_handle(struct tipc_nl_compat_msg *msg) return tipc_nl_compat_dumpit(&dump, msg); case TIPC_CMD_SET_NODE_ADDR: msg->req_type = TIPC_TLV_NET_ADDR; - doit.doit = tipc_nl_net_set; + doit.doit = __tipc_nl_net_set; doit.transcode = tipc_nl_compat_net_set; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_SET_NETID: msg->req_type = TIPC_TLV_UNSIGNED; - doit.doit = tipc_nl_net_set; + doit.doit = __tipc_nl_net_set; doit.transcode = tipc_nl_compat_net_set; return tipc_nl_compat_doit(&doit, msg); case TIPC_CMD_GET_NETID: From a65820e6956782af6c5330749ae37222350d8d3f Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 14 Feb 2018 18:05:31 +1100 Subject: [PATCH 35/56] docs: segmentation-offloads.txt: update for UFO depreciation UFO is deprecated except for tuntap and packet per 0c19f846d582, ("net: accept UFO datagrams from tuntap and packet"). Update UFO docs to reflect this. Signed-off-by: Daniel Axtens Signed-off-by: David S. Miller --- Documentation/networking/segmentation-offloads.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt index 2f09455a993a..2cda12ab7075 100644 --- a/Documentation/networking/segmentation-offloads.txt +++ b/Documentation/networking/segmentation-offloads.txt @@ -49,6 +49,10 @@ datagram into multiple IPv4 fragments. Many of the requirements for UDP fragmentation offload are the same as TSO. However the IPv4 ID for fragments should not increment as a single IPv4 datagram is fragmented. +UFO is deprecated: modern kernels will no longer generate UFO skbs, but can +still receive them from tuntap and similar devices. Offload of UDP-based +tunnel protocols is still supported. + IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads ======================================================== From bc3c2431d4173816240679a02fd4d74685e94bc8 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 14 Feb 2018 18:05:32 +1100 Subject: [PATCH 36/56] docs: segmentation-offloads.txt: Fix ref to SKB_GSO_TUNNEL_REMCSUM The doc originally called it SKB_GSO_REMCSUM. Fix it. Fixes: f7a6272bf3cb ("Documentation: Add documentation for TSO and GSO features") Signed-off-by: Daniel Axtens Signed-off-by: David S. Miller --- Documentation/networking/segmentation-offloads.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt index 2cda12ab7075..b247471a183c 100644 --- a/Documentation/networking/segmentation-offloads.txt +++ b/Documentation/networking/segmentation-offloads.txt @@ -87,10 +87,10 @@ SKB_GSO_UDP_TUNNEL_CSUM. These two additional tunnel types reflect the fact that the outer header also requests to have a non-zero checksum included in the outer header. -Finally there is SKB_GSO_REMCSUM which indicates that a given tunnel header -has requested a remote checksum offload. In this case the inner headers -will be left with a partial checksum and only the outer header checksum -will be computed. +Finally there is SKB_GSO_TUNNEL_REMCSUM which indicates that a given tunnel +header has requested a remote checksum offload. In this case the inner +headers will be left with a partial checksum and only the outer header +checksum will be computed. Generic Segmentation Offload ============================ From a677088922831d94d292ca3891b148a8ba0b5fa1 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 14 Feb 2018 18:05:33 +1100 Subject: [PATCH 37/56] docs: segmentation-offloads.txt: add SCTP info Most of this is extracted from 90017accff61 ("sctp: Add GSO support"), with some extra text about GSO_BY_FRAGS and the need to check for it. Cc: Marcelo Ricardo Leitner Signed-off-by: Daniel Axtens Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- .../networking/segmentation-offloads.txt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/Documentation/networking/segmentation-offloads.txt b/Documentation/networking/segmentation-offloads.txt index b247471a183c..d47480b61ac6 100644 --- a/Documentation/networking/segmentation-offloads.txt +++ b/Documentation/networking/segmentation-offloads.txt @@ -13,6 +13,7 @@ The following technologies are described: * Generic Segmentation Offload - GSO * Generic Receive Offload - GRO * Partial Generic Segmentation Offload - GSO_PARTIAL + * SCTP accelleration with GSO - GSO_BY_FRAGS TCP Segmentation Offload ======================== @@ -132,3 +133,28 @@ values for if the header was simply duplicated. The one exception to this is the outer IPv4 ID field. It is up to the device drivers to guarantee that the IPv4 ID field is incremented in the case that a given header does not have the DF bit set. + +SCTP accelleration with GSO +=========================== + +SCTP - despite the lack of hardware support - can still take advantage of +GSO to pass one large packet through the network stack, rather than +multiple small packets. + +This requires a different approach to other offloads, as SCTP packets +cannot be just segmented to (P)MTU. Rather, the chunks must be contained in +IP segments, padding respected. So unlike regular GSO, SCTP can't just +generate a big skb, set gso_size to the fragmentation point and deliver it +to IP layer. + +Instead, the SCTP protocol layer builds an skb with the segments correctly +padded and stored as chained skbs, and skb_segment() splits based on those. +To signal this, gso_size is set to the special value GSO_BY_FRAGS. + +Therefore, any code in the core networking stack must be aware of the +possibility that gso_size will be GSO_BY_FRAGS and handle that case +appropriately. (For size checks, the skb_gso_validate_*_len family of +helpers do this automatically.) + +This also affects drivers with the NETIF_F_FRAGLIST & NETIF_F_GSO_SCTP bits +set. Note also that NETIF_F_GSO_SCTP is included in NETIF_F_GSO_SOFTWARE. From a1dfa6812b682eef750412dd5a90e7d38d7af068 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 14 Feb 2018 10:46:06 +0200 Subject: [PATCH 38/56] tls: retrun the correct IV in getsockopt Current code returns four bytes of salt followed by four bytes of IV. This patch returns all eight bytes of IV. fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b0d5fcea47e7..a6c3702e0ddb 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -308,7 +308,8 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, goto out; } lock_sock(sk); - memcpy(crypto_info_aes_gcm_128->iv, ctx->iv, + memcpy(crypto_info_aes_gcm_128->iv, + ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); release_sock(sk); if (copy_to_user(optval, From 257082e6ae23e92898440f6bcb2857555bf7957c Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 14 Feb 2018 10:46:07 +0200 Subject: [PATCH 39/56] tls: reset the crypto info if copy_from_user fails copy_from_user could copy some partial information, as a result TLS_CRYPTO_INFO_READY(crypto_info) could be true while crypto_info is using uninitialzed data. This patch resets crypto_info when copy_from_user fails. fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a6c3702e0ddb..c105f86a7ea6 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -376,7 +376,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; - goto out; + goto err_crypto_info; } /* check version */ From c410c1966fe6fcfb23bcac0924aaa6a6e7449829 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 14 Feb 2018 10:46:08 +0200 Subject: [PATCH 40/56] tls: getsockopt return record sequence number Return the TLS record sequence number in getsockopt. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- net/tls/tls_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index c105f86a7ea6..e9b4b53ab53e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -311,6 +311,8 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, memcpy(crypto_info_aes_gcm_128->iv, ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); + memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq, + TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); release_sock(sk); if (copy_to_user(optval, crypto_info_aes_gcm_128, From fe9c842695e26d8116b61b80bfb905356f07834b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 14 Feb 2018 15:45:07 -0800 Subject: [PATCH 41/56] NFC: llcp: Limit size of SDP URI The tlv_len is u8, so we need to limit the size of the SDP URI. Enforce this both in the NLA policy and in the code that performs the allocation and copy, to avoid writing past the end of the allocated buffer. Fixes: d9b8d8e19b073 ("NFC: llcp: Service Name Lookup netlink interface") Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/nfc/llcp_commands.c | 4 ++++ net/nfc/netlink.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 367d8c027101..2ceefa183cee 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -149,6 +149,10 @@ struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, pr_debug("uri: %s, len: %zu\n", uri, uri_len); + /* sdreq->tlv_len is u8, takes uri_len, + 3 for header, + 1 for NULL */ + if (WARN_ON_ONCE(uri_len > U8_MAX - 4)) + return NULL; + sdreq = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); if (sdreq == NULL) return NULL; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index c0b83dc9d993..f018eafc2a0d 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -61,7 +61,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { - [NFC_SDP_ATTR_URI] = { .type = NLA_STRING }, + [NFC_SDP_ATTR_URI] = { .type = NLA_STRING, + .len = U8_MAX - 4 }, [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 }, }; From a8c6db1dfd1b1d18359241372bb204054f2c3174 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 15 Feb 2018 09:46:03 +0100 Subject: [PATCH 42/56] fib_semantics: Don't match route with mismatching tclassid In fib_nh_match(), if output interface or gateway are passed in the FIB configuration, we don't have to check next hops of multipath routes to conclude whether we have a match or not. However, we might still have routes with different realms matching the same output interface and gateway configuration, and this needs to cause the match to fail. Otherwise the first route inserted in the FIB will match, regardless of the realms: # ip route add 1.1.1.1 dev eth0 table 1234 realms 1/2 # ip route append 1.1.1.1 dev eth0 table 1234 realms 3/4 # ip route list table 1234 1.1.1.1 dev eth0 scope link realms 1/2 1.1.1.1 dev eth0 scope link realms 3/4 # ip route del 1.1.1.1 dev ens3 table 1234 realms 3/4 # ip route list table 1234 1.1.1.1 dev ens3 scope link realms 3/4 whereas route with realms 3/4 should have been deleted instead. Explicitly check for fc_flow passed in the FIB configuration (this comes from RTA_FLOW extracted by rtm_to_fib_config()) and fail matching if it differs from nh_tclassid. The handling of RTA_FLOW for multipath routes later in fib_nh_match() is still needed, as we can have multiple RTA_FLOW attributes that need to be matched against the tclassid of each next hop. v2: Check that fc_flow is set before discarding the match, so that the user can still select the first matching rule by not specifying any realm, as suggested by David Ahern. Reported-by: Jianlin Shi Signed-off-by: Stefano Brivio Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c586597da20d..7d36a950d961 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -646,6 +646,11 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, fi->fib_nh, cfg, extack)) return 1; } +#ifdef CONFIG_IP_ROUTE_CLASSID + if (cfg->fc_flow && + cfg->fc_flow != fi->fib_nh->nh_tclassid) + return 1; +#endif if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) return 0; From c4e43e14cd4617d57babc7a9f251bf3e9ad360a0 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Thu, 15 Feb 2018 18:16:57 +0530 Subject: [PATCH 43/56] cxgb4: free up resources of pf 0-3 free pf 0-3 resources, commit baf5086840ab ("cxgb4: restructure VF mgmt code") erroneously removed the code which frees the pf 0-3 resources, causing the probe of pf 0-3 to fail in case of driver reload. Fixes: baf5086840ab ("cxgb4: restructure VF mgmt code") Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- .../net/ethernet/chelsio/cxgb4/cxgb4_main.c | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 56bc626ef006..7b452e85de2a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -4982,9 +4982,10 @@ static int cxgb4_iov_configure(struct pci_dev *pdev, int num_vfs) pcie_fw = readl(adap->regs + PCIE_FW_A); /* Check if cxgb4 is the MASTER and fw is initialized */ - if (!(pcie_fw & PCIE_FW_INIT_F) || + if (num_vfs && + (!(pcie_fw & PCIE_FW_INIT_F) || !(pcie_fw & PCIE_FW_MASTER_VLD_F) || - PCIE_FW_MASTER_G(pcie_fw) != CXGB4_UNIFIED_PF) { + PCIE_FW_MASTER_G(pcie_fw) != CXGB4_UNIFIED_PF)) { dev_warn(&pdev->dev, "cxgb4 driver needs to be MASTER to support SRIOV\n"); return -EOPNOTSUPP; @@ -5599,24 +5600,24 @@ static void remove_one(struct pci_dev *pdev) #if IS_ENABLED(CONFIG_IPV6) t4_cleanup_clip_tbl(adapter); #endif - iounmap(adapter->regs); if (!is_t4(adapter->params.chip)) iounmap(adapter->bar2); - pci_disable_pcie_error_reporting(pdev); - if ((adapter->flags & DEV_ENABLED)) { - pci_disable_device(pdev); - adapter->flags &= ~DEV_ENABLED; - } - pci_release_regions(pdev); - kfree(adapter->mbox_log); - synchronize_rcu(); - kfree(adapter); } #ifdef CONFIG_PCI_IOV else { cxgb4_iov_configure(adapter->pdev, 0); } #endif + iounmap(adapter->regs); + pci_disable_pcie_error_reporting(pdev); + if ((adapter->flags & DEV_ENABLED)) { + pci_disable_device(pdev); + adapter->flags &= ~DEV_ENABLED; + } + pci_release_regions(pdev); + kfree(adapter->mbox_log); + synchronize_rcu(); + kfree(adapter); } /* "Shutdown" quiesces the device, stopping Ingress Packet and Interrupt From e6f02a4d57cc438099bc8abfba43ba1400d77b38 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Thu, 15 Feb 2018 18:20:01 +0530 Subject: [PATCH 44/56] cxgb4: fix trailing zero in CIM LA dump Set correct size of the CIM LA dump for T6. Fixes: 27887bc7cb7f ("cxgb4: collect hardware LA dumps") Signed-off-by: Rahul Lakkireddy Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c index 557fd8bfd54e..00a1d2d13169 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c @@ -472,7 +472,7 @@ int cudbg_collect_cim_la(struct cudbg_init *pdbg_init, if (is_t6(padap->params.chip)) { size = padap->params.cim_la_size / 10 + 1; - size *= 11 * sizeof(u32); + size *= 10 * sizeof(u32); } else { size = padap->params.cim_la_size / 8; size *= 8 * sizeof(u32); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c index 30485f9a598f..143686c60234 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c @@ -102,7 +102,7 @@ static u32 cxgb4_get_entity_length(struct adapter *adap, u32 entity) case CUDBG_CIM_LA: if (is_t6(adap->params.chip)) { len = adap->params.cim_la_size / 10 + 1; - len *= 11 * sizeof(u32); + len *= 10 * sizeof(u32); } else { len = adap->params.cim_la_size / 8; len *= 8 * sizeof(u32); From 7dcf688d4c78a18ba9538b2bf1b11dc7a43fe9be Mon Sep 17 00:00:00 2001 From: Casey Leedom Date: Thu, 15 Feb 2018 20:03:18 +0530 Subject: [PATCH 45/56] PCI/cxgb4: Extend T3 PCI quirk to T4+ devices We've run into a problem where our device is attached to a Virtual Machine and the use of the new pci_set_vpd_size() API doesn't help. The VM kernel has been informed that the accesses are okay, but all of the actual VPD Capability Accesses are trapped down into the KVM Hypervisor where it goes ahead and imposes the silent denials. The right idea is to follow the kernel.org commit 1c7de2b4ff88 ("PCI: Enable access to non-standard VPD for Chelsio devices (cxgb3)") which Alexey Kardashevskiy authored to establish a PCI Quirk for our T3-based adapters. This commit extends that PCI Quirk to cover Chelsio T4 devices and later. The advantage of this approach is that the VPD Size gets set early in the Base OS/Hypervisor Boot and doesn't require that the cxgb4 driver even be available in the Base OS/Hypervisor. Thus PF4 can be exported to a Virtual Machine and everything should work. Fixes: 67e658794ca1 ("cxgb4: Set VPD size so we can read both VPD structures") Cc: # v4.9+ Signed-off-by: Casey Leedom Signed-off-by: Arjun Vynipadath Signed-off-by: Ganesh Goudar Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 10 ------- drivers/pci/quirks.c | 35 +++++++++++++--------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 047609ef0515..920bccd6bc40 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -2637,7 +2637,6 @@ void t4_get_regs(struct adapter *adap, void *buf, size_t buf_size) } #define EEPROM_STAT_ADDR 0x7bfc -#define VPD_SIZE 0x800 #define VPD_BASE 0x400 #define VPD_BASE_OLD 0 #define VPD_LEN 1024 @@ -2704,15 +2703,6 @@ int t4_get_raw_vpd_params(struct adapter *adapter, struct vpd_params *p) if (!vpd) return -ENOMEM; - /* We have two VPD data structures stored in the adapter VPD area. - * By default, Linux calculates the size of the VPD area by traversing - * the first VPD area at offset 0x0, so we need to tell the OS what - * our real VPD size is. - */ - ret = pci_set_vpd_size(adapter->pdev, VPD_SIZE); - if (ret < 0) - goto out; - /* Card information normally starts at VPD_BASE but early cards had * it at 0. */ diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index fc734014206f..8b14bd326d4a 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3419,22 +3419,29 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PORT_RIDGE, static void quirk_chelsio_extend_vpd(struct pci_dev *dev) { - pci_set_vpd_size(dev, 8192); + int chip = (dev->device & 0xf000) >> 12; + int func = (dev->device & 0x0f00) >> 8; + int prod = (dev->device & 0x00ff) >> 0; + + /* + * If this is a T3-based adapter, there's a 1KB VPD area at offset + * 0xc00 which contains the preferred VPD values. If this is a T4 or + * later based adapter, the special VPD is at offset 0x400 for the + * Physical Functions (the SR-IOV Virtual Functions have no VPD + * Capabilities). The PCI VPD Access core routines will normally + * compute the size of the VPD by parsing the VPD Data Structure at + * offset 0x000. This will result in silent failures when attempting + * to accesses these other VPD areas which are beyond those computed + * limits. + */ + if (chip == 0x0 && prod >= 0x20) + pci_set_vpd_size(dev, 8192); + else if (chip >= 0x4 && func < 0x8) + pci_set_vpd_size(dev, 2048); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x20, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x21, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x22, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x23, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x24, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x25, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x26, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x30, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x31, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x32, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x35, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x36, quirk_chelsio_extend_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, 0x37, quirk_chelsio_extend_vpd); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID, + quirk_chelsio_extend_vpd); #ifdef CONFIG_ACPI /* From dfec091439bb2acf763497cfc58f2bdfc67c56b7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 Feb 2018 16:59:49 +0100 Subject: [PATCH 46/56] dn_getsockoptdecnet: move nf_{get/set}sockopt outside sock lock After commit 3f34cfae1238 ("netfilter: on sockopt() acquire sock lock only in the required scope"), the caller of nf_{get/set}sockopt() must not hold any lock, but, in such changeset, I forgot to cope with DECnet. This commit addresses the issue moving the nf call outside the lock, in the dn_{get,set}sockopt() with the same schema currently used by ipv4 and ipv6. Also moves the unhandled sockopts of the end of the main switch statements, to improve code readability. Reported-by: Petr Vandrovec BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=198791#c2 Fixes: 3f34cfae1238 ("netfilter: on sockopt() acquire sock lock only in the required scope") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/decnet/af_decnet.c | 62 ++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 91dd09f79808..791aff68af88 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1338,6 +1338,12 @@ static int dn_setsockopt(struct socket *sock, int level, int optname, char __use lock_sock(sk); err = __dn_setsockopt(sock, level, optname, optval, optlen, 0); release_sock(sk); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != DSO_LINKINFO && + optname != DSO_STREAM && optname != DSO_SEQPACKET) + err = nf_setsockopt(sk, PF_DECnet, optname, optval, optlen); +#endif return err; } @@ -1445,15 +1451,6 @@ static int __dn_setsockopt(struct socket *sock, int level,int optname, char __us dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation); break; - default: -#ifdef CONFIG_NETFILTER - return nf_setsockopt(sk, PF_DECnet, optname, optval, optlen); -#endif - case DSO_LINKINFO: - case DSO_STREAM: - case DSO_SEQPACKET: - return -ENOPROTOOPT; - case DSO_MAXWINDOW: if (optlen != sizeof(unsigned long)) return -EINVAL; @@ -1501,6 +1498,12 @@ static int __dn_setsockopt(struct socket *sock, int level,int optname, char __us return -EINVAL; scp->info_loc = u.info; break; + + case DSO_LINKINFO: + case DSO_STREAM: + case DSO_SEQPACKET: + default: + return -ENOPROTOOPT; } return 0; @@ -1514,6 +1517,20 @@ static int dn_getsockopt(struct socket *sock, int level, int optname, char __use lock_sock(sk); err = __dn_getsockopt(sock, level, optname, optval, optlen, 0); release_sock(sk); +#ifdef CONFIG_NETFILTER + if (err == -ENOPROTOOPT && optname != DSO_STREAM && + optname != DSO_SEQPACKET && optname != DSO_CONACCEPT && + optname != DSO_CONREJECT) { + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + err = nf_getsockopt(sk, PF_DECnet, optname, optval, &len); + if (err >= 0) + err = put_user(len, optlen); + } +#endif return err; } @@ -1579,26 +1596,6 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us r_data = &link; break; - default: -#ifdef CONFIG_NETFILTER - { - int ret, len; - - if (get_user(len, optlen)) - return -EFAULT; - - ret = nf_getsockopt(sk, PF_DECnet, optname, optval, &len); - if (ret >= 0) - ret = put_user(len, optlen); - return ret; - } -#endif - case DSO_STREAM: - case DSO_SEQPACKET: - case DSO_CONACCEPT: - case DSO_CONREJECT: - return -ENOPROTOOPT; - case DSO_MAXWINDOW: if (r_len > sizeof(unsigned long)) r_len = sizeof(unsigned long); @@ -1630,6 +1627,13 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us r_len = sizeof(unsigned char); r_data = &scp->info_rem; break; + + case DSO_STREAM: + case DSO_SEQPACKET: + case DSO_CONACCEPT: + case DSO_CONREJECT: + default: + return -ENOPROTOOPT; } if (r_data) { From da27988766e338e4a4fe198170497c0920395d4c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 16 Feb 2018 15:52:42 -0500 Subject: [PATCH 47/56] skbuff: Fix comment mis-spelling. 'peform' --> 'perform' Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ebc0f869720..c1e66bdcf583 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3646,7 +3646,7 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, return true; } -/* For small packets <= CHECKSUM_BREAK peform checksum complete directly +/* For small packets <= CHECKSUM_BREAK perform checksum complete directly * in checksum_init. */ #define CHECKSUM_BREAK 76 From 15f35d49c93f4fa9875235e7bf3e3783d2dd7a1b Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 15 Feb 2018 20:18:43 +0300 Subject: [PATCH 48/56] udplite: fix partial checksum initialization Since UDP-Lite is always using checksum, the following path is triggered when calculating pseudo header for it: udp4_csum_init() or udp6_csum_init() skb_checksum_init_zero_check() __skb_checksum_validate_complete() The problem can appear if skb->len is less than CHECKSUM_BREAK. In this particular case __skb_checksum_validate_complete() also invokes __skb_checksum_complete(skb). If UDP-Lite is using partial checksum that covers only part of a packet, the function will return bad checksum and the packet will be dropped. It can be fixed if we skip skb_checksum_init_zero_check() and only set the required pseudo header checksum for UDP-Lite with partial checksum before udp4_csum_init()/udp6_csum_init() functions return. Fixes: ed70fcfcee95 ("net: Call skb_checksum_init in IPv4") Fixes: e4f45b7f40bd ("net: Call skb_checksum_init in IPv6") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- include/net/udplite.h | 1 + net/ipv4/udp.c | 5 +++++ net/ipv6/ip6_checksum.c | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/include/net/udplite.h b/include/net/udplite.h index 81bdbf97319b..9185e45b997f 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -64,6 +64,7 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) UDP_SKB_CB(skb)->cscov = cscov; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; + skb->csum_valid = 0; } return 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bfaefe560b5c..e5ef7c38c934 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2024,6 +2024,11 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, err = udplite_checksum_init(skb, uh); if (err) return err; + + if (UDP_SKB_CB(skb)->partial_cov) { + skb->csum = inet_compute_pseudo(skb, proto); + return 0; + } } /* Note, we are only interested in != 0 or == 0, thus the diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index ec43d18b5ff9..547515e8450a 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -73,6 +73,11 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) err = udplite_checksum_init(skb, uh); if (err) return err; + + if (UDP_SKB_CB(skb)->partial_cov) { + skb->csum = ip6_compute_pseudo(skb, proto); + return 0; + } } /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) From 43a08e0f58b3f236165029710a4e3b303815253b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Feb 2018 14:47:15 -0800 Subject: [PATCH 49/56] tun: fix tun_napi_alloc_frags() frag allocator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While fuzzing arm64 v4.16-rc1 with Syzkaller, I've been hitting a misaligned atomic in __skb_clone:         atomic_inc(&(skb_shinfo(skb)->dataref)); where dataref doesn't have the required natural alignment, and the atomic operation faults. e.g. i often see it aligned to a single byte boundary rather than a four byte boundary. AFAICT, the skb_shared_info is misaligned at the instant it's allocated in __napi_alloc_skb() __napi_alloc_skb() Problem is caused by tun_napi_alloc_frags() using napi_alloc_frag() with user provided seg sizes, leading to other users of this API getting unaligned page fragments. Since we would like to not necessarily add paddings or alignments to the frags that tun_napi_alloc_frags() attaches to the skb, switch to another page frag allocator. As a bonus skb_page_frag_refill() can use GFP_KERNEL allocations, meaning that we can not deplete memory reserves as easily. Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver") Signed-off-by: Eric Dumazet Reported-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: David S. Miller --- drivers/net/tun.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 81e6cc951e7f..b52258c327d2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1489,27 +1489,23 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { + struct page_frag *pfrag = ¤t->task_frag; size_t fragsz = it->iov[i].iov_len; - unsigned long offset; - struct page *page; - void *data; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } - local_bh_disable(); - data = napi_alloc_frag(fragsz); - local_bh_enable(); - if (!data) { + if (!skb_page_frag_refill(fragsz, pfrag, GFP_KERNEL)) { err = -ENOMEM; goto free; } - page = virt_to_head_page(data); - offset = data - page_address(page); - skb_fill_page_desc(skb, i - 1, page, offset, fragsz); + skb_fill_page_desc(skb, i - 1, pfrag->page, + pfrag->offset, fragsz); + page_ref_inc(pfrag->page); + pfrag->offset += fragsz; } return skb; From a16b8d0cf2ec1e626d24bc2a7b9e64ace6f7501d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 15 Feb 2018 22:59:00 +0000 Subject: [PATCH 50/56] rxrpc: Work around usercopy check Due to a check recently added to copy_to_user(), it's now not permitted to copy from slab-held data to userspace unless the slab is whitelisted. This affects rxrpc_recvmsg() when it attempts to place an RXRPC_USER_CALL_ID control message in the userspace control message buffer. A warning is generated by usercopy_warn() because the source is the copy of the user_call_ID retained in the rxrpc_call struct. Work around the issue by copying the user_call_ID to a variable on the stack and passing that to put_cmsg(). The warning generated looks like: Bad or missing usercopy whitelist? Kernel memory exposure attempt detected from SLUB object 'dmaengine-unmap-128' (offset 680, size 8)! WARNING: CPU: 0 PID: 1401 at mm/usercopy.c:81 usercopy_warn+0x7e/0xa0 ... RIP: 0010:usercopy_warn+0x7e/0xa0 ... Call Trace: __check_object_size+0x9c/0x1a0 put_cmsg+0x98/0x120 rxrpc_recvmsg+0x6fc/0x1010 [rxrpc] ? finish_wait+0x80/0x80 ___sys_recvmsg+0xf8/0x240 ? __clear_rsb+0x25/0x3d ? __clear_rsb+0x15/0x3d ? __clear_rsb+0x25/0x3d ? __clear_rsb+0x15/0x3d ? __clear_rsb+0x25/0x3d ? __clear_rsb+0x15/0x3d ? __clear_rsb+0x25/0x3d ? __clear_rsb+0x15/0x3d ? finish_task_switch+0xa6/0x2b0 ? trace_hardirqs_on_caller+0xed/0x180 ? _raw_spin_unlock_irq+0x29/0x40 ? __sys_recvmsg+0x4e/0x90 __sys_recvmsg+0x4e/0x90 do_syscall_64+0x7a/0x220 entry_SYSCALL_64_after_hwframe+0x26/0x9b Reported-by: Jonathan Billings Signed-off-by: David Howells Acked-by: Kees Cook Tested-by: Jonathan Billings Signed-off-by: David S. Miller --- net/rxrpc/recvmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index cc21e8db25b0..9d45d8b56744 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -517,9 +517,10 @@ try_again: ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, sizeof(unsigned int), &id32); } else { + unsigned long idl = call->user_call_ID; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned long), - &call->user_call_ID); + sizeof(unsigned long), &idl); } if (ret < 0) goto error_unlock_call; From 9ab2323ca184168c288f7355fc19ec0838efc20c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 16 Feb 2018 17:18:33 +0800 Subject: [PATCH 51/56] sctp: remove the left unnecessary check for chunk in sctp_renege_events Commit fb23403536ea ("sctp: remove the useless check in sctp_renege_events") forgot to remove another check for chunk in sctp_renege_events. Dan found this when doing a static check. This patch is to remove that check, and also to merge two checks into one 'if statement'. Fixes: fb23403536ea ("sctp: remove the useless check in sctp_renege_events") Reported-by: Dan Carpenter Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/stream_interleave.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index 65ac03b44df8..d3764c181299 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -968,9 +968,8 @@ static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, needed); } - if (chunk && freed >= needed) - if (sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) - sctp_intl_start_pd(ulpq, gfp); + if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) + sctp_intl_start_pd(ulpq, gfp); sk_mem_reclaim(asoc->base.sk); } From b37f78f234bf4fd98979d6c3ccc0f85e508f978f Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Fri, 16 Feb 2018 15:56:37 -0700 Subject: [PATCH 52/56] net: qualcomm: rmnet: Fix crash on real dev unregistration With CONFIG_DEBUG_PREEMPT enabled, a crash with the following call stack was observed when removing a real dev which had rmnet devices attached to it. To fix this, remove the netdev_upper link APIs and instead use the existing information in rmnet_port and rmnet_priv to get the association between real and rmnet devs. BUG: sleeping function called from invalid context in_atomic(): 0, irqs_disabled(): 0, pid: 5762, name: ip Preemption disabled at: [] debug_object_active_state+0xa4/0x16c Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: PC is at ___might_sleep+0x13c/0x180 LR is at ___might_sleep+0x17c/0x180 [] ___might_sleep+0x13c/0x180 [] __might_sleep+0x58/0x8c [] mutex_lock+0x2c/0x48 [] kernfs_remove_by_name_ns+0x48/0xa8 [] sysfs_remove_link+0x30/0x58 [] __netdev_adjacent_dev_remove+0x14c/0x1e0 [] __netdev_adjacent_dev_unlink_lists+0x40/0x68 [] netdev_upper_dev_unlink+0xb4/0x1fc [] rmnet_dev_walk_unreg+0x6c/0xc8 [] netdev_walk_all_lower_dev_rcu+0x58/0xb4 [] rmnet_config_notify_cb+0xf4/0x134 [] raw_notifier_call_chain+0x58/0x78 [] call_netdevice_notifiers_info+0x48/0x78 [] rollback_registered_many+0x230/0x3c8 [] unregister_netdevice_many+0x38/0x94 [] rtnl_delete_link+0x58/0x88 [] rtnl_dellink+0xbc/0x1cc [] rtnetlink_rcv_msg+0xb0/0x244 [] netlink_rcv_skb+0xb4/0xdc [] rtnetlink_rcv+0x34/0x44 [] netlink_unicast+0x1ec/0x294 [] netlink_sendmsg+0x320/0x390 [] sock_sendmsg+0x54/0x60 [] ___sys_sendmsg+0x298/0x2b0 [] SyS_sendmsg+0xb4/0xf0 [] el0_svc_naked+0x24/0x28 Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation") Fixes: 60d58f971c10 ("net: qualcomm: rmnet: Implement bridge mode") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- .../ethernet/qualcomm/rmnet/rmnet_config.c | 68 ++++--------------- 1 file changed, 14 insertions(+), 54 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 7e7704daf5f1..c4949183eef3 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -43,12 +43,6 @@ /* Local Definitions and Declarations */ -struct rmnet_walk_data { - struct net_device *real_dev; - struct list_head *head; - struct rmnet_port *port; -}; - static int rmnet_is_real_dev_registered(const struct net_device *real_dev) { return rcu_access_pointer(real_dev->rx_handler) == rmnet_rx_handler; @@ -112,17 +106,14 @@ static int rmnet_register_real_device(struct net_device *real_dev) static void rmnet_unregister_bridge(struct net_device *dev, struct rmnet_port *port) { - struct net_device *rmnet_dev, *bridge_dev; struct rmnet_port *bridge_port; + struct net_device *bridge_dev; if (port->rmnet_mode != RMNET_EPMODE_BRIDGE) return; /* bridge slave handling */ if (!port->nr_rmnet_devs) { - rmnet_dev = netdev_master_upper_dev_get_rcu(dev); - netdev_upper_dev_unlink(dev, rmnet_dev); - bridge_dev = port->bridge_ep; bridge_port = rmnet_get_port_rtnl(bridge_dev); @@ -132,9 +123,6 @@ static void rmnet_unregister_bridge(struct net_device *dev, bridge_dev = port->bridge_ep; bridge_port = rmnet_get_port_rtnl(bridge_dev); - rmnet_dev = netdev_master_upper_dev_get_rcu(bridge_dev); - netdev_upper_dev_unlink(bridge_dev, rmnet_dev); - rmnet_unregister_real_device(bridge_dev, bridge_port); } } @@ -173,10 +161,6 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, if (err) goto err1; - err = netdev_master_upper_dev_link(dev, real_dev, NULL, NULL, extack); - if (err) - goto err2; - port->rmnet_mode = mode; hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]); @@ -193,8 +177,6 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, return 0; -err2: - rmnet_vnd_dellink(mux_id, port, ep); err1: rmnet_unregister_real_device(real_dev, port); err0: @@ -204,14 +186,13 @@ err0: static void rmnet_dellink(struct net_device *dev, struct list_head *head) { + struct rmnet_priv *priv = netdev_priv(dev); struct net_device *real_dev; struct rmnet_endpoint *ep; struct rmnet_port *port; u8 mux_id; - rcu_read_lock(); - real_dev = netdev_master_upper_dev_get_rcu(dev); - rcu_read_unlock(); + real_dev = priv->real_dev; if (!real_dev || !rmnet_is_real_dev_registered(real_dev)) return; @@ -219,7 +200,6 @@ static void rmnet_dellink(struct net_device *dev, struct list_head *head) port = rmnet_get_port_rtnl(real_dev); mux_id = rmnet_vnd_get_mux(dev); - netdev_upper_dev_unlink(dev, real_dev); ep = rmnet_get_endpoint(port, mux_id); if (ep) { @@ -233,30 +213,13 @@ static void rmnet_dellink(struct net_device *dev, struct list_head *head) unregister_netdevice_queue(dev, head); } -static int rmnet_dev_walk_unreg(struct net_device *rmnet_dev, void *data) -{ - struct rmnet_walk_data *d = data; - struct rmnet_endpoint *ep; - u8 mux_id; - - mux_id = rmnet_vnd_get_mux(rmnet_dev); - ep = rmnet_get_endpoint(d->port, mux_id); - if (ep) { - hlist_del_init_rcu(&ep->hlnode); - rmnet_vnd_dellink(mux_id, d->port, ep); - kfree(ep); - } - netdev_upper_dev_unlink(rmnet_dev, d->real_dev); - unregister_netdevice_queue(rmnet_dev, d->head); - - return 0; -} - static void rmnet_force_unassociate_device(struct net_device *dev) { struct net_device *real_dev = dev; - struct rmnet_walk_data d; + struct hlist_node *tmp_ep; + struct rmnet_endpoint *ep; struct rmnet_port *port; + unsigned long bkt_ep; LIST_HEAD(list); if (!rmnet_is_real_dev_registered(real_dev)) @@ -264,16 +227,19 @@ static void rmnet_force_unassociate_device(struct net_device *dev) ASSERT_RTNL(); - d.real_dev = real_dev; - d.head = &list; - port = rmnet_get_port_rtnl(dev); - d.port = port; rcu_read_lock(); rmnet_unregister_bridge(dev, port); - netdev_walk_all_lower_dev_rcu(real_dev, rmnet_dev_walk_unreg, &d); + hash_for_each_safe(port->muxed_ep, bkt_ep, tmp_ep, ep, hlnode) { + unregister_netdevice_queue(ep->egress_dev, &list); + rmnet_vnd_dellink(ep->mux_id, port, ep); + + hlist_del_init_rcu(&ep->hlnode); + kfree(ep); + } + rcu_read_unlock(); unregister_netdevice_many(&list); @@ -422,11 +388,6 @@ int rmnet_add_bridge(struct net_device *rmnet_dev, if (err) return -EBUSY; - err = netdev_master_upper_dev_link(slave_dev, rmnet_dev, NULL, NULL, - extack); - if (err) - return -EINVAL; - slave_port = rmnet_get_port(slave_dev); slave_port->rmnet_mode = RMNET_EPMODE_BRIDGE; slave_port->bridge_ep = real_dev; @@ -449,7 +410,6 @@ int rmnet_del_bridge(struct net_device *rmnet_dev, port->rmnet_mode = RMNET_EPMODE_VND; port->bridge_ep = NULL; - netdev_upper_dev_unlink(slave_dev, rmnet_dev); slave_port = rmnet_get_port(slave_dev); rmnet_unregister_real_device(slave_dev, slave_port); From 4dba8bbce94541c560940ac65ca9cd563fd43348 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Fri, 16 Feb 2018 15:56:38 -0700 Subject: [PATCH 53/56] net: qualcomm: rmnet: Fix warning seen with 64 bit stats With CONFIG_DEBUG_PREEMPT enabled, a warning was seen on device creation. This occurs due to the incorrect cpu API usage in ndo_get_stats64 handler. BUG: using smp_processor_id() in preemptible [00000000] code: rmnetcli/5743 caller is debug_smp_processor_id+0x1c/0x24 Call trace: [] dump_backtrace+0x0/0x2a8 [] show_stack+0x20/0x28 [] dump_stack+0xa8/0xe0 [] check_preemption_disabled+0x104/0x108 [] debug_smp_processor_id+0x1c/0x24 [] rmnet_get_stats64+0x64/0x13c [] dev_get_stats+0x68/0xd8 [] rtnl_fill_stats+0x54/0x140 [] rtnl_fill_ifinfo+0x428/0x9cc [] rtmsg_ifinfo_build_skb+0x80/0xf4 [] rtnetlink_event+0x88/0xb4 [] raw_notifier_call_chain+0x58/0x78 [] call_netdevice_notifiers_info+0x48/0x78 [] __netdev_upper_dev_link+0x290/0x5e8 [] netdev_master_upper_dev_link+0x3c/0x48 [] rmnet_newlink+0xf0/0x1c8 [] rtnl_newlink+0x57c/0x6c8 [] rtnetlink_rcv_msg+0xb0/0x244 [] netlink_rcv_skb+0xb4/0xdc [] rtnetlink_rcv+0x34/0x44 [] netlink_unicast+0x1ec/0x294 [] netlink_sendmsg+0x320/0x390 [] sock_sendmsg+0x54/0x60 [] SyS_sendto+0x1a0/0x1e4 [] el0_svc_naked+0x24/0x28 Fixes: 192c4b5d48f2 ("net: qualcomm: rmnet: Add support for 64 bit stats") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c index 570a227acdd8..346d310914df 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c @@ -121,7 +121,7 @@ static void rmnet_get_stats64(struct net_device *dev, memset(&total_stats, 0, sizeof(struct rmnet_vnd_stats)); for_each_possible_cpu(cpu) { - pcpu_ptr = this_cpu_ptr(priv->pcpu_stats); + pcpu_ptr = per_cpu_ptr(priv->pcpu_stats, cpu); do { start = u64_stats_fetch_begin_irq(&pcpu_ptr->syncp); From f57bbaae7271a47dc6486d489c503faeb248b6d5 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Fri, 16 Feb 2018 15:56:39 -0700 Subject: [PATCH 54/56] net: qualcomm: rmnet: Fix possible null dereference in command processing If a command packet with invalid mux id is received, the packet would not have a valid endpoint. This invalid endpoint maybe dereferenced leading to a crash. Identified by manual code inspection. Fixes: 3352e6c45760 ("net: qualcomm: rmnet: Convert the muxed endpoint to hlist") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c index 6bc328fb88e1..b0dbca070c00 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c @@ -38,6 +38,11 @@ static u8 rmnet_map_do_flow_control(struct sk_buff *skb, } ep = rmnet_get_endpoint(port, mux_id); + if (!ep) { + kfree_skb(skb); + return RX_HANDLER_CONSUMED; + } + vnd = ep->egress_dev; ip_family = cmd->flow_control.ip_family; From d1c95af366961101819f07e3c64d44f3be7f0367 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 17 Feb 2018 00:30:44 +0100 Subject: [PATCH 55/56] mlxsw: spectrum_router: Do not unconditionally clear route offload indication When mlxsw replaces (or deletes) a route it removes the offload indication from the replaced route. This is problematic for IPv4 routes, as the offload indication is stored in the fib_info which is usually shared between multiple routes. Instead of unconditionally clearing the offload indication, only clear it if no other route is using the fib_info. Fixes: 3984d1a89fe7 ("mlxsw: spectrum_router: Provide offload indication using nexthop flags") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Tested-by: Alexander Petrovskiy Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index dcc6305f7c22..f7948e983637 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3794,6 +3794,9 @@ mlxsw_sp_fib4_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; int i; + if (!list_is_singular(&nh_grp->fib_list)) + return; + for (i = 0; i < nh_grp->count; i++) { struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; From 506b0a395f26e52b3f18827e0de1be051acb77ab Mon Sep 17 00:00:00 2001 From: Prashant Sreedharan Date: Mon, 19 Feb 2018 12:27:04 +0530 Subject: [PATCH 56/56] tg3: APE heartbeat changes In ungraceful host shutdown or driver crash case BMC connectivity is lost. APE firmware is missing the driver state in this case to keep the BMC connectivity alive. This patch has below change to address this issue. Heartbeat mechanism with APE firmware. This heartbeat mechanism is needed to notify the APE firmware about driver state. This patch also has the change in wait time for APE event from 1ms to 20ms as there can be some delay in getting response. v2: Drop inline keyword as per David suggestion. Signed-off-by: Prashant Sreedharan Signed-off-by: Satish Baddipadige Signed-off-by: Siva Reddy Kallam Acked-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 35 ++++++++++++++++++++--------- drivers/net/ethernet/broadcom/tg3.h | 5 +++++ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index a77ee2f8fb8d..c1841db1b500 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -820,7 +820,7 @@ static int tg3_ape_event_lock(struct tg3 *tp, u32 timeout_us) tg3_ape_unlock(tp, TG3_APE_LOCK_MEM); - udelay(10); + usleep_range(10, 20); timeout_us -= (timeout_us > 10) ? 10 : timeout_us; } @@ -922,8 +922,8 @@ static int tg3_ape_send_event(struct tg3 *tp, u32 event) if (!(apedata & APE_FW_STATUS_READY)) return -EAGAIN; - /* Wait for up to 1 millisecond for APE to service previous event. */ - err = tg3_ape_event_lock(tp, 1000); + /* Wait for up to 20 millisecond for APE to service previous event. */ + err = tg3_ape_event_lock(tp, 20000); if (err) return err; @@ -946,6 +946,7 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, int kind) switch (kind) { case RESET_KIND_INIT: + tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_COUNT, tp->ape_hb++); tg3_ape_write32(tp, TG3_APE_HOST_SEG_SIG, APE_HOST_SEG_SIG_MAGIC); tg3_ape_write32(tp, TG3_APE_HOST_SEG_LEN, @@ -962,13 +963,6 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, int kind) event = APE_EVENT_STATUS_STATE_START; break; case RESET_KIND_SHUTDOWN: - /* With the interface we are currently using, - * APE does not track driver state. Wiping - * out the HOST SEGMENT SIGNATURE forces - * the APE to assume OS absent status. - */ - tg3_ape_write32(tp, TG3_APE_HOST_SEG_SIG, 0x0); - if (device_may_wakeup(&tp->pdev->dev) && tg3_flag(tp, WOL_ENABLE)) { tg3_ape_write32(tp, TG3_APE_HOST_WOL_SPEED, @@ -990,6 +984,18 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, int kind) tg3_ape_send_event(tp, event); } +static void tg3_send_ape_heartbeat(struct tg3 *tp, + unsigned long interval) +{ + /* Check if hb interval has exceeded */ + if (!tg3_flag(tp, ENABLE_APE) || + time_before(jiffies, tp->ape_hb_jiffies + interval)) + return; + + tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_COUNT, tp->ape_hb++); + tp->ape_hb_jiffies = jiffies; +} + static void tg3_disable_ints(struct tg3 *tp) { int i; @@ -7262,6 +7268,7 @@ static int tg3_poll_msix(struct napi_struct *napi, int budget) } } + tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL << 1); return work_done; tx_recovery: @@ -7344,6 +7351,7 @@ static int tg3_poll(struct napi_struct *napi, int budget) } } + tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL << 1); return work_done; tx_recovery: @@ -10732,7 +10740,7 @@ static int tg3_reset_hw(struct tg3 *tp, bool reset_phy) if (tg3_flag(tp, ENABLE_APE)) /* Write our heartbeat update interval to APE. */ tg3_ape_write32(tp, TG3_APE_HOST_HEARTBEAT_INT_MS, - APE_HOST_HEARTBEAT_INT_DISABLE); + APE_HOST_HEARTBEAT_INT_5SEC); tg3_write_sig_post_reset(tp, RESET_KIND_INIT); @@ -11077,6 +11085,9 @@ static void tg3_timer(struct timer_list *t) tp->asf_counter = tp->asf_multiplier; } + /* Update the APE heartbeat every 5 seconds.*/ + tg3_send_ape_heartbeat(tp, TG3_APE_HB_INTERVAL); + spin_unlock(&tp->lock); restart_timer: @@ -16653,6 +16664,8 @@ static int tg3_get_invariants(struct tg3 *tp, const struct pci_device_id *ent) pci_state_reg); tg3_ape_lock_init(tp); + tp->ape_hb_interval = + msecs_to_jiffies(APE_HOST_HEARTBEAT_INT_5SEC); } /* Set up tp->grc_local_ctrl before calling diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index 47f51cc0566d..1d61aa3efda1 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -2508,6 +2508,7 @@ #define TG3_APE_LOCK_PHY3 5 #define TG3_APE_LOCK_GPIO 7 +#define TG3_APE_HB_INTERVAL (tp->ape_hb_interval) #define TG3_EEPROM_SB_F1R2_MBA_OFF 0x10 @@ -3423,6 +3424,10 @@ struct tg3 { struct device *hwmon_dev; bool link_up; bool pcierr_recovery; + + u32 ape_hb; + unsigned long ape_hb_interval; + unsigned long ape_hb_jiffies; }; /* Accessor macros for chip and asic attributes