From 0a16628212b8b0daac643b6a0f1caa4e9482afc8 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Wed, 12 Jul 2017 10:32:48 +0100 Subject: [PATCH 01/76] brcmfmac: fix regression in brcmf_sdio_txpkt_hdalign() Recent change in brcmf_sdio_txpkt_hdalign() changed the behavior and now always returns 0. This resulted in a regression which basically renders the device useless. Fixes: 270a6c1f65fe ("brcmfmac: rework headroom check in .start_xmit()") Reported-by: S. Gilles Tested-by: S. Gilles Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index fbcbb4325936..c3ecec673eb7 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -2053,12 +2053,13 @@ static int brcmf_sdio_txpkt_hdalign(struct brcmf_sdio *bus, struct sk_buff *pkt) atomic_inc(&stats->pktcow_failed); return -ENOMEM; } + head_pad = 0; } skb_push(pkt, head_pad); dat_buf = (u8 *)(pkt->data); } memset(dat_buf, 0, head_pad + bus->tx_hdrlen); - return 0; + return head_pad; } /** From 271612d72da5b46715447bc18add4a1cf7d87687 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 17 Jul 2017 09:34:19 -0500 Subject: [PATCH 02/76] Revert "rtlwifi: btcoex: rtl8723be: fix ant_sel not work" This reverts commit f95d95a7cd5514549dcf6ba754f0ee834cce3e1f. With commit f95d95a7cd55 ("rtlwifi: btcoex: rtl8723be: fix ant_sel not work"), the kernel has a NULL pointer dereference oops. This content and the proper fix will be included in a later patch. Fixes: f95d95a7cd55 ("rtlwifi: btcoex: rtl8723be: fix ant_sel not work") Signed-off-by: Larry Finger Cc: Ping-Ke Shih Cc: Yan-Hsuan Chuang Cc: Birming Chiu Cc: Shaofu Cc: Steven Ting Signed-off-by: Kalle Valo --- drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c | 3 --- drivers/net/wireless/realtek/rtlwifi/wifi.h | 1 - 2 files changed, 4 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c index 2a7ad5ffe997..cd5dc6dcb19f 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c @@ -846,9 +846,6 @@ static bool _rtl8723be_init_mac(struct ieee80211_hw *hw) return false; } - if (rtlpriv->cfg->ops->get_btc_status()) - rtlpriv->btcoexist.btc_ops->btc_power_on_setting(rtlpriv); - bytetmp = rtl_read_byte(rtlpriv, REG_MULTI_FUNC_CTRL); rtl_write_byte(rtlpriv, REG_MULTI_FUNC_CTRL, bytetmp | BIT(3)); diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h index fb1ebb01133f..70723e67b7d7 100644 --- a/drivers/net/wireless/realtek/rtlwifi/wifi.h +++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h @@ -2547,7 +2547,6 @@ struct bt_coexist_info { struct rtl_btc_ops { void (*btc_init_variables) (struct rtl_priv *rtlpriv); void (*btc_init_hal_vars) (struct rtl_priv *rtlpriv); - void (*btc_power_on_setting)(struct rtl_priv *rtlpriv); void (*btc_init_hw_config) (struct rtl_priv *rtlpriv); void (*btc_ips_notify) (struct rtl_priv *rtlpriv, u8 type); void (*btc_lps_notify)(struct rtl_priv *rtlpriv, u8 type); From 0b0f934e92a8eaed2e6c48a50eae6f84661f74f3 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 8 Jun 2017 10:55:26 +0300 Subject: [PATCH 03/76] iwlwifi: dvm: prevent an out of bounds access iwlagn_check_ratid_empty takes the tid as a parameter, but it doesn't check that it is not IWL_TID_NON_QOS. Since IWL_TID_NON_QOS = 8 and iwl_priv::tid_data is an array with 8 entries, accessing iwl_priv::tid_data[IWL_TID_NON_QOS] is a bad idea. This happened in iwlagn_rx_reply_tx. Since iwlagn_check_ratid_empty is relevant only to check whether we can open A-MPDU, this flow is irrelevant if tid is IWL_TID_NON_QOS. Call iwlagn_check_ratid_empty only inside the if (tid != IWL_TID_NON_QOS) a few lines earlier in the function. Cc: Reported-by: Seraphime Kirkovski Tested-by: Seraphime Kirkovski Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/dvm/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/tx.c b/drivers/net/wireless/intel/iwlwifi/dvm/tx.c index adaa2f0097cc..fb40ddfced99 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/tx.c @@ -1189,11 +1189,11 @@ void iwlagn_rx_reply_tx(struct iwl_priv *priv, struct iwl_rx_cmd_buffer *rxb) next_reclaimed; IWL_DEBUG_TX_REPLY(priv, "Next reclaimed packet:%d\n", next_reclaimed); + iwlagn_check_ratid_empty(priv, sta_id, tid); } iwl_trans_reclaim(priv->trans, txq_id, ssn, &skbs); - iwlagn_check_ratid_empty(priv, sta_id, tid); freed = 0; /* process frames */ From f6eac740a9b6f3737a969bad82931633519a1cc5 Mon Sep 17 00:00:00 2001 From: Mordechai Goodstein Date: Sun, 11 Jun 2017 18:00:36 +0300 Subject: [PATCH 04/76] iwlwifi: pcie: fix unused txq NULL pointer dereference Before TVQM, all TX queues were allocated straight at init. With TVQM, queues are allocated on demand and hence we need to check if a queue exists before dereferencing it. Fixes: 66128fa08806 ("iwlwifi: move to TVQM mode") Signed-off-by: Mordechai Goodstein Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index de50418adae5..034bdb4a0b06 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -298,6 +298,9 @@ void iwl_pcie_txq_check_wrptrs(struct iwl_trans *trans) for (i = 0; i < trans->cfg->base_params->num_of_queues; i++) { struct iwl_txq *txq = trans_pcie->txq[i]; + if (!test_bit(i, trans_pcie->queue_used)) + continue; + spin_lock_bh(&txq->lock); if (txq->need_update) { iwl_pcie_txq_inc_wr_ptr(trans, txq); From 61dd8a8a6a0c3cbfb6b02ab652c4f4efb93f3d79 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 12 Jun 2017 15:10:09 +0300 Subject: [PATCH 05/76] iwlwifi: mvm: fix a NULL pointer dereference of error in recovery Sometimes, we can have an firmware crash while trying to recover from a previous firmware problem. When that happens, lots of things can go wrong. For example the stations don't get added properly to mvm->fw_id_to_mac_id. Mac80211 tries to stop A-MPDU upon reconfig but in case of a firmware crash we will bail out fairly early and in the end, we won't delete the A-MPDU Rx timeout. When that timer expired after a double firmware crash, we end up dereferencing mvm->fw_id_to_mac_id[sta_id] which is NULL. Fixes: 10b2b2019d81 ("iwlwifi: mvm: add infrastructure for tracking BA session in driver") Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 4df5f13fcdae..4a6df45b73df 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -277,6 +277,18 @@ static void iwl_mvm_rx_agg_session_expired(unsigned long data) /* Timer expired */ sta = rcu_dereference(ba_data->mvm->fw_id_to_mac_id[ba_data->sta_id]); + + /* + * sta should be valid unless the following happens: + * The firmware asserts which triggers a reconfig flow, but + * the reconfig fails before we set the pointer to sta into + * the fw_id_to_mac_id pointer table. Mac80211 can't stop + * A-MDPU and hence the timer continues to run. Then, the + * timer expires and sta is NULL. + */ + if (!sta) + goto unlock; + mvm_sta = iwl_mvm_sta_from_mac80211(sta); ieee80211_stop_rx_ba_session_offl(mvm_sta->vif, sta->addr, ba_data->tid); From 2388bd7b133504fa0991f483db66fad3a0de8694 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Jun 2017 13:44:51 +0300 Subject: [PATCH 06/76] iwlwifi: missing error code in iwl_trans_pcie_alloc() We don't set the error code here so we end up returning ERR_PTR(0) which is NULL. The caller doesn't expect that so it results in a NULL dereference. Fixes: 2e5d4a8f61dc ("iwlwifi: pcie: Add new configuration to enable MSIX") Signed-off-by: Dan Carpenter Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 92b3a55d0fbc..f95eec52508e 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -3150,7 +3150,8 @@ struct iwl_trans *iwl_trans_pcie_alloc(struct pci_dev *pdev, init_waitqueue_head(&trans_pcie->d0i3_waitq); if (trans_pcie->msix_enabled) { - if (iwl_pcie_init_msix_handler(pdev, trans_pcie)) + ret = iwl_pcie_init_msix_handler(pdev, trans_pcie); + if (ret) goto out_no_pci; } else { ret = iwl_pcie_alloc_ict(trans); From 5462bcd8c9dc7e7ff2dd54c3f1bb5a9a729f7a73 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 14 Jun 2017 12:21:05 +0300 Subject: [PATCH 07/76] iwlwifi: fix tracing when tx only is enabled iwl_trace_data is somewhat confusing. It returns a bool that tells if the payload of the skb should be added to the tx_data event. If it returns false, then the payload of the skb is added to the tx event. The purpose is to be able to start tracing with -e iwlwifi and record non-data packets only which saves bandwidth. Since EAPOLs are important, seldom and not real data packet (despite being WiFi data packets), they are included in tx event and thus iwl_trace_data returns false on those. This last part was buggy, and because of that, all the data packets were included in the tx event. Fix that. Fixes: 0c4cb7314d15 ("iwlwifi: tracing: decouple from mac80211") Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h index 545d14b0bc92..f5c1127253cb 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h @@ -55,8 +55,8 @@ static inline bool iwl_trace_data(struct sk_buff *skb) /* also account for the RFC 1042 header, of course */ offs += 6; - return skb->len > offs + 2 && - *(__be16 *)(skb->data + offs) == cpu_to_be16(ETH_P_PAE); + return skb->len <= offs + 2 || + *(__be16 *)(skb->data + offs) != cpu_to_be16(ETH_P_PAE); } static inline size_t iwl_rx_trace_len(const struct iwl_trans *trans, From 7b758a111819006ba64dd23aa016d42a20ba8557 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Tue, 20 Jun 2017 13:40:03 +0300 Subject: [PATCH 08/76] iwlwifi: mvm: handle IBSS probe_queue in a few missing places When IBSS was implemented for DQA, we missid a few places where it should be handled in the same way as AP. Fixes: ee48b72211f8 ("iwlwifi: mvm: support ibss in dqa mode") Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index 4a6df45b73df..ab66b4394dfc 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -2027,7 +2027,8 @@ int iwl_mvm_send_add_bcast_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif) IWL_MAX_TID_COUNT, wdg_timeout); - if (vif->type == NL80211_IFTYPE_AP) + if (vif->type == NL80211_IFTYPE_AP || + vif->type == NL80211_IFTYPE_ADHOC) mvm->probe_queue = queue; else if (vif->type == NL80211_IFTYPE_P2P_DEVICE) mvm->p2p_dev_queue = queue; From bf8b286f86fcc66d138fd992acfa37839340218d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 30 Jun 2017 10:48:28 +0200 Subject: [PATCH 09/76] iwlwifi: mvm: defer setting IWL_MVM_STATUS_IN_HW_RESTART MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A hardware/firmware error may happen at any point in time. In particular, it might happen while mac80211 is in the middle of a flow. We observed the following situation: * mac80211 is in authentication flow, in ieee80211_prep_connection() * iwlwifi firmware crashes, but no error can be reported at this precise point (mostly because the driver method is void, but even if it wasn't we'd just shift to a race condition) * mac80211 continues the flow, trying to add the AP station * iwlwifi has already set its internal restart flag, and so thinks that adding the station is part of the restart and already set up, so it uses the information that's supposed to already be in the struct This can happen with any flow in mac80211 and with any information we try to preserve across hardware restarts. To fix this, only set a new HW_RESTART_REQUESTED flag and translate that to IN_HW_RESTART once mac80211 actually starts the restart by calling our start() method. As a consequence, any mac80211 flow in progress at the time of the restart will properly finish (certainly with errors), before the restart is attempted. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=195299. Reported-by: djagoo Reported-by: Łukasz Siudut Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 8 +++++++- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 2 ++ drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 6 +++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index bcde1ba0f1c8..c7b1e58e3384 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1084,7 +1084,13 @@ int __iwl_mvm_mac_start(struct iwl_mvm *mvm) lockdep_assert_held(&mvm->mutex); - if (test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status)) { + if (test_bit(IWL_MVM_STATUS_HW_RESTART_REQUESTED, &mvm->status)) { + /* + * Now convert the HW_RESTART_REQUESTED flag to IN_HW_RESTART + * so later code will - from now on - see that we're doing it. + */ + set_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status); + clear_bit(IWL_MVM_STATUS_HW_RESTART_REQUESTED, &mvm->status); /* Clean up some internal and mac80211 state on restart */ iwl_mvm_restart_cleanup(mvm); } else { diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index eaacfaf37206..ddd8719f27b8 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -1090,6 +1090,7 @@ struct iwl_mvm { * @IWL_MVM_STATUS_HW_RFKILL: HW RF-kill is asserted * @IWL_MVM_STATUS_HW_CTKILL: CT-kill is active * @IWL_MVM_STATUS_ROC_RUNNING: remain-on-channel is running + * @IWL_MVM_STATUS_HW_RESTART_REQUESTED: HW restart was requested * @IWL_MVM_STATUS_IN_HW_RESTART: HW restart is active * @IWL_MVM_STATUS_IN_D0I3: NIC is in D0i3 * @IWL_MVM_STATUS_ROC_AUX_RUNNING: AUX remain-on-channel is running @@ -1101,6 +1102,7 @@ enum iwl_mvm_status { IWL_MVM_STATUS_HW_RFKILL, IWL_MVM_STATUS_HW_CTKILL, IWL_MVM_STATUS_ROC_RUNNING, + IWL_MVM_STATUS_HW_RESTART_REQUESTED, IWL_MVM_STATUS_IN_HW_RESTART, IWL_MVM_STATUS_IN_D0I3, IWL_MVM_STATUS_ROC_AUX_RUNNING, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index 4d1188b8736a..9c175d5e9d67 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -1235,9 +1235,8 @@ void iwl_mvm_nic_restart(struct iwl_mvm *mvm, bool fw_error) */ if (!mvm->fw_restart && fw_error) { iwl_mvm_fw_dbg_collect_desc(mvm, &iwl_mvm_dump_desc_assert, - NULL); - } else if (test_and_set_bit(IWL_MVM_STATUS_IN_HW_RESTART, - &mvm->status)) { + NULL); + } else if (test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status)) { struct iwl_mvm_reprobe *reprobe; IWL_ERR(mvm, @@ -1268,6 +1267,7 @@ void iwl_mvm_nic_restart(struct iwl_mvm *mvm, bool fw_error) if (fw_error && mvm->fw_restart > 0) mvm->fw_restart--; + set_bit(IWL_MVM_STATUS_HW_RESTART_REQUESTED, &mvm->status); ieee80211_restart_hw(mvm->hw); } } From 6c7fce6fa86a110d6455662d823c4e09f8f7be4a Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Sat, 22 Jul 2017 20:45:55 +0800 Subject: [PATCH 10/76] net: ethernet: mediatek: avoid potential invalid memory access Potential dangerous invalid memory might be accessed if invalid mac value reflected from the forward port field in rxd4 caused by possible potential hardware defects. So added a simple sanity checker to avoid the kind of situation happening. Signed-off-by: Sean Wang Acked-by: John Crispin Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index b3d0c2e6347a..3e3232fd7c05 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -947,6 +947,10 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, RX_DMA_FPORT_MASK; mac--; + if (unlikely(mac < 0 || mac >= MTK_MAC_COUNT || + !eth->netdev[mac])) + goto release_desc; + netdev = eth->netdev[mac]; if (unlikely(test_bit(MTK_RESETTING, ð->state))) From 5edfbd3c0697e643f179dba819a7c09375b42543 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 20 Jul 2017 02:41:34 -0700 Subject: [PATCH 11/76] tun/tap: Add the missed return value check of register_netdevice_notifier There is some codes of tun/tap module which did not check the return value of register_netdevice_notifier. Add the check now. Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- drivers/net/tun.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3d4c24572ecd..32ad87345f57 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2598,8 +2598,16 @@ static int __init tun_init(void) goto err_misc; } - register_netdevice_notifier(&tun_notifier_block); + ret = register_netdevice_notifier(&tun_notifier_block); + if (ret) { + pr_err("Can't register netdevice notifier\n"); + goto err_notifier; + } + return 0; + +err_notifier: + misc_deregister(&tun_miscdev); err_misc: rtnl_link_unregister(&tun_link_ops); err_linkops: From 70dba204a396c1c094749c4ef9bd27d5e8176a08 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 20 Jul 2017 11:06:31 +0100 Subject: [PATCH 12/76] net: ethernet: mediatek: Explicitly include linux/interrupt.h The mediatek ethernet driver uses interrupts but does not explicitly include linux/interrupt.h, relying on implicit includes. Fix this so we don't get build breaks as happened for ARM in next-20170720. Signed-off-by: Mark Brown Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 3e3232fd7c05..e588a0cdb074 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "mtk_eth_soc.h" From 545722cb0fc993226a01844fb27cf832459eb1c0 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 21 Jul 2017 14:36:57 +0100 Subject: [PATCH 13/76] selftests/bpf: subtraction bounds test There is a bug in the verifier's handling of BPF_SUB: [a,b] - [c,d] yields was [a-c, b-d] rather than the correct [a-d, b-c]. So here is a test which, with the bogus handling, will produce ranges of [0,0] and thus allowed accesses; whereas the correct handling will give a range of [-255, 255] (and hence the right-shift will give a range of [0, 255]) and the accesses will be rejected. Signed-off-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_verifier.c | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index af7d173910f4..addea82f76c9 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -5980,6 +5980,34 @@ static struct bpf_test tests[] = { .result = REJECT, .result_unpriv = REJECT, }, + { + "subtraction bounds (map value)", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0xff, 7), + BPF_LDX_MEM(BPF_B, BPF_REG_3, BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JGT, BPF_REG_3, 0xff, 5), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_3), + BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 56), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, + .errstr_unpriv = "R0 pointer arithmetic prohibited", + .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .result = REJECT, + .result_unpriv = REJECT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) From 9305706c2e808ae59f1eb201867f82f1ddf6d7a6 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 21 Jul 2017 14:37:34 +0100 Subject: [PATCH 14/76] bpf/verifier: fix min/max handling in BPF_SUB We have to subtract the src max from the dst min, and vice-versa, since (e.g.) the smallest result comes from the largest subtrahend. Fixes: 484611357c19 ("bpf: allow access into map value arrays") Signed-off-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index af9e84a4944e..664d93972373 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1865,10 +1865,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, * do our normal operations to the register, we need to set the values * to the min/max since they are undefined. */ - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (max_val == BPF_REGISTER_MAX_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (opcode != BPF_SUB) { + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + } switch (opcode) { case BPF_ADD: @@ -1879,10 +1881,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->min_align = min(src_align, dst_align); break; case BPF_SUB: + /* If one of our values was at the end of our ranges, then the + * _opposite_ value in the dst_reg goes to the end of our range. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value -= min_val; + dst_reg->min_value -= max_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value -= max_val; + dst_reg->max_value -= min_val; dst_reg->min_align = min(src_align, dst_align); break; case BPF_MUL: From e859afe1ee0c5ae981c55387ccd45eba258a7842 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 21 Jul 2017 16:51:31 +0200 Subject: [PATCH 15/76] lib: test_rhashtable: fix for large entry counts During concurrent access testing, threadfunc() concatenated thread ID and object index to create a unique key like so: | tdata->objs[i].value = (tdata->id << 16) | i; This breaks if a user passes an entries parameter of 64k or higher, since 'i' might use more than 16 bits then. Effectively, this will lead to duplicate keys in the table. Fix the problem by introducing a struct holding object and thread ID and using that as key instead of a single integer type field. Fixes: f4a3e90ba5739 ("rhashtable-test: extend to test concurrency") Reported by: Manuel Messner Signed-off-by: Phil Sutter Acked-by: Herbert Xu Signed-off-by: David S. Miller --- lib/test_rhashtable.c | 53 ++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 64e899b63337..16949d219291 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -56,8 +56,13 @@ static bool enomem_retry = false; module_param(enomem_retry, bool, 0); MODULE_PARM_DESC(enomem_retry, "Retry insert even if -ENOMEM was returned (default: off)"); +struct test_obj_val { + int id; + int tid; +}; + struct test_obj { - int value; + struct test_obj_val value; struct rhash_head node; }; @@ -72,7 +77,7 @@ static struct test_obj array[MAX_ENTRIES]; static struct rhashtable_params test_rht_params = { .head_offset = offsetof(struct test_obj, node), .key_offset = offsetof(struct test_obj, value), - .key_len = sizeof(int), + .key_len = sizeof(struct test_obj_val), .hashfn = jhash, .nulls_base = (3U << RHT_BASE_SHIFT), }; @@ -109,24 +114,26 @@ static int __init test_rht_lookup(struct rhashtable *ht) for (i = 0; i < entries * 2; i++) { struct test_obj *obj; bool expected = !(i % 2); - u32 key = i; + struct test_obj_val key = { + .id = i, + }; - if (array[i / 2].value == TEST_INSERT_FAIL) + if (array[i / 2].value.id == TEST_INSERT_FAIL) expected = false; obj = rhashtable_lookup_fast(ht, &key, test_rht_params); if (expected && !obj) { - pr_warn("Test failed: Could not find key %u\n", key); + pr_warn("Test failed: Could not find key %u\n", key.id); return -ENOENT; } else if (!expected && obj) { pr_warn("Test failed: Unexpected entry found for key %u\n", - key); + key.id); return -EEXIST; } else if (expected && obj) { - if (obj->value != i) { + if (obj->value.id != i) { pr_warn("Test failed: Lookup value mismatch %u!=%u\n", - obj->value, i); + obj->value.id, i); return -EINVAL; } } @@ -195,7 +202,7 @@ static s64 __init test_rhashtable(struct rhashtable *ht) for (i = 0; i < entries; i++) { struct test_obj *obj = &array[i]; - obj->value = i * 2; + obj->value.id = i * 2; err = insert_retry(ht, &obj->node, test_rht_params); if (err > 0) insert_retries += err; @@ -218,7 +225,7 @@ static s64 __init test_rhashtable(struct rhashtable *ht) for (i = 0; i < entries; i++) { u32 key = i * 2; - if (array[i].value != TEST_INSERT_FAIL) { + if (array[i].value.id != TEST_INSERT_FAIL) { obj = rhashtable_lookup_fast(ht, &key, test_rht_params); BUG_ON(!obj); @@ -242,18 +249,21 @@ static int thread_lookup_test(struct thread_data *tdata) for (i = 0; i < entries; i++) { struct test_obj *obj; - int key = (tdata->id << 16) | i; + struct test_obj_val key = { + .id = i, + .tid = tdata->id, + }; obj = rhashtable_lookup_fast(&ht, &key, test_rht_params); - if (obj && (tdata->objs[i].value == TEST_INSERT_FAIL)) { - pr_err(" found unexpected object %d\n", key); + if (obj && (tdata->objs[i].value.id == TEST_INSERT_FAIL)) { + pr_err(" found unexpected object %d-%d\n", key.tid, key.id); err++; - } else if (!obj && (tdata->objs[i].value != TEST_INSERT_FAIL)) { - pr_err(" object %d not found!\n", key); + } else if (!obj && (tdata->objs[i].value.id != TEST_INSERT_FAIL)) { + pr_err(" object %d-%d not found!\n", key.tid, key.id); err++; - } else if (obj && (obj->value != key)) { - pr_err(" wrong object returned (got %d, expected %d)\n", - obj->value, key); + } else if (obj && memcmp(&obj->value, &key, sizeof(key))) { + pr_err(" wrong object returned (got %d-%d, expected %d-%d)\n", + obj->value.tid, obj->value.id, key.tid, key.id); err++; } @@ -272,7 +282,8 @@ static int threadfunc(void *data) pr_err(" thread[%d]: down_interruptible failed\n", tdata->id); for (i = 0; i < entries; i++) { - tdata->objs[i].value = (tdata->id << 16) | i; + tdata->objs[i].value.id = i; + tdata->objs[i].value.tid = tdata->id; err = insert_retry(&ht, &tdata->objs[i].node, test_rht_params); if (err > 0) { insert_retries += err; @@ -295,7 +306,7 @@ static int threadfunc(void *data) for (step = 10; step > 0; step--) { for (i = 0; i < entries; i += step) { - if (tdata->objs[i].value == TEST_INSERT_FAIL) + if (tdata->objs[i].value.id == TEST_INSERT_FAIL) continue; err = rhashtable_remove_fast(&ht, &tdata->objs[i].node, test_rht_params); @@ -304,7 +315,7 @@ static int threadfunc(void *data) tdata->id); goto out; } - tdata->objs[i].value = TEST_INSERT_FAIL; + tdata->objs[i].value.id = TEST_INSERT_FAIL; cond_resched(); } From 1819ae3dfebe7bb2ade39dbf04518448c922dc51 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 21 Jul 2017 18:04:28 +0200 Subject: [PATCH 16/76] mlxsw: spectrum_router: Don't offload routes next in list Each FIB node holds a linked list of routes sharing the same prefix and length. In the case of IPv4 it's ordered according to table ID, metric and TOS and only the first route in the list is actually programmed to the device. In case a gatewayed route is added somewhere in the list, then after its nexthop group will be refreshed and become valid (due to the resolution of its gateway), it'll mistakenly overwrite the existing entry. Example: 192.168.200.0/24 dev enp3s0np3 scope link metric 1000 offload 192.168.200.0/24 via 192.168.100.1 dev enp3s0np3 metric 1000 offload Both routes are marked as offloaded despite the fact only the first one should actually be present in the device's table. When refreshing the nexthop group, don't write the route to the device's table unless it's the first in its node. Fixes: 9aecce1c7d97 ("mlxsw: spectrum_router: Correctly handle identical routes") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 383fef5a8e24..4b2e0fd7d51e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -1512,6 +1512,10 @@ mlxsw_sp_nexthop_group_mac_update(struct mlxsw_sp *mlxsw_sp, static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib_entry *fib_entry); +static bool +mlxsw_sp_fib_node_entry_is_first(const struct mlxsw_sp_fib_node *fib_node, + const struct mlxsw_sp_fib_entry *fib_entry); + static int mlxsw_sp_nexthop_fib_entries_update(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_nexthop_group *nh_grp) @@ -1520,6 +1524,9 @@ mlxsw_sp_nexthop_fib_entries_update(struct mlxsw_sp *mlxsw_sp, int err; list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) { + if (!mlxsw_sp_fib_node_entry_is_first(fib_entry->fib_node, + fib_entry)) + continue; err = mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry); if (err) return err; From 864d9664245565a6b9df86a68c7664a25a4fcd58 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Jul 2017 18:49:45 +0200 Subject: [PATCH 17/76] net/socket: fix type in assignment and trim long line The commit ffb07550c76f ("copy_msghdr_from_user(): get rid of field-by-field copyin") introduce a new sparse warning: net/socket.c:1919:27: warning: incorrect type in assignment (different address spaces) net/socket.c:1919:27: expected void *msg_control net/socket.c:1919:27: got void [noderef] *[addressable] msg_control and a line above 80 chars, let's fix them Fixes: ffb07550c76f ("copy_msghdr_from_user(): get rid of field-by-field copyin") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/socket.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/socket.c b/net/socket.c index bf2122691fba..ad22df1ffbd1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1916,7 +1916,7 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - kmsg->msg_control = msg.msg_control; + kmsg->msg_control = (void __force *)msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; @@ -1935,7 +1935,8 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, if (msg.msg_name && kmsg->msg_namelen) { if (!save_addr) { - err = move_addr_to_kernel(msg.msg_name, kmsg->msg_namelen, + err = move_addr_to_kernel(msg.msg_name, + kmsg->msg_namelen, kmsg->msg_name); if (err < 0) return err; From f4458b92d2190757bef81a5f282d2644ccbaec23 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Fri, 21 Jul 2017 16:35:09 -0500 Subject: [PATCH 18/76] net: stmmac: Adjust dump offset of DMA registers for ethtool The commit fbf68229ffe7 ("net: stmmac: unify registers dumps methods") in the Linux kernel modified the register dump to store the DMA registers at the DMA register offset (0x1000) but ethtool (stmmac.c) looks for the DMA registers after the MAC registers which is offset 55. This patch copies the DMA registers from the higher offset to the offset where ethtool expects them. Signed-off-by: Thor Thayer Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h | 3 +++ drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 5 +++++ 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c index 22cf6353ba04..7ecf549c7f1c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c @@ -205,7 +205,7 @@ static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space) { int i; - for (i = 0; i < 23; i++) + for (i = 0; i < NUM_DWMAC1000_DMA_REGS; i++) if ((i < 12) || (i > 17)) reg_space[DMA_BUS_MODE / 4 + i] = readl(ioaddr + DMA_BUS_MODE + i * 4); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c index eef2f222ce9a..6502b9aa3bf5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c @@ -70,7 +70,7 @@ static void dwmac100_dump_dma_regs(void __iomem *ioaddr, u32 *reg_space) { int i; - for (i = 0; i < 9; i++) + for (i = 0; i < NUM_DWMAC100_DMA_REGS; i++) reg_space[DMA_BUS_MODE / 4 + i] = readl(ioaddr + DMA_BUS_MODE + i * 4); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h index 9091df86723a..adc54006f884 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac_dma.h @@ -136,6 +136,9 @@ #define DMA_STATUS_TI 0x00000001 /* Transmit Interrupt */ #define DMA_CONTROL_FTF 0x00100000 /* Flush transmit FIFO */ +#define NUM_DWMAC100_DMA_REGS 9 +#define NUM_DWMAC1000_DMA_REGS 23 + void dwmac_enable_dma_transmission(void __iomem *ioaddr); void dwmac_enable_dma_irq(void __iomem *ioaddr, u32 chan); void dwmac_disable_dma_irq(void __iomem *ioaddr, u32 chan); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index babb39c646ff..af30b4857c3b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -33,6 +33,8 @@ #define MAC100_ETHTOOL_NAME "st_mac100" #define GMAC_ETHTOOL_NAME "st_gmac" +#define ETHTOOL_DMA_OFFSET 55 + struct stmmac_stats { char stat_string[ETH_GSTRING_LEN]; int sizeof_stat; @@ -442,6 +444,9 @@ static void stmmac_ethtool_gregs(struct net_device *dev, priv->hw->mac->dump_regs(priv->hw, reg_space); priv->hw->dma->dump_regs(priv->ioaddr, reg_space); + /* Copy DMA registers to where ethtool expects them */ + memcpy(®_space[ETHTOOL_DMA_OFFSET], ®_space[DMA_BUS_MODE / 4], + NUM_DWMAC1000_DMA_REGS * 4); } static void From 9476d393667968b4a02afbe9d35a3558482b943e Mon Sep 17 00:00:00 2001 From: Thomas Jarosch Date: Sat, 22 Jul 2017 17:14:34 +0200 Subject: [PATCH 19/76] mcs7780: Fix initialization when CONFIG_VMAP_STACK is enabled DMA transfers are not allowed to buffers that are on the stack. Therefore allocate a buffer to store the result of usb_control_message(). Fixes these bugreports: https://bugzilla.kernel.org/show_bug.cgi?id=195217 https://bugzilla.redhat.com/show_bug.cgi?id=1421387 https://bugzilla.redhat.com/show_bug.cgi?id=1427398 Shortened kernel backtrace from 4.11.9-200.fc25.x86_64: kernel: ------------[ cut here ]------------ kernel: WARNING: CPU: 3 PID: 2957 at drivers/usb/core/hcd.c:1587 kernel: transfer buffer not dma capable kernel: Call Trace: kernel: dump_stack+0x63/0x86 kernel: __warn+0xcb/0xf0 kernel: warn_slowpath_fmt+0x5a/0x80 kernel: usb_hcd_map_urb_for_dma+0x37f/0x570 kernel: ? try_to_del_timer_sync+0x53/0x80 kernel: usb_hcd_submit_urb+0x34e/0xb90 kernel: ? schedule_timeout+0x17e/0x300 kernel: ? del_timer_sync+0x50/0x50 kernel: ? __slab_free+0xa9/0x300 kernel: usb_submit_urb+0x2f4/0x560 kernel: ? urb_destroy+0x24/0x30 kernel: usb_start_wait_urb+0x6e/0x170 kernel: usb_control_msg+0xdc/0x120 kernel: mcs_get_reg+0x36/0x40 [mcs7780] kernel: mcs_net_open+0xb5/0x5c0 [mcs7780] ... Regression goes back to 4.9, so it's a good candidate for -stable. Though it's the decision of the maintainer. Thanks to Dan Williams for adding the "transfer buffer not dma capable" warning in the first place. It instantly pointed me in the right direction. Patch has been tested with transferring data from a Polar watch. Signed-off-by: Thomas Jarosch Signed-off-by: David S. Miller --- drivers/net/irda/mcs7780.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/net/irda/mcs7780.c b/drivers/net/irda/mcs7780.c index 6f6ed75b63c9..765de3bedb88 100644 --- a/drivers/net/irda/mcs7780.c +++ b/drivers/net/irda/mcs7780.c @@ -141,9 +141,19 @@ static int mcs_set_reg(struct mcs_cb *mcs, __u16 reg, __u16 val) static int mcs_get_reg(struct mcs_cb *mcs, __u16 reg, __u16 * val) { struct usb_device *dev = mcs->usbdev; - int ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, - MCS_RD_RTYPE, 0, reg, val, 2, - msecs_to_jiffies(MCS_CTRL_TIMEOUT)); + void *dmabuf; + int ret; + + dmabuf = kmalloc(sizeof(__u16), GFP_KERNEL); + if (!dmabuf) + return -ENOMEM; + + ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, + MCS_RD_RTYPE, 0, reg, dmabuf, 2, + msecs_to_jiffies(MCS_CTRL_TIMEOUT)); + + memcpy(val, dmabuf, sizeof(__u16)); + kfree(dmabuf); return ret; } From 69ec932e364b1ba9c3a2085fe96b76c8a3f71e7c Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 23 Jul 2017 17:52:23 +0800 Subject: [PATCH 20/76] openvswitch: fix potential out of bound access in parse_ct Before the 'type' is validated, we shouldn't use it to fetch the ovs_ct_attr_lens's minlen and maxlen, else, out of bound access may happen. Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Signed-off-by: Liping Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e3c4c6c3fef7..03859e386b47 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1310,8 +1310,8 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, nla_for_each_nested(a, attr, rem) { int type = nla_type(a); - int maxlen = ovs_ct_attr_lens[type].maxlen; - int minlen = ovs_ct_attr_lens[type].minlen; + int maxlen; + int minlen; if (type > OVS_CT_ATTR_MAX) { OVS_NLERR(log, @@ -1319,6 +1319,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, type, OVS_CT_ATTR_MAX); return -EINVAL; } + + maxlen = ovs_ct_attr_lens[type].maxlen; + minlen = ovs_ct_attr_lens[type].minlen; if (nla_len(a) < minlen || nla_len(a) > maxlen) { OVS_NLERR(log, "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", From c7472ec4a00c298d2b55c0aa945b8a4c10f6f898 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 24 Jul 2017 16:59:01 +1000 Subject: [PATCH 21/76] ftgmac100: Increase reset timeout We had reports of 50us not being sufficient to reset the MAC, thus hitting the "Hardware reset failed" error bringing the interface up on some AST2400 based machines. This bumps the timeout to 200us. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 95bf5e89cfd1..bfda0b2e2b82 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -125,7 +125,7 @@ static int ftgmac100_reset_mac(struct ftgmac100 *priv, u32 maccr) iowrite32(maccr, priv->base + FTGMAC100_OFFSET_MACCR); iowrite32(maccr | FTGMAC100_MACCR_SW_RST, priv->base + FTGMAC100_OFFSET_MACCR); - for (i = 0; i < 50; i++) { + for (i = 0; i < 200; i++) { unsigned int maccr; maccr = ioread32(priv->base + FTGMAC100_OFFSET_MACCR); From d57b9db1ae0cde366c9cc6b038cb1e8ff05557a1 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 24 Jul 2017 16:59:07 +1000 Subject: [PATCH 22/76] ftgmac100: Make the MDIO bus a child of the ethernet device Populate mii_bus->parent with our own platform device before registering, which makes it easier to locate the MDIO bus in sysfs when trying to diagnose problems. Signed-off-by: Benjamin Herrenschmidt Acked-by: Joel Stanley Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index bfda0b2e2b82..cdb979440ca5 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1682,6 +1682,7 @@ static int ftgmac100_setup_mdio(struct net_device *netdev) priv->mii_bus->name = "ftgmac100_mdio"; snprintf(priv->mii_bus->id, MII_BUS_ID_SIZE, "%s-%d", pdev->name, pdev->id); + priv->mii_bus->parent = priv->dev; priv->mii_bus->priv = priv->netdev; priv->mii_bus->read = ftgmac100_mdiobus_read; priv->mii_bus->write = ftgmac100_mdiobus_write; From c800aaf8d869f2b9b47b10c5c312fe19f0a94042 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 24 Jul 2017 10:07:32 -0700 Subject: [PATCH 23/76] packet: fix use-after-free in prb_retire_rx_blk_timer_expired() There are multiple reports showing we have a use-after-free in the timer prb_retire_rx_blk_timer_expired(), where we use struct tpacket_kbdq_core::pkbdq, a pg_vec, after it gets freed by free_pg_vec(). The interesting part is it is not freed via packet_release() but via packet_setsockopt(), which means we are not closing the socket. Looking into the big and fat function packet_set_ring(), this could happen if we satisfy the following conditions: 1. closing == 0, not on packet_release() path 2. req->tp_block_nr == 0, we don't allocate a new pg_vec 3. rx_ring->pg_vec is already set as V3, which means we already called packet_set_ring() wtih req->tp_block_nr > 0 previously 4. req->tp_frame_nr == 0, pass sanity check 5. po->mapped == 0, never called mmap() In this scenario we are clearing the old rx_ring->pg_vec, so we need to free this pg_vec, but we don't stop the timer on this path because of closing==0. The timer has to be stopped as long as we need to free pg_vec, therefore the check on closing!=0 is wrong, we should check pg_vec!=NULL instead. Thanks to liujian for testing different fixes. Reported-by: alexander.levin@verizon.com Reported-by: Dave Jones Reported-by: liujian (CE) Tested-by: liujian (CE) Cc: Ding Tianhong Cc: Willem de Bruijn Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 008bb34ee324..0615c2a950fa 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4329,7 +4329,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, register_prot_hook(sk); } spin_unlock(&po->bind_lock); - if (closing && (po->tp_version > TPACKET_V2)) { + if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); From 9f9e772da2db279c8d8c9206d271b8009c9093f4 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 24 Jul 2017 10:49:23 -0700 Subject: [PATCH 24/76] net: dsa: Initialize ds->cpu_port_mask earlier The mt7530 driver has its dsa_switch_ops::get_tag_protocol function check ds->cpu_port_mask to issue a warning in case the configured CPU port is not capable of supporting tags. After commit 14be36c2c96c ("net: dsa: Initialize all CPU and enabled ports masks in dsa_ds_parse()") we slightly re-arranged the initialization such that this was no longer working. Just make sure that ds->cpu_port_mask is set prior to the first call to get_tag_protocol, thus restoring the expected contract. In case of error, the CPU port bit is cleared. Fixes: 14be36c2c96c ("net: dsa: Initialize all CPU and enabled ports masks in dsa_ds_parse()") Reported-by: Sean Wang Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 56e46090526b..c442051d5a55 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -509,21 +509,22 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, dst->cpu_dp->netdev = ethernet_dev; } - tag_protocol = ds->ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) { - dev_warn(ds->dev, "No tagger for this switch\n"); - return PTR_ERR(dst->tag_ops); - } - - dst->rcv = dst->tag_ops->rcv; - /* Initialize cpu_port_mask now for drv->setup() * to have access to a correct value, just like what * net/dsa/dsa.c::dsa_switch_setup_one does. */ ds->cpu_port_mask |= BIT(index); + tag_protocol = ds->ops->get_tag_protocol(ds); + dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(dst->tag_ops)) { + dev_warn(ds->dev, "No tagger for this switch\n"); + ds->cpu_port_mask &= ~BIT(index); + return PTR_ERR(dst->tag_ops); + } + + dst->rcv = dst->tag_ops->rcv; + return 0; } From dce4551cb2adb1ac9a30f8ab5299d614392b3cff Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 25 Jul 2017 17:57:47 +0200 Subject: [PATCH 25/76] udp: preserve head state for IP_CMSG_PASSSEC Paul Moore reported a SELinux/IP_PASSSEC regression caused by missing skb->sp at recvmsg() time. We need to preserve the skb head state to process the IP_CMSG_PASSSEC cmsg. With this commit we avoid releasing the skb head state in the BH even if a secpath is attached to the current skb, and stores the skb status (with/without head states) in the scratch area, so that we can access it at skb deallocation time, without incurring in cache-miss penalties. This also avoids misusing the skb CB for ipv6 packets, as introduced by the commit 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing"). Clean a bit the scratch area helpers implementation, to reduce the code differences between 32 and 64 bits build. Reported-by: Paul Moore Fixes: 0a463c78d25b ("udp: avoid a cache miss on dequeue") Fixes: 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing") Signed-off-by: Paolo Abeni Tested-by: Paul Moore Signed-off-by: David S. Miller --- include/net/udp.h | 33 ++++++++++++++++++++---------- net/ipv4/udp.c | 52 +++++++++++++++++++++++------------------------ 2 files changed, 47 insertions(+), 38 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index 972ce4baab6b..56ce2d2a612d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -305,33 +305,44 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() */ -#if BITS_PER_LONG == 64 - -/* truesize, len and the bit needed to compute skb_csum_unnecessary will be on - * cold cache lines at recvmsg time. - * skb->len can be stored on 16 bits since the udp header has been already - * validated and pulled. - */ struct udp_dev_scratch { - u32 truesize; + /* skb->truesize and the stateless bit are embedded in a single field; + * do not use a bitfield since the compiler emits better/smaller code + * this way + */ + u32 _tsize_state; + +#if BITS_PER_LONG == 64 + /* len and the bit needed to compute skb_csum_unnecessary + * will be on cold cache lines at recvmsg time. + * skb->len can be stored on 16 bits since the udp header has been + * already validated and pulled. + */ u16 len; bool is_linear; bool csum_unnecessary; +#endif }; +static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb) +{ + return (struct udp_dev_scratch *)&skb->dev_scratch; +} + +#if BITS_PER_LONG == 64 static inline unsigned int udp_skb_len(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->len; + return udp_skb_scratch(skb)->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->csum_unnecessary; + return udp_skb_scratch(skb)->csum_unnecessary; } static inline bool udp_skb_is_linear(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->is_linear; + return udp_skb_scratch(skb)->is_linear; } #else diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b057653ceca9..d243772f6efc 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1163,34 +1163,32 @@ out: return ret; } -#if BITS_PER_LONG == 64 +#define UDP_SKB_IS_STATELESS 0x80000000 + static void udp_set_dev_scratch(struct sk_buff *skb) { - struct udp_dev_scratch *scratch; + struct udp_dev_scratch *scratch = udp_skb_scratch(skb); BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); - scratch = (struct udp_dev_scratch *)&skb->dev_scratch; - scratch->truesize = skb->truesize; + scratch->_tsize_state = skb->truesize; +#if BITS_PER_LONG == 64 scratch->len = skb->len; scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); -} - -static int udp_skb_truesize(struct sk_buff *skb) -{ - return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize; -} -#else -static void udp_set_dev_scratch(struct sk_buff *skb) -{ - skb->dev_scratch = skb->truesize; -} - -static int udp_skb_truesize(struct sk_buff *skb) -{ - return skb->dev_scratch; -} #endif + if (likely(!skb->_skb_refdst)) + scratch->_tsize_state |= UDP_SKB_IS_STATELESS; +} + +static int udp_skb_truesize(struct sk_buff *skb) +{ + return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; +} + +static bool udp_skb_has_head_state(struct sk_buff *skb) +{ + return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS); +} /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, int size, int partial, @@ -1388,10 +1386,10 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } - /* we cleared the head states previously only if the skb lacks any IP - * options, see __udp_queue_rcv_skb(). + /* In the more common cases we cleared the head states previously, + * see __udp_queue_rcv_skb(). */ - if (unlikely(IPCB(skb)->opt.optlen > 0)) + if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); consume_stateless_skb(skb); } @@ -1784,11 +1782,11 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } - /* At recvmsg() time we need skb->dst to process IP options-related - * cmsg, elsewhere can we clear all pending head states while they are - * hot in the cache + /* At recvmsg() time we may access skb->dst or skb->sp depending on + * the IP options and the cmsg flags, elsewhere can we clear all + * pending head states while they are hot in the cache */ - if (likely(IPCB(skb)->opt.optlen == 0)) + if (likely(IPCB(skb)->opt.optlen == 0 && !skb->sp)) skb_release_head_state(skb); rc = __udp_enqueue_schedule_skb(sk, skb); From 2eaa38d9fcba5294182268b8d11770cf3fdc9bc9 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Tue, 25 Jul 2017 11:08:15 +0200 Subject: [PATCH 26/76] net: phy: Remove trailing semicolon in macro definition Commit e5a03bfd873c2 ("phy: Add an mdio_device structure") introduced a spurious trailing semicolon. Remove it. Signed-off-by: Marc Gonzalez Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/phy.h b/include/linux/phy.h index 2a9567bb8186..0bb5b212ab42 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -830,7 +830,7 @@ static inline int phy_read_status(struct phy_device *phydev) dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ - dev_dbg(&_phydev->mdio.dev, format, ##args); + dev_dbg(&_phydev->mdio.dev, format, ##args) static inline const char *phydev_name(const struct phy_device *phydev) { From 783692558a60cd69d8d86900b33846263598ca6c Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 25 Jul 2017 13:36:21 +0200 Subject: [PATCH 27/76] lib: test_rhashtable: Fix KASAN warning I forgot one spot when introducing struct test_obj_val. Fixes: e859afe1ee0c5 ("lib: test_rhashtable: fix for large entry counts") Reported by: kernel test robot Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- lib/test_rhashtable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 16949d219291..0ffca990a833 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -223,7 +223,9 @@ static s64 __init test_rhashtable(struct rhashtable *ht) pr_info(" Deleting %d keys\n", entries); for (i = 0; i < entries; i++) { - u32 key = i * 2; + struct test_obj_val key = { + .id = i * 2, + }; if (array[i].value.id != TEST_INSERT_FAIL) { obj = rhashtable_lookup_fast(ht, &key, test_rht_params); From afce615aaabfbaad02550e75c0bec106dafa1adf Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Mon, 24 Jul 2017 23:14:28 +0200 Subject: [PATCH 28/76] ipv6: Don't increase IPSTATS_MIB_FRAGFAILS twice in ip6_fragment() RFC 2465 defines ipv6IfStatsOutFragFails as: "The number of IPv6 datagrams that have been discarded because they needed to be fragmented at this output interface but could not be." The existing implementation, instead, would increase the counter twice in case we fail to allocate room for single fragments: once for the fragment, once for the datagram. This didn't look intentional though. In one of the two affected affected failure paths, the double increase was simply a result of a new 'goto fail' statement, introduced to avoid a skb leak. The other path appears to be affected since at least 2.6.12-rc2. Reported-by: Sabrina Dubroca Fixes: 1d325d217c7f ("ipv6: ip6_fragment: fix headroom tests and skb leak") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1422d6c08377..162efba0d0cd 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -673,8 +673,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, *prevhdr = NEXTHDR_FRAGMENT; tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!tmp_hdr) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; } @@ -789,8 +787,6 @@ slow_path: frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + hroom + troom, GFP_ATOMIC); if (!frag) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; } From 6cee9d649cd1824a4d9ce6940dd716f2bf2ef24f Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 25 Jul 2017 10:19:01 +0930 Subject: [PATCH 29/76] ftgmac100: return error in ftgmac100_alloc_rx_buf The error paths set err, but it's not returned. I wondered if we should fix all of the callers to check the returned value, but Ben explains why the code is this way: > Most call sites ignore it on purpose. There's nothing we can do if > we fail to get a buffer at interrupt time, so we point the buffer to > the scratch page so the HW doesn't DMA into lalaland and lose the > packet. > > The one call site that tests and can fail is the one used when brining > the interface up. If we fail to allocate at that point, we fail the > ifup. But as you noticed, I do have a bug not returning the error. Acked-by: Benjamin Herrenschmidt Signed-off-by: Joel Stanley Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index cdb979440ca5..34dae51effd4 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -392,7 +392,7 @@ static int ftgmac100_alloc_rx_buf(struct ftgmac100 *priv, unsigned int entry, struct net_device *netdev = priv->netdev; struct sk_buff *skb; dma_addr_t map; - int err; + int err = 0; skb = netdev_alloc_skb_ip_align(netdev, RX_BUF_SIZE); if (unlikely(!skb)) { @@ -428,7 +428,7 @@ static int ftgmac100_alloc_rx_buf(struct ftgmac100 *priv, unsigned int entry, else rxdes->rxdes0 = 0; - return 0; + return err; } static unsigned int ftgmac100_next_rx_pointer(struct ftgmac100 *priv, From 80d887dbb673f007938c467fbfa118bba3e9f37d Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 24 Jul 2017 21:03:19 -0700 Subject: [PATCH 30/76] Revert "netvsc: optimize calculation of number of slots" The logic for computing page buffer scatter does not take into account the impact of compound pages. Therefore the optimization to compute number of slots was incorrect and could cause stack corruption a skb was sent with lots of fragments from huge pages. This reverts commit 60b86665af0dfbeebda8aae43f0ba451cd2dcfe5. Fixes: 60b86665af0d ("netvsc: optimize calculation of number of slots") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 43 +++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 63c98bbbc596..0d78727f1a14 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -315,14 +315,34 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, return slots_used; } -/* Estimate number of page buffers neede to transmit - * Need at most 2 for RNDIS header plus skb body and fragments. - */ -static unsigned int netvsc_get_slots(const struct sk_buff *skb) +static int count_skb_frag_slots(struct sk_buff *skb) { - return PFN_UP(offset_in_page(skb->data) + skb_headlen(skb)) - + skb_shinfo(skb)->nr_frags - + 2; + int i, frags = skb_shinfo(skb)->nr_frags; + int pages = 0; + + for (i = 0; i < frags; i++) { + skb_frag_t *frag = skb_shinfo(skb)->frags + i; + unsigned long size = skb_frag_size(frag); + unsigned long offset = frag->page_offset; + + /* Skip unused frames from start of page */ + offset &= ~PAGE_MASK; + pages += PFN_UP(offset + size); + } + return pages; +} + +static int netvsc_get_slots(struct sk_buff *skb) +{ + char *data = skb->data; + unsigned int offset = offset_in_page(data); + unsigned int len = skb_headlen(skb); + int slots; + int frag_slots; + + slots = DIV_ROUND_UP(offset + len, PAGE_SIZE); + frag_slots = count_skb_frag_slots(skb); + return slots + frag_slots; } static u32 net_checksum_info(struct sk_buff *skb) @@ -360,18 +380,21 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net) struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT]; struct hv_page_buffer *pb = page_buf; - /* We can only transmit MAX_PAGE_BUFFER_COUNT number + /* We will atmost need two pages to describe the rndis + * header. We can only transmit MAX_PAGE_BUFFER_COUNT number * of pages in a single packet. If skb is scattered around * more pages we try linearizing it. */ - num_data_pgs = netvsc_get_slots(skb); + + num_data_pgs = netvsc_get_slots(skb) + 2; + if (unlikely(num_data_pgs > MAX_PAGE_BUFFER_COUNT)) { ++net_device_ctx->eth_stats.tx_scattered; if (skb_linearize(skb)) goto no_memory; - num_data_pgs = netvsc_get_slots(skb); + num_data_pgs = netvsc_get_slots(skb) + 2; if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) { ++net_device_ctx->eth_stats.tx_too_big; goto drop; From 4813497b537c6208c90d6cbecac5072d347de900 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Tue, 25 Jul 2017 14:35:03 +0200 Subject: [PATCH 31/76] net: ethernet: nb8800: Handle all 4 RGMII modes identically Before commit bf8f6952a233 ("Add blurb about RGMII") it was unclear whose responsibility it was to insert the required clock skew, and in hindsight, some PHY drivers got it wrong. The solution forward is to introduce a new property, explicitly requiring skew from the node to which it is attached. In the interim, this driver will handle all 4 RGMII modes identically (no skew). Fixes: 52dfc8301248 ("net: ethernet: add driver for Aurora VLSI NB8800 Ethernet controller") Signed-off-by: Marc Gonzalez Signed-off-by: David S. Miller --- drivers/net/ethernet/aurora/nb8800.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c index 041cfb7952f8..e94159507847 100644 --- a/drivers/net/ethernet/aurora/nb8800.c +++ b/drivers/net/ethernet/aurora/nb8800.c @@ -609,7 +609,7 @@ static void nb8800_mac_config(struct net_device *dev) mac_mode |= HALF_DUPLEX; if (gigabit) { - if (priv->phy_mode == PHY_INTERFACE_MODE_RGMII) + if (phy_interface_is_rgmii(dev->phydev)) mac_mode |= RGMII_MODE; mac_mode |= GMAC_MODE; @@ -1268,11 +1268,10 @@ static int nb8800_tangox_init(struct net_device *dev) break; case PHY_INTERFACE_MODE_RGMII: - pad_mode = PAD_MODE_RGMII; - break; - + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: case PHY_INTERFACE_MODE_RGMII_TXID: - pad_mode = PAD_MODE_RGMII | PAD_MODE_GTX_CLK_DELAY; + pad_mode = PAD_MODE_RGMII; break; default: From 9688f9b020263c4a17471d09bb18e34eccc1c0a5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 26 Jul 2017 11:33:49 +0200 Subject: [PATCH 32/76] udp: unbreak build lacking CONFIG_XFRM We must use pre-processor conditional block or suitable accessors to manipulate skb->sp elsewhere builds lacking the CONFIG_XFRM will break. Fixes: dce4551cb2ad ("udp: preserve head state for IP_CMSG_PASSSEC") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d243772f6efc..fac7cb9e3b0f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1786,7 +1786,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) * the IP options and the cmsg flags, elsewhere can we clear all * pending head states while they are hot in the cache */ - if (likely(IPCB(skb)->opt.optlen == 0 && !skb->sp)) + if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb))) skb_release_head_state(skb); rc = __udp_enqueue_schedule_skb(sk, skb); From d94708a553022bf012fa95af10532a134eeb5a52 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 25 Jul 2017 09:44:25 -0700 Subject: [PATCH 33/76] bonding: commit link status change after propose Commit de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring") moves link status commitment into bond_mii_monitor(), but it still relies on the return value of bond_miimon_inspect() as the hint. We need to return non-zero as long as we propose a link status change. Fixes: de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring") Reported-by: Benjamin Gilbert Tested-by: Benjamin Gilbert Cc: Mahesh Bandewar Signed-off-by: Cong Wang Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 181839d6fbea..9bee6c1c70cc 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2050,6 +2050,7 @@ static int bond_miimon_inspect(struct bonding *bond) continue; bond_propose_link_state(slave, BOND_LINK_FAIL); + commit++; slave->delay = bond->params.downdelay; if (slave->delay) { netdev_info(bond->dev, "link status down for %sinterface %s, disabling it in %d ms\n", @@ -2088,6 +2089,7 @@ static int bond_miimon_inspect(struct bonding *bond) continue; bond_propose_link_state(slave, BOND_LINK_BACK); + commit++; slave->delay = bond->params.updelay; if (slave->delay) { From 0c3a8f8b8fabff4f3ad2dd7b95ae0e90cdd1aebb Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 25 Jul 2017 11:36:25 -0700 Subject: [PATCH 34/76] netpoll: Fix device name check in netpoll_setup() Apparently netpoll_setup() assumes that netpoll.dev_name is a pointer when checking if the device name is set: if (np->dev_name) { ... However the field is a character array, therefore the condition always yields true. Check instead whether the first byte of the array has a non-zero value. Signed-off-by: Matthias Kaehlcke Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 8357f164c660..912731bed7b7 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -666,7 +666,7 @@ int netpoll_setup(struct netpoll *np) int err; rtnl_lock(); - if (np->dev_name) { + if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; ndev = __dev_get_by_name(net, np->dev_name); } From d777b2ddbecf509bc61ee4f0fe0d3b5a909d698a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jul 2017 15:16:12 -0700 Subject: [PATCH 35/76] bpf: don't zero out the info struct in bpf_obj_get_info_by_fd() The buffer passed to bpf_obj_get_info_by_fd() should be initialized to zeros. Kernel will enforce that to guarantee we can safely extend info structures in the future. Making the bpf_obj_get_info_by_fd() call in libbpf perform the zeroing is problematic, however, since some members of the info structures may need to be initialized by the callers (for instance pointers to buffers to which kernel is to dump translated and jited images). Remove the zeroing and fix up the in-tree callers before any kernel has been released with this code. As Daniel points out this seems to be the intended operation anyway, since commit 95b9afd3987f ("bpf: Test for bpf ID") is itself setting the buffer pointers before calling bpf_obj_get_info_by_fd(). Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/lib/bpf/bpf.c | 1 - tools/testing/selftests/bpf/test_progs.c | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 412a7c82995a..256f571f2ab5 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -314,7 +314,6 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) int err; bzero(&attr, sizeof(attr)); - bzero(info, *info_len); attr.info.bpf_fd = prog_fd; attr.info.info_len = *info_len; attr.info.info = ptr_to_u64(info); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 5855cd3d3d45..1f7dd35551b9 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -340,6 +340,7 @@ static void test_bpf_obj_id(void) /* Check getting prog info */ info_len = sizeof(struct bpf_prog_info) * 2; + bzero(&prog_infos[i], info_len); prog_infos[i].jited_prog_insns = ptr_to_u64(jited_insns); prog_infos[i].jited_prog_len = sizeof(jited_insns); prog_infos[i].xlated_prog_insns = ptr_to_u64(xlated_insns); @@ -369,6 +370,7 @@ static void test_bpf_obj_id(void) /* Check getting map info */ info_len = sizeof(struct bpf_map_info) * 2; + bzero(&map_infos[i], info_len); err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i], &info_len); if (CHECK(err || @@ -394,7 +396,7 @@ static void test_bpf_obj_id(void) nr_id_found = 0; next_id = 0; while (!bpf_prog_get_next_id(next_id, &next_id)) { - struct bpf_prog_info prog_info; + struct bpf_prog_info prog_info = {}; int prog_fd; info_len = sizeof(prog_info); @@ -418,6 +420,8 @@ static void test_bpf_obj_id(void) nr_id_found++; err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len); + prog_infos[i].jited_prog_insns = 0; + prog_infos[i].xlated_prog_insns = 0; CHECK(err || info_len != sizeof(struct bpf_prog_info) || memcmp(&prog_info, &prog_infos[i], info_len), "get-prog-info(next_id->fd)", @@ -436,7 +440,7 @@ static void test_bpf_obj_id(void) nr_id_found = 0; next_id = 0; while (!bpf_map_get_next_id(next_id, &next_id)) { - struct bpf_map_info map_info; + struct bpf_map_info map_info = {}; int map_fd; info_len = sizeof(map_info); From 0c2232b0a71db0ac1d22f751aa1ac0cadb950fd2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:19:09 +0800 Subject: [PATCH 36/76] dccp: fix a memleak that dccp_ipv6 doesn't put reqsk properly In dccp_v6_conn_request, after reqsk gets alloced and hashed into ehash table, reqsk's refcnt is set 3. one is for req->rsk_timer, one is for hlist, and the other one is for current using. The problem is when dccp_v6_conn_request returns and finishes using reqsk, it doesn't put reqsk. This will cause reqsk refcnt leaks and reqsk obj never gets freed. Jianlin found this issue when running dccp_memleak.c in a loop, the system memory would run out. dccp_memleak.c: int s1 = socket(PF_INET6, 6, IPPROTO_IP); bind(s1, &sa1, 0x20); listen(s1, 0x9); int s2 = socket(PF_INET6, 6, IPPROTO_IP); connect(s2, &sa1, 0x20); close(s1); close(s2); This patch is to put the reqsk before dccp_v6_conn_request returns, just as what tcp_conn_request does. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/dccp/ipv6.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index c376af5bfdfb..1b58eac8aad3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -380,6 +380,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + reqsk_put(req); return 0; drop_and_free: From b7953d3c0e30a5fc944f6b7bd0bcceb0794bcd85 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:19:46 +0800 Subject: [PATCH 37/76] dccp: fix a memleak that dccp_ipv4 doesn't put reqsk properly The patch "dccp: fix a memleak that dccp_ipv6 doesn't put reqsk properly" fixed reqsk refcnt leak for dccp_ipv6. The same issue exists on dccp_ipv4. This patch is to fix it for dccp_ipv4. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index f85d901f4e3f..1b202f16531f 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -631,6 +631,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + reqsk_put(req); return 0; drop_and_free: From e90ce2fc27cad7e7b1e72b9e66201a7a4c124c2b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:20:15 +0800 Subject: [PATCH 38/76] dccp: fix a memleak for dccp_feat_init err process In dccp_feat_init, when ccid_get_builtin_ccids failsto alloc memory for rx.val, it should free tx.val before returning an error. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/dccp/feat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 1704948e6a12..f227f002c73d 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1471,9 +1471,12 @@ int dccp_feat_init(struct sock *sk) * singleton values (which always leads to failure). * These settings can still (later) be overridden via sockopts. */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len) || - ccid_get_builtin_ccids(&rx.val, &rx.len)) + if (ccid_get_builtin_ccids(&tx.val, &tx.len)) return -ENOBUFS; + if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { + kfree(tx.val); + return -ENOBUFS; + } if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) From 6b84202c946cd3da3a8daa92c682510e9ed80321 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 16:24:59 +0800 Subject: [PATCH 39/76] sctp: fix the check for _sctp_walk_params and _sctp_walk_errors Commit b1f5bfc27a19 ("sctp: don't dereference ptr before leaving _sctp_walk_{params, errors}()") tried to fix the issue that it may overstep the chunk end for _sctp_walk_{params, errors} with 'chunk_end > offset(length) + sizeof(length)'. But it introduced a side effect: When processing INIT, it verifies the chunks with 'param.v == chunk_end' after iterating all params by sctp_walk_params(). With the check 'chunk_end > offset(length) + sizeof(length)', it would return when the last param is not yet accessed. Because the last param usually is fwdtsn supported param whose size is 4 and 'chunk_end == offset(length) + sizeof(length)' This is a badly issue even causing sctp couldn't process 4-shakes. Client would always get abort when connecting to server, due to the failure of INIT chunk verification on server. The patch is to use 'chunk_end <= offset(length) + sizeof(length)' instead of 'chunk_end < offset(length) + sizeof(length)' for both _sctp_walk_params and _sctp_walk_errors. Fixes: b1f5bfc27a19 ("sctp: don't dereference ptr before leaving _sctp_walk_{params, errors}()") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 980807d7506f..45fd4c6056b5 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -469,7 +469,7 @@ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) #define _sctp_walk_params(pos, chunk, end, member)\ for (pos.v = chunk->member;\ - (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <\ + (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\ (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(struct sctp_paramhdr);\ @@ -481,7 +481,7 @@ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) #define _sctp_walk_errors(err, chunk_hdr, end)\ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \ sizeof(struct sctp_chunkhdr));\ - ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <\ + ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <=\ (void *)chunk_hdr + end) &&\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(sctp_errhdr_t); \ From 58f36b4526add4870476e1dc97a8467cf16aeee6 Mon Sep 17 00:00:00 2001 From: Daniel Stone Date: Wed, 26 Jul 2017 12:24:10 +0100 Subject: [PATCH 40/76] brcmfmac: Don't grow SKB by negative size The commit to rework the headroom check in start_xmit() now calls pxskb_expand_head() unconditionally if the header is CoW. Unfortunately, it does so with the delta between the extant headroom and the header length, which may be negative if there is already sufficient headroom. pskb_expand_head() does allow for size being 0, in which case it just copies, so clamp the header delta to zero. Opening Chrome (and all my tabs) on a PCIE device was enough to reliably hit this. Fixes: 270a6c1f65fe ("brcmfmac: rework headroom check in .start_xmit()") Signed-off-by: Daniel Stone Cc: Arend Van Spriel Cc: James Hughes Cc: Hante Meuleman Cc: Pieter-Paul Giesberts Cc: Franky Lin Tested-by: Hans de Goede Signed-off-by: Kalle Valo --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c index 2153e8062b4c..5cc3a07dda9e 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c @@ -214,7 +214,7 @@ static netdev_tx_t brcmf_netdev_start_xmit(struct sk_buff *skb, /* Make sure there's enough writeable headroom */ if (skb_headroom(skb) < drvr->hdrlen || skb_header_cloned(skb)) { - head_delta = drvr->hdrlen - skb_headroom(skb); + head_delta = max_t(int, drvr->hdrlen - skb_headroom(skb), 0); brcmf_dbg(INFO, "%s: insufficient headroom (%d)\n", brcmf_ifname(ifp), head_delta); From 5f5d03143de5e0c593da4ab18fc6393c2815e108 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Wed, 26 Jul 2017 13:09:24 +0100 Subject: [PATCH 41/76] brcmfmac: fix memleak due to calling brcmf_sdiod_sgtable_alloc() twice Due to a bugfix in wireless tree and the commit mentioned below a merge was needed which went haywire. So the submitted change resulted in the function brcmf_sdiod_sgtable_alloc() being called twice during the probe thus leaking the memory of the first call. Cc: stable@vger.kernel.org # 4.6.x Fixes: 4d7928959832 ("brcmfmac: switch to new platform data") Reported-by: Stefan Wahren Tested-by: Stefan Wahren Reviewed-by: Hante Meuleman Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index c3ecec673eb7..f3556122c6ac 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -4175,11 +4175,6 @@ struct brcmf_sdio *brcmf_sdio_probe(struct brcmf_sdio_dev *sdiodev) goto fail; } - /* allocate scatter-gather table. sg support - * will be disabled upon allocation failure. - */ - brcmf_sdiod_sgtable_alloc(bus->sdiodev); - /* Query the F2 block size, set roundup accordingly */ bus->blocksize = bus->sdiodev->func[2]->cur_blksize; bus->roundup = min(max_roundup, bus->blocksize); From 079adf05398f12371ea91dd53fdf61ee6352bc01 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 1 Jun 2017 09:40:56 +0300 Subject: [PATCH 42/76] net/mlx5: Clean SRIOV eswitch resources upon VF creation failure Upon sriov enable, eswitch is always enabled. Currently, if enable hca failed over all VFs, we would skip eswitch disable as part of sriov disable, which will lead to resources leak. Fix it by disabling eswitch if it was enabled (use indication from eswitch mode). Fixes: 6b6adee3dad2 ('net/mlx5: SRIOV core code refactoring') Signed-off-by: Eran Ben Elisha Signed-off-by: Noa Osherovich Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 89bfda419efe..8b18cc9ec026 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1668,7 +1668,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) int i; if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager) || - MLX5_CAP_GEN(esw->dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + MLX5_CAP_GEN(esw->dev, port_type) != MLX5_CAP_PORT_TYPE_ETH || + esw->mode == SRIOV_NONE) return; esw_info(esw->dev, "disable SRIOV: active vports(%d) mode(%d)\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index bcdf7779c48d..bf99d40e30b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -88,7 +88,11 @@ static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev) int vf; if (!sriov->enabled_vfs) +#ifdef CONFIG_MLX5_CORE_EN + goto disable_sriov_resources; +#else return; +#endif for (vf = 0; vf < sriov->num_vfs; vf++) { if (!sriov->vfs_ctx[vf].enabled) @@ -103,6 +107,7 @@ static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev) } #ifdef CONFIG_MLX5_CORE_EN +disable_sriov_resources: mlx5_eswitch_disable_sriov(dev->priv.eswitch); #endif From dc798b4cc0f2a06e7ad7d522403de274b86a0a6f Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Sun, 2 Jul 2017 19:13:43 +0300 Subject: [PATCH 43/76] net/mlx5: Consider tx_enabled in all modes on remap The tx_enabled lag event field is used to determine whether a slave is active. Current logic uses this value only if the mode is active-backup. However, LACP mode, although considered a load balancing mode, can mark a slave as inactive in certain situations (e.g., LACP timeout). This fix takes the tx_enabled value into account when remapping, with no respect to the LAG mode (this should not affect the behavior in XOR mode, since in this mode both slaves are marked as active). Fixes: 7907f23adc18 (net/mlx5: Implement RoCE LAG feature) Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag.c | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index a3a836bdcfd2..f26f97fe4666 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -162,22 +162,17 @@ static bool mlx5_lag_is_bonded(struct mlx5_lag *ldev) static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker, u8 *port1, u8 *port2) { - if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { - if (tracker->netdev_state[0].tx_enabled) { - *port1 = 1; - *port2 = 1; - } else { - *port1 = 2; - *port2 = 2; - } - } else { - *port1 = 1; - *port2 = 2; - if (!tracker->netdev_state[0].link_up) - *port1 = 2; - else if (!tracker->netdev_state[1].link_up) - *port2 = 1; + *port1 = 1; + *port2 = 2; + if (!tracker->netdev_state[0].tx_enabled || + !tracker->netdev_state[0].link_up) { + *port1 = 2; + return; } + + if (!tracker->netdev_state[1].tx_enabled || + !tracker->netdev_state[1].link_up) + *port2 = 1; } static void mlx5_activate_lag(struct mlx5_lag *ldev, From 061870800efb4e3d1ad4082a2569363629bdfcfc Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 6 Jul 2017 15:48:40 +0300 Subject: [PATCH 44/76] net/mlx5: Fix command completion after timeout access invalid structure Completion on timeout should not free the driver command entry structure as it will need to access it again once real completion event from FW will occur. Fixes: 73dd3a4839c1 ('net/mlx5: Avoid using pending command interface slots') Signed-off-by: Moshe Shemesh Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index f5a2c605749f..25fd32fcdf79 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -967,7 +967,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, err = wait_func(dev, ent); if (err == -ETIMEDOUT) - goto out_free; + goto out; ds = ent->ts2 - ent->ts1; op = MLX5_GET(mbox_in, in->first.data, opcode); @@ -1430,6 +1430,7 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced) mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", ent->idx); free_ent(cmd, ent->idx); + free_cmd(ent); } continue; } @@ -1488,7 +1489,8 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced) free_msg(dev, ent->in); err = err ? err : ent->status; - free_cmd(ent); + if (!forced) + free_cmd(ent); callback(err, context); } else { complete(&ent->done); From 219c81f7d1d5a89656cb3b53d3b4e11e93608d80 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 25 Jun 2017 18:45:32 +0300 Subject: [PATCH 45/76] net/mlx5: Fix command bad flow on command entry allocation failure When driver fail to allocate an entry to send command to FW, it must notify the calling function and release the memory allocated for this command. Fixes: e126ba97dba9e ('mlx5: Add driver for Mellanox Connect-IB adapters') Signed-off-by: Moshe Shemesh Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 25fd32fcdf79..31cbe5e86a01 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -786,6 +786,10 @@ static void cb_timeout_handler(struct work_struct *work) mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); } +static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); +static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev, + struct mlx5_cmd_msg *msg); + static void cmd_work_handler(struct work_struct *work) { struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work); @@ -796,17 +800,28 @@ static void cmd_work_handler(struct work_struct *work) struct semaphore *sem; unsigned long flags; bool poll_cmd = ent->polling; + int alloc_ret; sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; down(sem); if (!ent->page_queue) { - ent->idx = alloc_ent(cmd); - if (ent->idx < 0) { + alloc_ret = alloc_ent(cmd); + if (alloc_ret < 0) { mlx5_core_err(dev, "failed to allocate command entry\n"); + if (ent->callback) { + ent->callback(-EAGAIN, ent->context); + mlx5_free_cmd_msg(dev, ent->out); + free_msg(dev, ent->in); + free_cmd(ent); + } else { + ent->ret = -EAGAIN; + complete(&ent->done); + } up(sem); return; } + ent->idx = alloc_ret; } else { ent->idx = cmd->max_reg_cmds; spin_lock_irqsave(&cmd->alloc_lock, flags); From 58569ef8f619761548e7d198f59e8ebe3af91d04 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 6 Jul 2017 15:40:32 +0300 Subject: [PATCH 46/76] net/mlx5e: IPoIB, Modify add/remove underlay QPN flows On interface remove, the clean-up was done incorrectly causing an error in the log: "SET_FLOW_TABLE_ROOT(0x92f) op_mod(0x0) failed...syndrome (0x7e9f14)" This was caused by the following flow: -ndo_uninit: Move QP state to RST (this disconnects the QP from FT), the QP cannot be attached to any FT unless it is in RTS. -mlx5_rdma_netdev_free: cleanup_rx: Destroy FT cleanup_tx: Destroy QP and remove QPN from FT This caused a problem when destroying current FT we tried to re-attach the QP to the next FT which is not needed. The correct flow is: -mlx5_rdma_netdev_free: cleanup_rx: remove QPN from FT & Destroy FT cleanup_tx: Destroy QP Fixes: 508541146af1 ("net/mlx5: Use underlay QPN from the root name space") Signed-off-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 1ee5bce85901..85298051a3e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -178,8 +178,6 @@ out: static void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp) { - mlx5_fs_remove_rx_underlay_qpn(mdev, qp->qpn); - mlx5_core_destroy_qp(mdev, qp); } @@ -194,8 +192,6 @@ static int mlx5i_init_tx(struct mlx5e_priv *priv) return err; } - mlx5_fs_add_rx_underlay_qpn(priv->mdev, ipriv->qp.qpn); - err = mlx5e_create_tis(priv->mdev, 0 /* tc */, ipriv->qp.qpn, &priv->tisn[0]); if (err) { mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err); @@ -253,6 +249,7 @@ static void mlx5i_destroy_flow_steering(struct mlx5e_priv *priv) static int mlx5i_init_rx(struct mlx5e_priv *priv) { + struct mlx5i_priv *ipriv = priv->ppriv; int err; err = mlx5e_create_indirect_rqt(priv); @@ -271,12 +268,18 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv) if (err) goto err_destroy_indirect_tirs; - err = mlx5i_create_flow_steering(priv); + err = mlx5_fs_add_rx_underlay_qpn(priv->mdev, ipriv->qp.qpn); if (err) goto err_destroy_direct_tirs; + err = mlx5i_create_flow_steering(priv); + if (err) + goto err_remove_rx_underlay_qpn; + return 0; +err_remove_rx_underlay_qpn: + mlx5_fs_remove_rx_underlay_qpn(priv->mdev, ipriv->qp.qpn); err_destroy_direct_tirs: mlx5e_destroy_direct_tirs(priv); err_destroy_indirect_tirs: @@ -290,6 +293,9 @@ err_destroy_indirect_rqts: static void mlx5i_cleanup_rx(struct mlx5e_priv *priv) { + struct mlx5i_priv *ipriv = priv->ppriv; + + mlx5_fs_remove_rx_underlay_qpn(priv->mdev, ipriv->qp.qpn); mlx5i_destroy_flow_steering(priv); mlx5e_destroy_direct_tirs(priv); mlx5e_destroy_indirect_tirs(priv); From 0242f4a0bb03906010bbf80495512be00494a0ef Mon Sep 17 00:00:00 2001 From: Ilan Tayari Date: Wed, 5 Jul 2017 10:17:04 +0300 Subject: [PATCH 47/76] net/mlx5e: Fix outer_header_zero() check size outer_header_zero() routine checks if the outer_headers match of a flow-table entry are all zero. This function uses the size of whole fte_match_param, instead of just the outer_headers member, causing failure to detect all-zeros if any other members of the fte_match_param are non-zero. Use the correct size for zero check. Fixes: 6dc6071cfcde ("net/mlx5e: Add ethtool flow steering support") Signed-off-by: Ilan Tayari Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index bdd82c9b3992..22fb993987b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -276,7 +276,7 @@ static void add_rule_to_list(struct mlx5e_priv *priv, static bool outer_header_zero(u32 *match_criteria) { - int size = MLX5_ST_SZ_BYTES(fte_match_param); + int size = MLX5_FLD_SZ_BYTES(fte_match_param, outer_headers); char *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_criteria, outer_headers); From 0b794ffae7afa7c4e5accac8791c4b78e8d080ce Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 25 May 2017 15:11:26 +0300 Subject: [PATCH 48/76] net/mlx5: Fix mlx5_ifc_mtpps_reg_bits structure size Fix miscalculation in reserved_at_1a0 field. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 87869c04849a..fd98aef4545c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8175,7 +8175,7 @@ struct mlx5_ifc_mtpps_reg_bits { u8 out_pulse_duration[0x10]; u8 out_periodic_adjustment[0x10]; - u8 reserved_at_1a0[0x60]; + u8 reserved_at_1a0[0x40]; }; struct mlx5_ifc_mtppse_reg_bits { From fa3676885e3b5be1edfa1b2cc775e20a45b34a19 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 25 May 2017 16:09:34 +0300 Subject: [PATCH 49/76] net/mlx5e: Add field select to MTPPS register In order to mark relevant fields while setting the MTPPS register add field select. Otherwise it can cause a misconfiguration in firmware. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_clock.c | 29 +++++++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- .../ethernet/mellanox/mlx5/core/mlx5_core.h | 5 ++++ include/linux/mlx5/mlx5_ifc.h | 10 +++++-- 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index 66f432385dbb..ab07233e2faa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -53,6 +53,15 @@ enum { MLX5E_EVENT_MODE_ONCE_TILL_ARM = 0x2, }; +enum { + MLX5E_MTPPS_FS_ENABLE = BIT(0x0), + MLX5E_MTPPS_FS_PATTERN = BIT(0x2), + MLX5E_MTPPS_FS_PIN_MODE = BIT(0x3), + MLX5E_MTPPS_FS_TIME_STAMP = BIT(0x4), + MLX5E_MTPPS_FS_OUT_PULSE_DURATION = BIT(0x5), + MLX5E_MTPPS_FS_ENH_OUT_PER_ADJ = BIT(0x7), +}; + void mlx5e_fill_hwstamp(struct mlx5e_tstamp *tstamp, u64 timestamp, struct skb_shared_hwtstamps *hwts) { @@ -221,7 +230,10 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta) /* For future use need to add a loop for finding all 1PPS out pins */ MLX5_SET(mtpps_reg, in, pin_mode, MLX5E_PIN_MODE_OUT); - MLX5_SET(mtpps_reg, in, out_periodic_adjustment, delta & 0xFFFF); + MLX5_SET(mtpps_reg, in, enhanced_out_periodic_adjustment, delta); + MLX5_SET(mtpps_reg, in, field_select, + MLX5E_MTPPS_FS_PIN_MODE | + MLX5E_MTPPS_FS_ENH_OUT_PER_ADJ); mlx5_set_mtpps(priv->mdev, in, sizeof(in)); } @@ -257,8 +269,7 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp, int pin = -1; int err = 0; - if (!MLX5_CAP_GEN(priv->mdev, pps) || - !MLX5_CAP_GEN(priv->mdev, pps_modify)) + if (!MLX5_PPS_CAP(priv->mdev)) return -EOPNOTSUPP; if (rq->extts.index >= tstamp->ptp_info.n_pins) @@ -277,6 +288,9 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp, MLX5_SET(mtpps_reg, in, pin_mode, MLX5E_PIN_MODE_IN); MLX5_SET(mtpps_reg, in, pattern, pattern); MLX5_SET(mtpps_reg, in, enable, on); + MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_PIN_MODE | + MLX5E_MTPPS_FS_PATTERN | + MLX5E_MTPPS_FS_ENABLE); err = mlx5_set_mtpps(priv->mdev, in, sizeof(in)); if (err) @@ -302,7 +316,7 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, int pin = -1; s64 ns; - if (!MLX5_CAP_GEN(priv->mdev, pps_modify)) + if (!MLX5_PPS_CAP(priv->mdev)) return -EOPNOTSUPP; if (rq->perout.index >= tstamp->ptp_info.n_pins) @@ -337,7 +351,10 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, MLX5_SET(mtpps_reg, in, pattern, MLX5E_OUT_PATTERN_PERIODIC); MLX5_SET(mtpps_reg, in, enable, on); MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp); - + MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_PIN_MODE | + MLX5E_MTPPS_FS_PATTERN | + MLX5E_MTPPS_FS_ENABLE | + MLX5E_MTPPS_FS_TIME_STAMP); return mlx5_set_mtpps(priv->mdev, in, sizeof(in)); } @@ -487,7 +504,7 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv) #define MAX_PIN_NUM 8 tstamp->pps_pin_caps = kzalloc(sizeof(u8) * MAX_PIN_NUM, GFP_KERNEL); if (tstamp->pps_pin_caps) { - if (MLX5_CAP_GEN(priv->mdev, pps)) + if (MLX5_PPS_CAP(priv->mdev)) mlx5e_get_pps_caps(priv, tstamp); if (tstamp->ptp_info.n_pins) mlx5e_init_pin_config(tstamp); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index af51a5d2b912..52b9a64cd3a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -698,7 +698,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) else mlx5_core_dbg(dev, "port_module_event is not set\n"); - if (MLX5_CAP_GEN(dev, pps)) + if (MLX5_PPS_CAP(dev)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT); if (MLX5_CAP_GEN(dev, fpga)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 6a3d6bef7dd4..6a263e8d883a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -154,6 +154,11 @@ int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size); int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode); int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode); +#define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \ + MLX5_CAP_GEN((mdev), pps_modify) && \ + MLX5_CAP_MCAM_FEATURE((mdev), mtpps_fs) && \ + MLX5_CAP_MCAM_FEATURE((mdev), mtpps_enh_out_per_adj)) + int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw); void mlx5e_init(void); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fd98aef4545c..3030121b4746 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -7749,8 +7749,10 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x7f]; + u8 reserved_at_0[0x7d]; + u8 mtpps_enh_out_per_adj[0x1]; + u8 mtpps_fs[0x1]; u8 pcie_performance_group[0x1]; }; @@ -8159,7 +8161,8 @@ struct mlx5_ifc_mtpps_reg_bits { u8 reserved_at_78[0x4]; u8 cap_pin_4_mode[0x4]; - u8 reserved_at_80[0x80]; + u8 field_select[0x20]; + u8 reserved_at_a0[0x60]; u8 enable[0x1]; u8 reserved_at_101[0xb]; @@ -8174,8 +8177,9 @@ struct mlx5_ifc_mtpps_reg_bits { u8 out_pulse_duration[0x10]; u8 out_periodic_adjustment[0x10]; + u8 enhanced_out_periodic_adjustment[0x20]; - u8 reserved_at_1a0[0x40]; + u8 reserved_at_1c0[0x20]; }; struct mlx5_ifc_mtppse_reg_bits { From 49c5031ca6f0628ef973a11b17e463e088bf859e Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Sun, 28 May 2017 12:01:38 +0300 Subject: [PATCH 50/76] net/mlx5e: Fix broken disable 1PPS flow Need to disable the MTPPS and unsubscribe from the pulse events when user disables the 1PPS functionality. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_clock.c | 75 ++++++++++++------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index ab07233e2faa..25d43767c888 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -265,6 +265,8 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp, struct mlx5e_priv *priv = container_of(tstamp, struct mlx5e_priv, tstamp); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + u32 field_select = 0; + u8 pin_mode = 0; u8 pattern = 0; int pin = -1; int err = 0; @@ -279,18 +281,21 @@ static int mlx5e_extts_configure(struct ptp_clock_info *ptp, pin = ptp_find_pin(tstamp->ptp, PTP_PF_EXTTS, rq->extts.index); if (pin < 0) return -EBUSY; + pin_mode = MLX5E_PIN_MODE_IN; + pattern = !!(rq->extts.flags & PTP_FALLING_EDGE); + field_select = MLX5E_MTPPS_FS_PIN_MODE | + MLX5E_MTPPS_FS_PATTERN | + MLX5E_MTPPS_FS_ENABLE; + } else { + pin = rq->extts.index; + field_select = MLX5E_MTPPS_FS_ENABLE; } - if (rq->extts.flags & PTP_FALLING_EDGE) - pattern = 1; - MLX5_SET(mtpps_reg, in, pin, pin); - MLX5_SET(mtpps_reg, in, pin_mode, MLX5E_PIN_MODE_IN); + MLX5_SET(mtpps_reg, in, pin_mode, pin_mode); MLX5_SET(mtpps_reg, in, pattern, pattern); MLX5_SET(mtpps_reg, in, enable, on); - MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_PIN_MODE | - MLX5E_MTPPS_FS_PATTERN | - MLX5E_MTPPS_FS_ENABLE); + MLX5_SET(mtpps_reg, in, field_select, field_select); err = mlx5_set_mtpps(priv->mdev, in, sizeof(in)); if (err) @@ -313,6 +318,9 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, u64 cycles_now, cycles_delta; struct timespec64 ts; unsigned long flags; + u32 field_select = 0; + u8 pin_mode = 0; + u8 pattern = 0; int pin = -1; s64 ns; @@ -327,34 +335,43 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, rq->perout.index); if (pin < 0) return -EBUSY; - } - ts.tv_sec = rq->perout.period.sec; - ts.tv_nsec = rq->perout.period.nsec; - ns = timespec64_to_ns(&ts); - if (on) + pin_mode = MLX5E_PIN_MODE_OUT; + pattern = MLX5E_OUT_PATTERN_PERIODIC; + ts.tv_sec = rq->perout.period.sec; + ts.tv_nsec = rq->perout.period.nsec; + ns = timespec64_to_ns(&ts); + if ((ns >> 1) != 500000000LL) return -EINVAL; - ts.tv_sec = rq->perout.start.sec; - ts.tv_nsec = rq->perout.start.nsec; - ns = timespec64_to_ns(&ts); - cycles_now = mlx5_read_internal_timer(tstamp->mdev); - write_lock_irqsave(&tstamp->lock, flags); - nsec_now = timecounter_cyc2time(&tstamp->clock, cycles_now); - nsec_delta = ns - nsec_now; - cycles_delta = div64_u64(nsec_delta << tstamp->cycles.shift, - tstamp->cycles.mult); - write_unlock_irqrestore(&tstamp->lock, flags); - time_stamp = cycles_now + cycles_delta; + + ts.tv_sec = rq->perout.start.sec; + ts.tv_nsec = rq->perout.start.nsec; + ns = timespec64_to_ns(&ts); + cycles_now = mlx5_read_internal_timer(tstamp->mdev); + write_lock_irqsave(&tstamp->lock, flags); + nsec_now = timecounter_cyc2time(&tstamp->clock, cycles_now); + nsec_delta = ns - nsec_now; + cycles_delta = div64_u64(nsec_delta << tstamp->cycles.shift, + tstamp->cycles.mult); + write_unlock_irqrestore(&tstamp->lock, flags); + time_stamp = cycles_now + cycles_delta; + field_select = MLX5E_MTPPS_FS_PIN_MODE | + MLX5E_MTPPS_FS_PATTERN | + MLX5E_MTPPS_FS_ENABLE | + MLX5E_MTPPS_FS_TIME_STAMP; + } else { + pin = rq->perout.index; + field_select = MLX5E_MTPPS_FS_ENABLE; + } + MLX5_SET(mtpps_reg, in, pin, pin); - MLX5_SET(mtpps_reg, in, pin_mode, MLX5E_PIN_MODE_OUT); - MLX5_SET(mtpps_reg, in, pattern, MLX5E_OUT_PATTERN_PERIODIC); + MLX5_SET(mtpps_reg, in, pin_mode, pin_mode); + MLX5_SET(mtpps_reg, in, pattern, pattern); MLX5_SET(mtpps_reg, in, enable, on); MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp); - MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_PIN_MODE | - MLX5E_MTPPS_FS_PATTERN | - MLX5E_MTPPS_FS_ENABLE | - MLX5E_MTPPS_FS_TIME_STAMP); + MLX5_SET(mtpps_reg, in, field_select, field_select); + return mlx5_set_mtpps(priv->mdev, in, sizeof(in)); } From 4272f9b88db9223216cdf87314f570f6d81295b4 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Sun, 28 May 2017 14:06:01 +0300 Subject: [PATCH 51/76] net/mlx5e: Change 1PPS out scheme In order to fix the drift in 1PPS out need to adjust the next pulse. On each 1PPS out falling edge driver gets the event, then the event handler adjusts the next pulse starting time. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 9 +- .../ethernet/mellanox/mlx5/core/en_clock.c | 116 ++++++++++++------ 2 files changed, 87 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e1b7ddfecd01..d42826f0f0d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -266,6 +266,13 @@ struct mlx5e_dcbx { }; #endif +#define MAX_PIN_NUM 8 +struct mlx5e_pps { + u8 pin_caps[MAX_PIN_NUM]; + struct work_struct out_work; + u64 start[MAX_PIN_NUM]; +}; + struct mlx5e_tstamp { rwlock_t lock; struct cyclecounter cycles; @@ -277,7 +284,7 @@ struct mlx5e_tstamp { struct mlx5_core_dev *mdev; struct ptp_clock *ptp; struct ptp_clock_info ptp_info; - u8 *pps_pin_caps; + struct mlx5e_pps pps_info; }; enum { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index 25d43767c888..464ddd10ebbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -82,6 +82,33 @@ static u64 mlx5e_read_internal_timer(const struct cyclecounter *cc) return mlx5_read_internal_timer(tstamp->mdev) & cc->mask; } +static void mlx5e_pps_out(struct work_struct *work) +{ + struct mlx5e_pps *pps_info = container_of(work, struct mlx5e_pps, + out_work); + struct mlx5e_tstamp *tstamp = container_of(pps_info, struct mlx5e_tstamp, + pps_info); + u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + unsigned long flags; + int i; + + for (i = 0; i < tstamp->ptp_info.n_pins; i++) { + u64 tstart; + + write_lock_irqsave(&tstamp->lock, flags); + tstart = tstamp->pps_info.start[i]; + tstamp->pps_info.start[i] = 0; + write_unlock_irqrestore(&tstamp->lock, flags); + if (!tstart) + continue; + + MLX5_SET(mtpps_reg, in, pin, i); + MLX5_SET64(mtpps_reg, in, time_stamp, tstart); + MLX5_SET(mtpps_reg, in, field_select, MLX5E_MTPPS_FS_TIME_STAMP); + mlx5_set_mtpps(tstamp->mdev, in, sizeof(in)); + } +} + static void mlx5e_timestamp_overflow(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); @@ -222,21 +249,6 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta) int neg_adj = 0; struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp, ptp_info); - struct mlx5e_priv *priv = - container_of(tstamp, struct mlx5e_priv, tstamp); - - if (MLX5_CAP_GEN(priv->mdev, pps_modify)) { - u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; - - /* For future use need to add a loop for finding all 1PPS out pins */ - MLX5_SET(mtpps_reg, in, pin_mode, MLX5E_PIN_MODE_OUT); - MLX5_SET(mtpps_reg, in, enhanced_out_periodic_adjustment, delta); - MLX5_SET(mtpps_reg, in, field_select, - MLX5E_MTPPS_FS_PIN_MODE | - MLX5E_MTPPS_FS_ENH_OUT_PER_ADJ); - - mlx5_set_mtpps(priv->mdev, in, sizeof(in)); - } if (delta < 0) { neg_adj = 1; @@ -314,7 +326,7 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, struct mlx5e_priv *priv = container_of(tstamp, struct mlx5e_priv, tstamp); u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; - u64 nsec_now, nsec_delta, time_stamp; + u64 nsec_now, nsec_delta, time_stamp = 0; u64 cycles_now, cycles_delta; struct timespec64 ts; unsigned long flags; @@ -322,6 +334,7 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, u8 pin_mode = 0; u8 pattern = 0; int pin = -1; + int err = 0; s64 ns; if (!MLX5_PPS_CAP(priv->mdev)) @@ -372,7 +385,12 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp); MLX5_SET(mtpps_reg, in, field_select, field_select); - return mlx5_set_mtpps(priv->mdev, in, sizeof(in)); + err = mlx5_set_mtpps(priv->mdev, in, sizeof(in)); + if (err) + return err; + + return mlx5_set_mtppse(priv->mdev, pin, 0, + MLX5E_EVENT_MODE_REPETETIVE & on); } static int mlx5e_ptp_enable(struct ptp_clock_info *ptp, @@ -456,22 +474,50 @@ static void mlx5e_get_pps_caps(struct mlx5e_priv *priv, tstamp->ptp_info.n_per_out = MLX5_GET(mtpps_reg, out, cap_max_num_of_pps_out_pins); - tstamp->pps_pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode); - tstamp->pps_pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode); - tstamp->pps_pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode); - tstamp->pps_pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode); - tstamp->pps_pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode); - tstamp->pps_pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode); - tstamp->pps_pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode); - tstamp->pps_pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode); + tstamp->pps_info.pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode); + tstamp->pps_info.pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode); + tstamp->pps_info.pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode); + tstamp->pps_info.pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode); + tstamp->pps_info.pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode); + tstamp->pps_info.pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode); + tstamp->pps_info.pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode); + tstamp->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode); } void mlx5e_pps_event_handler(struct mlx5e_priv *priv, struct ptp_clock_event *event) { + struct net_device *netdev = priv->netdev; struct mlx5e_tstamp *tstamp = &priv->tstamp; + struct timespec64 ts; + u64 nsec_now, nsec_delta; + u64 cycles_now, cycles_delta; + int pin = event->index; + s64 ns; + unsigned long flags; - ptp_clock_event(tstamp->ptp, event); + switch (tstamp->ptp_info.pin_config[pin].func) { + case PTP_PF_EXTTS: + ptp_clock_event(tstamp->ptp, event); + break; + case PTP_PF_PEROUT: + mlx5e_ptp_gettime(&tstamp->ptp_info, &ts); + cycles_now = mlx5_read_internal_timer(tstamp->mdev); + ts.tv_sec += 1; + ts.tv_nsec = 0; + ns = timespec64_to_ns(&ts); + write_lock_irqsave(&tstamp->lock, flags); + nsec_now = timecounter_cyc2time(&tstamp->clock, cycles_now); + nsec_delta = ns - nsec_now; + cycles_delta = div64_u64(nsec_delta << tstamp->cycles.shift, + tstamp->cycles.mult); + tstamp->pps_info.start[pin] = cycles_now + cycles_delta; + queue_work(priv->wq, &tstamp->pps_info.out_work); + write_unlock_irqrestore(&tstamp->lock, flags); + break; + default: + netdev_err(netdev, "%s: Unhandled event\n", __func__); + } } void mlx5e_timestamp_init(struct mlx5e_priv *priv) @@ -507,6 +553,7 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv) do_div(ns, NSEC_PER_SEC / 2 / HZ); tstamp->overflow_period = ns; + INIT_WORK(&tstamp->pps_info.out_work, mlx5e_pps_out); INIT_DELAYED_WORK(&tstamp->overflow_work, mlx5e_timestamp_overflow); if (tstamp->overflow_period) schedule_delayed_work(&tstamp->overflow_work, 0); @@ -518,16 +565,10 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv) snprintf(tstamp->ptp_info.name, 16, "mlx5 ptp"); /* Initialize 1PPS data structures */ -#define MAX_PIN_NUM 8 - tstamp->pps_pin_caps = kzalloc(sizeof(u8) * MAX_PIN_NUM, GFP_KERNEL); - if (tstamp->pps_pin_caps) { - if (MLX5_PPS_CAP(priv->mdev)) - mlx5e_get_pps_caps(priv, tstamp); - if (tstamp->ptp_info.n_pins) - mlx5e_init_pin_config(tstamp); - } else { - mlx5_core_warn(priv->mdev, "1PPS initialization failed\n"); - } + if (MLX5_PPS_CAP(priv->mdev)) + mlx5e_get_pps_caps(priv, tstamp); + if (tstamp->ptp_info.n_pins) + mlx5e_init_pin_config(tstamp); tstamp->ptp = ptp_clock_register(&tstamp->ptp_info, &priv->mdev->pdev->dev); @@ -550,7 +591,8 @@ void mlx5e_timestamp_cleanup(struct mlx5e_priv *priv) priv->tstamp.ptp = NULL; } - kfree(tstamp->pps_pin_caps); + cancel_work_sync(&tstamp->pps_info.out_work); + kfree(tstamp->ptp_info.pin_config); cancel_delayed_work_sync(&tstamp->overflow_work); From cf5033089b078303b102b65e3ccbbfa3ce0f4367 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Sun, 28 May 2017 14:27:02 +0300 Subject: [PATCH 52/76] net/mlx5e: Add missing support for PTP_CLK_REQ_PPS request Add the missing option to enable the PTP_CLK_PPS function. In this case pin should be configured as 1PPS IN first and then it will be connected to PPS mechanism. Events will be reported as PTP_CLOCK_PPSUSR events to relevant sysfs. Fixes: ee7f12205abc ('net/mlx5e: Implement 1PPS support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../ethernet/mellanox/mlx5/core/en_clock.c | 20 +++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/en_main.c | 1 - 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index d42826f0f0d6..0039b4725405 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -271,6 +271,7 @@ struct mlx5e_pps { u8 pin_caps[MAX_PIN_NUM]; struct work_struct out_work; u64 start[MAX_PIN_NUM]; + u8 enabled; }; struct mlx5e_tstamp { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index 464ddd10ebbc..ba355c6b6e1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -393,6 +393,17 @@ static int mlx5e_perout_configure(struct ptp_clock_info *ptp, MLX5E_EVENT_MODE_REPETETIVE & on); } +static int mlx5e_pps_configure(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, + int on) +{ + struct mlx5e_tstamp *tstamp = + container_of(ptp, struct mlx5e_tstamp, ptp_info); + + tstamp->pps_info.enabled = !!on; + return 0; +} + static int mlx5e_ptp_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on) @@ -402,6 +413,8 @@ static int mlx5e_ptp_enable(struct ptp_clock_info *ptp, return mlx5e_extts_configure(ptp, rq, on); case PTP_CLK_REQ_PEROUT: return mlx5e_perout_configure(ptp, rq, on); + case PTP_CLK_REQ_PPS: + return mlx5e_pps_configure(ptp, rq, on); default: return -EOPNOTSUPP; } @@ -447,6 +460,7 @@ static int mlx5e_init_pin_config(struct mlx5e_tstamp *tstamp) return -ENOMEM; tstamp->ptp_info.enable = mlx5e_ptp_enable; tstamp->ptp_info.verify = mlx5e_ptp_verify; + tstamp->ptp_info.pps = 1; for (i = 0; i < tstamp->ptp_info.n_pins; i++) { snprintf(tstamp->ptp_info.pin_config[i].name, @@ -498,6 +512,12 @@ void mlx5e_pps_event_handler(struct mlx5e_priv *priv, switch (tstamp->ptp_info.pin_config[pin].func) { case PTP_PF_EXTTS: + if (tstamp->pps_info.enabled) { + event->type = PTP_CLOCK_PPSUSR; + event->pps_times.ts_real = ns_to_timespec64(event->timestamp); + } else { + event->type = PTP_CLOCK_EXTTS; + } ptp_clock_event(tstamp->ptp, event); break; case PTP_PF_PEROUT: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 1eac5003084f..57f31fa478ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -377,7 +377,6 @@ static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv, break; case MLX5_DEV_EVENT_PPS: eqe = (struct mlx5_eqe *)param; - ptp_event.type = PTP_CLOCK_EXTTS; ptp_event.index = eqe->data.pps.pin; ptp_event.timestamp = timecounter_cyc2time(&priv->tstamp.clock, From d439c84509a510e864fdc6166c760482cd03fc57 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Wed, 12 Jul 2017 17:27:18 +0300 Subject: [PATCH 53/76] net/mlx5e: Fix wrong delay calculation for overflow check scheduling The overflow_period is calculated in seconds. In order to use it for delayed work scheduling translation to jiffies is needed. Fixes: ef9814deafd0 ('net/mlx5e: Add HW timestamping (TS) support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_clock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index ba355c6b6e1f..cde8e8e772ec 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -119,7 +119,8 @@ static void mlx5e_timestamp_overflow(struct work_struct *work) write_lock_irqsave(&tstamp->lock, flags); timecounter_read(&tstamp->clock); write_unlock_irqrestore(&tstamp->lock, flags); - schedule_delayed_work(&tstamp->overflow_work, tstamp->overflow_period); + schedule_delayed_work(&tstamp->overflow_work, + msecs_to_jiffies(tstamp->overflow_period * 1000)); } int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr) From f08c39ed0bfb503c7b3e013cd40d036ce6a0941a Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Wed, 12 Jul 2017 17:44:07 +0300 Subject: [PATCH 54/76] net/mlx5e: Schedule overflow check work to mlx5e workqueue This is done in order to ensure that work will not run after the cleanup. Fixes: ef9814deafd0 ('net/mlx5e: Add HW timestamping (TS) support') Signed-off-by: Eugenia Emantayev Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_clock.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c index cde8e8e772ec..84dd63e74041 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c @@ -114,13 +114,14 @@ static void mlx5e_timestamp_overflow(struct work_struct *work) struct delayed_work *dwork = to_delayed_work(work); struct mlx5e_tstamp *tstamp = container_of(dwork, struct mlx5e_tstamp, overflow_work); + struct mlx5e_priv *priv = container_of(tstamp, struct mlx5e_priv, tstamp); unsigned long flags; write_lock_irqsave(&tstamp->lock, flags); timecounter_read(&tstamp->clock); write_unlock_irqrestore(&tstamp->lock, flags); - schedule_delayed_work(&tstamp->overflow_work, - msecs_to_jiffies(tstamp->overflow_period * 1000)); + queue_delayed_work(priv->wq, &tstamp->overflow_work, + msecs_to_jiffies(tstamp->overflow_period * 1000)); } int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr) @@ -577,7 +578,7 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv) INIT_WORK(&tstamp->pps_info.out_work, mlx5e_pps_out); INIT_DELAYED_WORK(&tstamp->overflow_work, mlx5e_timestamp_overflow); if (tstamp->overflow_period) - schedule_delayed_work(&tstamp->overflow_work, 0); + queue_delayed_work(priv->wq, &tstamp->overflow_work, 0); else mlx5_core_warn(priv->mdev, "invalid overflow period, overflow_work is not scheduled\n"); @@ -613,8 +614,6 @@ void mlx5e_timestamp_cleanup(struct mlx5e_priv *priv) } cancel_work_sync(&tstamp->pps_info.out_work); - - kfree(tstamp->ptp_info.pin_config); - cancel_delayed_work_sync(&tstamp->overflow_work); + kfree(tstamp->ptp_info.pin_config); } From bcec601f30fb41e9233674942fa4040a6e63657a Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 6 Jul 2017 16:40:34 +0300 Subject: [PATCH 55/76] net/mlx5: Fix mlx5_add_flow_rules call with correct num of dests When adding ethtool steering rule with action DISCARD we wrongly pass a NULL dest with dest_num 1 to mlx5_add_flow_rules(). What this error seems to have caused is sending VPORT 0 (MLX5_FLOW_DESTINATION_TYPE_VPORT) as the fte dest instead of no dests. We have fte action correctly set to DROP so it might been ignored anyways. To reproduce use: # sudo ethtool --config-nfc flow-type ether \ dst aa:bb:cc:dd:ee:ff action -1 Fixes: 74491de93712 ("net/mlx5: Add multi dest support") Signed-off-by: Paul Blakey Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c index 22fb993987b4..eafc59280ada 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -320,7 +320,7 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv, spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria)); flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG; - rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, 1); + rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, dst ? 1 : 0); if (IS_ERR(rule)) { err = PTR_ERR(rule); netdev_err(priv->netdev, "%s: failed to add ethtool steering rule: %d\n", From 4c3464a8c31829e6e7efbf8258dc062f761509d7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Jul 2017 17:13:59 +0200 Subject: [PATCH 56/76] net: phy: rework Kconfig settings for MDIO_BUS I still see build errors in randconfig builds and have had this patch for a while to locally work around it: drivers/built-in.o: In function `xgene_mdio_probe': mux-core.c:(.text+0x352154): undefined reference to `of_mdiobus_register' mux-core.c:(.text+0x352168): undefined reference to `mdiobus_free' mux-core.c:(.text+0x3521c0): undefined reference to `mdiobus_alloc_size' The idea is that CONFIG_MDIO_BUS now reflects whether the mdio_bus code is built-in or a module, and other drivers that use the core code can simply depend on that, instead of having a complex dependency line. Fixes: 90eff9096c01 ("net: phy: Allow splitting MDIO bus/device support from PHYs") Signed-off-by: Arnd Bergmann Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 2dda72004a7d..928fd892f167 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -7,7 +7,16 @@ menuconfig MDIO_DEVICE help MDIO devices and driver infrastructure code. -if MDIO_DEVICE +config MDIO_BUS + tristate + default m if PHYLIB=m + default MDIO_DEVICE + help + This internal symbol is used for link time dependencies and it + reflects whether the mdio_bus/mdio_device code is built as a + loadable module or built-in. + +if MDIO_BUS config MDIO_BCM_IPROC tristate "Broadcom iProc MDIO bus controller" @@ -28,7 +37,6 @@ config MDIO_BCM_UNIMAC config MDIO_BITBANG tristate "Bitbanged MDIO buses" - depends on !(MDIO_DEVICE=y && PHYLIB=m) help This module implements the MDIO bus protocol in software, for use by low level drivers that export the ability to @@ -127,7 +135,6 @@ config MDIO_THUNDER tristate "ThunderX SOCs MDIO buses" depends on 64BIT depends on PCI - depends on !(MDIO_DEVICE=y && PHYLIB=m) select MDIO_CAVIUM help This driver supports the MDIO interfaces found on Cavium From 245db3c349e0b413e0fec8d6179f6b767649ad63 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Jul 2017 17:14:00 +0200 Subject: [PATCH 57/76] phy: bcm-ns-usb3: fix MDIO_BUS dependency The driver attempts to 'select MDIO_DEVICE', but the code is actually a loadable module when PHYLIB=m: drivers/phy/broadcom/phy-bcm-ns-usb3.o: In function `bcm_ns_usb3_mdiodev_phy_write': phy-bcm-ns-usb3.c:(.text.bcm_ns_usb3_mdiodev_phy_write+0x28): undefined reference to `mdiobus_write' drivers/phy/broadcom/phy-bcm-ns-usb3.o: In function `bcm_ns_usb3_module_exit': phy-bcm-ns-usb3.c:(.exit.text+0x18): undefined reference to `mdio_driver_unregister' drivers/phy/broadcom/phy-bcm-ns-usb3.o: In function `bcm_ns_usb3_module_init': phy-bcm-ns-usb3.c:(.init.text+0x18): undefined reference to `mdio_driver_register' phy-bcm-ns-usb3.c:(.init.text+0x38): undefined reference to `mdio_driver_unregister' Using 'depends on MDIO_BUS' instead will avoid the link error. Fixes: af850e14a7ae ("phy: bcm-ns-usb3: add MDIO driver using proper bus layer") Signed-off-by: Arnd Bergmann Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/phy/broadcom/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/broadcom/Kconfig b/drivers/phy/broadcom/Kconfig index 37371b89b14f..64fc59c3ae6d 100644 --- a/drivers/phy/broadcom/Kconfig +++ b/drivers/phy/broadcom/Kconfig @@ -30,8 +30,8 @@ config PHY_BCM_NS_USB3 tristate "Broadcom Northstar USB 3.0 PHY Driver" depends on ARCH_BCM_IPROC || COMPILE_TEST depends on HAS_IOMEM && OF + depends on MDIO_BUS select GENERIC_PHY - select MDIO_DEVICE help Enable this to support Broadcom USB 3.0 PHY connected to the USB controller on Northstar family. From 0254e0c632bfe4a0cf610e2d90397474144f00d2 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 26 Jul 2017 15:22:06 -0700 Subject: [PATCH 58/76] net: check dev->addr_len for dev_set_mac_address() Historically, dev_ifsioc() uses struct sockaddr as mac address definition, this is why dev_set_mac_address() accepts a struct sockaddr pointer as input but now we have various types of mac addresse whose lengths are up to MAX_ADDR_LEN, longer than struct sockaddr, and saved in dev->addr_len. It is too late to fix dev_ifsioc() due to API compatibility, so just reject those larger than sizeof(struct sockaddr), otherwise we would read and use some random bytes from kernel stack. Fortunately, only a few IPv6 tunnel devices have addr_len larger than sizeof(struct sockaddr) and they don't support ndo_set_mac_addr(). But with team driver, in lb mode, they can still be enslaved to a team master and make its mac addr length as the same. Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 06b147d7d9e2..709a4e6fb447 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -263,6 +263,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: + if (dev->addr_len > sizeof(struct sockaddr)) + return -EINVAL; return dev_set_mac_address(dev, &ifr->ifr_hwaddr); case SIOCSIFHWBROADCAST: From 996f6e12cfb33ff6c3e39b4511bb3b8bc564b342 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 26 Jul 2017 15:22:07 -0700 Subject: [PATCH 59/76] team: use a larger struct for mac address IPv6 tunnels use sizeof(struct in6_addr) as dev->addr_len, but in many places especially bonding, we use struct sockaddr to copy and set mac addr, this could lead to stack out-of-bounds access. Fix it by using a larger address storage like bonding. Reported-by: Andrey Konovalov Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/team/team.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 464570409796..ae53e899259f 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -60,11 +60,11 @@ static struct team_port *team_port_get_rtnl(const struct net_device *dev) static int __set_port_dev_addr(struct net_device *port_dev, const unsigned char *dev_addr) { - struct sockaddr addr; + struct sockaddr_storage addr; - memcpy(addr.sa_data, dev_addr, port_dev->addr_len); - addr.sa_family = port_dev->type; - return dev_set_mac_address(port_dev, &addr); + memcpy(addr.__data, dev_addr, port_dev->addr_len); + addr.ss_family = port_dev->type; + return dev_set_mac_address(port_dev, (struct sockaddr *)&addr); } static int team_port_set_orig_dev_addr(struct team_port *port) From 8d65843c44269c21e95c98090d9bb4848d473853 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 27 Jul 2017 11:22:05 +0800 Subject: [PATCH 60/76] Revert "vhost: cache used event for better performance" This reverts commit 809ecb9bca6a9424ccd392d67e368160f8b76c92. Since it was reported to break vhost_net. We want to cache used event and use it to check for notification. The assumption was that guest won't move the event idx back, but this could happen in fact when 16 bit index wraps around after 64K entries. Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 28 ++++++---------------------- drivers/vhost/vhost.h | 3 --- 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e4613a3c362d..9cb3f722dce1 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -308,7 +308,6 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->avail = NULL; vq->used = NULL; vq->last_avail_idx = 0; - vq->last_used_event = 0; vq->avail_idx = 0; vq->last_used_idx = 0; vq->signalled_used = 0; @@ -1402,7 +1401,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) r = -EINVAL; break; } - vq->last_avail_idx = vq->last_used_event = s.num; + vq->last_avail_idx = s.num; /* Forget the cached index value. */ vq->avail_idx = vq->last_avail_idx; break; @@ -2241,6 +2240,10 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) __u16 old, new; __virtio16 event; bool v; + /* Flush out used index updates. This is paired + * with the barrier that the Guest executes when enabling + * interrupts. */ + smp_mb(); if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) && unlikely(vq->avail_idx == vq->last_avail_idx)) @@ -2248,10 +2251,6 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { __virtio16 flags; - /* Flush out used index updates. This is paired - * with the barrier that the Guest executes when enabling - * interrupts. */ - smp_mb(); if (vhost_get_avail(vq, flags, &vq->avail->flags)) { vq_err(vq, "Failed to get flags"); return true; @@ -2266,26 +2265,11 @@ static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) if (unlikely(!v)) return true; - /* We're sure if the following conditions are met, there's no - * need to notify guest: - * 1) cached used event is ahead of new - * 2) old to new updating does not cross cached used event. */ - if (vring_need_event(vq->last_used_event, new + vq->num, new) && - !vring_need_event(vq->last_used_event, new, old)) - return false; - - /* Flush out used index updates. This is paired - * with the barrier that the Guest executes when enabling - * interrupts. */ - smp_mb(); - if (vhost_get_avail(vq, event, vhost_used_event(vq))) { vq_err(vq, "Failed to get used event idx"); return true; } - vq->last_used_event = vhost16_to_cpu(vq, event); - - return vring_need_event(vq->last_used_event, new, old); + return vring_need_event(vhost16_to_cpu(vq, event), new, old); } /* This actually signals the guest, using eventfd. */ diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index f72095868b93..bb7c29b8b9fc 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -115,9 +115,6 @@ struct vhost_virtqueue { /* Last index we used. */ u16 last_used_idx; - /* Last used evet we've seen */ - u16 last_used_event; - /* Used flags */ u16 used_flags; From 500268e9f28de972514de89a5a1b4106d0417989 Mon Sep 17 00:00:00 2001 From: Sunil Goutham Date: Thu, 27 Jul 2017 12:53:04 +0530 Subject: [PATCH 61/76] net: thunderx: Fix BGX transmit stall due to underflow For SGMII/RGMII/QSGMII interfaces when physical link goes down while traffic is high is resulting in underflow condition being set on that specific BGX's LMAC. Which assets a backpresure and VNIC stops transmitting packets. This is due to BGX being disabled in link status change callback while packet is in transit. This patch fixes this issue by not disabling BGX but instead just disables packet Rx and Tx. Signed-off-by: Sunil Goutham Signed-off-by: David S. Miller --- .../net/ethernet/cavium/thunder/thunder_bgx.c | 27 +++++++++++++++---- .../net/ethernet/cavium/thunder/thunder_bgx.h | 2 ++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index 79112563a25a..5e5c4d7796b8 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -292,11 +292,30 @@ static void bgx_sgmii_change_link_state(struct lmac *lmac) u64 cmr_cfg; u64 port_cfg = 0; u64 misc_ctl = 0; + bool tx_en, rx_en; cmr_cfg = bgx_reg_read(bgx, lmac->lmacid, BGX_CMRX_CFG); - cmr_cfg &= ~CMR_EN; + tx_en = cmr_cfg & CMR_PKT_TX_EN; + rx_en = cmr_cfg & CMR_PKT_RX_EN; + cmr_cfg &= ~(CMR_PKT_RX_EN | CMR_PKT_TX_EN); bgx_reg_write(bgx, lmac->lmacid, BGX_CMRX_CFG, cmr_cfg); + /* Wait for BGX RX to be idle */ + if (bgx_poll_reg(bgx, lmac->lmacid, BGX_GMP_GMI_PRTX_CFG, + GMI_PORT_CFG_RX_IDLE, false)) { + dev_err(&bgx->pdev->dev, "BGX%d LMAC%d GMI RX not idle\n", + bgx->bgx_id, lmac->lmacid); + return; + } + + /* Wait for BGX TX to be idle */ + if (bgx_poll_reg(bgx, lmac->lmacid, BGX_GMP_GMI_PRTX_CFG, + GMI_PORT_CFG_TX_IDLE, false)) { + dev_err(&bgx->pdev->dev, "BGX%d LMAC%d GMI TX not idle\n", + bgx->bgx_id, lmac->lmacid); + return; + } + port_cfg = bgx_reg_read(bgx, lmac->lmacid, BGX_GMP_GMI_PRTX_CFG); misc_ctl = bgx_reg_read(bgx, lmac->lmacid, BGX_GMP_PCS_MISCX_CTL); @@ -347,10 +366,8 @@ static void bgx_sgmii_change_link_state(struct lmac *lmac) bgx_reg_write(bgx, lmac->lmacid, BGX_GMP_PCS_MISCX_CTL, misc_ctl); bgx_reg_write(bgx, lmac->lmacid, BGX_GMP_GMI_PRTX_CFG, port_cfg); - port_cfg = bgx_reg_read(bgx, lmac->lmacid, BGX_GMP_GMI_PRTX_CFG); - - /* Re-enable lmac */ - cmr_cfg |= CMR_EN; + /* Restore CMR config settings */ + cmr_cfg |= (rx_en ? CMR_PKT_RX_EN : 0) | (tx_en ? CMR_PKT_TX_EN : 0); bgx_reg_write(bgx, lmac->lmacid, BGX_CMRX_CFG, cmr_cfg); if (bgx->is_rgx && (cmr_cfg & (CMR_PKT_RX_EN | CMR_PKT_TX_EN))) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h index 6b7fe6fdd13b..23acdc5ab896 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h @@ -170,6 +170,8 @@ #define GMI_PORT_CFG_DUPLEX BIT_ULL(2) #define GMI_PORT_CFG_SLOT_TIME BIT_ULL(3) #define GMI_PORT_CFG_SPEED_MSB BIT_ULL(8) +#define GMI_PORT_CFG_RX_IDLE BIT_ULL(12) +#define GMI_PORT_CFG_TX_IDLE BIT_ULL(13) #define BGX_GMP_GMI_RXX_JABBER 0x38038 #define BGX_GMP_GMI_TXX_THRESH 0x38210 #define BGX_GMP_GMI_TXX_APPEND 0x38218 From c9f2c1ae123a751d4e4f949144500219354d5ee1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 27 Jul 2017 14:45:09 +0200 Subject: [PATCH 62/76] udp6: fix socket leak on early demux When an early demuxed packet reaches __udp6_lib_lookup_skb(), the sk reference is retrieved and used, but the relevant reference count is leaked and the socket destructor is never called. Beyond leaking the sk memory, if there are pending UDP packets in the receive queue, even the related accounted memory is leaked. In the long run, this will cause persistent forward allocation errors and no UDP skbs (both ipv4 and ipv6) will be able to reach the user-space. Fix this by explicitly accessing the early demux reference before the lookup, and properly decreasing the socket reference count after usage. Also drop the skb_steal_sock() in __udp6_lib_lookup_skb(), and the now obsoleted comment about "socket cache". The newly added code is derived from the current ipv4 code for the similar path. v1 -> v2: fixed the __udp6_lib_rcv() return code for resubmission, as suggested by Eric Reported-by: Sam Edwards Reported-by: Marc Haber Fixes: 5425077d73e0 ("net: ipv6: Add early demux handler for UDP unicast") Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/udp.h | 1 + net/ipv4/udp.c | 3 ++- net/ipv6/udp.c | 27 ++++++++++++++++++--------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index 56ce2d2a612d..cc8036987dcb 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -260,6 +260,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, } void udp_v4_early_demux(struct sk_buff *skb); +void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fac7cb9e3b0f..e6276fa3750b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1928,7 +1928,7 @@ drop: /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ -static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; @@ -1937,6 +1937,7 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) dst_release(old); } } +EXPORT_SYMBOL(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4a3e65626e8b..98fe4560e24c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -291,11 +291,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, struct udp_table *udptable) { const struct ipv6hdr *iph = ipv6_hdr(skb); - struct sock *sk; - sk = skb_steal_sock(skb); - if (unlikely(sk)) - return sk; return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), udptable, skb); @@ -804,6 +800,24 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp6_csum_init(skb, uh, proto)) goto csum_error; + /* Check if the socket is already available, e.g. due to early demux */ + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); + int ret; + + if (unlikely(sk->sk_rx_dst != dst)) + udp_sk_rx_dst_set(sk, dst); + + ret = udpv6_queue_rcv_skb(sk, skb); + sock_put(sk); + + /* a return value > 0 means to resubmit the input */ + if (ret > 0) + return ret; + return 0; + } + /* * Multicast receive code */ @@ -812,11 +826,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable, proto); /* Unicast */ - - /* - * check socket cache ... must talk to Alan about his plans - * for sock caches... i'll skip this for now. - */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { int ret; From 89b096898a8450b0a5b97d521e000ae9f94f81f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 27 Jul 2017 21:02:46 +0200 Subject: [PATCH 63/76] bpf: don't indicate success when copy_from_user fails err in bpf_prog_get_info_by_fd() still holds 0 at that time from prior check_uarg_tail_zero() check. Explicitly return -EFAULT instead, so user space can be notified of buggy behavior. Fixes: 1e2709769086 ("bpf: Add BPF_OBJ_GET_INFO_BY_FD") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 045646da97cc..84bb39975ad4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1289,7 +1289,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info_len = min_t(u32, sizeof(info), info_len); if (copy_from_user(&info, uinfo, info_len)) - return err; + return -EFAULT; info.type = prog->type; info.id = prog->aux->id; From b103ec73b27ad385241c806a21f8e2bdeae2f13a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 27 Jul 2017 23:15:09 +0100 Subject: [PATCH 64/76] net: tc35815: fix spelling mistake: "Intterrupt" -> "Interrupt" Trivial fix to spelling mistake in printk message Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/toshiba/tc35815.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c index d9db8a06afd2..cce9c9ed46aa 100644 --- a/drivers/net/ethernet/toshiba/tc35815.c +++ b/drivers/net/ethernet/toshiba/tc35815.c @@ -1338,7 +1338,7 @@ static int tc35815_send_packet(struct sk_buff *skb, struct net_device *dev) static void tc35815_fatal_error_interrupt(struct net_device *dev, u32 status) { static int count; - printk(KERN_WARNING "%s: Fatal Error Intterrupt (%#x):", + printk(KERN_WARNING "%s: Fatal Error Interrupt (%#x):", dev->name, status); if (status & Int_IntPCI) printk(" IntPCI"); From efe967cdec32af93e15839cc639695ec5f637771 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 28 Jul 2017 16:41:37 +0200 Subject: [PATCH 65/76] tcp: avoid bogus gcc-7 array-bounds warning When using CONFIG_UBSAN_SANITIZE_ALL, the TCP code produces a false-positive warning: net/ipv4/tcp_output.c: In function 'tcp_connect': net/ipv4/tcp_output.c:2207:40: error: array subscript is below array bounds [-Werror=array-bounds] tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; ^~ net/ipv4/tcp_output.c:2207:40: error: array subscript is below array bounds [-Werror=array-bounds] tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~ I have opened a gcc bug for this, but distros have already shipped compilers with this problem, and it's not clear yet whether there is a way for gcc to avoid the warning. As the problem is related to the bitfield access, this introduces a temporary variable to store the old enum value. I did not notice this warning earlier, since UBSAN is disabled when building with COMPILE_TEST, and that was always turned on in both allmodconfig and randconfig tests. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81601 Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4e985dea1dd2..2f1588bf73da 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2202,9 +2202,10 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; + enum tcp_chrono old = tp->chrono_type; - if (tp->chrono_type > TCP_CHRONO_UNSPEC) - tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; + if (old > TCP_CHRONO_UNSPEC) + tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } From 9975a54b3c9ecf029cbf5dd7a8c9701b1d74029e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 Jul 2017 17:05:25 +0200 Subject: [PATCH 66/76] bpf: fix bpf_prog_get_info_by_fd to dump correct xlated_prog_len bpf_prog_size(prog->len) is not the correct length we want to dump back to user space. The code in bpf_prog_get_info_by_fd() uses this to copy prog->insnsi to user space, but bpf_prog_size(prog->len) also includes the size of struct bpf_prog itself plus program instructions and is usually used either in context of accounting or for bpf_prog_alloc() et al, thus we copy out of bounds in bpf_prog_get_info_by_fd() potentially. Use the correct bpf_prog_insn_size() instead. Fixes: 1e2709769086 ("bpf: Add BPF_OBJ_GET_INFO_BY_FD") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 84bb39975ad4..6c772adabad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1312,7 +1312,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.xlated_prog_len; - info.xlated_prog_len = bpf_prog_size(prog->len); + info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); From 96a734bae00f4ea46d0f3f3a8e347c31d24489bd Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 27 Jul 2017 17:26:00 +0100 Subject: [PATCH 67/76] sunhme: fix up GREG_STAT and GREG_IMASK register offsets Update the values to match those from the STP2002QFP documentation. Signed-off-by: Mark Cave-Ayland Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/sunhme.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sun/sunhme.h b/drivers/net/ethernet/sun/sunhme.h index 3af540adb3c5..fca1bca7f69d 100644 --- a/drivers/net/ethernet/sun/sunhme.h +++ b/drivers/net/ethernet/sun/sunhme.h @@ -13,9 +13,9 @@ /* Happy Meal global registers. */ #define GREG_SWRESET 0x000UL /* Software Reset */ #define GREG_CFG 0x004UL /* Config Register */ -#define GREG_STAT 0x108UL /* Status */ -#define GREG_IMASK 0x10cUL /* Interrupt Mask */ -#define GREG_REG_SIZE 0x110UL +#define GREG_STAT 0x100UL /* Status */ +#define GREG_IMASK 0x104UL /* Interrupt Mask */ +#define GREG_REG_SIZE 0x108UL /* Global reset register. */ #define GREG_RESET_ETX 0x01 From 7ad813f208533cebfcc32d3d7474dc1677d1b09a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Jul 2017 11:58:36 -0700 Subject: [PATCH 68/76] net: phy: Correctly process PHY_HALTED in phy_stop_machine() Marc reported that he was not getting the PHY library adjust_link() callback function to run when calling phy_stop() + phy_disconnect() which does not indeed happen because we set the state machine to PHY_HALTED but we don't get to run it to process this state past that point. Fix this with a synchronous call to phy_state_machine() in order to have the state machine actually act on PHY_HALTED, set the PHY device's link down, turn the network device's carrier off and finally call the adjust_link() function. Reported-by: Marc Gonzalez Fixes: a390d1f379cf ("phylib: convert state_queue work to delayed_work") Signed-off-by: Florian Fainelli Signed-off-by: Marc Gonzalez Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index d0626bf5c540..5068c582d502 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -749,6 +749,9 @@ void phy_stop_machine(struct phy_device *phydev) if (phydev->state > PHY_UP && phydev->state != PHY_HALTED) phydev->state = PHY_UP; mutex_unlock(&phydev->lock); + + /* Now we can run the state machine synchronously */ + phy_state_machine(&phydev->state_queue.work); } /** From 71ed7ee35ad2c5300f4b51634185a0193b4fb0fa Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 28 Jul 2017 23:27:44 +0300 Subject: [PATCH 69/76] ipv4: fib: Fix NULL pointer deref during fib_sync_down_dev() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michał reported a NULL pointer deref during fib_sync_down_dev() when unregistering a netdevice. The problem is that we don't check for 'in_dev' being NULL, which can happen in very specific cases. Usually routes are flushed upon NETDEV_DOWN sent in either the netdev or the inetaddr notification chains. However, if an interface isn't configured with any IP address, then it's possible for host routes to be flushed following NETDEV_UNREGISTER, after NULLing dev->ip_ptr in inetdev_destroy(). To reproduce: $ ip link add type dummy $ ip route add local 1.1.1.0/24 dev dummy0 $ ip link del dev dummy0 Fix this by checking for the presence of 'in_dev' before referencing it. Fixes: 982acb97560c ("ipv4: fib: Notify about nexthop status changes") Signed-off-by: Ido Schimmel Reported-by: Michał Mirosław Tested-by: Michał Mirosław Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 222100103808..b8d18171cca3 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1452,7 +1452,7 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: - if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && fib_nh->nh_flags & RTNH_F_LINKDOWN) || (fib_nh->nh_flags & RTNH_F_DEAD)) return call_fib_notifiers(dev_net(fib_nh->nh_dev), From 13332db54fb5a821c86432238d33be1ff0429394 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 31 Jul 2017 09:47:50 -0700 Subject: [PATCH 70/76] MAINTAINERS: Add more files to the PHY LIBRARY section Include missing files that are provided by, used, or directly maintained within the PHY LIBRARY, this include uapi header, header files used by Device Tree code etc. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- MAINTAINERS | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 297e610c9163..a7a392a74859 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5138,12 +5138,20 @@ M: Andrew Lunn M: Florian Fainelli L: netdev@vger.kernel.org S: Maintained -F: include/linux/phy.h -F: include/linux/phy_fixed.h -F: drivers/net/phy/ +F: Documentation/ABI/testing/sysfs-bus-mdio +F: Documentation/devicetree/bindings/net/mdio* F: Documentation/networking/phy.txt +F: drivers/net/phy/ F: drivers/of/of_mdio.c F: drivers/of/of_net.c +F: include/linux/*mdio*.h +F: include/linux/of_net.h +F: include/linux/phy.h +F: include/linux/phy_fixed.h +F: include/linux/platform_data/mdio-gpio.h +F: include/trace/events/mdio.h +F: include/uapi/linux/mdio.h +F: include/uapi/linux/mii.h EXT2 FILE SYSTEM M: Jan Kara From cfbcb61f6c2ac9aeb7731d2ec59fe3bf53786c49 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 29 Jul 2017 22:18:41 +0300 Subject: [PATCH 71/76] mv643xx_eth: fix of_irq_to_resource() error check of_irq_to_resource() has recently been fixed to return negative error #'s along with 0 in case of failure, however the Marvell MV643xx Ethernet driver still only regards 0 as invalid IRQ -- fix it up. Fixes: 7a4228bbff76 ("of: irq: use of_irq_get() in of_irq_to_resource()") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 5794d98d946f..9c94ea9b2b80 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2734,7 +2734,7 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, ppd.shared = pdev; memset(&res, 0, sizeof(res)); - if (!of_irq_to_resource(pnp, 0, &res)) { + if (of_irq_to_resource(pnp, 0, &res) <= 0) { dev_err(&pdev->dev, "missing interrupt on %s\n", pnp->name); return -EINVAL; } From 1daa8790d0280d2c719658e39bd59fce65efa909 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 31 Jul 2017 21:49:49 +0300 Subject: [PATCH 72/76] virtio_net: fix truesize for mergeable buffers Seth Forshee noticed a performance degradation with some workloads. This turns out to be due to packet drops. Euan Kemp noticed that this is because we drop all packets where length exceeds the truesize, but for some packets we add in extra memory without updating the truesize. This in turn was kept around unchanged from ab7db91705e95 ("virtio-net: auto-tune mergeable rx buffer size for improved performance"). That commit had an internal reason not to account for the extra space: not enough bits to do it. No longer true so let's account for the allocated length exactly. Many thanks to Seth Forshee for the report and bisecting and Euan Kemp for debugging the issue. Fixes: 680557cf79f8 ("virtio_net: rework mergeable buffer handling") Reported-by: Euan Kemp Tested-by: Euan Kemp Reported-by: Seth Forshee Tested-by: Seth Forshee Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 99a26a9efec1..075dc18829ab 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -889,21 +889,20 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; buf += headroom; /* advance address leaving hole at front of pkt */ - ctx = (void *)(unsigned long)len; get_page(alloc_frag->page); alloc_frag->offset += len + headroom; hole = alloc_frag->size - alloc_frag->offset; if (hole < len + headroom) { /* To avoid internal fragmentation, if there is very likely not * enough space for another buffer, add the remaining space to - * the current buffer. This extra space is not included in - * the truesize stored in ctx. + * the current buffer. */ len += hole; alloc_frag->offset += hole; } sg_init_one(rq->sg, buf, len); + ctx = (void *)(unsigned long)len; err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) put_page(virt_to_head_page(buf)); From 00d51094b8a5856ae790048a332ae24f1dd7c994 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 31 Jul 2017 11:05:32 -0700 Subject: [PATCH 73/76] Revert "net: bcmgenet: Remove init parameter from bcmgenet_mii_config" This reverts commit 28b45910ccda ("net: bcmgenet: Remove init parameter from bcmgenet_mii_config") because in the process of moving from dev_info() to dev_info_once() we essentially lost the helpful printed messages once the second instance of the driver is loaded. dev_info_once() does not actually print the message once per device instance, but once period. Fixes: 28b45910ccda ("net: bcmgenet: Remove init parameter from bcmgenet_mii_config") Signed-off-by: Florian Fainelli Reviewed-by: Doug Berger Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 2 +- drivers/net/ethernet/broadcom/genet/bcmgenet.h | 2 +- drivers/net/ethernet/broadcom/genet/bcmmii.c | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 7b0b399aaedd..a981c4ee9d72 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3669,7 +3669,7 @@ static int bcmgenet_resume(struct device *d) phy_init_hw(priv->phydev); /* Speed settings must be restored */ - bcmgenet_mii_config(priv->dev); + bcmgenet_mii_config(priv->dev, false); /* disable ethernet MAC while updating its registers */ umac_enable_set(priv, CMD_TX_EN | CMD_RX_EN, false); diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index b9344de669f8..3a34fdba5301 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -698,7 +698,7 @@ GENET_IO_MACRO(rbuf, GENET_RBUF_OFF); /* MDIO routines */ int bcmgenet_mii_init(struct net_device *dev); -int bcmgenet_mii_config(struct net_device *dev); +int bcmgenet_mii_config(struct net_device *dev, bool init); int bcmgenet_mii_probe(struct net_device *dev); void bcmgenet_mii_exit(struct net_device *dev); void bcmgenet_mii_reset(struct net_device *dev); diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 071fcbd14e6a..30cb97b4a1d7 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -238,7 +238,7 @@ static void bcmgenet_moca_phy_setup(struct bcmgenet_priv *priv) bcmgenet_fixed_phy_link_update); } -int bcmgenet_mii_config(struct net_device *dev) +int bcmgenet_mii_config(struct net_device *dev, bool init) { struct bcmgenet_priv *priv = netdev_priv(dev); struct phy_device *phydev = priv->phydev; @@ -327,7 +327,8 @@ int bcmgenet_mii_config(struct net_device *dev) bcmgenet_ext_writel(priv, reg, EXT_RGMII_OOB_CTRL); } - dev_info_once(kdev, "configuring instance for %s\n", phy_name); + if (init) + dev_info(kdev, "configuring instance for %s\n", phy_name); return 0; } @@ -375,7 +376,7 @@ int bcmgenet_mii_probe(struct net_device *dev) * PHY speed which is needed for bcmgenet_mii_config() to configure * things appropriately. */ - ret = bcmgenet_mii_config(dev); + ret = bcmgenet_mii_config(dev, true); if (ret) { phy_disconnect(priv->phydev); return ret; From ddab82821fa633da768879fc515e76dcb1207151 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 31 Jul 2017 18:07:38 +0800 Subject: [PATCH 74/76] ppp: Fix a scheduling-while-atomic bug in del_chan The PPTP set the pptp_sock_destruct as the sock's sk_destruct, it would trigger this bug when __sk_free is invoked in atomic context, because of the call path pptp_sock_destruct->del_chan->synchronize_rcu. Now move the synchronize_rcu to pptp_release from del_chan. This is the only one case which would free the sock and need the synchronize_rcu. The following is the panic I met with kernel 3.3.8, but this issue should exist in current kernel too according to the codes. BUG: scheduling while atomic __schedule_bug+0x5e/0x64 __schedule+0x55/0x580 ? ppp_unregister_channel+0x1cd5/0x1de0 [ppp_generic] ? dev_hard_start_xmit+0x423/0x530 ? sch_direct_xmit+0x73/0x170 __cond_resched+0x16/0x30 _cond_resched+0x22/0x30 wait_for_common+0x18/0x110 ? call_rcu_bh+0x10/0x10 wait_for_completion+0x12/0x20 wait_rcu_gp+0x34/0x40 ? wait_rcu_gp+0x40/0x40 synchronize_sched+0x1e/0x20 0xf8417298 0xf8417484 ? sock_queue_rcv_skb+0x109/0x130 __sk_free+0x16/0x110 ? udp_queue_rcv_skb+0x1f2/0x290 sk_free+0x16/0x20 __udp4_lib_rcv+0x3b8/0x650 Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- drivers/net/ppp/pptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index eac499c58aa7..6dde9a0cfe76 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -131,7 +131,6 @@ static void del_chan(struct pppox_sock *sock) clear_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap); RCU_INIT_POINTER(callid_sock[sock->proto.pptp.src_addr.call_id], NULL); spin_unlock(&chan_lock); - synchronize_rcu(); } static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) @@ -520,6 +519,7 @@ static int pptp_release(struct socket *sock) po = pppox_sk(sk); del_chan(po); + synchronize_rcu(); pppox_unbind_sock(sk); sk->sk_state = PPPOX_DEAD; From cb891fa6a1d5f52c5f5c07b6f7f4c6d65ea55fc0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 31 Jul 2017 16:52:36 +0200 Subject: [PATCH 75/76] udp6: fix jumbogram reception Since commit 67a51780aebb ("ipv6: udp: leverage scratch area helpers") udp6_recvmsg() read the skb len from the scratch area, to avoid a cache miss. But the UDP6 rx path support RFC 2675 UDPv6 jumbograms, and their length exceeds the 16 bits available in the scratch area. As a side effect the length returned by recvmsg() is: % (1<<16) This commit addresses the issue allocating one more bit in the IP6CB flags field and setting it for incoming jumbograms. Such field is still in the first cacheline, so at recvmsg() time we can check it and fallback to access skb->len if required, without a measurable overhead. Fixes: 67a51780aebb ("ipv6: udp: leverage scratch area helpers") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/ipv6.h | 6 ++++++ net/ipv6/exthdrs.c | 1 + net/ipv6/udp.c | 11 ++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e1b442996f81..474d6bbc158c 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -128,6 +128,7 @@ struct inet6_skb_parm { #define IP6SKB_FRAGMENTED 16 #define IP6SKB_HOPBYHOP 32 #define IP6SKB_L3SLAVE 64 +#define IP6SKB_JUMBOGRAM 128 }; #if defined(CONFIG_NET_L3_MASTER_DEV) @@ -152,6 +153,11 @@ static inline int inet6_iif(const struct sk_buff *skb) return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } +static inline bool inet6_is_jumbogram(const struct sk_buff *skb) +{ + return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM); +} + /* can not be used in TCP layer after tcp_v6_fill_cb */ static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) { diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 4996d734f1d2..3cec529c6113 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -756,6 +756,7 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; + IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM; return true; drop: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 98fe4560e24c..578142b7ca3e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -328,6 +328,15 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif +/* do not use the scratch area len for jumbogram: their length execeeds the + * scratch area space; note that the IP6CB flags is still in the first + * cacheline, so checking for jumbograms is cheap + */ +static int udp6_skb_len(struct sk_buff *skb) +{ + return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb); +} + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -358,7 +367,7 @@ try_again: if (!skb) return err; - ulen = udp_skb_len(skb); + ulen = udp6_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; From cc75f8514db6a3aec517760fccaf954e5b46478c Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 31 Jul 2017 14:40:50 -0700 Subject: [PATCH 76/76] samples/bpf: fix bpf tunnel cleanup test_tunnel_bpf.sh fails to remove the vxlan11 tunnel device, causing the next geneve tunnelling test case fails. In addition, the geneve reserved bit in tcbpf2_kern.c should be zero, according to the RFC. Signed-off-by: William Tu Signed-off-by: David S. Miller --- samples/bpf/tcbpf2_kern.c | 4 ++-- samples/bpf/test_tunnel_bpf.sh | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c index 9c823a609e75..270edcc149a1 100644 --- a/samples/bpf/tcbpf2_kern.c +++ b/samples/bpf/tcbpf2_kern.c @@ -147,9 +147,9 @@ int _geneve_set_tunnel(struct __sk_buff *skb) __builtin_memset(&gopt, 0x0, sizeof(gopt)); gopt.opt_class = 0x102; /* Open Virtual Networking (OVN) */ gopt.type = 0x08; - gopt.r1 = 1; + gopt.r1 = 0; gopt.r2 = 0; - gopt.r3 = 1; + gopt.r3 = 0; gopt.length = 2; /* 4-byte multiple */ *(int *) &gopt.opt_data = 0xdeadbeef; diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh index 1ff634f187b7..a70d2ea90313 100755 --- a/samples/bpf/test_tunnel_bpf.sh +++ b/samples/bpf/test_tunnel_bpf.sh @@ -149,6 +149,7 @@ function cleanup { ip link del veth1 ip link del ipip11 ip link del gretap11 + ip link del vxlan11 ip link del geneve11 pkill tcpdump pkill cat