From 06799a9085e12a778fe2851db550ab5911ad28fe Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 31 Jul 2022 15:41:05 +0300 Subject: [PATCH 1/4] net: bonding: replace dev_trans_start() with the jiffies of the last ARP/NS The bonding driver piggybacks on time stamps kept by the network stack for the purpose of the netdev TX watchdog, and this is problematic because it does not work with NETIF_F_LLTX devices. It is hard to say why the driver looks at dev_trans_start() of the slave->dev, considering that this is updated even by non-ARP/NS probes sent by us, and even by traffic not sent by us at all (for example PTP on physical slave devices). ARP monitoring in active-backup mode appears to still work even if we track only the last TX time of actual ARP probes. Signed-off-by: Vladimir Oltean Acked-by: Jay Vosburgh Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 35 +++++++++++++++++++-------------- include/net/bonding.h | 13 +++++++++++- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e75acb14d066..ab7fdbbc2530 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2001,6 +2001,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) new_slave->target_last_arp_rx[i] = new_slave->last_rx; + new_slave->last_tx = new_slave->last_rx; + if (bond->params.miimon && !bond->params.use_carrier) { link_reporting = bond_check_dev_link(bond, slave_dev, 1); @@ -2884,8 +2886,11 @@ static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip, return; } - if (bond_handle_vlan(slave, tags, skb)) + if (bond_handle_vlan(slave, tags, skb)) { + slave_update_last_tx(slave); arp_xmit(skb); + } + return; } @@ -3074,8 +3079,7 @@ static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, curr_active_slave->last_link_up)) bond_validate_arp(bond, slave, tip, sip); else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) && - bond_time_in_interval(bond, - dev_trans_start(curr_arp_slave->dev), 1)) + bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_arp(bond, slave, sip, tip); out_unlock: @@ -3103,8 +3107,10 @@ static void bond_ns_send(struct slave *slave, const struct in6_addr *daddr, } addrconf_addr_solict_mult(daddr, &mcaddr); - if (bond_handle_vlan(slave, tags, skb)) + if (bond_handle_vlan(slave, tags, skb)) { + slave_update_last_tx(slave); ndisc_send_skb(skb, &mcaddr, saddr); + } } static void bond_ns_send_all(struct bonding *bond, struct slave *slave) @@ -3246,8 +3252,7 @@ static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, curr_active_slave->last_link_up)) bond_validate_ns(bond, slave, saddr, daddr); else if (curr_arp_slave && - bond_time_in_interval(bond, - dev_trans_start(curr_arp_slave->dev), 1)) + bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_ns(bond, slave, saddr, daddr); out: @@ -3335,12 +3340,12 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) * so it can wait */ bond_for_each_slave_rcu(bond, slave, iter) { - unsigned long trans_start = dev_trans_start(slave->dev); + unsigned long last_tx = slave_last_tx(slave); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); if (slave->link != BOND_LINK_UP) { - if (bond_time_in_interval(bond, trans_start, 1) && + if (bond_time_in_interval(bond, last_tx, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); @@ -3365,7 +3370,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) * when the source ip is 0, so don't take the link down * if we don't know our ip yet */ - if (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) || + if (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) { bond_propose_link_state(slave, BOND_LINK_DOWN); @@ -3431,7 +3436,7 @@ re_arm: */ static int bond_ab_arp_inspect(struct bonding *bond) { - unsigned long trans_start, last_rx; + unsigned long last_tx, last_rx; struct list_head *iter; struct slave *slave; int commit = 0; @@ -3482,9 +3487,9 @@ static int bond_ab_arp_inspect(struct bonding *bond) * - (more than missed_max*delta since receive AND * the bond has an IP address) */ - trans_start = dev_trans_start(slave->dev); + last_tx = slave_last_tx(slave); if (bond_is_active_slave(slave) && - (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) || + (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; @@ -3501,8 +3506,8 @@ static int bond_ab_arp_inspect(struct bonding *bond) */ static void bond_ab_arp_commit(struct bonding *bond) { - unsigned long trans_start; struct list_head *iter; + unsigned long last_tx; struct slave *slave; bond_for_each_slave(bond, slave, iter) { @@ -3511,10 +3516,10 @@ static void bond_ab_arp_commit(struct bonding *bond) continue; case BOND_LINK_UP: - trans_start = dev_trans_start(slave->dev); + last_tx = slave_last_tx(slave); if (rtnl_dereference(bond->curr_active_slave) != slave || (!rtnl_dereference(bond->curr_active_slave) && - bond_time_in_interval(bond, trans_start, 1))) { + bond_time_in_interval(bond, last_tx, 1))) { struct slave *current_arp_slave; current_arp_slave = rtnl_dereference(bond->current_arp_slave); diff --git a/include/net/bonding.h b/include/net/bonding.h index 6e78d657aa05..afd606df149a 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -161,8 +161,9 @@ struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ int delay; - /* all three in jiffies */ + /* all 4 in jiffies */ unsigned long last_link_up; + unsigned long last_tx; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ @@ -540,6 +541,16 @@ static inline unsigned long slave_last_rx(struct bonding *bond, return slave->last_rx; } +static inline void slave_update_last_tx(struct slave *slave) +{ + WRITE_ONCE(slave->last_tx, jiffies); +} + +static inline unsigned long slave_last_tx(struct slave *slave) +{ + return READ_ONCE(slave->last_tx); +} + #ifdef CONFIG_NET_POLL_CONTROLLER static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) From 4873a1b2024dce9a501b56d1039ff0752027a92e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 31 Jul 2022 15:41:06 +0300 Subject: [PATCH 2/4] net/sched: remove hacks added to dev_trans_start() for bonding to work Now that the bonding driver keeps track of the last TX time of ARP and NS probes, we effectively revert the following commits: 32d3e51a82d4 ("net_sched: use macvlan real dev trans_start in dev_trans_start()") 07ce76aa9bcf ("net_sched: make dev_trans_start return vlan's real dev trans_start") Note that the approach of continuing to hack at this function would not get us very far, hence the desire to take a different approach. DSA is also a virtual device that uses NETIF_F_LLTX, but there, many uppers share the same lower (DSA master, i.e. the physical host port of a switch). By making dev_trans_start() on a DSA interface return the dev_trans_start() of the master, we effectively assume that all other DSA interfaces are silent, otherwise this corrupts the validity of the probe timestamp data from the bonding driver's perspective. Furthermore, the hacks didn't take into consideration the fact that the lower interface of @dev may not have been physical either. For example, VLAN over VLAN, or DSA with 2 masters in a LAG. And even furthermore, there are NETIF_F_LLTX devices which are not stacked, like veth. The hack here would not work with those, because it would not have to provide the bonding driver something to chew at all. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- net/sched/sch_generic.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cc6eabee2830..d47b9689eba6 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -427,14 +427,10 @@ void __qdisc_run(struct Qdisc *q) unsigned long dev_trans_start(struct net_device *dev) { - unsigned long val, res; + unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); + unsigned long val; unsigned int i; - if (is_vlan_dev(dev)) - dev = vlan_dev_real_dev(dev); - else if (netif_is_macvlan(dev)) - dev = macvlan_dev_real_dev(dev); - res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); for (i = 1; i < dev->num_tx_queues; i++) { val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); if (val && time_after(val, res)) From 08b403d5bf07d9e0d97ba12649b198eee42f826d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 31 Jul 2022 15:41:07 +0300 Subject: [PATCH 3/4] Revert "veth: Add updating of trans_start" This reverts commit e66e257a5d8368d9c0ba13d4630f474436533e8b. The veth driver no longer needs these hacks which are slightly detrimential to the fast path performance, because the bonding driver is keeping track of TX times of ARP and NS probes by itself, which it should. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/veth.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 2cb833b3006a..466da01ba2e3 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -312,7 +312,6 @@ static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); - struct netdev_queue *queue = NULL; struct veth_rq *rq = NULL; struct net_device *rcv; int length = skb->len; @@ -330,7 +329,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) rxq = skb_get_queue_mapping(skb); if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; - queue = netdev_get_tx_queue(dev, rxq); /* The napi pointer is available when an XDP program is * attached or when GRO is enabled @@ -342,8 +340,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) skb_tx_timestamp(skb); if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { - if (queue) - txq_trans_cond_update(queue); if (!use_napi) dev_lstats_add(dev, length); } else { From cba8d8f57dfb1d01d961a0e50e7fddb82df57ad7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 31 Jul 2022 15:41:08 +0300 Subject: [PATCH 4/4] docs: net: bonding: remove mentions of trans_start ARP monitoring no longer depends on dev->last_rx or dev_trans_start(), so delete this information. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- Documentation/networking/bonding.rst | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index 53a18ff7cf23..7823a069a903 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -1982,15 +1982,6 @@ uses the response as an indication that the link is operating. This gives some assurance that traffic is actually flowing to and from one or more peers on the local network. -The ARP monitor relies on the device driver itself to verify -that traffic is flowing. In particular, the driver must keep up to -date the last receive time, dev->last_rx. Drivers that use NETIF_F_LLTX -flag must also update netdev_queue->trans_start. If they do not, then the -ARP monitor will immediately fail any slaves using that driver, and -those slaves will stay down. If networking monitoring (tcpdump, etc) -shows the ARP requests and replies on the network, then it may be that -your device driver is not updating last_rx and trans_start. - 7.2 Configuring Multiple ARP Targets ------------------------------------