diff --git a/Documentation/networking/failover.rst b/Documentation/networking/failover.rst
new file mode 100644
index 000000000000..f0c8483cdbf5
--- /dev/null
+++ b/Documentation/networking/failover.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========
+FAILOVER
+========
+
+Overview
+========
+
+The failover module provides a generic interface for paravirtual drivers
+to register a netdev and a set of ops with a failover instance. The ops
+are used as event handlers that get called to handle netdev register/
+unregister/link change/name change events on slave pci ethernet devices
+with the same mac address as the failover netdev.
+
+This enables paravirtual drivers to use a VF as an accelerated low latency
+datapath. It also allows live migration of VMs with direct attached VFs by
+failing over to the paravirtual datapath when the VF is unplugged.
diff --git a/Documentation/networking/net_failover.rst b/Documentation/networking/net_failover.rst
new file mode 100644
index 000000000000..70ca2f5800c4
--- /dev/null
+++ b/Documentation/networking/net_failover.rst
@@ -0,0 +1,116 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+NET_FAILOVER
+============
+
+Overview
+========
+
+The net_failover driver provides an automated failover mechanism via APIs
+to create and destroy a failover master netdev and mananges a primary and
+standby slave netdevs that get registered via the generic failover
+infrastructrure.
+
+The failover netdev acts a master device and controls 2 slave devices. The
+original paravirtual interface is registered as 'standby' slave netdev and
+a passthru/vf device with the same MAC gets registered as 'primary' slave
+netdev. Both 'standby' and 'failover' netdevs are associated with the same
+'pci' device. The user accesses the network interface via 'failover' netdev.
+The 'failover' netdev chooses 'primary' netdev as default for transmits when
+it is available with link up and running.
+
+This can be used by paravirtual drivers to enable an alternate low latency
+datapath. It also enables hypervisor controlled live migration of a VM with
+direct attached VF by failing over to the paravirtual datapath when the VF
+is unplugged.
+
+virtio-net accelerated datapath: STANDBY mode
+=============================================
+
+net_failover enables hypervisor controlled accelerated datapath to virtio-net
+enabled VMs in a transparent manner with no/minimal guest userspace chanages.
+
+To support this, the hypervisor needs to enable VIRTIO_NET_F_STANDBY
+feature on the virtio-net interface and assign the same MAC address to both
+virtio-net and VF interfaces.
+
+Here is an example XML snippet that shows such configuration.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Booting a VM with the above configuration will result in the following 3
+netdevs created in the VM.
+
+4: ens10: mtu 1500 qdisc noqueue state UP group default qlen 1000
+ link/ether 52:54:00:00:12:53 brd ff:ff:ff:ff:ff:ff
+ inet 192.168.12.53/24 brd 192.168.12.255 scope global dynamic ens10
+ valid_lft 42482sec preferred_lft 42482sec
+ inet6 fe80::97d8:db2:8c10:b6d6/64 scope link
+ valid_lft forever preferred_lft forever
+5: ens10nsby: mtu 1500 qdisc fq_codel master ens10 state UP group default qlen 1000
+ link/ether 52:54:00:00:12:53 brd ff:ff:ff:ff:ff:ff
+7: ens11: mtu 1500 qdisc mq master ens10 state UP group default qlen 1000
+ link/ether 52:54:00:00:12:53 brd ff:ff:ff:ff:ff:ff
+
+ens10 is the 'failover' master netdev, ens10nsby and ens11 are the slave
+'standby' and 'primary' netdevs respectively.
+
+Live Migration of a VM with SR-IOV VF & virtio-net in STANDBY mode
+==================================================================
+
+net_failover also enables hypervisor controlled live migration to be supported
+with VMs that have direct attached SR-IOV VF devices by automatic failover to
+the paravirtual datapath when the VF is unplugged.
+
+Here is a sample script that shows the steps to initiate live migration on
+the source hypervisor.
+
+# cat vf_xml
+
+
+
+
+
+
+
+
+# Source Hypervisor
+#!/bin/bash
+
+DOMAIN=fedora27-tap01
+PF=enp66s0f0
+VF_NUM=5
+TAP_IF=tap01
+VF_XML=
+
+MAC=52:54:00:00:12:53
+ZERO_MAC=00:00:00:00:00:00
+
+virsh domif-setlink $DOMAIN $TAP_IF up
+bridge fdb del $MAC dev $PF master
+virsh detach-device $DOMAIN $VF_XML
+ip link set $PF vf $VF_NUM mac $ZERO_MAC
+
+virsh migrate --live $DOMAIN qemu+ssh://$REMOTE_HOST/system
+
+# Destination Hypervisor
+#!/bin/bash
+
+virsh attach-device $DOMAIN $VF_XML
+virsh domif-setlink $DOMAIN $TAP_IF down
diff --git a/MAINTAINERS b/MAINTAINERS
index f492431b239b..1831ff5863a1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5411,6 +5411,14 @@ S: Maintained
F: Documentation/hwmon/f71805f
F: drivers/hwmon/f71805f.c
+FAILOVER MODULE
+M: Sridhar Samudrala
+L: netdev@vger.kernel.org
+S: Supported
+F: net/core/failover.c
+F: include/net/failover.h
+F: Documentation/networking/failover.rst
+
FANOTIFY
M: Jan Kara
R: Amir Goldstein
@@ -9646,6 +9654,14 @@ S: Maintained
F: Documentation/hwmon/nct6775
F: drivers/hwmon/nct6775.c
+NET_FAILOVER MODULE
+M: Sridhar Samudrala
+L: netdev@vger.kernel.org
+S: Supported
+F: driver/net/net_failover.c
+F: include/net/net_failover.h
+F: Documentation/networking/net_failover.rst
+
NETEFFECT IWARP RNIC DRIVER (IW_NES)
M: Faisal Latif
L: linux-rdma@vger.kernel.org
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index a029b27fd002..d03775100f7d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -332,6 +332,7 @@ config VETH
config VIRTIO_NET
tristate "Virtio network driver"
depends on VIRTIO
+ select NET_FAILOVER
---help---
This is the virtual network driver for virtio. It can be used with
QEMU based VMMs (like KVM or Xen). Say Y or M.
@@ -510,4 +511,16 @@ config NETDEVSIM
To compile this driver as a module, choose M here: the module
will be called netdevsim.
+config NET_FAILOVER
+ tristate "Failover driver"
+ select FAILOVER
+ help
+ This provides an automated failover mechanism via APIs to create
+ and destroy a failover master netdev and manages a primary and
+ standby slave netdevs that get registered via the generic failover
+ infrastructure. This can be used by paravirtual drivers to enable
+ an alternate low latency datapath. It alsoenables live migration of
+ a VM with direct attached VF by failing over to the paravirtual
+ datapath when the VF is unplugged.
+
endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 91e67e375dd4..21cde7e78621 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_FUJITSU_ES) += fjes/
thunderbolt-net-y += thunderbolt.o
obj-$(CONFIG_THUNDERBOLT_NET) += thunderbolt-net.o
obj-$(CONFIG_NETDEVSIM) += netdevsim/
+obj-$(CONFIG_NET_FAILOVER) += net_failover.o
diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig
index 0765d5f61714..23a2d145813a 100644
--- a/drivers/net/hyperv/Kconfig
+++ b/drivers/net/hyperv/Kconfig
@@ -2,5 +2,6 @@ config HYPERV_NET
tristate "Microsoft Hyper-V virtual network driver"
depends on HYPERV
select UCS2_STRING
+ select FAILOVER
help
Select this option to enable the Hyper-V virtual network driver.
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 1be34d2e3563..99d8e7398a5b 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -932,6 +932,8 @@ struct net_device_context {
u32 vf_alloc;
/* Serial number of the VF to team with */
u32 vf_serial;
+
+ struct failover *failover;
};
/* Per channel data */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 60a5769ef5a1..ebe964203eff 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -43,6 +43,7 @@
#include
#include
#include
+#include
#include "hyperv_net.h"
@@ -1779,46 +1780,6 @@ out_unlock:
rtnl_unlock();
}
-static struct net_device *get_netvsc_bymac(const u8 *mac)
-{
- struct net_device *dev;
-
- ASSERT_RTNL();
-
- for_each_netdev(&init_net, dev) {
- if (dev->netdev_ops != &device_ops)
- continue; /* not a netvsc device */
-
- if (ether_addr_equal(mac, dev->perm_addr))
- return dev;
- }
-
- return NULL;
-}
-
-static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
-{
- struct net_device *dev;
-
- ASSERT_RTNL();
-
- for_each_netdev(&init_net, dev) {
- struct net_device_context *net_device_ctx;
-
- if (dev->netdev_ops != &device_ops)
- continue; /* not a netvsc device */
-
- net_device_ctx = netdev_priv(dev);
- if (!rtnl_dereference(net_device_ctx->nvdev))
- continue; /* device is removed */
-
- if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
- return dev; /* a match */
- }
-
- return NULL;
-}
-
/* Called when VF is injecting data into network stack.
* Change the associated network device from VF to netvsc.
* note: already called with rcu_read_lock
@@ -1841,46 +1802,6 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
return RX_HANDLER_ANOTHER;
}
-static int netvsc_vf_join(struct net_device *vf_netdev,
- struct net_device *ndev)
-{
- struct net_device_context *ndev_ctx = netdev_priv(ndev);
- int ret;
-
- ret = netdev_rx_handler_register(vf_netdev,
- netvsc_vf_handle_frame, ndev);
- if (ret != 0) {
- netdev_err(vf_netdev,
- "can not register netvsc VF receive handler (err = %d)\n",
- ret);
- goto rx_handler_failed;
- }
-
- ret = netdev_master_upper_dev_link(vf_netdev, ndev,
- NULL, NULL, NULL);
- if (ret != 0) {
- netdev_err(vf_netdev,
- "can not set master device %s (err = %d)\n",
- ndev->name, ret);
- goto upper_link_failed;
- }
-
- /* set slave flag before open to prevent IPv6 addrconf */
- vf_netdev->flags |= IFF_SLAVE;
-
- schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
-
- call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
-
- netdev_info(vf_netdev, "joined to %s\n", ndev->name);
- return 0;
-
-upper_link_failed:
- netdev_rx_handler_unregister(vf_netdev);
-rx_handler_failed:
- return ret;
-}
-
static void __netvsc_vf_setup(struct net_device *ndev,
struct net_device *vf_netdev)
{
@@ -1931,85 +1852,95 @@ static void netvsc_vf_setup(struct work_struct *w)
rtnl_unlock();
}
-static int netvsc_register_vf(struct net_device *vf_netdev)
+static int netvsc_pre_register_vf(struct net_device *vf_netdev,
+ struct net_device *ndev)
{
- struct net_device *ndev;
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
- if (vf_netdev->addr_len != ETH_ALEN)
- return NOTIFY_DONE;
-
- /*
- * We will use the MAC address to locate the synthetic interface to
- * associate with the VF interface. If we don't find a matching
- * synthetic interface, move on.
- */
- ndev = get_netvsc_bymac(vf_netdev->perm_addr);
- if (!ndev)
- return NOTIFY_DONE;
-
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
- return NOTIFY_DONE;
+ return -ENODEV;
- if (netvsc_vf_join(vf_netdev, ndev) != 0)
- return NOTIFY_DONE;
+ return 0;
+}
- netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
+static int netvsc_register_vf(struct net_device *vf_netdev,
+ struct net_device *ndev)
+{
+ struct net_device_context *ndev_ctx = netdev_priv(ndev);
+
+ /* set slave flag before open to prevent IPv6 addrconf */
+ vf_netdev->flags |= IFF_SLAVE;
+
+ schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+
+ call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
+
+ netdev_info(vf_netdev, "joined to %s\n", ndev->name);
dev_hold(vf_netdev);
- rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
- return NOTIFY_OK;
+ rcu_assign_pointer(ndev_ctx->vf_netdev, vf_netdev);
+
+ return 0;
}
/* VF up/down change detected, schedule to change data path */
-static int netvsc_vf_changed(struct net_device *vf_netdev)
+static int netvsc_vf_changed(struct net_device *vf_netdev,
+ struct net_device *ndev)
{
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
- struct net_device *ndev;
bool vf_is_up = netif_running(vf_netdev);
- ndev = get_netvsc_byref(vf_netdev);
- if (!ndev)
- return NOTIFY_DONE;
-
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
if (!netvsc_dev)
- return NOTIFY_DONE;
+ return -ENODEV;
netvsc_switch_datapath(ndev, vf_is_up);
netdev_info(ndev, "Data path switched %s VF: %s\n",
vf_is_up ? "to" : "from", vf_netdev->name);
- return NOTIFY_OK;
+ return 0;
}
-static int netvsc_unregister_vf(struct net_device *vf_netdev)
+static int netvsc_pre_unregister_vf(struct net_device *vf_netdev,
+ struct net_device *ndev)
{
- struct net_device *ndev;
struct net_device_context *net_device_ctx;
- ndev = get_netvsc_byref(vf_netdev);
- if (!ndev)
- return NOTIFY_DONE;
-
net_device_ctx = netdev_priv(ndev);
cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
+ return 0;
+}
+
+static int netvsc_unregister_vf(struct net_device *vf_netdev,
+ struct net_device *ndev)
+{
+ struct net_device_context *net_device_ctx;
+
+ net_device_ctx = netdev_priv(ndev);
+
netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
- netdev_rx_handler_unregister(vf_netdev);
- netdev_upper_dev_unlink(vf_netdev, ndev);
RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
dev_put(vf_netdev);
- return NOTIFY_OK;
+ return 0;
}
+static struct failover_ops netvsc_failover_ops = {
+ .slave_pre_register = netvsc_pre_register_vf,
+ .slave_register = netvsc_register_vf,
+ .slave_pre_unregister = netvsc_pre_unregister_vf,
+ .slave_unregister = netvsc_unregister_vf,
+ .slave_link_change = netvsc_vf_changed,
+ .slave_handle_frame = netvsc_vf_handle_frame,
+};
+
static int netvsc_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
@@ -2099,8 +2030,14 @@ static int netvsc_probe(struct hv_device *dev,
goto register_failed;
}
+ net_device_ctx->failover = failover_register(net, &netvsc_failover_ops);
+ if (IS_ERR(net_device_ctx->failover))
+ goto err_failover;
+
return ret;
+err_failover:
+ unregister_netdev(net);
register_failed:
rndis_filter_device_remove(dev, nvdev);
rndis_failed:
@@ -2141,13 +2078,15 @@ static int netvsc_remove(struct hv_device *dev)
rtnl_lock();
vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
if (vf_netdev)
- netvsc_unregister_vf(vf_netdev);
+ failover_slave_unregister(vf_netdev);
if (nvdev)
rndis_filter_device_remove(dev, nvdev);
unregister_netdevice(net);
+ failover_unregister(ndev_ctx->failover);
+
rtnl_unlock();
rcu_read_unlock();
@@ -2174,54 +2113,8 @@ static struct hv_driver netvsc_drv = {
.remove = netvsc_remove,
};
-/*
- * On Hyper-V, every VF interface is matched with a corresponding
- * synthetic interface. The synthetic interface is presented first
- * to the guest. When the corresponding VF instance is registered,
- * we will take care of switching the data path.
- */
-static int netvsc_netdev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
-
- /* Skip our own events */
- if (event_dev->netdev_ops == &device_ops)
- return NOTIFY_DONE;
-
- /* Avoid non-Ethernet type devices */
- if (event_dev->type != ARPHRD_ETHER)
- return NOTIFY_DONE;
-
- /* Avoid Vlan dev with same MAC registering as VF */
- if (is_vlan_dev(event_dev))
- return NOTIFY_DONE;
-
- /* Avoid Bonding master dev with same MAC registering as VF */
- if ((event_dev->priv_flags & IFF_BONDING) &&
- (event_dev->flags & IFF_MASTER))
- return NOTIFY_DONE;
-
- switch (event) {
- case NETDEV_REGISTER:
- return netvsc_register_vf(event_dev);
- case NETDEV_UNREGISTER:
- return netvsc_unregister_vf(event_dev);
- case NETDEV_UP:
- case NETDEV_DOWN:
- return netvsc_vf_changed(event_dev);
- default:
- return NOTIFY_DONE;
- }
-}
-
-static struct notifier_block netvsc_netdev_notifier = {
- .notifier_call = netvsc_netdev_event,
-};
-
static void __exit netvsc_drv_exit(void)
{
- unregister_netdevice_notifier(&netvsc_netdev_notifier);
vmbus_driver_unregister(&netvsc_drv);
}
@@ -2241,7 +2134,6 @@ static int __init netvsc_drv_init(void)
if (ret)
return ret;
- register_netdevice_notifier(&netvsc_netdev_notifier);
return 0;
}
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
new file mode 100644
index 000000000000..8b508e2cf29b
--- /dev/null
+++ b/drivers/net/net_failover.c
@@ -0,0 +1,836 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* This provides a net_failover interface for paravirtual drivers to
+ * provide an alternate datapath by exporting APIs to create and
+ * destroy a upper 'net_failover' netdev. The upper dev manages the
+ * original paravirtual interface as a 'standby' netdev and uses the
+ * generic failover infrastructure to register and manage a direct
+ * attached VF as a 'primary' netdev. This enables live migration of
+ * a VM with direct attached VF by failing over to the paravirtual
+ * datapath when the VF is unplugged.
+ *
+ * Some of the netdev management routines are based on bond/team driver as
+ * this driver provides active-backup functionality similar to those drivers.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+static bool net_failover_xmit_ready(struct net_device *dev)
+{
+ return netif_running(dev) && netif_carrier_ok(dev);
+}
+
+static int net_failover_open(struct net_device *dev)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+ int err;
+
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ err = dev_open(primary_dev);
+ if (err)
+ goto err_primary_open;
+ }
+
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+ if (standby_dev) {
+ err = dev_open(standby_dev);
+ if (err)
+ goto err_standby_open;
+ }
+
+ if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
+ (standby_dev && net_failover_xmit_ready(standby_dev))) {
+ netif_carrier_on(dev);
+ netif_tx_wake_all_queues(dev);
+ }
+
+ return 0;
+
+err_standby_open:
+ dev_close(primary_dev);
+err_primary_open:
+ netif_tx_disable(dev);
+ return err;
+}
+
+static int net_failover_close(struct net_device *dev)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *slave_dev;
+
+ netif_tx_disable(dev);
+
+ slave_dev = rtnl_dereference(nfo_info->primary_dev);
+ if (slave_dev)
+ dev_close(slave_dev);
+
+ slave_dev = rtnl_dereference(nfo_info->standby_dev);
+ if (slave_dev)
+ dev_close(slave_dev);
+
+ return 0;
+}
+
+static netdev_tx_t net_failover_drop_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ atomic_long_inc(&dev->tx_dropped);
+ dev_kfree_skb_any(skb);
+ return NETDEV_TX_OK;
+}
+
+static netdev_tx_t net_failover_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *xmit_dev;
+
+ /* Try xmit via primary netdev followed by standby netdev */
+ xmit_dev = rcu_dereference_bh(nfo_info->primary_dev);
+ if (!xmit_dev || !net_failover_xmit_ready(xmit_dev)) {
+ xmit_dev = rcu_dereference_bh(nfo_info->standby_dev);
+ if (!xmit_dev || !net_failover_xmit_ready(xmit_dev))
+ return net_failover_drop_xmit(skb, dev);
+ }
+
+ skb->dev = xmit_dev;
+ skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+ return dev_queue_xmit(skb);
+}
+
+static u16 net_failover_select_queue(struct net_device *dev,
+ struct sk_buff *skb, void *accel_priv,
+ select_queue_fallback_t fallback)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev;
+ u16 txq;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ const struct net_device_ops *ops = primary_dev->netdev_ops;
+
+ if (ops->ndo_select_queue)
+ txq = ops->ndo_select_queue(primary_dev, skb,
+ accel_priv, fallback);
+ else
+ txq = fallback(primary_dev, skb);
+
+ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+ return txq;
+ }
+
+ txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
+
+ /* Save the original txq to restore before passing to the driver */
+ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+ if (unlikely(txq >= dev->real_num_tx_queues)) {
+ do {
+ txq -= dev->real_num_tx_queues;
+ } while (txq >= dev->real_num_tx_queues);
+ }
+
+ return txq;
+}
+
+/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
+ * that some drivers can provide 32bit values only.
+ */
+static void net_failover_fold_stats(struct rtnl_link_stats64 *_res,
+ const struct rtnl_link_stats64 *_new,
+ const struct rtnl_link_stats64 *_old)
+{
+ const u64 *new = (const u64 *)_new;
+ const u64 *old = (const u64 *)_old;
+ u64 *res = (u64 *)_res;
+ int i;
+
+ for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
+ u64 nv = new[i];
+ u64 ov = old[i];
+ s64 delta = nv - ov;
+
+ /* detects if this particular field is 32bit only */
+ if (((nv | ov) >> 32) == 0)
+ delta = (s64)(s32)((u32)nv - (u32)ov);
+
+ /* filter anomalies, some drivers reset their stats
+ * at down/up events.
+ */
+ if (delta > 0)
+ res[i] += delta;
+ }
+}
+
+static void net_failover_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ const struct rtnl_link_stats64 *new;
+ struct rtnl_link_stats64 temp;
+ struct net_device *slave_dev;
+
+ spin_lock(&nfo_info->stats_lock);
+ memcpy(stats, &nfo_info->failover_stats, sizeof(*stats));
+
+ rcu_read_lock();
+
+ slave_dev = rcu_dereference(nfo_info->primary_dev);
+ if (slave_dev) {
+ new = dev_get_stats(slave_dev, &temp);
+ net_failover_fold_stats(stats, new, &nfo_info->primary_stats);
+ memcpy(&nfo_info->primary_stats, new, sizeof(*new));
+ }
+
+ slave_dev = rcu_dereference(nfo_info->standby_dev);
+ if (slave_dev) {
+ new = dev_get_stats(slave_dev, &temp);
+ net_failover_fold_stats(stats, new, &nfo_info->standby_stats);
+ memcpy(&nfo_info->standby_stats, new, sizeof(*new));
+ }
+
+ rcu_read_unlock();
+
+ memcpy(&nfo_info->failover_stats, stats, sizeof(*stats));
+ spin_unlock(&nfo_info->stats_lock);
+}
+
+static int net_failover_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+ int ret = 0;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ ret = dev_set_mtu(primary_dev, new_mtu);
+ if (ret)
+ return ret;
+ }
+
+ standby_dev = rcu_dereference(nfo_info->standby_dev);
+ if (standby_dev) {
+ ret = dev_set_mtu(standby_dev, new_mtu);
+ if (ret) {
+ if (primary_dev)
+ dev_set_mtu(primary_dev, dev->mtu);
+ return ret;
+ }
+ }
+
+ dev->mtu = new_mtu;
+
+ return 0;
+}
+
+static void net_failover_set_rx_mode(struct net_device *dev)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *slave_dev;
+
+ rcu_read_lock();
+
+ slave_dev = rcu_dereference(nfo_info->primary_dev);
+ if (slave_dev) {
+ dev_uc_sync_multiple(slave_dev, dev);
+ dev_mc_sync_multiple(slave_dev, dev);
+ }
+
+ slave_dev = rcu_dereference(nfo_info->standby_dev);
+ if (slave_dev) {
+ dev_uc_sync_multiple(slave_dev, dev);
+ dev_mc_sync_multiple(slave_dev, dev);
+ }
+
+ rcu_read_unlock();
+}
+
+static int net_failover_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+ u16 vid)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+ int ret = 0;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ ret = vlan_vid_add(primary_dev, proto, vid);
+ if (ret)
+ return ret;
+ }
+
+ standby_dev = rcu_dereference(nfo_info->standby_dev);
+ if (standby_dev) {
+ ret = vlan_vid_add(standby_dev, proto, vid);
+ if (ret)
+ if (primary_dev)
+ vlan_vid_del(primary_dev, proto, vid);
+ }
+
+ return ret;
+}
+
+static int net_failover_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+ u16 vid)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *slave_dev;
+
+ slave_dev = rcu_dereference(nfo_info->primary_dev);
+ if (slave_dev)
+ vlan_vid_del(slave_dev, proto, vid);
+
+ slave_dev = rcu_dereference(nfo_info->standby_dev);
+ if (slave_dev)
+ vlan_vid_del(slave_dev, proto, vid);
+
+ return 0;
+}
+
+static const struct net_device_ops failover_dev_ops = {
+ .ndo_open = net_failover_open,
+ .ndo_stop = net_failover_close,
+ .ndo_start_xmit = net_failover_start_xmit,
+ .ndo_select_queue = net_failover_select_queue,
+ .ndo_get_stats64 = net_failover_get_stats,
+ .ndo_change_mtu = net_failover_change_mtu,
+ .ndo_set_rx_mode = net_failover_set_rx_mode,
+ .ndo_vlan_rx_add_vid = net_failover_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = net_failover_vlan_rx_kill_vid,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_features_check = passthru_features_check,
+};
+
+#define FAILOVER_NAME "net_failover"
+#define FAILOVER_VERSION "0.1"
+
+static void nfo_ethtool_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strlcpy(drvinfo->driver, FAILOVER_NAME, sizeof(drvinfo->driver));
+ strlcpy(drvinfo->version, FAILOVER_VERSION, sizeof(drvinfo->version));
+}
+
+static int nfo_ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *slave_dev;
+
+ slave_dev = rtnl_dereference(nfo_info->primary_dev);
+ if (!slave_dev || !net_failover_xmit_ready(slave_dev)) {
+ slave_dev = rtnl_dereference(nfo_info->standby_dev);
+ if (!slave_dev || !net_failover_xmit_ready(slave_dev)) {
+ cmd->base.duplex = DUPLEX_UNKNOWN;
+ cmd->base.port = PORT_OTHER;
+ cmd->base.speed = SPEED_UNKNOWN;
+
+ return 0;
+ }
+ }
+
+ return __ethtool_get_link_ksettings(slave_dev, cmd);
+}
+
+static const struct ethtool_ops failover_ethtool_ops = {
+ .get_drvinfo = nfo_ethtool_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_link_ksettings = nfo_ethtool_get_link_ksettings,
+};
+
+/* Called when slave dev is injecting data into network stack.
+ * Change the associated network device from lower dev to failover dev.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t net_failover_handle_frame(struct sk_buff **pskb)
+{
+ struct sk_buff *skb = *pskb;
+ struct net_device *dev = rcu_dereference(skb->dev->rx_handler_data);
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ standby_dev = rcu_dereference(nfo_info->standby_dev);
+
+ if (primary_dev && skb->dev == standby_dev)
+ return RX_HANDLER_EXACT;
+
+ skb->dev = dev;
+
+ return RX_HANDLER_ANOTHER;
+}
+
+static void net_failover_compute_features(struct net_device *dev)
+{
+ u32 vlan_features = FAILOVER_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
+ netdev_features_t enc_features = FAILOVER_ENC_FEATURES;
+ unsigned short max_hard_header_len = ETH_HLEN;
+ unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+ IFF_XMIT_DST_RELEASE_PERM;
+ struct net_failover_info *nfo_info = netdev_priv(dev);
+ struct net_device *primary_dev, *standby_dev;
+
+ primary_dev = rcu_dereference(nfo_info->primary_dev);
+ if (primary_dev) {
+ vlan_features =
+ netdev_increment_features(vlan_features,
+ primary_dev->vlan_features,
+ FAILOVER_VLAN_FEATURES);
+ enc_features =
+ netdev_increment_features(enc_features,
+ primary_dev->hw_enc_features,
+ FAILOVER_ENC_FEATURES);
+
+ dst_release_flag &= primary_dev->priv_flags;
+ if (primary_dev->hard_header_len > max_hard_header_len)
+ max_hard_header_len = primary_dev->hard_header_len;
+ }
+
+ standby_dev = rcu_dereference(nfo_info->standby_dev);
+ if (standby_dev) {
+ vlan_features =
+ netdev_increment_features(vlan_features,
+ standby_dev->vlan_features,
+ FAILOVER_VLAN_FEATURES);
+ enc_features =
+ netdev_increment_features(enc_features,
+ standby_dev->hw_enc_features,
+ FAILOVER_ENC_FEATURES);
+
+ dst_release_flag &= standby_dev->priv_flags;
+ if (standby_dev->hard_header_len > max_hard_header_len)
+ max_hard_header_len = standby_dev->hard_header_len;
+ }
+
+ dev->vlan_features = vlan_features;
+ dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+ dev->hard_header_len = max_hard_header_len;
+
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ if (dst_release_flag == (IFF_XMIT_DST_RELEASE |
+ IFF_XMIT_DST_RELEASE_PERM))
+ dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+ netdev_change_features(dev);
+}
+
+static void net_failover_lower_state_changed(struct net_device *slave_dev,
+ struct net_device *primary_dev,
+ struct net_device *standby_dev)
+{
+ struct netdev_lag_lower_state_info info;
+
+ if (netif_carrier_ok(slave_dev))
+ info.link_up = true;
+ else
+ info.link_up = false;
+
+ if (slave_dev == primary_dev) {
+ if (netif_running(primary_dev))
+ info.tx_enabled = true;
+ else
+ info.tx_enabled = false;
+ } else {
+ if ((primary_dev && netif_running(primary_dev)) ||
+ (!netif_running(standby_dev)))
+ info.tx_enabled = false;
+ else
+ info.tx_enabled = true;
+ }
+
+ netdev_lower_state_changed(slave_dev, &info);
+}
+
+static int net_failover_slave_pre_register(struct net_device *slave_dev,
+ struct net_device *failover_dev)
+{
+ struct net_device *standby_dev, *primary_dev;
+ struct net_failover_info *nfo_info;
+ bool slave_is_standby;
+
+ nfo_info = netdev_priv(failover_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+ if (slave_is_standby ? standby_dev : primary_dev) {
+ netdev_err(failover_dev, "%s attempting to register as slave dev when %s already present\n",
+ slave_dev->name,
+ slave_is_standby ? "standby" : "primary");
+ return -EINVAL;
+ }
+
+ /* We want to allow only a direct attached VF device as a primary
+ * netdev. As there is no easy way to check for a VF device, restrict
+ * this to a pci device.
+ */
+ if (!slave_is_standby && (!slave_dev->dev.parent ||
+ !dev_is_pci(slave_dev->dev.parent)))
+ return -EINVAL;
+
+ if (failover_dev->features & NETIF_F_VLAN_CHALLENGED &&
+ vlan_uses_dev(failover_dev)) {
+ netdev_err(failover_dev, "Device %s is VLAN challenged and failover device has VLAN set up\n",
+ failover_dev->name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int net_failover_slave_register(struct net_device *slave_dev,
+ struct net_device *failover_dev)
+{
+ struct net_device *standby_dev, *primary_dev;
+ struct net_failover_info *nfo_info;
+ bool slave_is_standby;
+ u32 orig_mtu;
+ int err;
+
+ /* Align MTU of slave with failover dev */
+ orig_mtu = slave_dev->mtu;
+ err = dev_set_mtu(slave_dev, failover_dev->mtu);
+ if (err) {
+ netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n",
+ slave_dev->name, failover_dev->mtu);
+ goto done;
+ }
+
+ dev_hold(slave_dev);
+
+ if (netif_running(failover_dev)) {
+ err = dev_open(slave_dev);
+ if (err && (err != -EBUSY)) {
+ netdev_err(failover_dev, "Opening slave %s failed err:%d\n",
+ slave_dev->name, err);
+ goto err_dev_open;
+ }
+ }
+
+ netif_addr_lock_bh(failover_dev);
+ dev_uc_sync_multiple(slave_dev, failover_dev);
+ dev_uc_sync_multiple(slave_dev, failover_dev);
+ netif_addr_unlock_bh(failover_dev);
+
+ err = vlan_vids_add_by_dev(slave_dev, failover_dev);
+ if (err) {
+ netdev_err(failover_dev, "Failed to add vlan ids to device %s err:%d\n",
+ slave_dev->name, err);
+ goto err_vlan_add;
+ }
+
+ nfo_info = netdev_priv(failover_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+
+ if (slave_is_standby) {
+ rcu_assign_pointer(nfo_info->standby_dev, slave_dev);
+ standby_dev = slave_dev;
+ dev_get_stats(standby_dev, &nfo_info->standby_stats);
+ } else {
+ rcu_assign_pointer(nfo_info->primary_dev, slave_dev);
+ primary_dev = slave_dev;
+ dev_get_stats(primary_dev, &nfo_info->primary_stats);
+ failover_dev->min_mtu = slave_dev->min_mtu;
+ failover_dev->max_mtu = slave_dev->max_mtu;
+ }
+
+ net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
+ net_failover_compute_features(failover_dev);
+
+ call_netdevice_notifiers(NETDEV_JOIN, slave_dev);
+
+ netdev_info(failover_dev, "failover %s slave:%s registered\n",
+ slave_is_standby ? "standby" : "primary", slave_dev->name);
+
+ return 0;
+
+err_vlan_add:
+ dev_uc_unsync(slave_dev, failover_dev);
+ dev_mc_unsync(slave_dev, failover_dev);
+ dev_close(slave_dev);
+err_dev_open:
+ dev_put(slave_dev);
+ dev_set_mtu(slave_dev, orig_mtu);
+done:
+ return err;
+}
+
+static int net_failover_slave_pre_unregister(struct net_device *slave_dev,
+ struct net_device *failover_dev)
+{
+ struct net_device *standby_dev, *primary_dev;
+ struct net_failover_info *nfo_info;
+
+ nfo_info = netdev_priv(failover_dev);
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+ if (slave_dev != primary_dev && slave_dev != standby_dev)
+ return -ENODEV;
+
+ return 0;
+}
+
+static int net_failover_slave_unregister(struct net_device *slave_dev,
+ struct net_device *failover_dev)
+{
+ struct net_device *standby_dev, *primary_dev;
+ struct net_failover_info *nfo_info;
+ bool slave_is_standby;
+
+ nfo_info = netdev_priv(failover_dev);
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+ vlan_vids_del_by_dev(slave_dev, failover_dev);
+ dev_uc_unsync(slave_dev, failover_dev);
+ dev_mc_unsync(slave_dev, failover_dev);
+ dev_close(slave_dev);
+
+ nfo_info = netdev_priv(failover_dev);
+ dev_get_stats(failover_dev, &nfo_info->failover_stats);
+
+ slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+ if (slave_is_standby) {
+ RCU_INIT_POINTER(nfo_info->standby_dev, NULL);
+ } else {
+ RCU_INIT_POINTER(nfo_info->primary_dev, NULL);
+ if (standby_dev) {
+ failover_dev->min_mtu = standby_dev->min_mtu;
+ failover_dev->max_mtu = standby_dev->max_mtu;
+ }
+ }
+
+ dev_put(slave_dev);
+
+ net_failover_compute_features(failover_dev);
+
+ netdev_info(failover_dev, "failover %s slave:%s unregistered\n",
+ slave_is_standby ? "standby" : "primary", slave_dev->name);
+
+ return 0;
+}
+
+static int net_failover_slave_link_change(struct net_device *slave_dev,
+ struct net_device *failover_dev)
+{
+ struct net_device *primary_dev, *standby_dev;
+ struct net_failover_info *nfo_info;
+
+ nfo_info = netdev_priv(failover_dev);
+
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+ if (slave_dev != primary_dev && slave_dev != standby_dev)
+ return -ENODEV;
+
+ if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
+ (standby_dev && net_failover_xmit_ready(standby_dev))) {
+ netif_carrier_on(failover_dev);
+ netif_tx_wake_all_queues(failover_dev);
+ } else {
+ dev_get_stats(failover_dev, &nfo_info->failover_stats);
+ netif_carrier_off(failover_dev);
+ netif_tx_stop_all_queues(failover_dev);
+ }
+
+ net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
+
+ return 0;
+}
+
+static int net_failover_slave_name_change(struct net_device *slave_dev,
+ struct net_device *failover_dev)
+{
+ struct net_device *primary_dev, *standby_dev;
+ struct net_failover_info *nfo_info;
+
+ nfo_info = netdev_priv(failover_dev);
+
+ primary_dev = rtnl_dereference(nfo_info->primary_dev);
+ standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+ if (slave_dev != primary_dev && slave_dev != standby_dev)
+ return -ENODEV;
+
+ /* We need to bring up the slave after the rename by udev in case
+ * open failed with EBUSY when it was registered.
+ */
+ dev_open(slave_dev);
+
+ return 0;
+}
+
+static struct failover_ops net_failover_ops = {
+ .slave_pre_register = net_failover_slave_pre_register,
+ .slave_register = net_failover_slave_register,
+ .slave_pre_unregister = net_failover_slave_pre_unregister,
+ .slave_unregister = net_failover_slave_unregister,
+ .slave_link_change = net_failover_slave_link_change,
+ .slave_name_change = net_failover_slave_name_change,
+ .slave_handle_frame = net_failover_handle_frame,
+};
+
+/**
+ * net_failover_create - Create and register a failover instance
+ *
+ * @dev: standby netdev
+ *
+ * Creates a failover netdev and registers a failover instance for a standby
+ * netdev. Used by paravirtual drivers that use 3-netdev model.
+ * The failover netdev acts as a master device and controls 2 slave devices -
+ * the original standby netdev and a VF netdev with the same MAC gets
+ * registered as primary netdev.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *net_failover_create(struct net_device *standby_dev)
+{
+ struct device *dev = standby_dev->dev.parent;
+ struct net_device *failover_dev;
+ struct failover *failover;
+ int err;
+
+ /* Alloc at least 2 queues, for now we are going with 16 assuming
+ * that VF devices being enslaved won't have too many queues.
+ */
+ failover_dev = alloc_etherdev_mq(sizeof(struct net_failover_info), 16);
+ if (!failover_dev) {
+ dev_err(dev, "Unable to allocate failover_netdev!\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ dev_net_set(failover_dev, dev_net(standby_dev));
+ SET_NETDEV_DEV(failover_dev, dev);
+
+ failover_dev->netdev_ops = &failover_dev_ops;
+ failover_dev->ethtool_ops = &failover_ethtool_ops;
+
+ /* Initialize the device options */
+ failover_dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
+ failover_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
+ IFF_TX_SKB_SHARING);
+
+ /* don't acquire failover netdev's netif_tx_lock when transmitting */
+ failover_dev->features |= NETIF_F_LLTX;
+
+ /* Don't allow failover devices to change network namespaces. */
+ failover_dev->features |= NETIF_F_NETNS_LOCAL;
+
+ failover_dev->hw_features = FAILOVER_VLAN_FEATURES |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_CTAG_RX |
+ NETIF_F_HW_VLAN_CTAG_FILTER;
+
+ failover_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+ failover_dev->features |= failover_dev->hw_features;
+
+ memcpy(failover_dev->dev_addr, standby_dev->dev_addr,
+ failover_dev->addr_len);
+
+ failover_dev->min_mtu = standby_dev->min_mtu;
+ failover_dev->max_mtu = standby_dev->max_mtu;
+
+ err = register_netdev(failover_dev);
+ if (err) {
+ dev_err(dev, "Unable to register failover_dev!\n");
+ goto err_register_netdev;
+ }
+
+ netif_carrier_off(failover_dev);
+
+ failover = failover_register(failover_dev, &net_failover_ops);
+ if (IS_ERR(failover))
+ goto err_failover_register;
+
+ return failover;
+
+err_failover_register:
+ unregister_netdev(failover_dev);
+err_register_netdev:
+ free_netdev(failover_dev);
+
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(net_failover_create);
+
+/**
+ * net_failover_destroy - Destroy a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters any slave netdevs associated with the failover instance by
+ * calling failover_slave_unregister().
+ * unregisters the failover instance itself and finally frees the failover
+ * netdev. Used by paravirtual drivers that use 3-netdev model.
+ *
+ */
+void net_failover_destroy(struct failover *failover)
+{
+ struct net_failover_info *nfo_info;
+ struct net_device *failover_dev;
+ struct net_device *slave_dev;
+
+ if (!failover)
+ return;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+ nfo_info = netdev_priv(failover_dev);
+
+ netif_device_detach(failover_dev);
+
+ rtnl_lock();
+
+ slave_dev = rtnl_dereference(nfo_info->primary_dev);
+ if (slave_dev)
+ failover_slave_unregister(slave_dev);
+
+ slave_dev = rtnl_dereference(nfo_info->standby_dev);
+ if (slave_dev)
+ failover_slave_unregister(slave_dev);
+
+ failover_unregister(failover);
+
+ unregister_netdevice(failover_dev);
+
+ rtnl_unlock();
+
+ free_netdev(failover_dev);
+}
+EXPORT_SYMBOL_GPL(net_failover_destroy);
+
+static __init int
+net_failover_init(void)
+{
+ return 0;
+}
+module_init(net_failover_init);
+
+static __exit
+void net_failover_exit(void)
+{
+}
+module_exit(net_failover_exit);
+
+MODULE_DESCRIPTION("Failover driver for Paravirtual drivers");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b2647dd5d302..8f08a3e1bbaa 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -30,8 +30,11 @@
#include
#include
#include
+#include
+#include
#include
#include
+#include
static int napi_weight = NAPI_POLL_WEIGHT;
module_param(napi_weight, int, 0444);
@@ -210,6 +213,9 @@ struct virtnet_info {
u32 speed;
unsigned long guest_offloads;
+
+ /* failover when STANDBY feature enabled */
+ struct failover *failover;
};
struct padded_vnet_hdr {
@@ -1554,6 +1560,9 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
struct sockaddr *addr;
struct scatterlist sg;
+ if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
+ return -EOPNOTSUPP;
+
addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
if (!addr)
return -ENOMEM;
@@ -2337,6 +2346,22 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
}
}
+static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
+ size_t len)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ int ret;
+
+ if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
+ return -EOPNOTSUPP;
+
+ ret = snprintf(buf, len, "sby");
+ if (ret >= len)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
static const struct net_device_ops virtnet_netdev = {
.ndo_open = virtnet_open,
.ndo_stop = virtnet_close,
@@ -2354,6 +2379,7 @@ static const struct net_device_ops virtnet_netdev = {
.ndo_xdp_xmit = virtnet_xdp_xmit,
.ndo_xdp_flush = virtnet_xdp_flush,
.ndo_features_check = passthru_features_check,
+ .ndo_get_phys_port_name = virtnet_get_phys_port_name,
};
static void virtnet_config_changed_work(struct work_struct *work)
@@ -2907,10 +2933,16 @@ static int virtnet_probe(struct virtio_device *vdev)
virtnet_init_settings(dev);
+ if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
+ vi->failover = net_failover_create(vi->dev);
+ if (IS_ERR(vi->failover))
+ goto free_vqs;
+ }
+
err = register_netdev(dev);
if (err) {
pr_debug("virtio_net: registering device failed\n");
- goto free_vqs;
+ goto free_failover;
}
virtio_device_ready(vdev);
@@ -2947,6 +2979,8 @@ free_unregister_netdev:
vi->vdev->config->reset(vdev);
unregister_netdev(dev);
+free_failover:
+ net_failover_destroy(vi->failover);
free_vqs:
cancel_delayed_work_sync(&vi->refill);
free_receive_page_frags(vi);
@@ -2981,6 +3015,8 @@ static void virtnet_remove(struct virtio_device *vdev)
unregister_netdev(vi->dev);
+ net_failover_destroy(vi->failover);
+
remove_vq_common(vi);
free_netdev(vi->dev);
@@ -3030,7 +3066,7 @@ static struct virtio_device_id id_table[] = {
VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
VIRTIO_NET_F_CTRL_MAC_ADDR, \
VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
- VIRTIO_NET_F_SPEED_DUPLEX
+ VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
static unsigned int features[] = {
VIRTNET_FEATURES,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72087ef..f45b1a4e37ab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1425,6 +1425,8 @@ struct net_device_ops {
* entity (i.e. the master device for bridged veth)
* @IFF_MACSEC: device is a MACsec device
* @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
+ * @IFF_FAILOVER: device is a failover master device
+ * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
@@ -1454,6 +1456,8 @@ enum netdev_priv_flags {
IFF_PHONY_HEADROOM = 1<<24,
IFF_MACSEC = 1<<25,
IFF_NO_RX_HANDLER = 1<<26,
+ IFF_FAILOVER = 1<<27,
+ IFF_FAILOVER_SLAVE = 1<<28,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
@@ -1482,6 +1486,8 @@ enum netdev_priv_flags {
#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
#define IFF_MACSEC IFF_MACSEC
#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
+#define IFF_FAILOVER IFF_FAILOVER
+#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE
/**
* struct net_device - The DEVICE structure.
@@ -4336,6 +4342,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
return dev->priv_flags & IFF_RXFH_CONFIGURED;
}
+static inline bool netif_is_failover(const struct net_device *dev)
+{
+ return dev->priv_flags & IFF_FAILOVER;
+}
+
+static inline bool netif_is_failover_slave(const struct net_device *dev)
+{
+ return dev->priv_flags & IFF_FAILOVER_SLAVE;
+}
+
/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
diff --git a/include/net/failover.h b/include/net/failover.h
new file mode 100644
index 000000000000..bb15438f39c7
--- /dev/null
+++ b/include/net/failover.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _FAILOVER_H
+#define _FAILOVER_H
+
+#include
+
+struct failover_ops {
+ int (*slave_pre_register)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_register)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_pre_unregister)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_unregister)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_link_change)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ int (*slave_name_change)(struct net_device *slave_dev,
+ struct net_device *failover_dev);
+ rx_handler_result_t (*slave_handle_frame)(struct sk_buff **pskb);
+};
+
+struct failover {
+ struct list_head list;
+ struct net_device __rcu *failover_dev;
+ struct failover_ops __rcu *ops;
+};
+
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops);
+void failover_unregister(struct failover *failover);
+int failover_slave_unregister(struct net_device *slave_dev);
+
+#endif /* _FAILOVER_H */
diff --git a/include/net/net_failover.h b/include/net/net_failover.h
new file mode 100644
index 000000000000..b12a1c469d1c
--- /dev/null
+++ b/include/net/net_failover.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _NET_FAILOVER_H
+#define _NET_FAILOVER_H
+
+#include
+
+/* failover state */
+struct net_failover_info {
+ /* primary netdev with same MAC */
+ struct net_device __rcu *primary_dev;
+
+ /* standby netdev */
+ struct net_device __rcu *standby_dev;
+
+ /* primary netdev stats */
+ struct rtnl_link_stats64 primary_stats;
+
+ /* standby netdev stats */
+ struct rtnl_link_stats64 standby_stats;
+
+ /* aggregated stats */
+ struct rtnl_link_stats64 failover_stats;
+
+ /* spinlock while updating stats */
+ spinlock_t stats_lock;
+};
+
+struct failover *net_failover_create(struct net_device *standby_dev);
+void net_failover_destroy(struct failover *failover);
+
+#define FAILOVER_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \
+ NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
+ NETIF_F_HIGHDMA | NETIF_F_LRO)
+
+#define FAILOVER_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \
+ NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
+
+#endif /* _NET_FAILOVER_H */
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 5de6ed37695b..a3715a3224c1 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,9 @@
* Steering */
#define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */
+#define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device
+ * with the same MAC.
+ */
#define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Device set linkspeed and duplex */
#ifndef VIRTIO_NET_NO_LEGACY
diff --git a/net/Kconfig b/net/Kconfig
index ba554cedb615..f738a6f27665 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -432,6 +432,19 @@ config MAY_USE_DEVLINK
config PAGE_POOL
bool
+config FAILOVER
+ tristate "Generic failover module"
+ help
+ The failover module provides a generic interface for paravirtual
+ drivers to register a netdev and a set of ops with a failover
+ instance. The ops are used as event handlers that get called to
+ handle netdev register/unregister/link change/name change events
+ on slave pci ethernet devices with the same mac address as the
+ failover netdev. This enables paravirtual drivers to use a
+ VF as an accelerated low latency datapath. It also allows live
+ migration of VMs with direct attached VFs by failing over to the
+ paravirtual datapath when the VF is unplugged.
+
endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/core/Makefile b/net/core/Makefile
index 7080417f8bc8..80175e6a2eb8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000000..4a92a98ccce9
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+ struct net_device *failover_dev;
+ struct failover *failover;
+
+ spin_lock(&failover_lock);
+ list_for_each_entry(failover, &failover_list, list) {
+ failover_dev = rtnl_dereference(failover->failover_dev);
+ if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+ *ops = rtnl_dereference(failover->ops);
+ spin_unlock(&failover_lock);
+ return failover_dev;
+ }
+ }
+ spin_unlock(&failover_lock);
+ return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+ int err;
+
+ if (slave_dev->type != ARPHRD_ETHER)
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_register &&
+ fops->slave_pre_register(slave_dev, failover_dev))
+ goto done;
+
+ err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
+ failover_dev);
+ if (err) {
+ netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+ err);
+ goto done;
+ }
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+ err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+ &lag_upper_info, NULL);
+ if (err) {
+ netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+ failover_dev->name, err);
+ goto err_upper_link;
+ }
+
+ slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+
+ if (fops && fops->slave_register &&
+ !fops->slave_register(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+err_upper_link:
+ netdev_rx_handler_unregister(slave_dev);
+done:
+ return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (fops && fops->slave_pre_unregister &&
+ fops->slave_pre_unregister(slave_dev, failover_dev))
+ goto done;
+
+ netdev_rx_handler_unregister(slave_dev);
+ netdev_upper_dev_unlink(slave_dev, failover_dev);
+ slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+
+ if (fops && fops->slave_unregister &&
+ !fops->slave_unregister(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_link_change &&
+ !fops->slave_link_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+ struct net_device *failover_dev;
+ struct failover_ops *fops;
+
+ if (!netif_is_failover_slave(slave_dev))
+ goto done;
+
+ ASSERT_RTNL();
+
+ failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+ if (!failover_dev)
+ goto done;
+
+ if (!netif_running(failover_dev))
+ goto done;
+
+ if (fops && fops->slave_name_change &&
+ !fops->slave_name_change(slave_dev, failover_dev))
+ return NOTIFY_OK;
+
+done:
+ return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ /* Skip parent events */
+ if (netif_is_failover(event_dev))
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ return failover_slave_register(event_dev);
+ case NETDEV_UNREGISTER:
+ return failover_slave_unregister(event_dev);
+ case NETDEV_UP:
+ case NETDEV_DOWN:
+ case NETDEV_CHANGE:
+ return failover_slave_link_change(event_dev);
+ case NETDEV_CHANGENAME:
+ return failover_slave_name_change(event_dev);
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block failover_notifier = {
+ .notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+ struct net *net = dev_net(failover_dev);
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev) {
+ if (netif_is_failover(dev))
+ continue;
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ failover_slave_register(dev);
+ }
+ rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+ struct failover_ops *ops)
+{
+ struct failover *failover;
+
+ if (dev->type != ARPHRD_ETHER)
+ return ERR_PTR(-EINVAL);
+
+ failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+ if (!failover)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_assign_pointer(failover->ops, ops);
+ dev_hold(dev);
+ dev->priv_flags |= IFF_FAILOVER;
+ rcu_assign_pointer(failover->failover_dev, dev);
+
+ spin_lock(&failover_lock);
+ list_add_tail(&failover->list, &failover_list);
+ spin_unlock(&failover_lock);
+
+ netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+ failover_existing_slave_register(dev);
+
+ return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+ struct net_device *failover_dev;
+
+ failover_dev = rcu_dereference(failover->failover_dev);
+
+ netdev_info(failover_dev, "failover master:%s unregistered\n",
+ failover_dev->name);
+
+ failover_dev->priv_flags &= ~IFF_FAILOVER;
+ dev_put(failover_dev);
+
+ spin_lock(&failover_lock);
+ list_del(&failover->list);
+ spin_unlock(&failover_lock);
+
+ kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+ register_netdevice_notifier(&failover_notifier);
+
+ return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+ unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");