Merge branch 'lag-offload-for-ocelot-dsa-switches'

Vladimir Oltean says:

====================
LAG offload for Ocelot DSA switches

This patch series reworks the ocelot switchdev driver such that it could
share the same implementation for LAG offload as the felix DSA driver.

Testing has been done in the following topology:

         +----------------------------------+
         | Board 1         br0              |
         |             +---------+          |
         |            /           \         |
         |            |           |         |
         |            |         bond0       |
         |            |        +-----+      |
         |            |       /       \     |
         |  eno0     swp0    swp1    swp2   |
         +---|--------|-------|-------|-----+
             |        |       |       |
             +--------+       |       |
               Cable          |       |
                         Cable|       |Cable
               Cable          |       |
             +--------+       |       |
             |        |       |       |
         +---|--------|-------|-------|-----+
         |  eno0     swp0    swp1    swp2   |
         |            |       \       /     |
         |            |        +-----+      |
         |            |         bond0       |
         |            |           |         |
         |            \           /         |
         |             +---------+          |
         | Board 2         br0              |
         +----------------------------------+

The same script can be run on both Board 1 and Board 2 to set this up:

ip link del bond0
ip link add bond0 type bond mode balance-xor miimon 1
OR
ip link add bond0 type bond mode 802.3ad
ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up
ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up
ip link del br0
ip link add br0 type bridge
ip link set bond0 master br0
ip link set swp0 master br0

Then traffic can be tested between eno0 of Board 1 and eno0 of Board 2.
====================

Link: https://lore.kernel.org/r/20210205220221.255646-1-olteanv@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2021-02-06 14:51:53 -08:00
commit bfc213f159
7 changed files with 266 additions and 147 deletions

View File

@ -569,6 +569,35 @@ static void felix_bridge_leave(struct dsa_switch *ds, int port,
ocelot_port_bridge_leave(ocelot, port, br);
}
static int felix_lag_join(struct dsa_switch *ds, int port,
struct net_device *bond,
struct netdev_lag_upper_info *info)
{
struct ocelot *ocelot = ds->priv;
return ocelot_port_lag_join(ocelot, port, bond, info);
}
static int felix_lag_leave(struct dsa_switch *ds, int port,
struct net_device *bond)
{
struct ocelot *ocelot = ds->priv;
ocelot_port_lag_leave(ocelot, port, bond);
return 0;
}
static int felix_lag_change(struct dsa_switch *ds, int port)
{
struct dsa_port *dp = dsa_to_port(ds, port);
struct ocelot *ocelot = ds->priv;
ocelot_port_lag_change(ocelot, port, dp->lag_tx_enabled);
return 0;
}
static int felix_vlan_prepare(struct dsa_switch *ds, int port,
const struct switchdev_obj_port_vlan *vlan)
{
@ -1331,6 +1360,9 @@ const struct dsa_switch_ops felix_switch_ops = {
.port_mdb_del = felix_mdb_del,
.port_bridge_join = felix_bridge_join,
.port_bridge_leave = felix_bridge_leave,
.port_lag_join = felix_lag_join,
.port_lag_leave = felix_lag_leave,
.port_lag_change = felix_lag_change,
.port_stp_state_set = felix_bridge_stp_state_set,
.port_vlan_filtering = felix_vlan_filtering,
.port_vlan_add = felix_vlan_add,

View File

@ -889,6 +889,29 @@ int ocelot_get_ts_info(struct ocelot *ocelot, int port,
}
EXPORT_SYMBOL(ocelot_get_ts_info);
static u32 ocelot_get_bond_mask(struct ocelot *ocelot, struct net_device *bond,
bool only_active_ports)
{
u32 mask = 0;
int port;
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port)
continue;
if (ocelot_port->bond == bond) {
if (only_active_ports && !ocelot_port->lag_tx_active)
continue;
mask |= BIT(port);
}
}
return mask;
}
static u32 ocelot_get_dsa_8021q_cpu_mask(struct ocelot *ocelot)
{
u32 mask = 0;
@ -939,20 +962,12 @@ void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot)
mask = GENMASK(ocelot->num_phys_ports - 1, 0);
mask &= ~cpu_fwd_mask;
} else if (ocelot->bridge_fwd_mask & BIT(port)) {
int lag;
struct net_device *bond = ocelot_port->bond;
mask = ocelot->bridge_fwd_mask & ~BIT(port);
for (lag = 0; lag < ocelot->num_phys_ports; lag++) {
unsigned long bond_mask = ocelot->lags[lag];
if (!bond_mask)
continue;
if (bond_mask & BIT(port)) {
mask &= ~bond_mask;
break;
}
if (bond) {
mask &= ~ocelot_get_bond_mask(ocelot, bond,
false);
}
} else {
/* Standalone ports forward only to DSA tag_8021q CPU
@ -1259,6 +1274,7 @@ EXPORT_SYMBOL(ocelot_port_bridge_leave);
static void ocelot_set_aggr_pgids(struct ocelot *ocelot)
{
unsigned long visited = GENMASK(ocelot->num_phys_ports - 1, 0);
int i, port, lag;
/* Reset destination and aggregation PGIDS */
@ -1269,22 +1285,40 @@ static void ocelot_set_aggr_pgids(struct ocelot *ocelot)
ocelot_write_rix(ocelot, GENMASK(ocelot->num_phys_ports - 1, 0),
ANA_PGID_PGID, i);
/* Now, set PGIDs for each LAG */
/* The visited ports bitmask holds the list of ports offloading any
* bonding interface. Initially we mark all these ports as unvisited,
* then every time we visit a port in this bitmask, we know that it is
* the lowest numbered port, i.e. the one whose logical ID == physical
* port ID == LAG ID. So we mark as visited all further ports in the
* bitmask that are offloading the same bonding interface. This way,
* we set up the aggregation PGIDs only once per bonding interface.
*/
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port || !ocelot_port->bond)
continue;
visited &= ~BIT(port);
}
/* Now, set PGIDs for each active LAG */
for (lag = 0; lag < ocelot->num_phys_ports; lag++) {
struct net_device *bond = ocelot->ports[lag]->bond;
int num_active_ports = 0;
unsigned long bond_mask;
int aggr_count = 0;
u8 aggr_idx[16];
bond_mask = ocelot->lags[lag];
if (!bond_mask)
if (!bond || (visited & BIT(lag)))
continue;
bond_mask = ocelot_get_bond_mask(ocelot, bond, true);
for_each_set_bit(port, &bond_mask, ocelot->num_phys_ports) {
// Destination mask
ocelot_write_rix(ocelot, bond_mask,
ANA_PGID_PGID, port);
aggr_idx[aggr_count] = port;
aggr_count++;
aggr_idx[num_active_ports++] = port;
}
for_each_aggr_pgid(ocelot, i) {
@ -1292,63 +1326,73 @@ static void ocelot_set_aggr_pgids(struct ocelot *ocelot)
ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i);
ac &= ~bond_mask;
ac |= BIT(aggr_idx[i % aggr_count]);
/* Don't do division by zero if there was no active
* port. Just make all aggregation codes zero.
*/
if (num_active_ports)
ac |= BIT(aggr_idx[i % num_active_ports]);
ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i);
}
/* Mark all ports in the same LAG as visited to avoid applying
* the same config again.
*/
for (port = lag; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port)
continue;
if (ocelot_port->bond == bond)
visited |= BIT(port);
}
}
}
static void ocelot_setup_lag(struct ocelot *ocelot, int lag)
/* When offloading a bonding interface, the switch ports configured under the
* same bond must have the same logical port ID, equal to the physical port ID
* of the lowest numbered physical port in that bond. Otherwise, in standalone/
* bridged mode, each port has a logical port ID equal to its physical port ID.
*/
static void ocelot_setup_logical_port_ids(struct ocelot *ocelot)
{
unsigned long bond_mask = ocelot->lags[lag];
unsigned int p;
int port;
for_each_set_bit(p, &bond_mask, ocelot->num_phys_ports) {
u32 port_cfg = ocelot_read_gix(ocelot, ANA_PORT_PORT_CFG, p);
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
struct net_device *bond;
port_cfg &= ~ANA_PORT_PORT_CFG_PORTID_VAL_M;
if (!ocelot_port)
continue;
/* Use lag port as logical port for port i */
ocelot_write_gix(ocelot, port_cfg |
ANA_PORT_PORT_CFG_PORTID_VAL(lag),
ANA_PORT_PORT_CFG, p);
bond = ocelot_port->bond;
if (bond) {
int lag = __ffs(ocelot_get_bond_mask(ocelot, bond,
false));
ocelot_rmw_gix(ocelot,
ANA_PORT_PORT_CFG_PORTID_VAL(lag),
ANA_PORT_PORT_CFG_PORTID_VAL_M,
ANA_PORT_PORT_CFG, port);
} else {
ocelot_rmw_gix(ocelot,
ANA_PORT_PORT_CFG_PORTID_VAL(port),
ANA_PORT_PORT_CFG_PORTID_VAL_M,
ANA_PORT_PORT_CFG, port);
}
}
}
int ocelot_port_lag_join(struct ocelot *ocelot, int port,
struct net_device *bond)
struct net_device *bond,
struct netdev_lag_upper_info *info)
{
struct net_device *ndev;
u32 bond_mask = 0;
int lag, lp;
if (info->tx_type != NETDEV_LAG_TX_TYPE_HASH)
return -EOPNOTSUPP;
rcu_read_lock();
for_each_netdev_in_bond_rcu(bond, ndev) {
struct ocelot_port_private *priv = netdev_priv(ndev);
ocelot->ports[port]->bond = bond;
bond_mask |= BIT(priv->chip_port);
}
rcu_read_unlock();
lp = __ffs(bond_mask);
/* If the new port is the lowest one, use it as the logical port from
* now on
*/
if (port == lp) {
lag = port;
ocelot->lags[port] = bond_mask;
bond_mask &= ~BIT(port);
if (bond_mask) {
lp = __ffs(bond_mask);
ocelot->lags[lp] = 0;
}
} else {
lag = lp;
ocelot->lags[lp] |= BIT(port);
}
ocelot_setup_lag(ocelot, lag);
ocelot_setup_logical_port_ids(ocelot);
ocelot_apply_bridge_fwd_mask(ocelot);
ocelot_set_aggr_pgids(ocelot);
@ -1359,35 +1403,25 @@ EXPORT_SYMBOL(ocelot_port_lag_join);
void ocelot_port_lag_leave(struct ocelot *ocelot, int port,
struct net_device *bond)
{
u32 port_cfg;
int i;
/* Remove port from any lag */
for (i = 0; i < ocelot->num_phys_ports; i++)
ocelot->lags[i] &= ~BIT(port);
/* if it was the logical port of the lag, move the lag config to the
* next port
*/
if (ocelot->lags[port]) {
int n = __ffs(ocelot->lags[port]);
ocelot->lags[n] = ocelot->lags[port];
ocelot->lags[port] = 0;
ocelot_setup_lag(ocelot, n);
}
port_cfg = ocelot_read_gix(ocelot, ANA_PORT_PORT_CFG, port);
port_cfg &= ~ANA_PORT_PORT_CFG_PORTID_VAL_M;
ocelot_write_gix(ocelot, port_cfg | ANA_PORT_PORT_CFG_PORTID_VAL(port),
ANA_PORT_PORT_CFG, port);
ocelot->ports[port]->bond = NULL;
ocelot_setup_logical_port_ids(ocelot);
ocelot_apply_bridge_fwd_mask(ocelot);
ocelot_set_aggr_pgids(ocelot);
}
EXPORT_SYMBOL(ocelot_port_lag_leave);
void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
ocelot_port->lag_tx_active = lag_tx_active;
/* Rebalance the LAGs */
ocelot_set_aggr_pgids(ocelot);
}
EXPORT_SYMBOL(ocelot_port_lag_change);
/* Configure the maximum SDU (L2 payload) on RX to the value specified in @sdu.
* The length of VLAN tags is accounted for automatically via DEV_MAC_TAGS_CFG.
* In the special case that it's the NPI port that we're configuring, the
@ -1563,11 +1597,6 @@ int ocelot_init(struct ocelot *ocelot)
}
}
ocelot->lags = devm_kcalloc(ocelot->dev, ocelot->num_phys_ports,
sizeof(u32), GFP_KERNEL);
if (!ocelot->lags)
return -ENOMEM;
ocelot->stats = devm_kcalloc(ocelot->dev,
ocelot->num_phys_ports * ocelot->num_stats,
sizeof(u64), GFP_KERNEL);
@ -1611,7 +1640,10 @@ int ocelot_init(struct ocelot *ocelot)
ocelot_write(ocelot, ANA_AGGR_CFG_AC_SMAC_ENA |
ANA_AGGR_CFG_AC_DMAC_ENA |
ANA_AGGR_CFG_AC_IP4_SIPDIP_ENA |
ANA_AGGR_CFG_AC_IP4_TCPUDP_ENA, ANA_AGGR_CFG);
ANA_AGGR_CFG_AC_IP4_TCPUDP_ENA |
ANA_AGGR_CFG_AC_IP6_FLOW_LBL_ENA |
ANA_AGGR_CFG_AC_IP6_TCPUDP_ENA,
ANA_AGGR_CFG);
/* Set MAC age time to default value. The entry is aged after
* 2*AGE_PERIOD

View File

@ -109,10 +109,6 @@ int ocelot_mact_learn(struct ocelot *ocelot, int port,
unsigned int vid, enum macaccess_entry_type type);
int ocelot_mact_forget(struct ocelot *ocelot,
const unsigned char mac[ETH_ALEN], unsigned int vid);
int ocelot_port_lag_join(struct ocelot *ocelot, int port,
struct net_device *bond);
void ocelot_port_lag_leave(struct ocelot *ocelot, int port,
struct net_device *bond);
struct net_device *ocelot_port_to_netdev(struct ocelot *ocelot, int port);
int ocelot_netdev_to_port(struct net_device *dev);

View File

@ -1110,9 +1110,8 @@ static int ocelot_port_obj_del(struct net_device *dev,
return ret;
}
static int ocelot_netdevice_port_event(struct net_device *dev,
unsigned long event,
struct netdev_notifier_changeupper_info *info)
static int ocelot_netdevice_changeupper(struct net_device *dev,
struct netdev_notifier_changeupper_info *info)
{
struct ocelot_port_private *priv = netdev_priv(dev);
struct ocelot_port *ocelot_port = &priv->port;
@ -1120,71 +1119,103 @@ static int ocelot_netdevice_port_event(struct net_device *dev,
int port = priv->chip_port;
int err = 0;
switch (event) {
case NETDEV_CHANGEUPPER:
if (netif_is_bridge_master(info->upper_dev)) {
if (info->linking) {
err = ocelot_port_bridge_join(ocelot, port,
info->upper_dev);
} else {
err = ocelot_port_bridge_leave(ocelot, port,
info->upper_dev);
}
}
if (netif_is_lag_master(info->upper_dev)) {
if (info->linking)
err = ocelot_port_lag_join(ocelot, port,
info->upper_dev);
else
ocelot_port_lag_leave(ocelot, port,
if (netif_is_bridge_master(info->upper_dev)) {
if (info->linking) {
err = ocelot_port_bridge_join(ocelot, port,
info->upper_dev);
} else {
err = ocelot_port_bridge_leave(ocelot, port,
info->upper_dev);
}
}
if (netif_is_lag_master(info->upper_dev)) {
if (info->linking) {
err = ocelot_port_lag_join(ocelot, port,
info->upper_dev,
info->upper_info);
if (err == -EOPNOTSUPP) {
NL_SET_ERR_MSG_MOD(info->info.extack,
"Offloading not supported");
err = 0;
}
} else {
ocelot_port_lag_leave(ocelot, port,
info->upper_dev);
}
break;
default:
break;
}
return err;
return notifier_from_errno(err);
}
static int
ocelot_netdevice_lag_changeupper(struct net_device *dev,
struct netdev_notifier_changeupper_info *info)
{
struct net_device *lower;
struct list_head *iter;
int err = NOTIFY_DONE;
netdev_for_each_lower_dev(dev, lower, iter) {
err = ocelot_netdevice_changeupper(lower, info);
if (err)
return notifier_from_errno(err);
}
return NOTIFY_DONE;
}
static int
ocelot_netdevice_changelowerstate(struct net_device *dev,
struct netdev_lag_lower_state_info *info)
{
struct ocelot_port_private *priv = netdev_priv(dev);
bool is_active = info->link_up && info->tx_enabled;
struct ocelot_port *ocelot_port = &priv->port;
struct ocelot *ocelot = ocelot_port->ocelot;
int port = priv->chip_port;
if (!ocelot_port->bond)
return NOTIFY_DONE;
if (ocelot_port->lag_tx_active == is_active)
return NOTIFY_DONE;
ocelot_port_lag_change(ocelot, port, is_active);
return NOTIFY_OK;
}
static int ocelot_netdevice_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct netdev_notifier_changeupper_info *info = ptr;
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
int ret = 0;
if (event == NETDEV_PRECHANGEUPPER &&
ocelot_netdevice_dev_check(dev) &&
netif_is_lag_master(info->upper_dev)) {
struct netdev_lag_upper_info *lag_upper_info = info->upper_info;
struct netlink_ext_ack *extack;
switch (event) {
case NETDEV_CHANGEUPPER: {
struct netdev_notifier_changeupper_info *info = ptr;
if (lag_upper_info &&
lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) {
extack = netdev_notifier_info_to_extack(&info->info);
NL_SET_ERR_MSG_MOD(extack, "LAG device using unsupported Tx type");
if (ocelot_netdevice_dev_check(dev))
return ocelot_netdevice_changeupper(dev, info);
ret = -EINVAL;
goto notify;
}
if (netif_is_lag_master(dev))
return ocelot_netdevice_lag_changeupper(dev, info);
break;
}
case NETDEV_CHANGELOWERSTATE: {
struct netdev_notifier_changelowerstate_info *info = ptr;
if (!ocelot_netdevice_dev_check(dev))
break;
return ocelot_netdevice_changelowerstate(dev,
info->lower_state_info);
}
default:
break;
}
if (netif_is_lag_master(dev)) {
struct net_device *slave;
struct list_head *iter;
netdev_for_each_lower_dev(dev, slave, iter) {
ret = ocelot_netdevice_port_event(slave, event, info);
if (ret)
goto notify;
}
} else {
ret = ocelot_netdevice_port_event(dev, event, info);
}
notify:
return notifier_from_errno(ret);
return NOTIFY_DONE;
}
struct notifier_block ocelot_netdevice_nb __read_mostly = {

View File

@ -611,6 +611,9 @@ struct ocelot_port {
u8 *xmit_template;
bool is_dsa_8021q_cpu;
struct net_device *bond;
bool lag_tx_active;
};
struct ocelot {
@ -655,8 +658,6 @@ struct ocelot {
enum ocelot_tag_prefix npi_inj_prefix;
enum ocelot_tag_prefix npi_xtr_prefix;
u32 *lags;
struct list_head multicast;
struct list_head pgids;
@ -797,6 +798,12 @@ int ocelot_port_mdb_add(struct ocelot *ocelot, int port,
const struct switchdev_obj_port_mdb *mdb);
int ocelot_port_mdb_del(struct ocelot *ocelot, int port,
const struct switchdev_obj_port_mdb *mdb);
int ocelot_port_lag_join(struct ocelot *ocelot, int port,
struct net_device *bond,
struct netdev_lag_upper_info *info);
void ocelot_port_lag_leave(struct ocelot *ocelot, int port,
struct net_device *bond);
void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active);
int ocelot_devlink_sb_register(struct ocelot *ocelot);
void ocelot_devlink_sb_unregister(struct ocelot *ocelot);

View File

@ -209,6 +209,19 @@ static inline bool dsa_port_offloads_netdev(struct dsa_port *dp,
return false;
}
/* Returns true if any port of this tree offloads the given net_device */
static inline bool dsa_tree_offloads_netdev(struct dsa_switch_tree *dst,
struct net_device *dev)
{
struct dsa_port *dp;
list_for_each_entry(dp, &dst->ports, list)
if (dsa_port_offloads_netdev(dp, dev))
return true;
return false;
}
/* slave.c */
extern const struct dsa_device_ops notag_netdev_ops;
void dsa_slave_mii_bus_init(struct dsa_switch *ds);

View File

@ -2242,6 +2242,14 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
if (!dp->ds->assisted_learning_on_cpu_port)
return NOTIFY_DONE;
/* When the bridge learns an address on an offloaded
* LAG we don't want to send traffic to the CPU, the
* other ports bridged with the LAG should be able to
* autonomously forward towards it.
*/
if (dsa_tree_offloads_netdev(dp->ds->dst, dev))
return NOTIFY_DONE;
}
if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del)