2019-05-27 08:55:01 +02:00
// SPDX-License-Identifier: GPL-2.0-or-later
2017-09-19 11:57:00 -04:00
/*
* Handling of a master device , switching frames via its switch fabric CPU port
*
* Copyright ( c ) 2017 Savoir - faire Linux Inc .
* Vivien Didelot < vivien . didelot @ savoirfairelinux . com >
*/
# include "dsa_priv.h"
2019-08-02 15:34:55 -04:00
static int dsa_master_get_regs_len ( struct net_device * dev )
{
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
const struct ethtool_ops * ops = cpu_dp - > orig_ethtool_ops ;
struct dsa_switch * ds = cpu_dp - > ds ;
int port = cpu_dp - > index ;
int ret = 0 ;
int len ;
if ( ops - > get_regs_len ) {
len = ops - > get_regs_len ( dev ) ;
if ( len < 0 )
return len ;
ret + = len ;
}
ret + = sizeof ( struct ethtool_drvinfo ) ;
ret + = sizeof ( struct ethtool_regs ) ;
if ( ds - > ops - > get_regs_len ) {
len = ds - > ops - > get_regs_len ( ds , port ) ;
if ( len < 0 )
return len ;
ret + = len ;
}
return ret ;
}
static void dsa_master_get_regs ( struct net_device * dev ,
struct ethtool_regs * regs , void * data )
{
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
const struct ethtool_ops * ops = cpu_dp - > orig_ethtool_ops ;
struct dsa_switch * ds = cpu_dp - > ds ;
struct ethtool_drvinfo * cpu_info ;
struct ethtool_regs * cpu_regs ;
int port = cpu_dp - > index ;
int len ;
if ( ops - > get_regs_len & & ops - > get_regs ) {
len = ops - > get_regs_len ( dev ) ;
if ( len < 0 )
return ;
regs - > len = len ;
ops - > get_regs ( dev , regs , data ) ;
data + = regs - > len ;
}
cpu_info = ( struct ethtool_drvinfo * ) data ;
2022-08-18 23:02:16 +02:00
strscpy ( cpu_info - > driver , " dsa " , sizeof ( cpu_info - > driver ) ) ;
2019-08-02 15:34:55 -04:00
data + = sizeof ( * cpu_info ) ;
cpu_regs = ( struct ethtool_regs * ) data ;
data + = sizeof ( * cpu_regs ) ;
if ( ds - > ops - > get_regs_len & & ds - > ops - > get_regs ) {
len = ds - > ops - > get_regs_len ( ds , port ) ;
if ( len < 0 )
return ;
cpu_regs - > len = len ;
ds - > ops - > get_regs ( ds , port , cpu_regs , data ) ;
}
}
2017-09-19 11:57:00 -04:00
static void dsa_master_get_ethtool_stats ( struct net_device * dev ,
struct ethtool_stats * stats ,
uint64_t * data )
{
2017-09-29 17:19:20 -04:00
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
2017-09-29 17:19:16 -04:00
const struct ethtool_ops * ops = cpu_dp - > orig_ethtool_ops ;
struct dsa_switch * ds = cpu_dp - > ds ;
int port = cpu_dp - > index ;
2017-09-19 11:57:00 -04:00
int count = 0 ;
2018-04-25 12:12:49 -07:00
if ( ops - > get_sset_count & & ops - > get_ethtool_stats ) {
2017-09-19 11:57:00 -04:00
count = ops - > get_sset_count ( dev , ETH_SS_STATS ) ;
ops - > get_ethtool_stats ( dev , stats , data ) ;
}
if ( ds - > ops - > get_ethtool_stats )
2017-09-29 17:19:16 -04:00
ds - > ops - > get_ethtool_stats ( ds , port , data + count ) ;
2017-09-19 11:57:00 -04:00
}
2018-04-25 12:12:52 -07:00
static void dsa_master_get_ethtool_phy_stats ( struct net_device * dev ,
struct ethtool_stats * stats ,
uint64_t * data )
{
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
const struct ethtool_ops * ops = cpu_dp - > orig_ethtool_ops ;
struct dsa_switch * ds = cpu_dp - > ds ;
int port = cpu_dp - > index ;
int count = 0 ;
if ( dev - > phydev & & ! ops - > get_ethtool_phy_stats ) {
count = phy_ethtool_get_sset_count ( dev - > phydev ) ;
if ( count > = 0 )
phy_ethtool_get_stats ( dev - > phydev , stats , data ) ;
} else if ( ops - > get_sset_count & & ops - > get_ethtool_phy_stats ) {
count = ops - > get_sset_count ( dev , ETH_SS_PHY_STATS ) ;
ops - > get_ethtool_phy_stats ( dev , stats , data ) ;
}
if ( count < 0 )
count = 0 ;
if ( ds - > ops - > get_ethtool_phy_stats )
ds - > ops - > get_ethtool_phy_stats ( ds , port , data + count ) ;
}
2017-09-19 11:57:00 -04:00
static int dsa_master_get_sset_count ( struct net_device * dev , int sset )
{
2017-09-29 17:19:20 -04:00
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
2017-09-29 17:19:16 -04:00
const struct ethtool_ops * ops = cpu_dp - > orig_ethtool_ops ;
struct dsa_switch * ds = cpu_dp - > ds ;
2017-09-19 11:57:00 -04:00
int count = 0 ;
2018-04-25 12:12:52 -07:00
if ( sset = = ETH_SS_PHY_STATS & & dev - > phydev & &
! ops - > get_ethtool_phy_stats )
count = phy_ethtool_get_sset_count ( dev - > phydev ) ;
else if ( ops - > get_sset_count )
2018-04-25 12:12:50 -07:00
count = ops - > get_sset_count ( dev , sset ) ;
2018-04-25 12:12:52 -07:00
if ( count < 0 )
count = 0 ;
2017-09-19 11:57:00 -04:00
2018-04-25 12:12:50 -07:00
if ( ds - > ops - > get_sset_count )
count + = ds - > ops - > get_sset_count ( ds , cpu_dp - > index , sset ) ;
2017-09-19 11:57:00 -04:00
return count ;
}
static void dsa_master_get_strings ( struct net_device * dev , uint32_t stringset ,
uint8_t * data )
{
2017-09-29 17:19:20 -04:00
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
2017-09-29 17:19:16 -04:00
const struct ethtool_ops * ops = cpu_dp - > orig_ethtool_ops ;
struct dsa_switch * ds = cpu_dp - > ds ;
int port = cpu_dp - > index ;
2017-09-19 11:57:00 -04:00
int len = ETH_GSTRING_LEN ;
2021-05-08 16:30:35 +03:00
int mcount = 0 , count , i ;
2017-09-19 11:57:00 -04:00
uint8_t pfx [ 4 ] ;
uint8_t * ndata ;
2017-09-29 17:19:16 -04:00
snprintf ( pfx , sizeof ( pfx ) , " p%.2d " , port ) ;
2017-09-19 11:57:00 -04:00
/* We do not want to be NULL-terminated, since this is a prefix */
pfx [ sizeof ( pfx ) - 1 ] = ' _ ' ;
2018-04-25 12:12:52 -07:00
if ( stringset = = ETH_SS_PHY_STATS & & dev - > phydev & &
! ops - > get_ethtool_phy_stats ) {
mcount = phy_ethtool_get_sset_count ( dev - > phydev ) ;
if ( mcount < 0 )
mcount = 0 ;
else
phy_ethtool_get_strings ( dev - > phydev , data ) ;
} else if ( ops - > get_sset_count & & ops - > get_strings ) {
2018-04-25 12:12:50 -07:00
mcount = ops - > get_sset_count ( dev , stringset ) ;
if ( mcount < 0 )
mcount = 0 ;
2017-09-19 11:57:00 -04:00
ops - > get_strings ( dev , stringset , data ) ;
}
2018-04-25 12:12:50 -07:00
if ( ds - > ops - > get_strings ) {
2017-09-19 11:57:00 -04:00
ndata = data + mcount * len ;
/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
* the output after to prepend our CPU port prefix we
* constructed earlier
*/
2018-04-25 12:12:50 -07:00
ds - > ops - > get_strings ( ds , port , stringset , ndata ) ;
count = ds - > ops - > get_sset_count ( ds , port , stringset ) ;
2021-05-08 16:30:35 +03:00
if ( count < 0 )
return ;
2017-09-19 11:57:00 -04:00
for ( i = 0 ; i < count ; i + + ) {
memmove ( ndata + ( i * len + sizeof ( pfx ) ) ,
ndata + i * len , len - sizeof ( pfx ) ) ;
memcpy ( ndata + i * len , pfx , sizeof ( pfx ) ) ;
}
}
}
net: dsa: Deny PTP on master if switch supports it
It is possible to kill PTP on a DSA switch completely and absolutely,
until a reboot, with a simple command:
tcpdump -i eth2 -j adapter_unsynced
where eth2 is the switch's DSA master.
Why? Well, in short, the PTP API in place today is a bit rudimentary and
relies on applications to retrieve the TX timestamps by polling the
error queue and looking at the cmsg structure. But there is no timestamp
identification of any sorts (except whether it's HW or SW), you don't
know how many more timestamps are there to come, which one is this one,
from whom it is, etc. In other words, the SO_TIMESTAMPING API is
fundamentally limited in that you can get a single HW timestamp from the
stack.
And the "-j adapter_unsynced" flag of tcpdump enables hardware
timestamping.
So let's imagine what happens when the DSA master decides it wants to
deliver TX timestamps to the skb's socket too:
- The timestamp that the user space sees is taken by the DSA master.
Whereas the RX timestamp will eventually be overwritten by the DSA
switch. So the RX and TX timestamps will be in different time bases
(aka garbage).
- The user space applications have no way to deal with the second (real)
TX timestamp finally delivered by the DSA switch, or even to know to
wait for it.
Take ptp4l from the linuxptp project, for example. This is its behavior
after running tcpdump, before the patch:
ptp4l[172]: [6469.594] Unexpected data on socket err queue:
ptp4l[172]: [6469.693] rms 8 max 16 freq -21257 +/- 11 delay 748 +/- 0
ptp4l[172]: [6469.711] Unexpected data on socket err queue:
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 05 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: [6469.721] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02
ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 01 c6 b1 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: [6469.838] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02
ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 06 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: [6469.848] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 13 02
ptp4l[172]: 0010 00 36 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 04 1a 45 05 7f
ptp4l[172]: 0030 00 00 5e 05 41 32 27 c2 1a 68 00 04 9f ff fe 05
ptp4l[172]: 0040 de 06 00 01
ptp4l[172]: [6469.855] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02
ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 01 c6 b2 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: [6469.974] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02
ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 07 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
The ptp4l program itself is heavily patched to show this (more details
here [0]). Otherwise, by default it just hangs.
On the other hand, with the DSA patch to disallow HW timestamping
applied:
tcpdump -i eth2 -j adapter_unsynced
tcpdump: SIOCSHWTSTAMP failed: Device or resource busy
So it is a fact of life that PTP timestamping on the DSA master is
incompatible with timestamping on the switch MAC, at least with the
current API. And if the switch supports PTP, taking the timestamps from
the switch MAC is highly preferable anyway, due to the fact that those
don't contain the queuing latencies of the switch. So just disallow PTP
on the DSA master if there is any PTP-capable switch attached.
[0]: https://sourceforge.net/p/linuxptp/mailman/message/36880648/
Fixes: 0336369d3a4d ("net: dsa: forward hardware timestamping ioctls to switch driver")
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 15:30:46 +02:00
static int dsa_master_ioctl ( struct net_device * dev , struct ifreq * ifr , int cmd )
{
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
struct dsa_switch * ds = cpu_dp - > ds ;
struct dsa_switch_tree * dst ;
int err = - EOPNOTSUPP ;
struct dsa_port * dp ;
dst = ds - > dst ;
switch ( cmd ) {
case SIOCGHWTSTAMP :
case SIOCSHWTSTAMP :
/* Deny PTP operations on master if there is at least one
* switch in the tree that is PTP capable .
*/
list_for_each_entry ( dp , & dst - > ports , list )
if ( dp - > ds - > ops - > port_hwtstamp_get | |
dp - > ds - > ops - > port_hwtstamp_set )
return - EBUSY ;
break ;
}
2021-07-27 15:45:13 +02:00
if ( dev - > netdev_ops - > ndo_eth_ioctl )
err = dev - > netdev_ops - > ndo_eth_ioctl ( dev , ifr , cmd ) ;
net: dsa: Deny PTP on master if switch supports it
It is possible to kill PTP on a DSA switch completely and absolutely,
until a reboot, with a simple command:
tcpdump -i eth2 -j adapter_unsynced
where eth2 is the switch's DSA master.
Why? Well, in short, the PTP API in place today is a bit rudimentary and
relies on applications to retrieve the TX timestamps by polling the
error queue and looking at the cmsg structure. But there is no timestamp
identification of any sorts (except whether it's HW or SW), you don't
know how many more timestamps are there to come, which one is this one,
from whom it is, etc. In other words, the SO_TIMESTAMPING API is
fundamentally limited in that you can get a single HW timestamp from the
stack.
And the "-j adapter_unsynced" flag of tcpdump enables hardware
timestamping.
So let's imagine what happens when the DSA master decides it wants to
deliver TX timestamps to the skb's socket too:
- The timestamp that the user space sees is taken by the DSA master.
Whereas the RX timestamp will eventually be overwritten by the DSA
switch. So the RX and TX timestamps will be in different time bases
(aka garbage).
- The user space applications have no way to deal with the second (real)
TX timestamp finally delivered by the DSA switch, or even to know to
wait for it.
Take ptp4l from the linuxptp project, for example. This is its behavior
after running tcpdump, before the patch:
ptp4l[172]: [6469.594] Unexpected data on socket err queue:
ptp4l[172]: [6469.693] rms 8 max 16 freq -21257 +/- 11 delay 748 +/- 0
ptp4l[172]: [6469.711] Unexpected data on socket err queue:
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 05 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: [6469.721] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02
ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 01 c6 b1 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: [6469.838] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02
ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 06 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: [6469.848] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 13 02
ptp4l[172]: 0010 00 36 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 04 1a 45 05 7f
ptp4l[172]: 0030 00 00 5e 05 41 32 27 c2 1a 68 00 04 9f ff fe 05
ptp4l[172]: 0040 de 06 00 01
ptp4l[172]: [6469.855] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02
ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 01 c6 b2 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: [6469.974] Unexpected data on socket err queue:
ptp4l[172]: 0000 01 80 c2 00 00 0e 00 1f 7b 63 02 48 88 f7 10 02
ptp4l[172]: 0010 00 2c 00 00 02 00 00 00 00 00 00 00 00 00 00 00
ptp4l[172]: 0020 00 00 00 1f 7b ff fe 63 02 48 00 03 aa 07 00 fd
ptp4l[172]: 0030 00 00 00 00 00 00 00 00 00 00
The ptp4l program itself is heavily patched to show this (more details
here [0]). Otherwise, by default it just hangs.
On the other hand, with the DSA patch to disallow HW timestamping
applied:
tcpdump -i eth2 -j adapter_unsynced
tcpdump: SIOCSHWTSTAMP failed: Device or resource busy
So it is a fact of life that PTP timestamping on the DSA master is
incompatible with timestamping on the switch MAC, at least with the
current API. And if the switch supports PTP, taking the timestamps from
the switch MAC is highly preferable anyway, due to the fact that those
don't contain the queuing latencies of the switch. So just disallow PTP
on the DSA master if there is any PTP-capable switch attached.
[0]: https://sourceforge.net/p/linuxptp/mailman/message/36880648/
Fixes: 0336369d3a4d ("net: dsa: forward hardware timestamping ioctls to switch driver")
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 15:30:46 +02:00
return err ;
}
2020-07-19 20:49:54 -07:00
static const struct dsa_netdevice_ops dsa_netdev_ops = {
2021-07-27 15:45:13 +02:00
. ndo_eth_ioctl = dsa_master_ioctl ,
2020-07-19 20:49:54 -07:00
} ;
2017-11-06 16:11:45 -05:00
static int dsa_master_ethtool_setup ( struct net_device * dev )
2017-09-19 11:57:00 -04:00
{
2017-09-29 17:19:20 -04:00
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
2017-09-29 17:19:16 -04:00
struct dsa_switch * ds = cpu_dp - > ds ;
2017-09-19 11:57:00 -04:00
struct ethtool_ops * ops ;
ops = devm_kzalloc ( ds - > dev , sizeof ( * ops ) , GFP_KERNEL ) ;
if ( ! ops )
return - ENOMEM ;
2017-09-29 17:19:16 -04:00
cpu_dp - > orig_ethtool_ops = dev - > ethtool_ops ;
if ( cpu_dp - > orig_ethtool_ops )
memcpy ( ops , cpu_dp - > orig_ethtool_ops , sizeof ( * ops ) ) ;
2017-09-19 11:57:00 -04:00
2019-08-02 15:34:55 -04:00
ops - > get_regs_len = dsa_master_get_regs_len ;
ops - > get_regs = dsa_master_get_regs ;
2017-09-19 11:57:00 -04:00
ops - > get_sset_count = dsa_master_get_sset_count ;
ops - > get_ethtool_stats = dsa_master_get_ethtool_stats ;
ops - > get_strings = dsa_master_get_strings ;
2018-04-25 12:12:52 -07:00
ops - > get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats ;
2017-09-19 11:57:00 -04:00
dev - > ethtool_ops = ops ;
return 0 ;
}
2017-11-06 16:11:45 -05:00
static void dsa_master_ethtool_teardown ( struct net_device * dev )
2017-09-19 11:57:00 -04:00
{
2017-09-29 17:19:20 -04:00
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
2017-09-19 11:57:00 -04:00
2017-09-29 17:19:16 -04:00
dev - > ethtool_ops = cpu_dp - > orig_ethtool_ops ;
cpu_dp - > orig_ethtool_ops = NULL ;
2017-09-19 11:57:00 -04:00
}
2017-11-06 16:11:45 -05:00
2020-07-19 20:49:54 -07:00
static void dsa_netdev_ops_set ( struct net_device * dev ,
const struct dsa_netdevice_ops * ops )
2019-01-15 14:43:04 -08:00
{
2020-07-19 20:49:54 -07:00
dev - > dsa_ptr - > netdev_ops = ops ;
2019-01-15 14:43:04 -08:00
}
net: dsa: avoid call to __dev_set_promiscuity() while rtnl_mutex isn't held
If the DSA master doesn't support IFF_UNICAST_FLT, then the following
call path is possible:
dsa_slave_switchdev_event_work
-> dsa_port_host_fdb_add
-> dev_uc_add
-> __dev_set_rx_mode
-> __dev_set_promiscuity
Since the blamed commit, dsa_slave_switchdev_event_work() no longer
holds rtnl_lock(), which triggers the ASSERT_RTNL() from
__dev_set_promiscuity().
Taking rtnl_lock() around dev_uc_add() is impossible, because all the
code paths that call dsa_flush_workqueue() do so from contexts where the
rtnl_mutex is already held - so this would lead to an instant deadlock.
dev_uc_add() in itself doesn't require the rtnl_mutex for protection.
There is this comment in __dev_set_rx_mode() which assumes so:
/* Unicast addresses changes may only happen under the rtnl,
* therefore calling __dev_set_promiscuity here is safe.
*/
but it is from commit 4417da668c00 ("[NET]: dev: secondary unicast
address support") dated June 2007, and in the meantime, commit
f1f28aa3510d ("netdev: Add addr_list_lock to struct net_device."), dated
July 2008, has added &dev->addr_list_lock to protect this instead of the
global rtnl_mutex.
Nonetheless, __dev_set_promiscuity() does assume rtnl_mutex protection,
but it is the uncommon path of what we typically expect dev_uc_add()
to do. So since only the uncommon path requires rtnl_lock(), just check
ahead of time whether dev_uc_add() would result into a call to
__dev_set_promiscuity(), and handle that condition separately.
DSA already configures the master interface to be promiscuous if the
tagger requires this. We can extend this to also cover the case where
the master doesn't handle dev_uc_add() (doesn't support IFF_UNICAST_FLT),
and on the premise that we'd end up making it promiscuous during
operation anyway, either if a DSA slave has a non-inherited MAC address,
or if the bridge notifies local FDB entries for its own MAC address, the
address of a station learned on a foreign port, etc.
Fixes: 0faf890fc519 ("net: dsa: drop rtnl_lock from dsa_slave_switchdev_event_work")
Reported-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-18 14:13:02 +02:00
/* Keep the master always promiscuous if the tagging protocol requires that
* ( garbles MAC DA ) or if it doesn ' t support unicast filtering , case in which
* it would revert to promiscuous mode as soon as we call dev_uc_add ( ) on it
* anyway .
*/
2020-09-26 22:32:02 +03:00
static void dsa_master_set_promiscuity ( struct net_device * dev , int inc )
{
const struct dsa_device_ops * ops = dev - > dsa_ptr - > tag_ops ;
net: dsa: avoid call to __dev_set_promiscuity() while rtnl_mutex isn't held
If the DSA master doesn't support IFF_UNICAST_FLT, then the following
call path is possible:
dsa_slave_switchdev_event_work
-> dsa_port_host_fdb_add
-> dev_uc_add
-> __dev_set_rx_mode
-> __dev_set_promiscuity
Since the blamed commit, dsa_slave_switchdev_event_work() no longer
holds rtnl_lock(), which triggers the ASSERT_RTNL() from
__dev_set_promiscuity().
Taking rtnl_lock() around dev_uc_add() is impossible, because all the
code paths that call dsa_flush_workqueue() do so from contexts where the
rtnl_mutex is already held - so this would lead to an instant deadlock.
dev_uc_add() in itself doesn't require the rtnl_mutex for protection.
There is this comment in __dev_set_rx_mode() which assumes so:
/* Unicast addresses changes may only happen under the rtnl,
* therefore calling __dev_set_promiscuity here is safe.
*/
but it is from commit 4417da668c00 ("[NET]: dev: secondary unicast
address support") dated June 2007, and in the meantime, commit
f1f28aa3510d ("netdev: Add addr_list_lock to struct net_device."), dated
July 2008, has added &dev->addr_list_lock to protect this instead of the
global rtnl_mutex.
Nonetheless, __dev_set_promiscuity() does assume rtnl_mutex protection,
but it is the uncommon path of what we typically expect dev_uc_add()
to do. So since only the uncommon path requires rtnl_lock(), just check
ahead of time whether dev_uc_add() would result into a call to
__dev_set_promiscuity(), and handle that condition separately.
DSA already configures the master interface to be promiscuous if the
tagger requires this. We can extend this to also cover the case where
the master doesn't handle dev_uc_add() (doesn't support IFF_UNICAST_FLT),
and on the premise that we'd end up making it promiscuous during
operation anyway, either if a DSA slave has a non-inherited MAC address,
or if the bridge notifies local FDB entries for its own MAC address, the
address of a station learned on a foreign port, etc.
Fixes: 0faf890fc519 ("net: dsa: drop rtnl_lock from dsa_slave_switchdev_event_work")
Reported-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-18 14:13:02 +02:00
if ( ( dev - > priv_flags & IFF_UNICAST_FLT ) & & ! ops - > promisc_on_master )
2020-09-26 22:32:02 +03:00
return ;
2022-01-06 01:11:15 +02:00
ASSERT_RTNL ( ) ;
2020-09-26 22:32:02 +03:00
dev_set_promiscuity ( dev , inc ) ;
}
2018-11-28 13:40:04 -08:00
static ssize_t tagging_show ( struct device * d , struct device_attribute * attr ,
char * buf )
{
struct net_device * dev = to_net_dev ( d ) ;
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
return sprintf ( buf , " %s \n " ,
dsa_tag_protocol_to_str ( cpu_dp - > tag_ops ) ) ;
}
net: dsa: allow changing the tag protocol via the "tagging" device attribute
Currently DSA exposes the following sysfs:
$ cat /sys/class/net/eno2/dsa/tagging
ocelot
which is a read-only device attribute, introduced in the kernel as
commit 98cdb4807123 ("net: dsa: Expose tagging protocol to user-space"),
and used by libpcap since its commit 993db3800d7d ("Add support for DSA
link-layer types").
It would be nice if we could extend this device attribute by making it
writable:
$ echo ocelot-8021q > /sys/class/net/eno2/dsa/tagging
This is useful with DSA switches that can make use of more than one
tagging protocol. It may be useful in dsa_loop in the future too, to
perform offline testing of various taggers, or for changing between dsa
and edsa on Marvell switches, if that is desirable.
In terms of implementation, drivers can support this feature by
implementing .change_tag_protocol, which should always leave the switch
in a consistent state: either with the new protocol if things went well,
or with the old one if something failed. Teardown of the old protocol,
if necessary, must be handled by the driver.
Some things remain as before:
- The .get_tag_protocol is currently only called at probe time, to load
the initial tagging protocol driver. Nonetheless, new drivers should
report the tagging protocol in current use now.
- The driver should manage by itself the initial setup of tagging
protocol, no later than the .setup() method, as well as destroying
resources used by the last tagger in use, no earlier than the
.teardown() method.
For multi-switch DSA trees, error handling is a bit more complicated,
since e.g. the 5th out of 7 switches may fail to change the tag
protocol. When that happens, a revert to the original tag protocol is
attempted, but that may fail too, leaving the tree in an inconsistent
state despite each individual switch implementing .change_tag_protocol
transactionally. Since the intersection between drivers that implement
.change_tag_protocol and drivers that support D in DSA is currently the
empty set, the possibility for this error to happen is ignored for now.
Testing:
$ insmod mscc_felix.ko
[ 79.549784] mscc_felix 0000:00:00.5: Adding to iommu group 14
[ 79.565712] mscc_felix 0000:00:00.5: Failed to register DSA switch: -517
$ insmod tag_ocelot.ko
$ rmmod mscc_felix.ko
$ insmod mscc_felix.ko
[ 97.261724] libphy: VSC9959 internal MDIO bus: probed
[ 97.267363] mscc_felix 0000:00:00.5: Found PCS at internal MDIO address 0
[ 97.274998] mscc_felix 0000:00:00.5: Found PCS at internal MDIO address 1
[ 97.282561] mscc_felix 0000:00:00.5: Found PCS at internal MDIO address 2
[ 97.289700] mscc_felix 0000:00:00.5: Found PCS at internal MDIO address 3
[ 97.599163] mscc_felix 0000:00:00.5 swp0 (uninitialized): PHY [0000:00:00.3:10] driver [Microsemi GE VSC8514 SyncE] (irq=POLL)
[ 97.862034] mscc_felix 0000:00:00.5 swp1 (uninitialized): PHY [0000:00:00.3:11] driver [Microsemi GE VSC8514 SyncE] (irq=POLL)
[ 97.950731] mscc_felix 0000:00:00.5 swp0: configuring for inband/qsgmii link mode
[ 97.964278] 8021q: adding VLAN 0 to HW filter on device swp0
[ 98.146161] mscc_felix 0000:00:00.5 swp2 (uninitialized): PHY [0000:00:00.3:12] driver [Microsemi GE VSC8514 SyncE] (irq=POLL)
[ 98.238649] mscc_felix 0000:00:00.5 swp1: configuring for inband/qsgmii link mode
[ 98.251845] 8021q: adding VLAN 0 to HW filter on device swp1
[ 98.433916] mscc_felix 0000:00:00.5 swp3 (uninitialized): PHY [0000:00:00.3:13] driver [Microsemi GE VSC8514 SyncE] (irq=POLL)
[ 98.485542] mscc_felix 0000:00:00.5: configuring for fixed/internal link mode
[ 98.503584] mscc_felix 0000:00:00.5: Link is Up - 2.5Gbps/Full - flow control rx/tx
[ 98.527948] device eno2 entered promiscuous mode
[ 98.544755] DSA: tree 0 setup
$ ping 10.0.0.1
PING 10.0.0.1 (10.0.0.1): 56 data bytes
64 bytes from 10.0.0.1: seq=0 ttl=64 time=2.337 ms
64 bytes from 10.0.0.1: seq=1 ttl=64 time=0.754 ms
^C
- 10.0.0.1 ping statistics -
2 packets transmitted, 2 packets received, 0% packet loss
round-trip min/avg/max = 0.754/1.545/2.337 ms
$ cat /sys/class/net/eno2/dsa/tagging
ocelot
$ cat ./test_ocelot_8021q.sh
#!/bin/bash
ip link set swp0 down
ip link set swp1 down
ip link set swp2 down
ip link set swp3 down
ip link set swp5 down
ip link set eno2 down
echo ocelot-8021q > /sys/class/net/eno2/dsa/tagging
ip link set eno2 up
ip link set swp0 up
ip link set swp1 up
ip link set swp2 up
ip link set swp3 up
ip link set swp5 up
$ ./test_ocelot_8021q.sh
./test_ocelot_8021q.sh: line 9: echo: write error: Protocol not available
$ rmmod tag_ocelot.ko
rmmod: can't unload module 'tag_ocelot': Resource temporarily unavailable
$ insmod tag_ocelot_8021q.ko
$ ./test_ocelot_8021q.sh
$ cat /sys/class/net/eno2/dsa/tagging
ocelot-8021q
$ rmmod tag_ocelot.ko
$ rmmod tag_ocelot_8021q.ko
rmmod: can't unload module 'tag_ocelot_8021q': Resource temporarily unavailable
$ ping 10.0.0.1
PING 10.0.0.1 (10.0.0.1): 56 data bytes
64 bytes from 10.0.0.1: seq=0 ttl=64 time=0.953 ms
64 bytes from 10.0.0.1: seq=1 ttl=64 time=0.787 ms
64 bytes from 10.0.0.1: seq=2 ttl=64 time=0.771 ms
$ rmmod mscc_felix.ko
[ 645.544426] mscc_felix 0000:00:00.5: Link is Down
[ 645.838608] DSA: tree 0 torn down
$ rmmod tag_ocelot_8021q.ko
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 03:00:06 +02:00
static ssize_t tagging_store ( struct device * d , struct device_attribute * attr ,
const char * buf , size_t count )
{
const struct dsa_device_ops * new_tag_ops , * old_tag_ops ;
struct net_device * dev = to_net_dev ( d ) ;
struct dsa_port * cpu_dp = dev - > dsa_ptr ;
int err ;
old_tag_ops = cpu_dp - > tag_ops ;
new_tag_ops = dsa_find_tagger_by_name ( buf ) ;
/* Bad tagger name, or module is not loaded? */
if ( IS_ERR ( new_tag_ops ) )
return PTR_ERR ( new_tag_ops ) ;
if ( new_tag_ops = = old_tag_ops )
/* Drop the temporarily held duplicate reference, since
* the DSA switch tree uses this tagger .
*/
goto out ;
err = dsa_tree_change_tag_proto ( cpu_dp - > ds - > dst , dev , new_tag_ops ,
old_tag_ops ) ;
if ( err ) {
/* On failure the old tagger is restored, so we don't need the
* driver for the new one .
*/
dsa_tag_driver_put ( new_tag_ops ) ;
return err ;
}
/* On success we no longer need the module for the old tagging protocol
*/
out :
dsa_tag_driver_put ( old_tag_ops ) ;
return count ;
}
static DEVICE_ATTR_RW ( tagging ) ;
2018-11-28 13:40:04 -08:00
static struct attribute * dsa_slave_attrs [ ] = {
& dev_attr_tagging . attr ,
NULL
} ;
static const struct attribute_group dsa_group = {
. name = " dsa " ,
. attrs = dsa_slave_attrs ,
} ;
2022-03-31 16:28:54 +03:00
static void dsa_master_reset_mtu ( struct net_device * dev )
{
int err ;
err = dev_set_mtu ( dev , ETH_DATA_LEN ) ;
if ( err )
netdev_dbg ( dev ,
" Unable to reset MTU to exclude DSA overheads \n " ) ;
}
2017-11-06 16:11:45 -05:00
int dsa_master_setup ( struct net_device * dev , struct dsa_port * cpu_dp )
{
2022-03-31 16:28:54 +03:00
const struct dsa_device_ops * tag_ops = cpu_dp - > tag_ops ;
net: dsa: unbind all switches from tree when DSA master unbinds
Currently the following happens when a DSA master driver unbinds while
there are DSA switches attached to it:
$ echo 0000:00:00.5 > /sys/bus/pci/drivers/mscc_felix/unbind
------------[ cut here ]------------
WARNING: CPU: 0 PID: 392 at net/core/dev.c:9507
Call trace:
rollback_registered_many+0x5fc/0x688
unregister_netdevice_queue+0x98/0x120
dsa_slave_destroy+0x4c/0x88
dsa_port_teardown.part.16+0x78/0xb0
dsa_tree_teardown_switches+0x58/0xc0
dsa_unregister_switch+0x104/0x1b8
felix_pci_remove+0x24/0x48
pci_device_remove+0x48/0xf0
device_release_driver_internal+0x118/0x1e8
device_driver_detach+0x28/0x38
unbind_store+0xd0/0x100
Located at the above location is this WARN_ON:
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
Other stacked interfaces, like VLAN, do indeed listen for
NETDEV_UNREGISTER on the real_dev and also unregister themselves at that
time, which is clearly the behavior that rollback_registered_many
expects. But DSA interfaces are not VLAN. They have backing hardware
(platform devices, PCI devices, MDIO, SPI etc) which have a life cycle
of their own and we can't just trigger an unregister from the DSA
framework when we receive a netdev notifier that the master unregisters.
Luckily, there is something we can do, and that is to inform the driver
core that we have a runtime dependency to the DSA master interface's
device, and create a device link where that is the supplier and we are
the consumer. Having this device link will make the DSA switch unbind
before the DSA master unbinds, which is enough to avoid the WARN_ON from
rollback_registered_many.
Note that even before the blamed commit, DSA did nothing intelligent
when the master interface got unregistered either. See the discussion
here:
https://lore.kernel.org/netdev/20200505210253.20311-1-f.fainelli@gmail.com/
But this time, at least the WARN_ON is loud enough that the
upper_dev_link commit can be blamed.
The advantage with this approach vs dev_hold(master) in the attached
link is that the latter is not meant for long term reference counting.
With dev_hold, the only thing that will happen is that when the user
attempts an unbind of the DSA master, netdev_wait_allrefs will keep
waiting and waiting, due to DSA keeping the refcount forever. DSA would
not access freed memory corresponding to the master interface, but the
unbind would still result in a freeze. Whereas with device links,
graceful teardown is ensured. It even works with cascaded DSA trees.
$ echo 0000:00:00.2 > /sys/bus/pci/drivers/fsl_enetc/unbind
[ 1818.797546] device swp0 left promiscuous mode
[ 1819.301112] sja1105 spi2.0: Link is Down
[ 1819.307981] DSA: tree 1 torn down
[ 1819.312408] device eno2 left promiscuous mode
[ 1819.656803] mscc_felix 0000:00:00.5: Link is Down
[ 1819.667194] DSA: tree 0 torn down
[ 1819.711557] fsl_enetc 0000:00:00.2 eno2: Link is Down
This approach allows us to keep the DSA framework absolutely unchanged,
and the driver core will just know to unbind us first when the master
goes away - as opposed to the large (and probably impossible) rework
required if attempting to listen for NETDEV_UNREGISTER.
As per the documentation at Documentation/driver-api/device_link.rst,
specifying the DL_FLAG_AUTOREMOVE_CONSUMER flag causes the device link
to be automatically purged when the consumer fails to probe or later
unbinds. So we don't need to keep the consumer_link variable in struct
dsa_switch.
Fixes: 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20210111230943.3701806-1-olteanv@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-12 01:09:43 +02:00
struct dsa_switch * ds = cpu_dp - > ds ;
struct device_link * consumer_link ;
2022-03-31 16:28:54 +03:00
int mtu , ret ;
mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead ( tag_ops ) ;
2018-11-28 13:40:04 -08:00
net: dsa: unbind all switches from tree when DSA master unbinds
Currently the following happens when a DSA master driver unbinds while
there are DSA switches attached to it:
$ echo 0000:00:00.5 > /sys/bus/pci/drivers/mscc_felix/unbind
------------[ cut here ]------------
WARNING: CPU: 0 PID: 392 at net/core/dev.c:9507
Call trace:
rollback_registered_many+0x5fc/0x688
unregister_netdevice_queue+0x98/0x120
dsa_slave_destroy+0x4c/0x88
dsa_port_teardown.part.16+0x78/0xb0
dsa_tree_teardown_switches+0x58/0xc0
dsa_unregister_switch+0x104/0x1b8
felix_pci_remove+0x24/0x48
pci_device_remove+0x48/0xf0
device_release_driver_internal+0x118/0x1e8
device_driver_detach+0x28/0x38
unbind_store+0xd0/0x100
Located at the above location is this WARN_ON:
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
Other stacked interfaces, like VLAN, do indeed listen for
NETDEV_UNREGISTER on the real_dev and also unregister themselves at that
time, which is clearly the behavior that rollback_registered_many
expects. But DSA interfaces are not VLAN. They have backing hardware
(platform devices, PCI devices, MDIO, SPI etc) which have a life cycle
of their own and we can't just trigger an unregister from the DSA
framework when we receive a netdev notifier that the master unregisters.
Luckily, there is something we can do, and that is to inform the driver
core that we have a runtime dependency to the DSA master interface's
device, and create a device link where that is the supplier and we are
the consumer. Having this device link will make the DSA switch unbind
before the DSA master unbinds, which is enough to avoid the WARN_ON from
rollback_registered_many.
Note that even before the blamed commit, DSA did nothing intelligent
when the master interface got unregistered either. See the discussion
here:
https://lore.kernel.org/netdev/20200505210253.20311-1-f.fainelli@gmail.com/
But this time, at least the WARN_ON is loud enough that the
upper_dev_link commit can be blamed.
The advantage with this approach vs dev_hold(master) in the attached
link is that the latter is not meant for long term reference counting.
With dev_hold, the only thing that will happen is that when the user
attempts an unbind of the DSA master, netdev_wait_allrefs will keep
waiting and waiting, due to DSA keeping the refcount forever. DSA would
not access freed memory corresponding to the master interface, but the
unbind would still result in a freeze. Whereas with device links,
graceful teardown is ensured. It even works with cascaded DSA trees.
$ echo 0000:00:00.2 > /sys/bus/pci/drivers/fsl_enetc/unbind
[ 1818.797546] device swp0 left promiscuous mode
[ 1819.301112] sja1105 spi2.0: Link is Down
[ 1819.307981] DSA: tree 1 torn down
[ 1819.312408] device eno2 left promiscuous mode
[ 1819.656803] mscc_felix 0000:00:00.5: Link is Down
[ 1819.667194] DSA: tree 0 torn down
[ 1819.711557] fsl_enetc 0000:00:00.2 eno2: Link is Down
This approach allows us to keep the DSA framework absolutely unchanged,
and the driver core will just know to unbind us first when the master
goes away - as opposed to the large (and probably impossible) rework
required if attempting to listen for NETDEV_UNREGISTER.
As per the documentation at Documentation/driver-api/device_link.rst,
specifying the DL_FLAG_AUTOREMOVE_CONSUMER flag causes the device link
to be automatically purged when the consumer fails to probe or later
unbinds. So we don't need to keep the consumer_link variable in struct
dsa_switch.
Fixes: 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20210111230943.3701806-1-olteanv@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-12 01:09:43 +02:00
/* The DSA master must use SET_NETDEV_DEV for this to work. */
consumer_link = device_link_add ( ds - > dev , dev - > dev . parent ,
DL_FLAG_AUTOREMOVE_CONSUMER ) ;
if ( ! consumer_link )
netdev_err ( dev ,
" Failed to create a device link to DSA switch %s \n " ,
dev_name ( ds - > dev ) ) ;
2022-03-31 16:28:54 +03:00
/* The switch driver may not implement ->port_change_mtu(), case in
* which dsa_slave_change_mtu ( ) will not update the master MTU either ,
* so we need to do that here .
*/
ret = dev_set_mtu ( dev , mtu ) ;
if ( ret )
netdev_warn ( dev , " error %d setting MTU to %d to include DSA overhead \n " ,
ret , mtu ) ;
2017-11-06 16:11:45 -05:00
/* If we use a tagging format that doesn't have an ethertype
* field , make sure that all packets from this point on get
* sent to the tag format ' s receive function .
*/
wmb ( ) ;
dev - > dsa_ptr = cpu_dp ;
2020-09-26 22:32:02 +03:00
dsa_master_set_promiscuity ( dev , 1 ) ;
2018-11-28 13:40:04 -08:00
ret = dsa_master_ethtool_setup ( dev ) ;
if ( ret )
2020-09-26 22:32:02 +03:00
goto out_err_reset_promisc ;
2018-11-28 13:40:04 -08:00
2020-07-19 20:49:54 -07:00
dsa_netdev_ops_set ( dev , & dsa_netdev_ops ) ;
2019-01-15 14:43:04 -08:00
2018-11-28 13:40:04 -08:00
ret = sysfs_create_group ( & dev - > dev . kobj , & dsa_group ) ;
if ( ret )
2019-01-15 14:43:04 -08:00
goto out_err_ndo_teardown ;
return ret ;
2018-11-28 13:40:04 -08:00
2019-01-15 14:43:04 -08:00
out_err_ndo_teardown :
2020-07-19 20:49:54 -07:00
dsa_netdev_ops_set ( dev , NULL ) ;
2019-01-15 14:43:04 -08:00
dsa_master_ethtool_teardown ( dev ) ;
2020-09-26 22:32:02 +03:00
out_err_reset_promisc :
dsa_master_set_promiscuity ( dev , - 1 ) ;
2018-11-28 13:40:04 -08:00
return ret ;
2017-11-06 16:11:45 -05:00
}
void dsa_master_teardown ( struct net_device * dev )
{
2018-11-28 13:40:04 -08:00
sysfs_remove_group ( & dev - > dev . kobj , & dsa_group ) ;
2020-07-19 20:49:54 -07:00
dsa_netdev_ops_set ( dev , NULL ) ;
2017-11-06 16:11:45 -05:00
dsa_master_ethtool_teardown ( dev ) ;
2022-03-31 16:28:54 +03:00
dsa_master_reset_mtu ( dev ) ;
2020-09-26 22:32:02 +03:00
dsa_master_set_promiscuity ( dev , - 1 ) ;
2017-11-06 16:11:45 -05:00
dev - > dsa_ptr = NULL ;
/* If we used a tagging format that doesn't have an ethertype
* field , make sure that all packets from this point get sent
* without the tag and go through the regular receive path .
*/
wmb ( ) ;
}