2019-04-27 19:32:56 +02:00
// SPDX-License-Identifier: GPL-2.0+
2008-10-07 13:46:22 +00:00
/*
* net / dsa / mv88e6060 . c - Driver for Marvell 88e6060 switch chips
dsa: add switch chip cascading support
The initial version of the DSA driver only supported a single switch
chip per network interface, while DSA-capable switch chips can be
interconnected to form a tree of switch chips. This patch adds support
for multiple switch chips on a network interface.
An example topology for a 16-port device with an embedded CPU is as
follows:
+-----+ +--------+ +--------+
| |eth0 10| switch |9 10| switch |
| CPU +----------+ +-------+ |
| | | chip 0 | | chip 1 |
+-----+ +---++---+ +---++---+
|| ||
|| ||
||1000baseT ||1000baseT
||ports 1-8 ||ports 9-16
This requires a couple of interdependent changes in the DSA layer:
- The dsa platform driver data needs to be extended: there is still
only one netdevice per DSA driver instance (eth0 in the example
above), but each of the switch chips in the tree needs its own
mii_bus device pointer, MII management bus address, and port name
array. (include/net/dsa.h) The existing in-tree dsa users need
some small changes to deal with this. (arch/arm)
- The DSA and Ethertype DSA tagging modules need to be extended to
use the DSA device ID field on receive and demultiplex the packet
accordingly, and fill in the DSA device ID field on transmit
according to which switch chip the packet is heading to.
(net/dsa/tag_{dsa,edsa}.c)
- The concept of "CPU port", which is the switch chip port that the
CPU is connected to (port 10 on switch chip 0 in the example), needs
to be extended with the concept of "upstream port", which is the
port on the switch chip that will bring us one hop closer to the CPU
(port 10 for both switch chips in the example above).
- The dsa platform data needs to specify which ports on which switch
chips are links to other switch chips, so that we can enable DSA
tagging mode on them. (For inter-switch links, we always use
non-EtherType DSA tagging, since it has lower overhead. The CPU
link uses dsa or edsa tagging depending on what the 'root' switch
chip supports.) This is done by specifying "dsa" for the given
port in the port array.
- The dsa platform data needs to be extended with information on via
which port to reach any given switch chip from any given switch chip.
This info is specified via the per-switch chip data struct ->rtable[]
array, which gives the nexthop ports for each of the other switches
in the tree.
For the example topology above, the dsa platform data would look
something like this:
static struct dsa_chip_data sw[2] = {
{
.mii_bus = &foo,
.sw_addr = 1,
.port_names[0] = "p1",
.port_names[1] = "p2",
.port_names[2] = "p3",
.port_names[3] = "p4",
.port_names[4] = "p5",
.port_names[5] = "p6",
.port_names[6] = "p7",
.port_names[7] = "p8",
.port_names[9] = "dsa",
.port_names[10] = "cpu",
.rtable = (s8 []){ -1, 9, },
}, {
.mii_bus = &foo,
.sw_addr = 2,
.port_names[0] = "p9",
.port_names[1] = "p10",
.port_names[2] = "p11",
.port_names[3] = "p12",
.port_names[4] = "p13",
.port_names[5] = "p14",
.port_names[6] = "p15",
.port_names[7] = "p16",
.port_names[10] = "dsa",
.rtable = (s8 []){ 10, -1, },
},
},
static struct dsa_platform_data pd = {
.netdev = &foo,
.nr_switches = 2,
.sw = sw,
};
Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Tested-by: Gary Thomas <gary@mlbassoc.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-03-20 09:52:09 +00:00
* Copyright ( c ) 2008 - 2009 Marvell Semiconductor
2008-10-07 13:46:22 +00:00
*/
2013-01-08 16:05:54 +00:00
# include <linux/delay.h>
2017-10-13 14:18:07 -04:00
# include <linux/etherdevice.h>
2013-01-08 16:05:54 +00:00
# include <linux/jiffies.h>
2008-10-07 13:46:22 +00:00
# include <linux/list.h>
2012-01-24 10:41:40 +00:00
# include <linux/module.h>
2008-10-07 13:46:22 +00:00
# include <linux/netdevice.h>
# include <linux/phy.h>
2011-11-27 17:06:08 +00:00
# include <net/dsa.h>
2015-11-10 16:51:36 +01:00
# include "mv88e6060.h"
2008-10-07 13:46:22 +00:00
2019-04-27 19:32:57 +02:00
static int reg_read ( struct mv88e6060_priv * priv , int addr , int reg )
2008-10-07 13:46:22 +00:00
{
2016-04-13 02:40:42 +02:00
return mdiobus_read_nested ( priv - > bus , priv - > sw_addr + addr , reg ) ;
2008-10-07 13:46:22 +00:00
}
2019-04-27 19:32:57 +02:00
static int reg_write ( struct mv88e6060_priv * priv , int addr , int reg , u16 val )
2008-10-07 13:46:22 +00:00
{
2016-04-13 02:40:42 +02:00
return mdiobus_write_nested ( priv - > bus , priv - > sw_addr + addr , reg , val ) ;
2008-10-07 13:46:22 +00:00
}
2016-04-17 13:23:55 -04:00
static const char * mv88e6060_get_name ( struct mii_bus * bus , int sw_addr )
2008-10-07 13:46:22 +00:00
{
int ret ;
2015-11-10 16:51:36 +01:00
ret = mdiobus_read ( bus , sw_addr + REG_PORT ( 0 ) , PORT_SWITCH_ID ) ;
2008-10-07 13:46:22 +00:00
if ( ret > = 0 ) {
2015-11-10 16:51:36 +01:00
if ( ret = = PORT_SWITCH_ID_6060 )
2014-10-29 10:44:54 -07:00
return " Marvell 88E6060 (A0) " ;
2015-11-10 16:51:36 +01:00
if ( ret = = PORT_SWITCH_ID_6060_R1 | |
ret = = PORT_SWITCH_ID_6060_R2 )
2014-10-29 10:44:54 -07:00
return " Marvell 88E6060 (B0) " ;
2015-11-10 16:51:36 +01:00
if ( ( ret & PORT_SWITCH_ID_6060_MASK ) = = PORT_SWITCH_ID_6060 )
2008-10-07 13:46:22 +00:00
return " Marvell 88E6060 " ;
}
return NULL ;
}
2017-11-10 15:22:52 -08:00
static enum dsa_tag_protocol mv88e6060_get_tag_protocol ( struct dsa_switch * ds ,
2020-01-07 21:06:05 -08:00
int port ,
enum dsa_tag_protocol m )
2016-08-22 16:01:01 +02:00
{
return DSA_TAG_PROTO_TRAILER ;
}
2019-04-27 19:32:57 +02:00
static int mv88e6060_switch_reset ( struct mv88e6060_priv * priv )
2008-10-07 13:46:22 +00:00
{
int i ;
int ret ;
2013-01-08 16:05:54 +00:00
unsigned long timeout ;
2008-10-07 13:46:22 +00:00
2013-01-08 16:05:53 +00:00
/* Set all ports to the disabled state. */
2015-11-10 16:51:36 +01:00
for ( i = 0 ; i < MV88E6060_PORTS ; i + + ) {
2019-04-27 19:32:59 +02:00
ret = reg_read ( priv , REG_PORT ( i ) , PORT_CONTROL ) ;
if ( ret < 0 )
return ret ;
2019-04-27 19:32:58 +02:00
ret = reg_write ( priv , REG_PORT ( i ) , PORT_CONTROL ,
ret & ~ PORT_CONTROL_STATE_MASK ) ;
if ( ret )
return ret ;
2008-10-07 13:46:22 +00:00
}
2013-01-08 16:05:53 +00:00
/* Wait for transmit queues to drain. */
2013-01-08 16:05:54 +00:00
usleep_range ( 2000 , 4000 ) ;
2008-10-07 13:46:22 +00:00
2013-01-08 16:05:53 +00:00
/* Reset the switch. */
2019-04-27 19:32:58 +02:00
ret = reg_write ( priv , REG_GLOBAL , GLOBAL_ATU_CONTROL ,
GLOBAL_ATU_CONTROL_SWRESET |
GLOBAL_ATU_CONTROL_LEARNDIS ) ;
if ( ret )
return ret ;
2008-10-07 13:46:22 +00:00
2013-01-08 16:05:53 +00:00
/* Wait up to one second for reset to complete. */
2013-01-08 16:05:54 +00:00
timeout = jiffies + 1 * HZ ;
while ( time_before ( jiffies , timeout ) ) {
2019-04-27 19:32:59 +02:00
ret = reg_read ( priv , REG_GLOBAL , GLOBAL_STATUS ) ;
if ( ret < 0 )
return ret ;
2015-11-10 16:51:36 +01:00
if ( ret & GLOBAL_STATUS_INIT_READY )
2008-10-07 13:46:22 +00:00
break ;
2013-01-08 16:05:54 +00:00
usleep_range ( 1000 , 2000 ) ;
2008-10-07 13:46:22 +00:00
}
2013-01-08 16:05:54 +00:00
if ( time_after ( jiffies , timeout ) )
2008-10-07 13:46:22 +00:00
return - ETIMEDOUT ;
return 0 ;
}
2019-04-27 19:32:57 +02:00
static int mv88e6060_setup_global ( struct mv88e6060_priv * priv )
2008-10-07 13:46:22 +00:00
{
2019-04-27 19:32:58 +02:00
int ret ;
2013-01-08 16:05:53 +00:00
/* Disable discarding of frames with excessive collisions,
2008-10-07 13:46:22 +00:00
* set the maximum frame size to 1536 bytes , and mask all
* interrupt sources .
*/
2019-04-27 19:32:58 +02:00
ret = reg_write ( priv , REG_GLOBAL , GLOBAL_CONTROL ,
GLOBAL_CONTROL_MAX_FRAME_1536 ) ;
if ( ret )
return ret ;
2008-10-07 13:46:22 +00:00
2018-11-30 21:58:36 -02:00
/* Disable automatic address learning.
2008-10-07 13:46:22 +00:00
*/
2019-04-27 19:32:58 +02:00
return reg_write ( priv , REG_GLOBAL , GLOBAL_ATU_CONTROL ,
GLOBAL_ATU_CONTROL_LEARNDIS ) ;
2008-10-07 13:46:22 +00:00
}
2019-04-27 19:32:57 +02:00
static int mv88e6060_setup_port ( struct mv88e6060_priv * priv , int p )
2008-10-07 13:46:22 +00:00
{
int addr = REG_PORT ( p ) ;
2019-04-27 19:32:58 +02:00
int ret ;
2008-10-07 13:46:22 +00:00
2022-08-11 10:09:39 +03:00
if ( dsa_is_unused_port ( priv - > ds , p ) )
return 0 ;
2013-01-08 16:05:53 +00:00
/* Do not force flow control, disable Ingress and Egress
2008-10-07 13:46:22 +00:00
* Header tagging , disable VLAN tunneling , and set the port
* state to Forwarding . Additionally , if this is the CPU
* port , enable Ingress and Egress Trailer tagging mode .
*/
2019-04-27 19:32:58 +02:00
ret = reg_write ( priv , addr , PORT_CONTROL ,
dsa_is_cpu_port ( priv - > ds , p ) ?
2015-11-10 16:51:36 +01:00
PORT_CONTROL_TRAILER |
PORT_CONTROL_INGRESS_MODE |
PORT_CONTROL_STATE_FORWARDING :
PORT_CONTROL_STATE_FORWARDING ) ;
2019-04-27 19:32:58 +02:00
if ( ret )
return ret ;
2008-10-07 13:46:22 +00:00
2013-01-08 16:05:53 +00:00
/* Port based VLAN map: give each port its own address
2008-10-07 13:46:22 +00:00
* database , allow the CPU port to talk to each of the ' real '
* ports , and allow each of the ' real ' ports to only talk to
* the CPU port .
*/
2019-04-27 19:32:58 +02:00
ret = reg_write ( priv , addr , PORT_VLAN_MAP ,
( ( p & 0xf ) < < PORT_VLAN_MAP_DBNUM_SHIFT ) |
( dsa_is_cpu_port ( priv - > ds , p ) ?
dsa_user_ports ( priv - > ds ) :
BIT ( dsa_to_port ( priv - > ds , p ) - > cpu_dp - > index ) ) ) ;
if ( ret )
return ret ;
2008-10-07 13:46:22 +00:00
2013-01-08 16:05:53 +00:00
/* Port Association Vector: when learning source addresses
2008-10-07 13:46:22 +00:00
* of packets , add the address to the address database using
* a port bitmap that has only the bit for this port set and
* the other bits clear .
*/
2019-04-27 19:32:58 +02:00
return reg_write ( priv , addr , PORT_ASSOC_VECTOR , BIT ( p ) ) ;
2008-10-07 13:46:22 +00:00
}
2019-04-27 19:32:57 +02:00
static int mv88e6060_setup_addr ( struct mv88e6060_priv * priv )
2017-10-13 14:18:07 -04:00
{
u8 addr [ ETH_ALEN ] ;
2019-04-27 19:32:58 +02:00
int ret ;
2017-10-13 14:18:07 -04:00
u16 val ;
eth_random_addr ( addr ) ;
val = addr [ 0 ] < < 8 | addr [ 1 ] ;
/* The multicast bit is always transmitted as a zero, so the switch uses
* bit 8 for " DiffAddr " , where 0 means all ports transmit the same SA .
*/
val & = 0xfeff ;
2019-04-27 19:32:58 +02:00
ret = reg_write ( priv , REG_GLOBAL , GLOBAL_MAC_01 , val ) ;
if ( ret )
return ret ;
ret = reg_write ( priv , REG_GLOBAL , GLOBAL_MAC_23 ,
( addr [ 2 ] < < 8 ) | addr [ 3 ] ) ;
if ( ret )
return ret ;
2017-10-13 14:18:07 -04:00
2019-04-27 19:32:58 +02:00
return reg_write ( priv , REG_GLOBAL , GLOBAL_MAC_45 ,
( addr [ 4 ] < < 8 ) | addr [ 5 ] ) ;
2017-10-13 14:18:07 -04:00
}
2008-10-07 13:46:22 +00:00
static int mv88e6060_setup ( struct dsa_switch * ds )
{
2019-04-27 19:32:57 +02:00
struct mv88e6060_priv * priv = ds - > priv ;
2008-10-07 13:46:22 +00:00
int ret ;
2016-04-13 02:40:42 +02:00
int i ;
2008-10-07 13:46:22 +00:00
2019-04-27 19:32:57 +02:00
priv - > ds = ds ;
ret = mv88e6060_switch_reset ( priv ) ;
2008-10-07 13:46:22 +00:00
if ( ret < 0 )
return ret ;
/* @@@ initialise atu */
2019-04-27 19:32:57 +02:00
ret = mv88e6060_setup_global ( priv ) ;
2008-10-07 13:46:22 +00:00
if ( ret < 0 )
return ret ;
2019-04-27 19:32:57 +02:00
ret = mv88e6060_setup_addr ( priv ) ;
2017-10-13 14:18:07 -04:00
if ( ret < 0 )
return ret ;
2015-11-10 16:51:36 +01:00
for ( i = 0 ; i < MV88E6060_PORTS ; i + + ) {
2019-04-27 19:32:57 +02:00
ret = mv88e6060_setup_port ( priv , i ) ;
2008-10-07 13:46:22 +00:00
if ( ret < 0 )
return ret ;
}
return 0 ;
}
static int mv88e6060_port_to_phy_addr ( int port )
{
2015-11-10 16:51:36 +01:00
if ( port > = 0 & & port < MV88E6060_PORTS )
2008-10-07 13:46:22 +00:00
return port ;
return - 1 ;
}
static int mv88e6060_phy_read ( struct dsa_switch * ds , int port , int regnum )
{
2019-04-27 19:32:57 +02:00
struct mv88e6060_priv * priv = ds - > priv ;
2008-10-07 13:46:22 +00:00
int addr ;
addr = mv88e6060_port_to_phy_addr ( port ) ;
if ( addr = = - 1 )
return 0xffff ;
2019-04-27 19:32:57 +02:00
return reg_read ( priv , addr , regnum ) ;
2008-10-07 13:46:22 +00:00
}
static int
mv88e6060_phy_write ( struct dsa_switch * ds , int port , int regnum , u16 val )
{
2019-04-27 19:32:57 +02:00
struct mv88e6060_priv * priv = ds - > priv ;
2008-10-07 13:46:22 +00:00
int addr ;
addr = mv88e6060_port_to_phy_addr ( port ) ;
if ( addr = = - 1 )
return 0xffff ;
2019-04-27 19:32:57 +02:00
return reg_write ( priv , addr , regnum , val ) ;
2008-10-07 13:46:22 +00:00
}
2023-08-12 10:30:33 +01:00
static void mv88e6060_phylink_get_caps ( struct dsa_switch * ds , int port ,
struct phylink_config * config )
{
unsigned long * interfaces = config - > supported_interfaces ;
struct mv88e6060_priv * priv = ds - > priv ;
int addr = REG_PORT ( port ) ;
int ret ;
ret = reg_read ( priv , addr , PORT_STATUS ) ;
if ( ret < 0 ) {
dev_err ( ds - > dev ,
" port %d: unable to read status register: %pe \n " ,
port , ERR_PTR ( ret ) ) ;
return ;
}
/* If the port is configured in SNI mode (acts as a 10Mbps PHY),
* it should have phy - mode = " sni " , but that doesn ' t yet exist , so
* forcibly fail validation until the need arises to introduce it .
*/
if ( ! ( ret & PORT_STATUS_PORTMODE ) ) {
dev_warn ( ds - > dev , " port %d: SNI mode not supported \n " , port ) ;
return ;
}
config - > mac_capabilities = MAC_100 | MAC_10 | MAC_SYM_PAUSE ;
if ( port > = 4 ) {
/* Ports 4 and 5 can support MII, REVMII and REVRMII modes */
__set_bit ( PHY_INTERFACE_MODE_MII , interfaces ) ;
__set_bit ( PHY_INTERFACE_MODE_REVMII , interfaces ) ;
__set_bit ( PHY_INTERFACE_MODE_REVRMII , interfaces ) ;
}
if ( port < = 4 ) {
/* Ports 0 to 3 have internal PHYs, and port 4 can optionally
* use an internal PHY .
*/
/* Internal PHY */
__set_bit ( PHY_INTERFACE_MODE_INTERNAL , interfaces ) ;
/* Default phylib interface mode */
__set_bit ( PHY_INTERFACE_MODE_GMII , interfaces ) ;
}
}
2017-01-08 14:52:08 -08:00
static const struct dsa_switch_ops mv88e6060_switch_ops = {
2016-08-22 16:01:01 +02:00
. get_tag_protocol = mv88e6060_get_tag_protocol ,
2008-10-07 13:46:22 +00:00
. setup = mv88e6060_setup ,
. phy_read = mv88e6060_phy_read ,
. phy_write = mv88e6060_phy_write ,
2023-08-12 10:30:33 +01:00
. phylink_get_caps = mv88e6060_phylink_get_caps ,
2008-10-07 13:46:22 +00:00
} ;
2019-04-28 02:56:21 +02:00
static int mv88e6060_probe ( struct mdio_device * mdiodev )
{
struct device * dev = & mdiodev - > dev ;
struct mv88e6060_priv * priv ;
struct dsa_switch * ds ;
const char * name ;
priv = devm_kzalloc ( dev , sizeof ( * priv ) , GFP_KERNEL ) ;
if ( ! priv )
return - ENOMEM ;
priv - > bus = mdiodev - > bus ;
priv - > sw_addr = mdiodev - > addr ;
name = mv88e6060_get_name ( priv - > bus , priv - > sw_addr ) ;
if ( ! name )
return - ENODEV ;
dev_info ( dev , " switch %s detected \n " , name ) ;
2019-10-21 16:51:30 -04:00
ds = devm_kzalloc ( dev , sizeof ( * ds ) , GFP_KERNEL ) ;
2019-04-28 02:56:21 +02:00
if ( ! ds )
return - ENOMEM ;
2019-10-21 16:51:30 -04:00
ds - > dev = dev ;
ds - > num_ports = MV88E6060_PORTS ;
2019-04-28 02:56:21 +02:00
ds - > priv = priv ;
ds - > dev = dev ;
ds - > ops = & mv88e6060_switch_ops ;
dev_set_drvdata ( dev , ds ) ;
return dsa_register_switch ( ds ) ;
}
static void mv88e6060_remove ( struct mdio_device * mdiodev )
{
struct dsa_switch * ds = dev_get_drvdata ( & mdiodev - > dev ) ;
net: dsa: be compatible with masters which unregister on shutdown
Lino reports that on his system with bcmgenet as DSA master and KSZ9897
as a switch, rebooting or shutting down never works properly.
What does the bcmgenet driver have special to trigger this, that other
DSA masters do not? It has an implementation of ->shutdown which simply
calls its ->remove implementation. Otherwise said, it unregisters its
network interface on shutdown.
This message can be seen in a loop, and it hangs the reboot process there:
unregister_netdevice: waiting for eth0 to become free. Usage count = 3
So why 3?
A usage count of 1 is normal for a registered network interface, and any
virtual interface which links itself as an upper of that will increment
it via dev_hold. In the case of DSA, this is the call path:
dsa_slave_create
-> netdev_upper_dev_link
-> __netdev_upper_dev_link
-> __netdev_adjacent_dev_insert
-> dev_hold
So a DSA switch with 3 interfaces will result in a usage count elevated
by two, and netdev_wait_allrefs will wait until they have gone away.
Other stacked interfaces, like VLAN, watch NETDEV_UNREGISTER events and
delete themselves, but DSA cannot just vanish and go poof, at most it
can unbind itself from the switch devices, but that must happen strictly
earlier compared to when the DSA master unregisters its net_device, so
reacting on the NETDEV_UNREGISTER event is way too late.
It seems that it is a pretty established pattern to have a driver's
->shutdown hook redirect to its ->remove hook, so the same code is
executed regardless of whether the driver is unbound from the device, or
the system is just shutting down. As Florian puts it, it is quite a big
hammer for bcmgenet to unregister its net_device during shutdown, but
having a common code path with the driver unbind helps ensure it is well
tested.
So DSA, for better or for worse, has to live with that and engage in an
arms race of implementing the ->shutdown hook too, from all individual
drivers, and do something sane when paired with masters that unregister
their net_device there. The only sane thing to do, of course, is to
unlink from the master.
However, complications arise really quickly.
The pattern of redirecting ->shutdown to ->remove is not unique to
bcmgenet or even to net_device drivers. In fact, SPI controllers do it
too (see dspi_shutdown -> dspi_remove), and presumably, I2C controllers
and MDIO controllers do it too (this is something I have not researched
too deeply, but even if this is not the case today, it is certainly
plausible to happen in the future, and must be taken into consideration).
Since DSA switches might be SPI devices, I2C devices, MDIO devices, the
insane implication is that for the exact same DSA switch device, we
might have both ->shutdown and ->remove getting called.
So we need to do something with that insane environment. The pattern
I've come up with is "if this, then not that", so if either ->shutdown
or ->remove gets called, we set the device's drvdata to NULL, and in the
other hook, we check whether the drvdata is NULL and just do nothing.
This is probably not necessary for platform devices, just for devices on
buses, but I would really insist for consistency among drivers, because
when code is copy-pasted, it is not always copy-pasted from the best
sources.
So depending on whether the DSA switch's ->remove or ->shutdown will get
called first, we cannot really guarantee even for the same driver if
rebooting will result in the same code path on all platforms. But
nonetheless, we need to do something minimally reasonable on ->shutdown
too to fix the bug. Of course, the ->remove will do more (a full
teardown of the tree, with all data structures freed, and this is why
the bug was not caught for so long). The new ->shutdown method is kept
separate from dsa_unregister_switch not because we couldn't have
unregistered the switch, but simply in the interest of doing something
quick and to the point.
The big question is: does the DSA switch's ->shutdown get called earlier
than the DSA master's ->shutdown? If not, there is still a risk that we
might still trigger the WARN_ON in unregister_netdevice that says we are
attempting to unregister a net_device which has uppers. That's no good.
Although the reference to the master net_device won't physically go away
even if DSA's ->shutdown comes afterwards, remember we have a dev_hold
on it.
The answer to that question lies in this comment above device_link_add:
* A side effect of the link creation is re-ordering of dpm_list and the
* devices_kset list by moving the consumer device and all devices depending
* on it to the ends of these lists (that does not happen to devices that have
* not been registered when this function is called).
so the fact that DSA uses device_link_add towards its master is not
exactly for nothing. device_shutdown() walks devices_kset from the back,
so this is our guarantee that DSA's shutdown happens before the master's
shutdown.
Fixes: 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings")
Link: https://lore.kernel.org/netdev/20210909095324.12978-1-LinoSanfilippo@gmx.de/
Reported-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-17 16:34:33 +03:00
if ( ! ds )
return ;
2019-04-28 02:56:21 +02:00
dsa_unregister_switch ( ds ) ;
net: dsa: be compatible with masters which unregister on shutdown
Lino reports that on his system with bcmgenet as DSA master and KSZ9897
as a switch, rebooting or shutting down never works properly.
What does the bcmgenet driver have special to trigger this, that other
DSA masters do not? It has an implementation of ->shutdown which simply
calls its ->remove implementation. Otherwise said, it unregisters its
network interface on shutdown.
This message can be seen in a loop, and it hangs the reboot process there:
unregister_netdevice: waiting for eth0 to become free. Usage count = 3
So why 3?
A usage count of 1 is normal for a registered network interface, and any
virtual interface which links itself as an upper of that will increment
it via dev_hold. In the case of DSA, this is the call path:
dsa_slave_create
-> netdev_upper_dev_link
-> __netdev_upper_dev_link
-> __netdev_adjacent_dev_insert
-> dev_hold
So a DSA switch with 3 interfaces will result in a usage count elevated
by two, and netdev_wait_allrefs will wait until they have gone away.
Other stacked interfaces, like VLAN, watch NETDEV_UNREGISTER events and
delete themselves, but DSA cannot just vanish and go poof, at most it
can unbind itself from the switch devices, but that must happen strictly
earlier compared to when the DSA master unregisters its net_device, so
reacting on the NETDEV_UNREGISTER event is way too late.
It seems that it is a pretty established pattern to have a driver's
->shutdown hook redirect to its ->remove hook, so the same code is
executed regardless of whether the driver is unbound from the device, or
the system is just shutting down. As Florian puts it, it is quite a big
hammer for bcmgenet to unregister its net_device during shutdown, but
having a common code path with the driver unbind helps ensure it is well
tested.
So DSA, for better or for worse, has to live with that and engage in an
arms race of implementing the ->shutdown hook too, from all individual
drivers, and do something sane when paired with masters that unregister
their net_device there. The only sane thing to do, of course, is to
unlink from the master.
However, complications arise really quickly.
The pattern of redirecting ->shutdown to ->remove is not unique to
bcmgenet or even to net_device drivers. In fact, SPI controllers do it
too (see dspi_shutdown -> dspi_remove), and presumably, I2C controllers
and MDIO controllers do it too (this is something I have not researched
too deeply, but even if this is not the case today, it is certainly
plausible to happen in the future, and must be taken into consideration).
Since DSA switches might be SPI devices, I2C devices, MDIO devices, the
insane implication is that for the exact same DSA switch device, we
might have both ->shutdown and ->remove getting called.
So we need to do something with that insane environment. The pattern
I've come up with is "if this, then not that", so if either ->shutdown
or ->remove gets called, we set the device's drvdata to NULL, and in the
other hook, we check whether the drvdata is NULL and just do nothing.
This is probably not necessary for platform devices, just for devices on
buses, but I would really insist for consistency among drivers, because
when code is copy-pasted, it is not always copy-pasted from the best
sources.
So depending on whether the DSA switch's ->remove or ->shutdown will get
called first, we cannot really guarantee even for the same driver if
rebooting will result in the same code path on all platforms. But
nonetheless, we need to do something minimally reasonable on ->shutdown
too to fix the bug. Of course, the ->remove will do more (a full
teardown of the tree, with all data structures freed, and this is why
the bug was not caught for so long). The new ->shutdown method is kept
separate from dsa_unregister_switch not because we couldn't have
unregistered the switch, but simply in the interest of doing something
quick and to the point.
The big question is: does the DSA switch's ->shutdown get called earlier
than the DSA master's ->shutdown? If not, there is still a risk that we
might still trigger the WARN_ON in unregister_netdevice that says we are
attempting to unregister a net_device which has uppers. That's no good.
Although the reference to the master net_device won't physically go away
even if DSA's ->shutdown comes afterwards, remember we have a dev_hold
on it.
The answer to that question lies in this comment above device_link_add:
* A side effect of the link creation is re-ordering of dpm_list and the
* devices_kset list by moving the consumer device and all devices depending
* on it to the ends of these lists (that does not happen to devices that have
* not been registered when this function is called).
so the fact that DSA uses device_link_add towards its master is not
exactly for nothing. device_shutdown() walks devices_kset from the back,
so this is our guarantee that DSA's shutdown happens before the master's
shutdown.
Fixes: 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings")
Link: https://lore.kernel.org/netdev/20210909095324.12978-1-LinoSanfilippo@gmx.de/
Reported-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-17 16:34:33 +03:00
}
static void mv88e6060_shutdown ( struct mdio_device * mdiodev )
{
struct dsa_switch * ds = dev_get_drvdata ( & mdiodev - > dev ) ;
if ( ! ds )
return ;
dsa_switch_shutdown ( ds ) ;
dev_set_drvdata ( & mdiodev - > dev , NULL ) ;
2019-04-28 02:56:21 +02:00
}
static const struct of_device_id mv88e6060_of_match [ ] = {
{
. compatible = " marvell,mv88e6060 " ,
} ,
{ /* sentinel */ } ,
} ;
static struct mdio_driver mv88e6060_driver = {
. probe = mv88e6060_probe ,
. remove = mv88e6060_remove ,
net: dsa: be compatible with masters which unregister on shutdown
Lino reports that on his system with bcmgenet as DSA master and KSZ9897
as a switch, rebooting or shutting down never works properly.
What does the bcmgenet driver have special to trigger this, that other
DSA masters do not? It has an implementation of ->shutdown which simply
calls its ->remove implementation. Otherwise said, it unregisters its
network interface on shutdown.
This message can be seen in a loop, and it hangs the reboot process there:
unregister_netdevice: waiting for eth0 to become free. Usage count = 3
So why 3?
A usage count of 1 is normal for a registered network interface, and any
virtual interface which links itself as an upper of that will increment
it via dev_hold. In the case of DSA, this is the call path:
dsa_slave_create
-> netdev_upper_dev_link
-> __netdev_upper_dev_link
-> __netdev_adjacent_dev_insert
-> dev_hold
So a DSA switch with 3 interfaces will result in a usage count elevated
by two, and netdev_wait_allrefs will wait until they have gone away.
Other stacked interfaces, like VLAN, watch NETDEV_UNREGISTER events and
delete themselves, but DSA cannot just vanish and go poof, at most it
can unbind itself from the switch devices, but that must happen strictly
earlier compared to when the DSA master unregisters its net_device, so
reacting on the NETDEV_UNREGISTER event is way too late.
It seems that it is a pretty established pattern to have a driver's
->shutdown hook redirect to its ->remove hook, so the same code is
executed regardless of whether the driver is unbound from the device, or
the system is just shutting down. As Florian puts it, it is quite a big
hammer for bcmgenet to unregister its net_device during shutdown, but
having a common code path with the driver unbind helps ensure it is well
tested.
So DSA, for better or for worse, has to live with that and engage in an
arms race of implementing the ->shutdown hook too, from all individual
drivers, and do something sane when paired with masters that unregister
their net_device there. The only sane thing to do, of course, is to
unlink from the master.
However, complications arise really quickly.
The pattern of redirecting ->shutdown to ->remove is not unique to
bcmgenet or even to net_device drivers. In fact, SPI controllers do it
too (see dspi_shutdown -> dspi_remove), and presumably, I2C controllers
and MDIO controllers do it too (this is something I have not researched
too deeply, but even if this is not the case today, it is certainly
plausible to happen in the future, and must be taken into consideration).
Since DSA switches might be SPI devices, I2C devices, MDIO devices, the
insane implication is that for the exact same DSA switch device, we
might have both ->shutdown and ->remove getting called.
So we need to do something with that insane environment. The pattern
I've come up with is "if this, then not that", so if either ->shutdown
or ->remove gets called, we set the device's drvdata to NULL, and in the
other hook, we check whether the drvdata is NULL and just do nothing.
This is probably not necessary for platform devices, just for devices on
buses, but I would really insist for consistency among drivers, because
when code is copy-pasted, it is not always copy-pasted from the best
sources.
So depending on whether the DSA switch's ->remove or ->shutdown will get
called first, we cannot really guarantee even for the same driver if
rebooting will result in the same code path on all platforms. But
nonetheless, we need to do something minimally reasonable on ->shutdown
too to fix the bug. Of course, the ->remove will do more (a full
teardown of the tree, with all data structures freed, and this is why
the bug was not caught for so long). The new ->shutdown method is kept
separate from dsa_unregister_switch not because we couldn't have
unregistered the switch, but simply in the interest of doing something
quick and to the point.
The big question is: does the DSA switch's ->shutdown get called earlier
than the DSA master's ->shutdown? If not, there is still a risk that we
might still trigger the WARN_ON in unregister_netdevice that says we are
attempting to unregister a net_device which has uppers. That's no good.
Although the reference to the master net_device won't physically go away
even if DSA's ->shutdown comes afterwards, remember we have a dev_hold
on it.
The answer to that question lies in this comment above device_link_add:
* A side effect of the link creation is re-ordering of dpm_list and the
* devices_kset list by moving the consumer device and all devices depending
* on it to the ends of these lists (that does not happen to devices that have
* not been registered when this function is called).
so the fact that DSA uses device_link_add towards its master is not
exactly for nothing. device_shutdown() walks devices_kset from the back,
so this is our guarantee that DSA's shutdown happens before the master's
shutdown.
Fixes: 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings")
Link: https://lore.kernel.org/netdev/20210909095324.12978-1-LinoSanfilippo@gmx.de/
Reported-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-09-17 16:34:33 +03:00
. shutdown = mv88e6060_shutdown ,
2019-04-28 02:56:21 +02:00
. mdiodrv . driver = {
. name = " mv88e6060 " ,
. of_match_table = mv88e6060_of_match ,
} ,
} ;
2019-04-28 02:56:22 +02:00
mdio_module_driver ( mv88e6060_driver ) ;
2011-11-25 14:37:16 +00:00
MODULE_AUTHOR ( " Lennert Buytenhek <buytenh@wantstofly.org> " ) ;
MODULE_DESCRIPTION ( " Driver for Marvell 88E6060 ethernet switch chip " ) ;
MODULE_LICENSE ( " GPL " ) ;
MODULE_ALIAS ( " platform:mv88e6060 " ) ;