2019-05-27 09:55:01 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-17 02:20:36 +04:00
/*
* Userspace interface
* Linux ethernet bridge
*
* Authors :
* Lennert Buytenhek < buytenh @ gnu . org >
*/
# include <linux/kernel.h>
# include <linux/netdevice.h>
2011-09-30 18:37:26 +04:00
# include <linux/etherdevice.h>
2010-05-06 11:48:24 +04:00
# include <linux/netpoll.h>
2005-04-17 02:20:36 +04:00
# include <linux/ethtool.h>
# include <linux/if_arp.h>
# include <linux/module.h>
# include <linux/init.h>
# include <linux/rtnetlink.h>
2006-01-06 03:35:42 +03:00
# include <linux/if_ether.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
# include <linux/slab.h>
2017-03-29 00:45:06 +03:00
# include <net/dsa.h>
2005-04-17 02:20:36 +04:00
# include <net/sock.h>
2013-02-13 16:00:12 +04:00
# include <linux/if_vlan.h>
2015-10-14 20:40:53 +03:00
# include <net/switchdev.h>
2018-07-21 00:56:54 +03:00
# include <net/net_namespace.h>
2005-04-17 02:20:36 +04:00
# include "br_private.h"
/*
* Determine initial path cost based on speed .
* using recommendations from 802.1 d standard
*
2007-08-01 01:00:02 +04:00
* Since driver might sleep need to not be holding any locks .
2005-04-17 02:20:36 +04:00
*/
2005-12-21 02:19:51 +03:00
static int port_cost ( struct net_device * dev )
2005-04-17 02:20:36 +04:00
{
2016-02-24 21:58:09 +03:00
struct ethtool_link_ksettings ecmd ;
2007-08-31 09:16:22 +04:00
2016-02-24 21:58:09 +03:00
if ( ! __ethtool_get_link_ksettings ( dev , & ecmd ) ) {
switch ( ecmd . base . speed ) {
2011-09-01 07:29:38 +04:00
case SPEED_10000 :
return 2 ;
case SPEED_1000 :
return 4 ;
case SPEED_100 :
return 19 ;
case SPEED_10 :
return 100 ;
2005-04-17 02:20:36 +04:00
}
}
/* Old silly heuristics based on name */
if ( ! strncmp ( dev - > name , " lec " , 3 ) )
return 7 ;
if ( ! strncmp ( dev - > name , " plip " , 4 ) )
return 2500 ;
return 100 ; /* assume old 10Mbps */
}
2005-12-21 02:19:51 +03:00
2013-12-16 17:32:46 +04:00
/* Check for port carrier transitions. */
2018-05-03 13:47:24 +03:00
void br_port_carrier_check ( struct net_bridge_port * p , bool * notified )
2005-12-21 02:19:51 +03:00
{
2007-02-22 12:10:18 +03:00
struct net_device * dev = p - > dev ;
struct net_bridge * br = p - > br ;
2006-03-04 04:14:51 +03:00
2013-04-13 18:06:07 +04:00
if ( ! ( p - > flags & BR_ADMIN_COST ) & &
netif_running ( dev ) & & netif_oper_up ( dev ) )
2006-03-04 04:14:51 +03:00
p - > path_cost = port_cost ( dev ) ;
2018-05-03 13:47:24 +03:00
* notified = false ;
2010-08-24 17:12:56 +04:00
if ( ! netif_running ( br - > dev ) )
return ;
spin_lock_bh ( & br - > lock ) ;
2012-12-28 22:15:22 +04:00
if ( netif_running ( dev ) & & netif_oper_up ( dev ) ) {
2018-05-03 13:47:24 +03:00
if ( p - > state = = BR_STATE_DISABLED ) {
2010-08-24 17:12:56 +04:00
br_stp_enable_port ( p ) ;
2018-05-03 13:47:24 +03:00
* notified = true ;
}
2010-08-24 17:12:56 +04:00
} else {
2018-05-03 13:47:24 +03:00
if ( p - > state ! = BR_STATE_DISABLED ) {
2010-08-24 17:12:56 +04:00
br_stp_disable_port ( p ) ;
2018-05-03 13:47:24 +03:00
* notified = true ;
}
2005-12-21 02:19:51 +03:00
}
2010-08-24 17:12:56 +04:00
spin_unlock_bh ( & br - > lock ) ;
2005-12-21 02:19:51 +03:00
}
2014-05-16 17:59:20 +04:00
static void br_port_set_promisc ( struct net_bridge_port * p )
{
int err = 0 ;
if ( br_promisc_port ( p ) )
return ;
err = dev_set_promiscuity ( p - > dev , 1 ) ;
if ( err )
return ;
br_fdb_unsync_static ( p - > br , p ) ;
p - > flags | = BR_PROMISC ;
}
static void br_port_clear_promisc ( struct net_bridge_port * p )
{
int err ;
/* Check if the port is already non-promisc or if it doesn't
* support UNICAST filtering . Without unicast filtering support
* we ' ll end up re - enabling promisc mode anyway , so just check for
* it here .
*/
if ( ! br_promisc_port ( p ) | | ! ( p - > dev - > priv_flags & IFF_UNICAST_FLT ) )
return ;
/* Since we'll be clearing the promisc mode, program the port
* first so that we don ' t have interruption in traffic .
*/
err = br_fdb_sync_static ( p - > br , p ) ;
if ( err )
return ;
dev_set_promiscuity ( p - > dev , - 1 ) ;
p - > flags & = ~ BR_PROMISC ;
}
/* When a port is added or removed or when certain port flags
* change , this function is called to automatically manage
* promiscuity setting of all the bridge ports . We are always called
* under RTNL so can skip using rcu primitives .
*/
void br_manage_promisc ( struct net_bridge * br )
{
struct net_bridge_port * p ;
bool set_all = false ;
/* If vlan filtering is disabled or bridge interface is placed
* into promiscuous mode , place all ports in promiscuous mode .
*/
2017-05-26 09:37:23 +03:00
if ( ( br - > dev - > flags & IFF_PROMISC ) | | ! br_vlan_enabled ( br - > dev ) )
2014-05-16 17:59:20 +04:00
set_all = true ;
list_for_each_entry ( p , & br - > port_list , list ) {
if ( set_all ) {
br_port_set_promisc ( p ) ;
} else {
/* If the number of auto-ports is <= 1, then all other
* ports will have their output configuration
* statically specified through fdbs . Since ingress
* on the auto - port becomes forwarding / egress to other
* ports and egress configuration is statically known ,
* we can say that ingress configuration of the
* auto - port is also statically known .
* This lets us disable promiscuous mode and write
* this config to hw .
*/
2014-06-05 15:53:32 +04:00
if ( br - > auto_cnt = = 0 | |
( br - > auto_cnt = = 1 & & br_auto_port ( p ) ) )
2014-05-16 17:59:20 +04:00
br_port_clear_promisc ( p ) ;
else
br_port_set_promisc ( p ) ;
}
}
}
2018-07-23 11:16:59 +03:00
int nbp_backup_change ( struct net_bridge_port * p ,
struct net_device * backup_dev )
{
struct net_bridge_port * old_backup = rtnl_dereference ( p - > backup_port ) ;
struct net_bridge_port * backup_p = NULL ;
ASSERT_RTNL ( ) ;
if ( backup_dev ) {
2019-03-29 16:38:19 +03:00
if ( ! netif_is_bridge_port ( backup_dev ) )
2018-07-23 11:16:59 +03:00
return - ENOENT ;
backup_p = br_port_get_rtnl ( backup_dev ) ;
if ( backup_p - > br ! = p - > br )
return - EINVAL ;
}
if ( p = = backup_p )
return - EINVAL ;
if ( old_backup = = backup_p )
return 0 ;
/* if the backup link is already set, clear it */
if ( old_backup )
old_backup - > backup_redirected_cnt - - ;
if ( backup_p )
backup_p - > backup_redirected_cnt + + ;
rcu_assign_pointer ( p - > backup_port , backup_p ) ;
return 0 ;
}
static void nbp_backup_clear ( struct net_bridge_port * p )
{
nbp_backup_change ( p , NULL ) ;
if ( p - > backup_redirected_cnt ) {
struct net_bridge_port * cur_p ;
list_for_each_entry ( cur_p , & p - > br - > port_list , list ) {
struct net_bridge_port * backup_p ;
backup_p = rtnl_dereference ( cur_p - > backup_port ) ;
if ( backup_p = = p )
nbp_backup_change ( cur_p , NULL ) ;
}
}
WARN_ON ( rcu_access_pointer ( p - > backup_port ) | | p - > backup_redirected_cnt ) ;
}
2014-05-16 17:59:16 +04:00
static void nbp_update_port_count ( struct net_bridge * br )
{
struct net_bridge_port * p ;
u32 cnt = 0 ;
list_for_each_entry ( p , & br - > port_list , list ) {
if ( br_auto_port ( p ) )
cnt + + ;
}
2014-05-16 17:59:20 +04:00
if ( br - > auto_cnt ! = cnt ) {
br - > auto_cnt = cnt ;
br_manage_promisc ( br ) ;
}
}
static void nbp_delete_promisc ( struct net_bridge_port * p )
{
2014-05-17 07:46:17 +04:00
/* If port is currently promiscuous, unset promiscuity.
2014-05-16 17:59:20 +04:00
* Otherwise , it is a static port so remove all addresses
* from it .
*/
dev_set_allmulti ( p - > dev , - 1 ) ;
if ( br_promisc_port ( p ) )
dev_set_promiscuity ( p - > dev , - 1 ) ;
else
br_fdb_unsync_static ( p - > br , p ) ;
2014-05-16 17:59:16 +04:00
}
2006-02-10 04:10:12 +03:00
static void release_nbp ( struct kobject * kobj )
{
struct net_bridge_port * p
= container_of ( kobj , struct net_bridge_port , kobj ) ;
kfree ( p ) ;
}
2018-07-21 00:56:54 +03:00
static void brport_get_ownership ( struct kobject * kobj , kuid_t * uid , kgid_t * gid )
{
struct net_bridge_port * p = kobj_to_brport ( kobj ) ;
net_ns_get_ownership ( dev_net ( p - > dev ) , uid , gid ) ;
}
2006-02-10 04:10:12 +03:00
static struct kobj_type brport_ktype = {
# ifdef CONFIG_SYSFS
. sysfs_ops = & brport_sysfs_ops ,
# endif
. release = release_nbp ,
2018-07-21 00:56:54 +03:00
. get_ownership = brport_get_ownership ,
2006-02-10 04:10:12 +03:00
} ;
2005-04-17 02:20:36 +04:00
static void destroy_nbp ( struct net_bridge_port * p )
{
struct net_device * dev = p - > dev ;
p - > br = NULL ;
p - > dev = NULL ;
2021-12-07 04:30:29 +03:00
dev_put_track ( dev , & p - > dev_tracker ) ;
2005-04-17 02:20:36 +04:00
2006-02-10 04:10:12 +03:00
kobject_put ( & p - > kobj ) ;
2005-04-17 02:20:36 +04:00
}
static void destroy_nbp_rcu ( struct rcu_head * head )
{
struct net_bridge_port * p =
container_of ( head , struct net_bridge_port , rcu ) ;
destroy_nbp ( p ) ;
}
2016-02-26 12:45:38 +03:00
static unsigned get_max_headroom ( struct net_bridge * br )
{
unsigned max_headroom = 0 ;
struct net_bridge_port * p ;
list_for_each_entry ( p , & br - > port_list , list ) {
unsigned dev_headroom = netdev_get_fwd_headroom ( p - > dev ) ;
if ( dev_headroom > max_headroom )
max_headroom = dev_headroom ;
}
return max_headroom ;
}
static void update_headroom ( struct net_bridge * br , int new_hr )
{
struct net_bridge_port * p ;
list_for_each_entry ( p , & br - > port_list , list )
netdev_set_rx_headroom ( p - > dev , new_hr ) ;
br - > dev - > needed_headroom = new_hr ;
}
2006-02-01 04:44:07 +03:00
/* Delete port(interface) from bridge is done in two steps.
* via RCU . First step , marks device as down . That deletes
* all the timers and stops new packets from flowing through .
*
* Final cleanup doesn ' t occur until after all CPU ' s finished
* processing packets .
*
* Protected from multiple admin operations by RTNL mutex
*/
2005-04-17 02:20:36 +04:00
static void del_nbp ( struct net_bridge_port * p )
{
struct net_bridge * br = p - > br ;
struct net_device * dev = p - > dev ;
2010-05-18 23:26:27 +04:00
sysfs_remove_link ( br - > ifobj , p - > dev - > name ) ;
2006-02-10 04:10:12 +03:00
2014-05-16 17:59:20 +04:00
nbp_delete_promisc ( p ) ;
2005-04-17 02:20:36 +04:00
spin_lock_bh ( & br - > lock ) ;
br_stp_disable_port ( p ) ;
spin_unlock_bh ( & br - > lock ) ;
2020-04-26 16:22:07 +03:00
br_mrp_port_del ( br , p ) ;
2020-10-27 13:02:45 +03:00
br_cfm_port_del ( br , p ) ;
2020-04-26 16:22:07 +03:00
2017-11-01 13:18:13 +03:00
br_ifinfo_notify ( RTM_DELLINK , NULL , p ) ;
2007-03-23 00:08:46 +03:00
2005-04-17 02:20:36 +04:00
list_del_rcu ( & p - > list ) ;
2016-02-26 12:45:38 +03:00
if ( netdev_get_fwd_headroom ( dev ) = = br - > dev - > needed_headroom )
update_headroom ( br , get_max_headroom ( br ) ) ;
netdev_reset_rx_headroom ( dev ) ;
2005-04-17 02:20:36 +04:00
2015-10-12 22:47:05 +03:00
nbp_vlan_flush ( p ) ;
2015-06-23 15:28:16 +03:00
br_fdb_delete_by_port ( br , p , 0 , 1 ) ;
2015-10-14 20:40:53 +03:00
switchdev_deferred_process ( ) ;
2018-07-23 11:16:59 +03:00
nbp_backup_clear ( p ) ;
2015-10-14 20:40:53 +03:00
2014-05-16 17:59:16 +04:00
nbp_update_port_count ( br ) ;
2014-09-05 17:51:28 +04:00
netdev_upper_dev_unlink ( dev , br - > dev ) ;
2010-06-15 10:50:45 +04:00
dev - > priv_flags & = ~ IFF_BRIDGE_PORT ;
2010-06-02 01:52:08 +04:00
netdev_rx_handler_unregister ( dev ) ;
2006-02-10 04:08:52 +03:00
2010-02-28 11:49:38 +03:00
br_multicast_del_port ( p ) ;
2006-03-04 04:16:15 +03:00
kobject_uevent ( & p - > kobj , KOBJ_REMOVE ) ;
2006-02-10 04:10:12 +03:00
kobject_del ( & p - > kobj ) ;
2010-06-10 20:12:50 +04:00
br_netpoll_disable ( p ) ;
2005-04-17 02:20:36 +04:00
call_rcu ( & p - > rcu , destroy_nbp_rcu ) ;
}
2011-10-06 15:19:41 +04:00
/* Delete bridge device */
void br_dev_delete ( struct net_device * dev , struct list_head * head )
2005-04-17 02:20:36 +04:00
{
2011-10-06 15:19:41 +04:00
struct net_bridge * br = netdev_priv ( dev ) ;
2005-04-17 02:20:36 +04:00
struct net_bridge_port * p , * n ;
list_for_each_entry_safe ( p , n , & br - > port_list , list ) {
del_nbp ( p ) ;
}
2017-10-07 08:12:37 +03:00
br_recalculate_neigh_suppress_enabled ( br ) ;
2015-06-23 15:28:16 +03:00
br_fdb_delete_by_port ( br , NULL , 0 , 1 ) ;
bridge: flush br's address entry in fdb when remove the
bridge dev
When the following commands are executed:
brctl addbr br0
ifconfig br0 hw ether <addr>
rmmod bridge
The calltrace will occur:
[ 563.312114] device eth1 left promiscuous mode
[ 563.312188] br0: port 1(eth1) entered disabled state
[ 563.468190] kmem_cache_destroy bridge_fdb_cache: Slab cache still has objects
[ 563.468197] CPU: 6 PID: 6982 Comm: rmmod Tainted: G O 3.12.0-0.7-default+ #9
[ 563.468199] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 563.468200] 0000000000000880 ffff88010f111e98 ffffffff814d1c92 ffff88010f111eb8
[ 563.468204] ffffffff81148efd ffff88010f111eb8 0000000000000000 ffff88010f111ec8
[ 563.468206] ffffffffa062a270 ffff88010f111ed8 ffffffffa063ac76 ffff88010f111f78
[ 563.468209] Call Trace:
[ 563.468218] [<ffffffff814d1c92>] dump_stack+0x6a/0x78
[ 563.468234] [<ffffffff81148efd>] kmem_cache_destroy+0xfd/0x100
[ 563.468242] [<ffffffffa062a270>] br_fdb_fini+0x10/0x20 [bridge]
[ 563.468247] [<ffffffffa063ac76>] br_deinit+0x4e/0x50 [bridge]
[ 563.468254] [<ffffffff810c7dc9>] SyS_delete_module+0x199/0x2b0
[ 563.468259] [<ffffffff814e0922>] system_call_fastpath+0x16/0x1b
[ 570.377958] Bridge firewalling registered
--------------------------- cut here -------------------------------
The reason is that when the bridge dev's address is changed, the
br_fdb_change_mac_address() will add new address in fdb, but when
the bridge was removed, the address entry in the fdb did not free,
the bridge_fdb_cache still has objects when destroy the cache, Fix
this by flushing the bridge address entry when removing the bridge.
v2: according to the Toshiaki Makita and Vlad's suggestion, I only
delete the vlan0 entry, it still have a leak here if the vlan id
is other number, so I need to call fdb_delete_by_port(br, NULL, 1)
to flush all entries whose dst is NULL for the bridge.
Suggested-by: Toshiaki Makita <toshiaki.makita1@gmail.com>
Suggested-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-07 18:12:05 +04:00
2017-02-04 20:05:07 +03:00
cancel_delayed_work_sync ( & br - > gc_work ) ;
2005-04-17 02:20:36 +04:00
br_sysfs_delbr ( br - > dev ) ;
2009-10-28 08:35:35 +03:00
unregister_netdevice_queue ( br - > dev , head ) ;
2005-04-17 02:20:36 +04:00
}
/* find an available port number */
static int find_portno ( struct net_bridge * br )
{
int index ;
struct net_bridge_port * p ;
unsigned long * inuse ;
2018-08-30 13:33:18 +03:00
inuse = bitmap_zalloc ( BR_MAX_PORTS , GFP_KERNEL ) ;
2005-04-17 02:20:36 +04:00
if ( ! inuse )
return - ENOMEM ;
2021-11-14 22:02:35 +03:00
__set_bit ( 0 , inuse ) ; /* zero is reserved */
list_for_each_entry ( p , & br - > port_list , list )
__set_bit ( p - > port_no , inuse ) ;
2005-04-17 02:20:36 +04:00
index = find_first_zero_bit ( inuse , BR_MAX_PORTS ) ;
2018-08-30 13:33:18 +03:00
bitmap_free ( inuse ) ;
2005-04-17 02:20:36 +04:00
return ( index > = BR_MAX_PORTS ) ? - EXFULL : index ;
}
2005-12-21 02:19:51 +03:00
/* called with RTNL but without bridge lock */
2007-02-09 17:24:35 +03:00
static struct net_bridge_port * new_nbp ( struct net_bridge * br ,
2005-12-21 02:19:51 +03:00
struct net_device * dev )
2005-04-17 02:20:36 +04:00
{
struct net_bridge_port * p ;
2016-06-28 17:57:06 +03:00
int index , err ;
2007-02-09 17:24:35 +03:00
2005-04-17 02:20:36 +04:00
index = find_portno ( br ) ;
if ( index < 0 )
return ERR_PTR ( index ) ;
2006-03-21 09:57:03 +03:00
p = kzalloc ( sizeof ( * p ) , GFP_KERNEL ) ;
2005-04-17 02:20:36 +04:00
if ( p = = NULL )
return ERR_PTR ( - ENOMEM ) ;
p - > br = br ;
2021-12-07 04:30:29 +03:00
dev_hold_track ( dev , & p - > dev_tracker , GFP_KERNEL ) ;
2005-04-17 02:20:36 +04:00
p - > dev = dev ;
2005-12-21 02:19:51 +03:00
p - > path_cost = port_cost ( dev ) ;
2007-02-09 17:24:35 +03:00
p - > priority = 0x8000 > > BR_PORT_BITS ;
2005-04-17 02:20:36 +04:00
p - > port_no = index ;
2017-04-26 16:48:09 +03:00
p - > flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD ;
2005-04-17 02:20:36 +04:00
br_init_port ( p ) ;
2014-10-01 03:13:19 +04:00
br_set_state ( p , BR_STATE_DISABLED ) ;
2006-03-04 04:15:34 +03:00
br_stp_port_timer_init ( p ) ;
2016-06-28 17:57:06 +03:00
err = br_multicast_add_port ( p ) ;
if ( err ) {
2021-12-07 04:30:29 +03:00
dev_put_track ( dev , & p - > dev_tracker ) ;
2016-06-28 17:57:06 +03:00
kfree ( p ) ;
p = ERR_PTR ( err ) ;
}
2005-04-17 02:20:36 +04:00
return p ;
}
2008-09-09 03:19:58 +04:00
int br_add_bridge ( struct net * net , const char * name )
2005-04-17 02:20:36 +04:00
{
struct net_device * dev ;
2011-08-22 10:05:59 +04:00
int res ;
2005-04-17 02:20:36 +04:00
net: set name_assign_type in alloc_netdev()
Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert
all users to pass NET_NAME_UNKNOWN.
Coccinelle patch:
@@
expression sizeof_priv, name, setup, txqs, rxqs, count;
@@
(
-alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs)
+alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs)
|
-alloc_netdev_mq(sizeof_priv, name, setup, count)
+alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count)
|
-alloc_netdev(sizeof_priv, name, setup)
+alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup)
)
v9: move comments here from the wrong commit
Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-14 18:37:24 +04:00
dev = alloc_netdev ( sizeof ( struct net_bridge ) , name , NET_NAME_UNKNOWN ,
2011-04-04 18:03:32 +04:00
br_dev_setup ) ;
2007-02-09 17:24:35 +03:00
if ( ! dev )
2005-04-17 02:20:36 +04:00
return - ENOMEM ;
2011-04-04 18:03:32 +04:00
dev_net_set ( dev , net ) ;
2012-06-26 09:48:45 +04:00
dev - > rtnl_link_ops = & br_link_ops ;
2008-05-05 04:58:07 +04:00
2021-08-05 11:29:01 +03:00
res = register_netdevice ( dev ) ;
2011-08-22 10:05:59 +04:00
if ( res )
free_netdev ( dev ) ;
return res ;
2005-04-17 02:20:36 +04:00
}
2008-09-09 03:19:58 +04:00
int br_del_bridge ( struct net * net , const char * name )
2005-04-17 02:20:36 +04:00
{
struct net_device * dev ;
int ret = 0 ;
2008-09-09 03:19:58 +04:00
dev = __dev_get_by_name ( net , name ) ;
2007-02-09 17:24:35 +03:00
if ( dev = = NULL )
2005-04-17 02:20:36 +04:00
ret = - ENXIO ; /* Could not find device */
2021-10-16 14:21:36 +03:00
else if ( ! netif_is_bridge_master ( dev ) ) {
2005-04-17 02:20:36 +04:00
/* Attempt to delete non bridge device! */
ret = - EPERM ;
}
else if ( dev - > flags & IFF_UP ) {
/* Not shutdown yet. */
ret = - EBUSY ;
2007-02-09 17:24:35 +03:00
}
2005-04-17 02:20:36 +04:00
2007-02-09 17:24:35 +03:00
else
2011-10-06 15:19:41 +04:00
br_dev_delete ( dev , NULL ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
2018-03-30 13:46:19 +03:00
/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */
static int br_mtu_min ( const struct net_bridge * br )
2005-04-17 02:20:36 +04:00
{
const struct net_bridge_port * p ;
2018-03-30 13:46:18 +03:00
int ret_mtu = 0 ;
2005-04-17 02:20:36 +04:00
2018-03-30 13:46:19 +03:00
list_for_each_entry ( p , & br - > port_list , list )
if ( ! ret_mtu | | ret_mtu > p - > dev - > mtu )
2018-03-30 13:46:18 +03:00
ret_mtu = p - > dev - > mtu ;
2005-04-17 02:20:36 +04:00
2018-03-30 13:46:18 +03:00
return ret_mtu ? ret_mtu : ETH_DATA_LEN ;
2018-03-22 18:34:06 +03:00
}
2018-03-30 13:46:19 +03:00
void br_mtu_auto_adjust ( struct net_bridge * br )
{
ASSERT_RTNL ( ) ;
/* if the bridge MTU was manually configured don't mess with it */
2018-09-26 17:01:06 +03:00
if ( br_opt_get ( br , BROPT_MTU_SET_BY_USER ) )
2018-03-30 13:46:19 +03:00
return ;
/* change to the minimum MTU and clear the flag which was set by
* the bridge ndo_change_mtu callback
*/
dev_set_mtu ( br - > dev , br_mtu_min ( br ) ) ;
2018-09-26 17:01:06 +03:00
br_opt_toggle ( br , BROPT_MTU_SET_BY_USER , false ) ;
2018-03-30 13:46:19 +03:00
}
2016-03-21 19:55:11 +03:00
static void br_set_gso_limits ( struct net_bridge * br )
{
unsigned int gso_max_size = GSO_MAX_SIZE ;
u16 gso_max_segs = GSO_MAX_SEGS ;
const struct net_bridge_port * p ;
list_for_each_entry ( p , & br - > port_list , list ) {
gso_max_size = min ( gso_max_size , p - > dev - > gso_max_size ) ;
gso_max_segs = min ( gso_max_segs , p - > dev - > gso_max_segs ) ;
}
2021-11-19 18:43:31 +03:00
netif_set_gso_max_size ( br - > dev , gso_max_size ) ;
2021-11-19 18:43:32 +03:00
netif_set_gso_max_segs ( br - > dev , gso_max_segs ) ;
2016-03-21 19:55:11 +03:00
}
2005-05-30 01:15:17 +04:00
/*
* Recomputes features using slave ' s features
*/
2011-11-15 19:29:55 +04:00
netdev_features_t br_features_recompute ( struct net_bridge * br ,
netdev_features_t features )
2005-05-30 01:15:17 +04:00
{
struct net_bridge_port * p ;
2011-11-15 19:29:55 +04:00
netdev_features_t mask ;
2005-05-30 01:15:17 +04:00
2008-10-23 12:11:29 +04:00
if ( list_empty ( & br - > port_list ) )
2011-04-22 10:31:16 +04:00
return features ;
2008-10-23 12:11:29 +04:00
2011-04-22 10:31:16 +04:00
mask = features ;
2008-10-23 12:11:29 +04:00
features & = ~ NETIF_F_ONE_FOR_ALL ;
2005-05-30 01:15:17 +04:00
list_for_each_entry ( p , & br - > port_list , list ) {
2008-10-23 12:11:29 +04:00
features = netdev_increment_features ( features ,
p - > dev - > features , mask ) ;
2005-05-30 01:15:17 +04:00
}
2015-01-09 08:16:40 +03:00
features = netdev_add_tso_features ( features , mask ) ;
2005-05-30 01:15:17 +04:00
2011-04-22 10:31:16 +04:00
return features ;
2005-05-30 01:15:17 +04:00
}
2005-04-17 02:20:36 +04:00
/* called with RTNL */
2017-10-05 03:48:50 +03:00
int br_add_if ( struct net_bridge * br , struct net_device * dev ,
struct netlink_ext_ack * extack )
2005-04-17 02:20:36 +04:00
{
struct net_bridge_port * p ;
int err = 0 ;
2016-02-26 12:45:38 +03:00
unsigned br_hr , dev_hr ;
2021-07-02 15:07:36 +03:00
bool changed_addr , fdb_synced = false ;
2005-04-17 02:20:36 +04:00
2020-05-10 19:37:40 +03:00
/* Don't allow bridging non-ethernet like devices. */
2009-11-06 07:46:52 +03:00
if ( ( dev - > flags & IFF_LOOPBACK ) | |
2011-09-30 18:37:26 +04:00
dev - > type ! = ARPHRD_ETHER | | dev - > addr_len ! = ETH_ALEN | |
2020-05-10 19:37:40 +03:00
! is_valid_ether_addr ( dev - > dev_addr ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2020-05-10 19:37:40 +03:00
/* Also don't allow bridging of net devices that are DSA masters, since
* the bridge layer rx_handler prevents the DSA fake ethertype handler
* to be invoked , so we don ' t get the chance to strip off and parse the
* DSA switch tag protocol header ( the bridge layer just returns
* RX_HANDLER_CONSUMED , stopping RX processing for these frames ) .
* The only case where that would not be an issue is when bridging can
* already be offloaded , such as when the DSA master is itself a DSA
* or plain switchdev port , and is bridged only with other ports from
* the same hardware device .
*/
if ( netdev_uses_dsa ( dev ) ) {
list_for_each_entry ( p , & br - > port_list , list ) {
if ( ! netdev_port_same_parent_id ( dev , p - > dev ) ) {
NL_SET_ERR_MSG ( extack ,
" Cannot do software bridging with a DSA master " ) ;
return - EINVAL ;
}
}
}
2009-11-06 07:46:52 +03:00
/* No bridging of bridges */
2017-10-05 03:48:50 +03:00
if ( dev - > netdev_ops - > ndo_start_xmit = = br_dev_xmit ) {
NL_SET_ERR_MSG ( extack ,
" Can not enslave a bridge to a bridge " ) ;
2005-04-17 02:20:36 +04:00
return - ELOOP ;
2017-10-05 03:48:50 +03:00
}
2005-04-17 02:20:36 +04:00
2018-04-27 15:59:24 +03:00
/* Device has master upper dev */
if ( netdev_master_upper_dev_get ( dev ) )
2005-04-17 02:20:36 +04:00
return - EBUSY ;
cfg80211: disallow bridging managed/adhoc interfaces
A number of people have tried to add a wireless interface
(in managed mode) to a bridge and then complained that it
doesn't work. It cannot work, however, because in 802.11
networks all packets need to be acknowledged and as such
need to be sent to the right address. Promiscuous doesn't
help here. The wireless address format used for these
links has only space for three addresses, the
* transmitter, which must be equal to the sender (origin)
* receiver (on the wireless medium), which is the AP in
the case of managed mode
* the recipient (destination), which is on the APs local
network segment
In an IBSS, it is similar, but the receiver and recipient
must match and the third address is used as the BSSID.
To avoid such mistakes in the future, disallow adding a
wireless interface to a bridge.
Felix has recently added a four-address mode to the AP
and client side that can be used (after negotiating that
it is possible, which must happen out-of-band by setting
up both sides) for bridging, so allow that case.
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2009-11-19 02:56:30 +03:00
/* No bridging devices that dislike that (e.g. wireless) */
2017-10-05 03:48:50 +03:00
if ( dev - > priv_flags & IFF_DONT_BRIDGE ) {
NL_SET_ERR_MSG ( extack ,
" Device does not allow enslaving to a bridge " ) ;
cfg80211: disallow bridging managed/adhoc interfaces
A number of people have tried to add a wireless interface
(in managed mode) to a bridge and then complained that it
doesn't work. It cannot work, however, because in 802.11
networks all packets need to be acknowledged and as such
need to be sent to the right address. Promiscuous doesn't
help here. The wireless address format used for these
links has only space for three addresses, the
* transmitter, which must be equal to the sender (origin)
* receiver (on the wireless medium), which is the AP in
the case of managed mode
* the recipient (destination), which is on the APs local
network segment
In an IBSS, it is similar, but the receiver and recipient
must match and the third address is used as the BSSID.
To avoid such mistakes in the future, disallow adding a
wireless interface to a bridge.
Felix has recently added a four-address mode to the AP
and client side that can be used (after negotiating that
it is possible, which must happen out-of-band by setting
up both sides) for bridging, so allow that case.
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2009-11-19 02:56:30 +03:00
return - EOPNOTSUPP ;
2017-10-05 03:48:50 +03:00
}
cfg80211: disallow bridging managed/adhoc interfaces
A number of people have tried to add a wireless interface
(in managed mode) to a bridge and then complained that it
doesn't work. It cannot work, however, because in 802.11
networks all packets need to be acknowledged and as such
need to be sent to the right address. Promiscuous doesn't
help here. The wireless address format used for these
links has only space for three addresses, the
* transmitter, which must be equal to the sender (origin)
* receiver (on the wireless medium), which is the AP in
the case of managed mode
* the recipient (destination), which is on the APs local
network segment
In an IBSS, it is similar, but the receiver and recipient
must match and the third address is used as the BSSID.
To avoid such mistakes in the future, disallow adding a
wireless interface to a bridge.
Felix has recently added a four-address mode to the AP
and client side that can be used (after negotiating that
it is possible, which must happen out-of-band by setting
up both sides) for bridging, so allow that case.
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2009-11-19 02:56:30 +03:00
2006-02-10 04:10:12 +03:00
p = new_nbp ( br , dev ) ;
if ( IS_ERR ( p ) )
2005-04-17 02:20:36 +04:00
return PTR_ERR ( p ) ;
2011-05-20 01:39:11 +04:00
call_netdevice_notifiers ( NETDEV_JOIN , dev ) ;
2014-05-16 17:59:20 +04:00
err = dev_set_allmulti ( dev , 1 ) ;
2019-05-10 05:52:12 +03:00
if ( err ) {
2021-08-09 16:20:23 +03:00
br_multicast_del_port ( p ) ;
2019-05-10 05:52:12 +03:00
kfree ( p ) ; /* kobject not yet init'd, manually free */
goto err1 ;
}
2008-07-15 07:53:13 +04:00
2007-12-18 09:05:35 +03:00
err = kobject_init_and_add ( & p - > kobj , & brport_ktype , & ( dev - > dev . kobj ) ,
SYSFS_BRIDGE_PORT_ATTR ) ;
2006-02-10 04:10:12 +03:00
if ( err )
2019-05-10 05:52:12 +03:00
goto err2 ;
2005-04-17 02:20:36 +04:00
2006-02-10 04:10:12 +03:00
err = br_sysfs_addif ( p ) ;
if ( err )
goto err2 ;
2005-04-17 02:20:36 +04:00
2014-03-28 02:36:38 +04:00
err = br_netpoll_enable ( p ) ;
2013-07-24 22:51:41 +04:00
if ( err )
2010-06-10 20:12:50 +04:00
goto err3 ;
2020-05-10 19:37:40 +03:00
err = netdev_rx_handler_register ( dev , br_get_rx_handler ( dev ) , p ) ;
2010-06-02 01:52:08 +04:00
if ( err )
2012-12-20 03:41:43 +04:00
goto err4 ;
2010-06-15 10:50:45 +04:00
2014-09-05 17:51:28 +04:00
dev - > priv_flags | = IFF_BRIDGE_PORT ;
2017-10-05 03:48:50 +03:00
err = netdev_master_upper_dev_link ( dev , br - > dev , NULL , NULL , extack ) ;
2011-02-13 12:33:42 +03:00
if ( err )
2012-12-20 03:41:43 +04:00
goto err5 ;
2011-02-13 12:33:42 +03:00
2008-06-20 03:15:47 +04:00
dev_disable_lro ( dev ) ;
2006-02-10 04:10:12 +03:00
list_add_rcu ( & p - > list , & br - > port_list ) ;
2014-05-16 17:59:16 +04:00
nbp_update_port_count ( br ) ;
2021-07-02 15:07:36 +03:00
if ( ! br_promisc_port ( p ) & & ( p - > dev - > priv_flags & IFF_UNICAST_FLT ) ) {
/* When updating the port count we also update all ports'
* promiscuous mode .
* A port leaving promiscuous mode normally gets the bridge ' s
* fdb synced to the unicast filter ( if supported ) , however ,
* ` br_port_clear_promisc ` does not distinguish between
* non - promiscuous ports and * new * ports , so we need to
* sync explicitly here .
*/
fdb_synced = br_fdb_sync_static ( br , p ) = = 0 ;
if ( ! fdb_synced )
netdev_err ( dev , " failed to sync bridge static fdb addresses to this port \n " ) ;
}
2014-05-16 17:59:16 +04:00
2011-04-22 10:31:16 +04:00
netdev_update_features ( br - > dev ) ;
2016-02-26 12:45:38 +03:00
br_hr = br - > dev - > needed_headroom ;
dev_hr = netdev_get_fwd_headroom ( dev ) ;
if ( br_hr < dev_hr )
update_headroom ( br , dev_hr ) ;
else
netdev_set_rx_headroom ( dev , br_hr ) ;
2013-08-27 15:03:53 +04:00
2021-10-26 17:27:39 +03:00
if ( br_fdb_add_local ( br , p , dev - > dev_addr , 0 ) )
2014-02-07 11:48:21 +04:00
netdev_err ( dev , " failed insert local address bridge forwarding table \n " ) ;
2018-12-13 14:54:37 +03:00
if ( br - > dev - > addr_assign_type ! = NET_ADDR_SET ) {
/* Ask for permission to use this MAC address now, even if we
* don ' t end up choosing it below .
*/
err = dev_pre_changeaddr_notify ( br - > dev , dev - > dev_addr , extack ) ;
if ( err )
net: bridge: switchdev: let drivers inform which bridge ports are offloaded
On reception of an skb, the bridge checks if it was marked as 'already
forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it
is, it assigns the source hardware domain of that skb based on the
hardware domain of the ingress port. Then during forwarding, it enforces
that the egress port must have a different hardware domain than the
ingress one (this is done in nbp_switchdev_allowed_egress).
Non-switchdev drivers don't report any physical switch id (neither
through devlink nor .ndo_get_port_parent_id), therefore the bridge
assigns them a hardware domain of 0, and packets coming from them will
always have skb->offload_fwd_mark = 0. So there aren't any restrictions.
Problems appear due to the fact that DSA would like to perform software
fallback for bonding and team interfaces that the physical switch cannot
offload.
+-- br0 ---+
/ / | \
/ / | \
/ | | bond0
/ | | / \
swp0 swp1 swp2 swp3 swp4
There, it is desirable that the presence of swp3 and swp4 under a
non-offloaded LAG does not preclude us from doing hardware bridging
beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high
enough that software bridging between {swp0,swp1,swp2} and bond0 is not
impractical.
But this creates an impossible paradox given the current way in which
port hardware domains are assigned. When the driver receives a packet
from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to
something.
- If we set it to 0, then the bridge will forward it towards swp1, swp2
and bond0. But the switch has already forwarded it towards swp1 and
swp2 (not to bond0, remember, that isn't offloaded, so as far as the
switch is concerned, ports swp3 and swp4 are not looking up the FDB,
and the entire bond0 is a destination that is strictly behind the
CPU). But we don't want duplicated traffic towards swp1 and swp2, so
it's not ok to set skb->offload_fwd_mark = 0.
- If we set it to 1, then the bridge will not forward the skb towards
the ports with the same switchdev mark, i.e. not to swp1, swp2 and
bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should
have forwarded the skb there.
So the real issue is that bond0 will be assigned the same hardware
domain as {swp0,swp1,swp2}, because the function that assigns hardware
domains to bridge ports, nbp_switchdev_add(), recurses through bond0's
lower interfaces until it finds something that implements devlink (calls
dev_get_port_parent_id with bool recurse = true). This is a problem
because the fact that bond0 can be offloaded by swp3 and swp4 in our
example is merely an assumption.
A solution is to give the bridge explicit hints as to what hardware
domain it should use for each port.
Currently, the bridging offload is very 'silent': a driver registers a
netdevice notifier, which is put on the netns's notifier chain, and
which sniffs around for NETDEV_CHANGEUPPER events where the upper is a
bridge, and the lower is an interface it knows about (one registered by
this driver, normally). Then, from within that notifier, it does a bunch
of stuff behind the bridge's back, without the bridge necessarily
knowing that there's somebody offloading that port. It looks like this:
ip link set swp0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v
call_netdevice_notifiers
|
v
dsa_slave_netdevice_event
|
v
oh, hey! it's for me!
|
v
.port_bridge_join
What we do to solve the conundrum is to be less silent, and change the
switchdev drivers to present themselves to the bridge. Something like this:
ip link set swp0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge: Aye! I'll use this
call_netdevice_notifiers ^ ppid as the
| | hardware domain for
v | this port, and zero
dsa_slave_netdevice_event | if I got nothing.
| |
v |
oh, hey! it's for me! |
| |
v |
.port_bridge_join |
| |
+------------------------+
switchdev_bridge_port_offload(swp0, swp0)
Then stacked interfaces (like bond0 on top of swp3/swp4) would be
treated differently in DSA, depending on whether we can or cannot
offload them.
The offload case:
ip link set bond0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge: Aye! I'll use this
call_netdevice_notifiers ^ ppid as the
| | switchdev mark for
v | bond0.
dsa_slave_netdevice_event | Coincidentally (or not),
| | bond0 and swp0, swp1, swp2
v | all have the same switchdev
hmm, it's not quite for me, | mark now, since the ASIC
but my driver has already | is able to forward towards
called .port_lag_join | all these ports in hw.
for it, because I have |
a port with dp->lag_dev == bond0. |
| |
v |
.port_bridge_join |
for swp3 and swp4 |
| |
+------------------------+
switchdev_bridge_port_offload(bond0, swp3)
switchdev_bridge_port_offload(bond0, swp4)
And the non-offload case:
ip link set bond0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge waiting:
call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload
| | wasn't called, okay, I'll use a
v | hwdom of zero for this one.
dsa_slave_netdevice_event : Then packets received on swp0 will
| : not be software-forwarded towards
v : swp1, but they will towards bond0.
it's not for me, but
bond0 is an upper of swp3
and swp4, but their dp->lag_dev
is NULL because they couldn't
offload it.
Basically we can draw the conclusion that the lowers of a bridge port
can come and go, so depending on the configuration of lowers for a
bridge port, it can dynamically toggle between offloaded and unoffloaded.
Therefore, we need an equivalent switchdev_bridge_port_unoffload too.
This patch changes the way any switchdev driver interacts with the
bridge. From now on, everybody needs to call switchdev_bridge_port_offload
and switchdev_bridge_port_unoffload, otherwise the bridge will treat the
port as non-offloaded and allow software flooding to other ports from
the same ASIC.
Note that these functions lay the ground for a more complex handshake
between switchdev drivers and the bridge in the future.
For drivers that will request a replay of the switchdev objects when
they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we
place the call to switchdev_bridge_port_unoffload() strategically inside
the NETDEV_PRECHANGEUPPER notifier's code path, and not inside
NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers
need the netdev adjacency lists to be valid, and that is only true in
NETDEV_PRECHANGEUPPER.
Cc: Vadym Kochan <vkochan@marvell.com>
Cc: Taras Chornyi <tchornyi@marvell.com>
Cc: Ioana Ciornei <ioana.ciornei@nxp.com>
Cc: Lars Povlsen <lars.povlsen@microchip.com>
Cc: Steen Hegelund <Steen.Hegelund@microchip.com>
Cc: UNGLinuxDriver@microchip.com
Cc: Claudiu Manoil <claudiu.manoil@nxp.com>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression
Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch
Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
goto err6 ;
2018-12-13 14:54:37 +03:00
}
2018-12-12 20:02:50 +03:00
err = nbp_vlan_init ( p , extack ) ;
2016-01-06 15:01:04 +03:00
if ( err ) {
2014-10-03 19:29:18 +04:00
netdev_err ( dev , " failed to initialize vlan filtering on this port \n " ) ;
net: bridge: switchdev: let drivers inform which bridge ports are offloaded
On reception of an skb, the bridge checks if it was marked as 'already
forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it
is, it assigns the source hardware domain of that skb based on the
hardware domain of the ingress port. Then during forwarding, it enforces
that the egress port must have a different hardware domain than the
ingress one (this is done in nbp_switchdev_allowed_egress).
Non-switchdev drivers don't report any physical switch id (neither
through devlink nor .ndo_get_port_parent_id), therefore the bridge
assigns them a hardware domain of 0, and packets coming from them will
always have skb->offload_fwd_mark = 0. So there aren't any restrictions.
Problems appear due to the fact that DSA would like to perform software
fallback for bonding and team interfaces that the physical switch cannot
offload.
+-- br0 ---+
/ / | \
/ / | \
/ | | bond0
/ | | / \
swp0 swp1 swp2 swp3 swp4
There, it is desirable that the presence of swp3 and swp4 under a
non-offloaded LAG does not preclude us from doing hardware bridging
beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high
enough that software bridging between {swp0,swp1,swp2} and bond0 is not
impractical.
But this creates an impossible paradox given the current way in which
port hardware domains are assigned. When the driver receives a packet
from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to
something.
- If we set it to 0, then the bridge will forward it towards swp1, swp2
and bond0. But the switch has already forwarded it towards swp1 and
swp2 (not to bond0, remember, that isn't offloaded, so as far as the
switch is concerned, ports swp3 and swp4 are not looking up the FDB,
and the entire bond0 is a destination that is strictly behind the
CPU). But we don't want duplicated traffic towards swp1 and swp2, so
it's not ok to set skb->offload_fwd_mark = 0.
- If we set it to 1, then the bridge will not forward the skb towards
the ports with the same switchdev mark, i.e. not to swp1, swp2 and
bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should
have forwarded the skb there.
So the real issue is that bond0 will be assigned the same hardware
domain as {swp0,swp1,swp2}, because the function that assigns hardware
domains to bridge ports, nbp_switchdev_add(), recurses through bond0's
lower interfaces until it finds something that implements devlink (calls
dev_get_port_parent_id with bool recurse = true). This is a problem
because the fact that bond0 can be offloaded by swp3 and swp4 in our
example is merely an assumption.
A solution is to give the bridge explicit hints as to what hardware
domain it should use for each port.
Currently, the bridging offload is very 'silent': a driver registers a
netdevice notifier, which is put on the netns's notifier chain, and
which sniffs around for NETDEV_CHANGEUPPER events where the upper is a
bridge, and the lower is an interface it knows about (one registered by
this driver, normally). Then, from within that notifier, it does a bunch
of stuff behind the bridge's back, without the bridge necessarily
knowing that there's somebody offloading that port. It looks like this:
ip link set swp0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v
call_netdevice_notifiers
|
v
dsa_slave_netdevice_event
|
v
oh, hey! it's for me!
|
v
.port_bridge_join
What we do to solve the conundrum is to be less silent, and change the
switchdev drivers to present themselves to the bridge. Something like this:
ip link set swp0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge: Aye! I'll use this
call_netdevice_notifiers ^ ppid as the
| | hardware domain for
v | this port, and zero
dsa_slave_netdevice_event | if I got nothing.
| |
v |
oh, hey! it's for me! |
| |
v |
.port_bridge_join |
| |
+------------------------+
switchdev_bridge_port_offload(swp0, swp0)
Then stacked interfaces (like bond0 on top of swp3/swp4) would be
treated differently in DSA, depending on whether we can or cannot
offload them.
The offload case:
ip link set bond0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge: Aye! I'll use this
call_netdevice_notifiers ^ ppid as the
| | switchdev mark for
v | bond0.
dsa_slave_netdevice_event | Coincidentally (or not),
| | bond0 and swp0, swp1, swp2
v | all have the same switchdev
hmm, it's not quite for me, | mark now, since the ASIC
but my driver has already | is able to forward towards
called .port_lag_join | all these ports in hw.
for it, because I have |
a port with dp->lag_dev == bond0. |
| |
v |
.port_bridge_join |
for swp3 and swp4 |
| |
+------------------------+
switchdev_bridge_port_offload(bond0, swp3)
switchdev_bridge_port_offload(bond0, swp4)
And the non-offload case:
ip link set bond0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge waiting:
call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload
| | wasn't called, okay, I'll use a
v | hwdom of zero for this one.
dsa_slave_netdevice_event : Then packets received on swp0 will
| : not be software-forwarded towards
v : swp1, but they will towards bond0.
it's not for me, but
bond0 is an upper of swp3
and swp4, but their dp->lag_dev
is NULL because they couldn't
offload it.
Basically we can draw the conclusion that the lowers of a bridge port
can come and go, so depending on the configuration of lowers for a
bridge port, it can dynamically toggle between offloaded and unoffloaded.
Therefore, we need an equivalent switchdev_bridge_port_unoffload too.
This patch changes the way any switchdev driver interacts with the
bridge. From now on, everybody needs to call switchdev_bridge_port_offload
and switchdev_bridge_port_unoffload, otherwise the bridge will treat the
port as non-offloaded and allow software flooding to other ports from
the same ASIC.
Note that these functions lay the ground for a more complex handshake
between switchdev drivers and the bridge in the future.
For drivers that will request a replay of the switchdev objects when
they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we
place the call to switchdev_bridge_port_unoffload() strategically inside
the NETDEV_PRECHANGEUPPER notifier's code path, and not inside
NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers
need the netdev adjacency lists to be valid, and that is only true in
NETDEV_PRECHANGEUPPER.
Cc: Vadym Kochan <vkochan@marvell.com>
Cc: Taras Chornyi <tchornyi@marvell.com>
Cc: Ioana Ciornei <ioana.ciornei@nxp.com>
Cc: Lars Povlsen <lars.povlsen@microchip.com>
Cc: Steen Hegelund <Steen.Hegelund@microchip.com>
Cc: UNGLinuxDriver@microchip.com
Cc: Claudiu Manoil <claudiu.manoil@nxp.com>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression
Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch
Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
goto err6 ;
2016-01-06 15:01:04 +03:00
}
2014-10-03 19:29:18 +04:00
2006-02-10 04:10:12 +03:00
spin_lock_bh ( & br - > lock ) ;
2011-03-24 16:24:01 +03:00
changed_addr = br_stp_recalculate_bridge_id ( br ) ;
2007-03-08 03:10:53 +03:00
2012-12-28 22:15:22 +04:00
if ( netif_running ( dev ) & & netif_oper_up ( dev ) & &
2007-03-08 03:10:53 +03:00
( br - > dev - > flags & IFF_UP ) )
br_stp_enable_port ( p ) ;
2006-02-10 04:10:12 +03:00
spin_unlock_bh ( & br - > lock ) ;
2017-11-01 13:18:13 +03:00
br_ifinfo_notify ( RTM_NEWLINK , NULL , p ) ;
2007-03-23 00:08:46 +03:00
2011-03-24 16:24:01 +03:00
if ( changed_addr )
2011-07-22 11:47:08 +04:00
call_netdevice_notifiers ( NETDEV_CHANGEADDR , br - > dev ) ;
2011-03-24 16:24:01 +03:00
2018-03-30 13:46:19 +03:00
br_mtu_auto_adjust ( br ) ;
2016-03-21 19:55:11 +03:00
br_set_gso_limits ( br ) ;
2007-02-22 12:10:18 +03:00
2006-02-10 04:10:12 +03:00
kobject_uevent ( & p - > kobj , KOBJ_ADD ) ;
2005-04-17 02:20:36 +04:00
2006-02-10 04:10:12 +03:00
return 0 ;
2011-02-13 12:33:42 +03:00
net: bridge: switchdev: let drivers inform which bridge ports are offloaded
On reception of an skb, the bridge checks if it was marked as 'already
forwarded in hardware' (checks if skb->offload_fwd_mark == 1), and if it
is, it assigns the source hardware domain of that skb based on the
hardware domain of the ingress port. Then during forwarding, it enforces
that the egress port must have a different hardware domain than the
ingress one (this is done in nbp_switchdev_allowed_egress).
Non-switchdev drivers don't report any physical switch id (neither
through devlink nor .ndo_get_port_parent_id), therefore the bridge
assigns them a hardware domain of 0, and packets coming from them will
always have skb->offload_fwd_mark = 0. So there aren't any restrictions.
Problems appear due to the fact that DSA would like to perform software
fallback for bonding and team interfaces that the physical switch cannot
offload.
+-- br0 ---+
/ / | \
/ / | \
/ | | bond0
/ | | / \
swp0 swp1 swp2 swp3 swp4
There, it is desirable that the presence of swp3 and swp4 under a
non-offloaded LAG does not preclude us from doing hardware bridging
beteen swp0, swp1 and swp2. The bandwidth of the CPU is often times high
enough that software bridging between {swp0,swp1,swp2} and bond0 is not
impractical.
But this creates an impossible paradox given the current way in which
port hardware domains are assigned. When the driver receives a packet
from swp0 (say, due to flooding), it must set skb->offload_fwd_mark to
something.
- If we set it to 0, then the bridge will forward it towards swp1, swp2
and bond0. But the switch has already forwarded it towards swp1 and
swp2 (not to bond0, remember, that isn't offloaded, so as far as the
switch is concerned, ports swp3 and swp4 are not looking up the FDB,
and the entire bond0 is a destination that is strictly behind the
CPU). But we don't want duplicated traffic towards swp1 and swp2, so
it's not ok to set skb->offload_fwd_mark = 0.
- If we set it to 1, then the bridge will not forward the skb towards
the ports with the same switchdev mark, i.e. not to swp1, swp2 and
bond0. Towards swp1 and swp2 that's ok, but towards bond0? It should
have forwarded the skb there.
So the real issue is that bond0 will be assigned the same hardware
domain as {swp0,swp1,swp2}, because the function that assigns hardware
domains to bridge ports, nbp_switchdev_add(), recurses through bond0's
lower interfaces until it finds something that implements devlink (calls
dev_get_port_parent_id with bool recurse = true). This is a problem
because the fact that bond0 can be offloaded by swp3 and swp4 in our
example is merely an assumption.
A solution is to give the bridge explicit hints as to what hardware
domain it should use for each port.
Currently, the bridging offload is very 'silent': a driver registers a
netdevice notifier, which is put on the netns's notifier chain, and
which sniffs around for NETDEV_CHANGEUPPER events where the upper is a
bridge, and the lower is an interface it knows about (one registered by
this driver, normally). Then, from within that notifier, it does a bunch
of stuff behind the bridge's back, without the bridge necessarily
knowing that there's somebody offloading that port. It looks like this:
ip link set swp0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v
call_netdevice_notifiers
|
v
dsa_slave_netdevice_event
|
v
oh, hey! it's for me!
|
v
.port_bridge_join
What we do to solve the conundrum is to be less silent, and change the
switchdev drivers to present themselves to the bridge. Something like this:
ip link set swp0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge: Aye! I'll use this
call_netdevice_notifiers ^ ppid as the
| | hardware domain for
v | this port, and zero
dsa_slave_netdevice_event | if I got nothing.
| |
v |
oh, hey! it's for me! |
| |
v |
.port_bridge_join |
| |
+------------------------+
switchdev_bridge_port_offload(swp0, swp0)
Then stacked interfaces (like bond0 on top of swp3/swp4) would be
treated differently in DSA, depending on whether we can or cannot
offload them.
The offload case:
ip link set bond0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge: Aye! I'll use this
call_netdevice_notifiers ^ ppid as the
| | switchdev mark for
v | bond0.
dsa_slave_netdevice_event | Coincidentally (or not),
| | bond0 and swp0, swp1, swp2
v | all have the same switchdev
hmm, it's not quite for me, | mark now, since the ASIC
but my driver has already | is able to forward towards
called .port_lag_join | all these ports in hw.
for it, because I have |
a port with dp->lag_dev == bond0. |
| |
v |
.port_bridge_join |
for swp3 and swp4 |
| |
+------------------------+
switchdev_bridge_port_offload(bond0, swp3)
switchdev_bridge_port_offload(bond0, swp4)
And the non-offload case:
ip link set bond0 master br0
|
v
br_add_if() calls netdev_master_upper_dev_link()
|
v bridge waiting:
call_netdevice_notifiers ^ huh, switchdev_bridge_port_offload
| | wasn't called, okay, I'll use a
v | hwdom of zero for this one.
dsa_slave_netdevice_event : Then packets received on swp0 will
| : not be software-forwarded towards
v : swp1, but they will towards bond0.
it's not for me, but
bond0 is an upper of swp3
and swp4, but their dp->lag_dev
is NULL because they couldn't
offload it.
Basically we can draw the conclusion that the lowers of a bridge port
can come and go, so depending on the configuration of lowers for a
bridge port, it can dynamically toggle between offloaded and unoffloaded.
Therefore, we need an equivalent switchdev_bridge_port_unoffload too.
This patch changes the way any switchdev driver interacts with the
bridge. From now on, everybody needs to call switchdev_bridge_port_offload
and switchdev_bridge_port_unoffload, otherwise the bridge will treat the
port as non-offloaded and allow software flooding to other ports from
the same ASIC.
Note that these functions lay the ground for a more complex handshake
between switchdev drivers and the bridge in the future.
For drivers that will request a replay of the switchdev objects when
they offload and unoffload a bridge port (DSA, dpaa2-switch, ocelot), we
place the call to switchdev_bridge_port_unoffload() strategically inside
the NETDEV_PRECHANGEUPPER notifier's code path, and not inside
NETDEV_CHANGEUPPER. This is because the switchdev object replay helpers
need the netdev adjacency lists to be valid, and that is only true in
NETDEV_PRECHANGEUPPER.
Cc: Vadym Kochan <vkochan@marvell.com>
Cc: Taras Chornyi <tchornyi@marvell.com>
Cc: Ioana Ciornei <ioana.ciornei@nxp.com>
Cc: Lars Povlsen <lars.povlsen@microchip.com>
Cc: Steen Hegelund <Steen.Hegelund@microchip.com>
Cc: UNGLinuxDriver@microchip.com
Cc: Claudiu Manoil <claudiu.manoil@nxp.com>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch: regression
Acked-by: Ioana Ciornei <ioana.ciornei@nxp.com> # dpaa2-switch
Tested-by: Horatiu Vultur <horatiu.vultur@microchip.com> # ocelot-switch
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-21 19:24:01 +03:00
err6 :
2021-07-02 15:07:36 +03:00
if ( fdb_synced )
br_fdb_unsync_static ( br , p ) ;
2016-01-06 15:01:04 +03:00
list_del_rcu ( & p - > list ) ;
br_fdb_delete_by_port ( br , p , 0 , 1 ) ;
nbp_update_port_count ( br ) ;
netdev_upper_dev_unlink ( dev , br - > dev ) ;
2012-12-20 03:41:43 +04:00
err5 :
2014-09-05 17:51:28 +04:00
dev - > priv_flags & = ~ IFF_BRIDGE_PORT ;
netdev_rx_handler_unregister ( dev ) ;
2012-12-20 03:41:43 +04:00
err4 :
br_netpoll_disable ( p ) ;
2010-06-10 20:12:50 +04:00
err3 :
sysfs_remove_link ( br - > ifobj , p - > dev - > name ) ;
2006-02-10 04:10:12 +03:00
err2 :
2021-08-09 16:20:23 +03:00
br_multicast_del_port ( p ) ;
2009-07-24 03:06:32 +04:00
kobject_put ( & p - > kobj ) ;
2014-05-29 06:15:30 +04:00
dev_set_allmulti ( dev , - 1 ) ;
2019-05-10 05:52:12 +03:00
err1 :
2008-04-29 14:17:42 +04:00
dev_put ( dev ) ;
2005-04-17 02:20:36 +04:00
return err ;
}
/* called with RTNL */
int br_del_if ( struct net_bridge * br , struct net_device * dev )
{
2010-06-15 10:50:45 +04:00
struct net_bridge_port * p ;
2011-08-05 15:04:10 +04:00
bool changed_addr ;
2010-06-15 10:50:45 +04:00
2010-11-15 09:38:14 +03:00
p = br_port_get_rtnl ( dev ) ;
2010-11-15 09:38:13 +03:00
if ( ! p | | p - > br ! = br )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2012-08-10 05:24:43 +04:00
/* Since more than one interface can be attached to a bridge,
* there still maybe an alternate path for netconsole to use ;
* therefore there is no reason for a NETDEV_RELEASE event .
*/
2005-04-17 02:20:36 +04:00
del_nbp ( p ) ;
2018-03-30 13:46:19 +03:00
br_mtu_auto_adjust ( br ) ;
2016-03-21 19:55:11 +03:00
br_set_gso_limits ( br ) ;
2015-03-13 17:08:22 +03:00
2005-04-17 02:20:36 +04:00
spin_lock_bh ( & br - > lock ) ;
2011-08-05 15:04:10 +04:00
changed_addr = br_stp_recalculate_bridge_id ( br ) ;
2005-04-17 02:20:36 +04:00
spin_unlock_bh ( & br - > lock ) ;
2011-08-05 15:04:10 +04:00
if ( changed_addr )
call_netdevice_notifiers ( NETDEV_CHANGEADDR , br - > dev ) ;
2011-04-22 10:31:16 +04:00
netdev_update_features ( br - > dev ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2014-05-16 17:59:16 +04:00
void br_port_flags_change ( struct net_bridge_port * p , unsigned long mask )
{
struct net_bridge * br = p - > br ;
if ( mask & BR_AUTO_MASK )
nbp_update_port_count ( br ) ;
2017-10-07 08:12:37 +03:00
if ( mask & BR_NEIGH_SUPPRESS )
br_recalculate_neigh_suppress_enabled ( br ) ;
2014-05-16 17:59:16 +04:00
}
2018-11-21 11:02:41 +03:00
bool br_port_flag_is_set ( const struct net_device * dev , unsigned long flag )
{
struct net_bridge_port * p ;
p = br_port_get_rtnl_rcu ( dev ) ;
if ( ! p )
return false ;
return p - > flags & flag ;
}
EXPORT_SYMBOL_GPL ( br_port_flag_is_set ) ;