7b4858df3b
For EVPN non-DF (Designated Forwarder) filtering we need to be able to prevent decapsulated traffic from being flooded to a multi-homed host. Filtering of multicast and broadcast traffic can be achieved using the following flower filter: # tc filter add dev bond0 egress pref 1 proto all flower indev vxlan0 dst_mac 01:00:00:00:00:00/01:00:00:00:00:00 action drop Unlike broadcast and multicast traffic, it is not currently possible to filter unknown unicast traffic. The classification into unknown unicast is performed by the bridge driver, but is not visible to other layers such as tc. Solve this by adding a new 'l2_miss' bit to the tc skb extension. Clear the bit whenever a packet enters the bridge (received from a bridge port or transmitted via the bridge) and set it if the packet did not match an FDB or MDB entry. If there is no skb extension and the bit needs to be cleared, then do not allocate one as no extension is equivalent to the bit being cleared. The bit is not set for broadcast packets as they never perform a lookup and therefore never incur a miss. A bit that is set for every flooded packet would also work for the current use case, but it does not allow us to differentiate between registered and unregistered multicast traffic, which might be useful in the future. To keep the performance impact to a minimum, the marking of packets is guarded by the 'tc_skb_ext_tc' static key. When 'false', the skb is not touched and an skb extension is not allocated. Instead, only a 5 bytes nop is executed, as demonstrated below for the call site in br_handle_frame(). Before the patch: ``` memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); c37b09: 49 c7 44 24 28 00 00 movq $0x0,0x28(%r12) c37b10: 00 00 p = br_port_get_rcu(skb->dev); c37b12: 49 8b 44 24 10 mov 0x10(%r12),%rax memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); c37b17: 49 c7 44 24 30 00 00 movq $0x0,0x30(%r12) c37b1e: 00 00 c37b20: 49 c7 44 24 38 00 00 movq $0x0,0x38(%r12) c37b27: 00 00 ``` After the patch (when static key is disabled): ``` memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); c37c29: 49 c7 44 24 28 00 00 movq $0x0,0x28(%r12) c37c30: 00 00 c37c32: 49 8d 44 24 28 lea 0x28(%r12),%rax c37c37: 48 c7 40 08 00 00 00 movq $0x0,0x8(%rax) c37c3e: 00 c37c3f: 48 c7 40 10 00 00 00 movq $0x0,0x10(%rax) c37c46: 00 #ifdef CONFIG_HAVE_JUMP_LABEL_HACK static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:" c37c47: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) br_tc_skb_miss_set(skb, false); p = br_port_get_rcu(skb->dev); c37c4c: 49 8b 44 24 10 mov 0x10(%r12),%rax ``` Subsequent patches will extend the flower classifier to be able to match on the new 'l2_miss' bit and enable / disable the static key when filters that match on it are added / deleted. Signed-off-by: Ido Schimmel <idosch@nvidia.com> Acked-by: Nikolay Aleksandrov <razor@blackwall.org> Acked-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
349 lines
7.9 KiB
C
349 lines
7.9 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Forwarding decision
|
|
* Linux ethernet bridge
|
|
*
|
|
* Authors:
|
|
* Lennert Buytenhek <buytenh@gnu.org>
|
|
*/
|
|
|
|
#include <linux/err.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/netpoll.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/if_vlan.h>
|
|
#include <linux/netfilter_bridge.h>
|
|
#include "br_private.h"
|
|
|
|
/* Don't forward packets to originating port or forwarding disabled */
|
|
static inline int should_deliver(const struct net_bridge_port *p,
|
|
const struct sk_buff *skb)
|
|
{
|
|
struct net_bridge_vlan_group *vg;
|
|
|
|
vg = nbp_vlan_group_rcu(p);
|
|
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
|
|
p->state == BR_STATE_FORWARDING && br_allowed_egress(vg, skb) &&
|
|
nbp_switchdev_allowed_egress(p, skb) &&
|
|
!br_skb_isolated(p, skb);
|
|
}
|
|
|
|
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
skb_push(skb, ETH_HLEN);
|
|
if (!is_skb_forwardable(skb->dev, skb))
|
|
goto drop;
|
|
|
|
br_drop_fake_rtable(skb);
|
|
|
|
if (skb->ip_summed == CHECKSUM_PARTIAL &&
|
|
eth_type_vlan(skb->protocol)) {
|
|
int depth;
|
|
|
|
if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth))
|
|
goto drop;
|
|
|
|
skb_set_network_header(skb, depth);
|
|
}
|
|
|
|
br_switchdev_frame_set_offload_fwd_mark(skb);
|
|
|
|
dev_queue_xmit(skb);
|
|
|
|
return 0;
|
|
|
|
drop:
|
|
kfree_skb(skb);
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
|
|
|
|
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
skb_clear_tstamp(skb);
|
|
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
|
|
net, sk, skb, NULL, skb->dev,
|
|
br_dev_queue_push_xmit);
|
|
|
|
}
|
|
EXPORT_SYMBOL_GPL(br_forward_finish);
|
|
|
|
static void __br_forward(const struct net_bridge_port *to,
|
|
struct sk_buff *skb, bool local_orig)
|
|
{
|
|
struct net_bridge_vlan_group *vg;
|
|
struct net_device *indev;
|
|
struct net *net;
|
|
int br_hook;
|
|
|
|
/* Mark the skb for forwarding offload early so that br_handle_vlan()
|
|
* can know whether to pop the VLAN header on egress or keep it.
|
|
*/
|
|
nbp_switchdev_frame_mark_tx_fwd_offload(to, skb);
|
|
|
|
vg = nbp_vlan_group_rcu(to);
|
|
skb = br_handle_vlan(to->br, to, vg, skb);
|
|
if (!skb)
|
|
return;
|
|
|
|
indev = skb->dev;
|
|
skb->dev = to->dev;
|
|
if (!local_orig) {
|
|
if (skb_warn_if_lro(skb)) {
|
|
kfree_skb(skb);
|
|
return;
|
|
}
|
|
br_hook = NF_BR_FORWARD;
|
|
skb_forward_csum(skb);
|
|
net = dev_net(indev);
|
|
} else {
|
|
if (unlikely(netpoll_tx_running(to->br->dev))) {
|
|
skb_push(skb, ETH_HLEN);
|
|
if (!is_skb_forwardable(skb->dev, skb))
|
|
kfree_skb(skb);
|
|
else
|
|
br_netpoll_send_skb(to, skb);
|
|
return;
|
|
}
|
|
br_hook = NF_BR_LOCAL_OUT;
|
|
net = dev_net(skb->dev);
|
|
indev = NULL;
|
|
}
|
|
|
|
NF_HOOK(NFPROTO_BRIDGE, br_hook,
|
|
net, NULL, skb, indev, skb->dev,
|
|
br_forward_finish);
|
|
}
|
|
|
|
static int deliver_clone(const struct net_bridge_port *prev,
|
|
struct sk_buff *skb, bool local_orig)
|
|
{
|
|
struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
|
|
|
|
skb = skb_clone(skb, GFP_ATOMIC);
|
|
if (!skb) {
|
|
dev->stats.tx_dropped++;
|
|
return -ENOMEM;
|
|
}
|
|
|
|
__br_forward(prev, skb, local_orig);
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* br_forward - forward a packet to a specific port
|
|
* @to: destination port
|
|
* @skb: packet being forwarded
|
|
* @local_rcv: packet will be received locally after forwarding
|
|
* @local_orig: packet is locally originated
|
|
*
|
|
* Should be called with rcu_read_lock.
|
|
*/
|
|
void br_forward(const struct net_bridge_port *to,
|
|
struct sk_buff *skb, bool local_rcv, bool local_orig)
|
|
{
|
|
if (unlikely(!to))
|
|
goto out;
|
|
|
|
/* redirect to backup link if the destination port is down */
|
|
if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
|
|
struct net_bridge_port *backup_port;
|
|
|
|
backup_port = rcu_dereference(to->backup_port);
|
|
if (unlikely(!backup_port))
|
|
goto out;
|
|
to = backup_port;
|
|
}
|
|
|
|
if (should_deliver(to, skb)) {
|
|
if (local_rcv)
|
|
deliver_clone(to, skb, local_orig);
|
|
else
|
|
__br_forward(to, skb, local_orig);
|
|
return;
|
|
}
|
|
|
|
out:
|
|
if (!local_rcv)
|
|
kfree_skb(skb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(br_forward);
|
|
|
|
static struct net_bridge_port *maybe_deliver(
|
|
struct net_bridge_port *prev, struct net_bridge_port *p,
|
|
struct sk_buff *skb, bool local_orig)
|
|
{
|
|
u8 igmp_type = br_multicast_igmp_type(skb);
|
|
int err;
|
|
|
|
if (!should_deliver(p, skb))
|
|
return prev;
|
|
|
|
nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb);
|
|
|
|
if (!prev)
|
|
goto out;
|
|
|
|
err = deliver_clone(prev, skb, local_orig);
|
|
if (err)
|
|
return ERR_PTR(err);
|
|
out:
|
|
br_multicast_count(p->br, p, skb, igmp_type, BR_MCAST_DIR_TX);
|
|
|
|
return p;
|
|
}
|
|
|
|
/* called under rcu_read_lock */
|
|
void br_flood(struct net_bridge *br, struct sk_buff *skb,
|
|
enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
|
|
u16 vid)
|
|
{
|
|
struct net_bridge_port *prev = NULL;
|
|
struct net_bridge_port *p;
|
|
|
|
br_tc_skb_miss_set(skb, pkt_type != BR_PKT_BROADCAST);
|
|
|
|
list_for_each_entry_rcu(p, &br->port_list, list) {
|
|
/* Do not flood unicast traffic to ports that turn it off, nor
|
|
* other traffic if flood off, except for traffic we originate
|
|
*/
|
|
switch (pkt_type) {
|
|
case BR_PKT_UNICAST:
|
|
if (!(p->flags & BR_FLOOD))
|
|
continue;
|
|
break;
|
|
case BR_PKT_MULTICAST:
|
|
if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
|
|
continue;
|
|
break;
|
|
case BR_PKT_BROADCAST:
|
|
if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev)
|
|
continue;
|
|
break;
|
|
}
|
|
|
|
/* Do not flood to ports that enable proxy ARP */
|
|
if (p->flags & BR_PROXYARP)
|
|
continue;
|
|
if (BR_INPUT_SKB_CB(skb)->proxyarp_replied &&
|
|
((p->flags & BR_PROXYARP_WIFI) ||
|
|
br_is_neigh_suppress_enabled(p, vid)))
|
|
continue;
|
|
|
|
prev = maybe_deliver(prev, p, skb, local_orig);
|
|
if (IS_ERR(prev))
|
|
goto out;
|
|
}
|
|
|
|
if (!prev)
|
|
goto out;
|
|
|
|
if (local_rcv)
|
|
deliver_clone(prev, skb, local_orig);
|
|
else
|
|
__br_forward(prev, skb, local_orig);
|
|
return;
|
|
|
|
out:
|
|
if (!local_rcv)
|
|
kfree_skb(skb);
|
|
}
|
|
|
|
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
|
|
static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
|
|
const unsigned char *addr, bool local_orig)
|
|
{
|
|
struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
|
|
const unsigned char *src = eth_hdr(skb)->h_source;
|
|
|
|
if (!should_deliver(p, skb))
|
|
return;
|
|
|
|
/* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
|
|
if (skb->dev == p->dev && ether_addr_equal(src, addr))
|
|
return;
|
|
|
|
skb = skb_copy(skb, GFP_ATOMIC);
|
|
if (!skb) {
|
|
dev->stats.tx_dropped++;
|
|
return;
|
|
}
|
|
|
|
if (!is_broadcast_ether_addr(addr))
|
|
memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
|
|
|
|
__br_forward(p, skb, local_orig);
|
|
}
|
|
|
|
/* called with rcu_read_lock */
|
|
void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
|
|
struct sk_buff *skb,
|
|
struct net_bridge_mcast *brmctx,
|
|
bool local_rcv, bool local_orig)
|
|
{
|
|
struct net_bridge_port *prev = NULL;
|
|
struct net_bridge_port_group *p;
|
|
bool allow_mode_include = true;
|
|
struct hlist_node *rp;
|
|
|
|
rp = br_multicast_get_first_rport_node(brmctx, skb);
|
|
|
|
if (mdst) {
|
|
p = rcu_dereference(mdst->ports);
|
|
if (br_multicast_should_handle_mode(brmctx, mdst->addr.proto) &&
|
|
br_multicast_is_star_g(&mdst->addr))
|
|
allow_mode_include = false;
|
|
} else {
|
|
p = NULL;
|
|
br_tc_skb_miss_set(skb, true);
|
|
}
|
|
|
|
while (p || rp) {
|
|
struct net_bridge_port *port, *lport, *rport;
|
|
|
|
lport = p ? p->key.port : NULL;
|
|
rport = br_multicast_rport_from_node_skb(rp, skb);
|
|
|
|
if ((unsigned long)lport > (unsigned long)rport) {
|
|
port = lport;
|
|
|
|
if (port->flags & BR_MULTICAST_TO_UNICAST) {
|
|
maybe_deliver_addr(lport, skb, p->eth_addr,
|
|
local_orig);
|
|
goto delivered;
|
|
}
|
|
if ((!allow_mode_include &&
|
|
p->filter_mode == MCAST_INCLUDE) ||
|
|
(p->flags & MDB_PG_FLAGS_BLOCKED))
|
|
goto delivered;
|
|
} else {
|
|
port = rport;
|
|
}
|
|
|
|
prev = maybe_deliver(prev, port, skb, local_orig);
|
|
if (IS_ERR(prev))
|
|
goto out;
|
|
delivered:
|
|
if ((unsigned long)lport >= (unsigned long)port)
|
|
p = rcu_dereference(p->next);
|
|
if ((unsigned long)rport >= (unsigned long)port)
|
|
rp = rcu_dereference(hlist_next_rcu(rp));
|
|
}
|
|
|
|
if (!prev)
|
|
goto out;
|
|
|
|
if (local_rcv)
|
|
deliver_clone(prev, skb, local_orig);
|
|
else
|
|
__br_forward(prev, skb, local_orig);
|
|
return;
|
|
|
|
out:
|
|
if (!local_rcv)
|
|
kfree_skb(skb);
|
|
}
|
|
#endif
|