linux/net/bridge/br_vlan.c

625 lines
12 KiB
C
Raw Normal View History

#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include "br_private.h"
static void __vlan_add_pvid(struct net_port_vlans *v, u16 vid)
{
if (v->pvid == vid)
return;
smp_wmb();
v->pvid = vid;
}
static void __vlan_delete_pvid(struct net_port_vlans *v, u16 vid)
{
if (v->pvid != vid)
return;
smp_wmb();
v->pvid = 0;
}
static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags)
{
if (flags & BRIDGE_VLAN_INFO_PVID)
__vlan_add_pvid(v, vid);
else
__vlan_delete_pvid(v, vid);
if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
set_bit(vid, v->untagged_bitmap);
else
clear_bit(vid, v->untagged_bitmap);
}
static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
{
struct net_bridge_port *p = NULL;
struct net_bridge *br;
struct net_device *dev;
int err;
if (test_bit(vid, v->vlan_bitmap)) {
__vlan_add_flags(v, vid, flags);
return 0;
}
if (v->port_idx) {
p = v->parent.port;
br = p->br;
dev = p->dev;
} else {
br = v->parent.br;
dev = br->dev;
}
if (p) {
/* Add VLAN to the device filter if it is supported.
* This ensures tagged traffic enters the bridge when
* promiscuous mode is disabled by br_manage_promisc().
*/
err = vlan_vid_add(dev, br->vlan_proto, vid);
if (err)
return err;
}
err = br_fdb_insert(br, p, dev->dev_addr, vid);
if (err) {
br_err(br, "failed insert local address into bridge "
"forwarding table\n");
goto out_filt;
}
set_bit(vid, v->vlan_bitmap);
v->num_vlans++;
__vlan_add_flags(v, vid, flags);
return 0;
out_filt:
if (p)
vlan_vid_del(dev, br->vlan_proto, vid);
return err;
}
static int __vlan_del(struct net_port_vlans *v, u16 vid)
{
if (!test_bit(vid, v->vlan_bitmap))
return -EINVAL;
__vlan_delete_pvid(v, vid);
clear_bit(vid, v->untagged_bitmap);
if (v->port_idx) {
struct net_bridge_port *p = v->parent.port;
vlan_vid_del(p->dev, p->br->vlan_proto, vid);
}
clear_bit(vid, v->vlan_bitmap);
v->num_vlans--;
if (bitmap_empty(v->vlan_bitmap, VLAN_N_VID)) {
if (v->port_idx)
RCU_INIT_POINTER(v->parent.port->vlan_info, NULL);
else
RCU_INIT_POINTER(v->parent.br->vlan_info, NULL);
kfree_rcu(v, rcu);
}
return 0;
}
static void __vlan_flush(struct net_port_vlans *v)
{
smp_wmb();
v->pvid = 0;
bitmap_zero(v->vlan_bitmap, VLAN_N_VID);
if (v->port_idx)
RCU_INIT_POINTER(v->parent.port->vlan_info, NULL);
else
RCU_INIT_POINTER(v->parent.br->vlan_info, NULL);
kfree_rcu(v, rcu);
}
struct sk_buff *br_handle_vlan(struct net_bridge *br,
const struct net_port_vlans *pv,
struct sk_buff *skb)
{
u16 vid;
/* If this packet was not filtered at input, let it pass */
if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
goto out;
/* Vlan filter table must be configured at this point. The
* only exception is the bridge is set in promisc mode and the
* packet is destined for the bridge device. In this case
* pass the packet as is.
*/
if (!pv) {
if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) {
goto out;
} else {
kfree_skb(skb);
return NULL;
}
}
/* At this point, we know that the frame was filtered and contains
* a valid vlan id. If the vlan id is set in the untagged bitmap,
* send untagged; otherwise, send tagged.
*/
br_vlan_get_tag(skb, &vid);
if (test_bit(vid, pv->untagged_bitmap))
skb->vlan_tci = 0;
out:
return skb;
}
/* Called under RCU */
bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
struct sk_buff *skb, u16 *vid)
{
bool tagged;
__be16 proto;
/* If VLAN filtering is disabled on the bridge, all packets are
* permitted.
*/
if (!br->vlan_enabled) {
BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
return true;
}
/* If there are no vlan in the permitted list, all packets are
* rejected.
*/
if (!v)
goto drop;
BR_INPUT_SKB_CB(skb)->vlan_filtered = true;
proto = br->vlan_proto;
/* If vlan tx offload is disabled on bridge device and frame was
* sent from vlan device on the bridge device, it does not have
* HW accelerated vlan tag.
*/
if (unlikely(!vlan_tx_tag_present(skb) &&
skb->protocol == proto)) {
net: Always untag vlan-tagged traffic on input. Currently the functionality to untag traffic on input resides as part of the vlan module and is build only when VLAN support is enabled in the kernel. When VLAN is disabled, the function vlan_untag() turns into a stub and doesn't really untag the packets. This seems to create an interesting interaction between VMs supporting checksum offloading and some network drivers. There are some drivers that do not allow the user to change tx-vlan-offload feature of the driver. These drivers also seem to assume that any VLAN-tagged traffic they transmit will have the vlan information in the vlan_tci and not in the vlan header already in the skb. When transmitting skbs that already have tagged data with partial checksum set, the checksum doesn't appear to be updated correctly by the card thus resulting in a failure to establish TCP connections. The following is a packet trace taken on the receiver where a sender is a VM with a VLAN configued. The host VM is running on doest not have VLAN support and the outging interface on the host is tg3: 10:12:43.503055 52:54:00:ae:42:3f > 28:d2:44:7d:c2:de, ethertype 802.1Q (0x8100), length 78: vlan 100, p 0, ethertype IPv4, (tos 0x0, ttl 64, id 27243, offset 0, flags [DF], proto TCP (6), length 60) 10.0.100.1.58545 > 10.0.100.10.ircu-2: Flags [S], cksum 0xdc39 (incorrect -> 0x48d9), seq 1069378582, win 29200, options [mss 1460,sackOK,TS val 4294837885 ecr 0,nop,wscale 7], length 0 10:12:44.505556 52:54:00:ae:42:3f > 28:d2:44:7d:c2:de, ethertype 802.1Q (0x8100), length 78: vlan 100, p 0, ethertype IPv4, (tos 0x0, ttl 64, id 27244, offset 0, flags [DF], proto TCP (6), length 60) 10.0.100.1.58545 > 10.0.100.10.ircu-2: Flags [S], cksum 0xdc39 (incorrect -> 0x44ee), seq 1069378582, win 29200, options [mss 1460,sackOK,TS val 4294838888 ecr 0,nop,wscale 7], length 0 This connection finally times out. I've only access to the TG3 hardware in this configuration thus have only tested this with TG3 driver. There are a lot of other drivers that do not permit user changes to vlan acceleration features, and I don't know if they all suffere from a similar issue. The patch attempt to fix this another way. It moves the vlan header stipping code out of the vlan module and always builds it into the kernel network core. This way, even if vlan is not supported on a virtualizatoin host, the virtual machines running on top of such host will still work with VLANs enabled. CC: Patrick McHardy <kaber@trash.net> CC: Nithin Nayak Sujir <nsujir@broadcom.com> CC: Michael Chan <mchan@broadcom.com> CC: Jiri Pirko <jiri@resnulli.us> Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com> Acked-by: Jiri Pirko <jiri@resnulli.us> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-08-08 22:42:13 +04:00
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
return false;
}
if (!br_vlan_get_tag(skb, vid)) {
/* Tagged frame */
if (skb->vlan_proto != proto) {
/* Protocol-mismatch, empty out vlan_tci for new tag */
skb_push(skb, ETH_HLEN);
skb = __vlan_put_tag(skb, skb->vlan_proto,
vlan_tx_tag_get(skb));
if (unlikely(!skb))
return false;
skb_pull(skb, ETH_HLEN);
skb_reset_mac_len(skb);
*vid = 0;
tagged = false;
} else {
tagged = true;
}
} else {
/* Untagged frame */
tagged = false;
}
if (!*vid) {
u16 pvid = br_get_pvid(v);
/* Frame had a tag with VID 0 or did not have a tag.
* See if pvid is set on this port. That tells us which
* vlan untagged or priority-tagged traffic belongs to.
*/
if (pvid == VLAN_N_VID)
goto drop;
/* PVID is set on this port. Any untagged or priority-tagged
* ingress frame is considered to belong to this vlan.
*/
*vid = pvid;
if (likely(!tagged))
/* Untagged Frame. */
__vlan_hwaccel_put_tag(skb, proto, pvid);
else
/* Priority-tagged Frame.
* At this point, We know that skb->vlan_tci had
* VLAN_TAG_PRESENT bit and its VID field was 0x000.
* We update only VID field and preserve PCP field.
*/
skb->vlan_tci |= pvid;
return true;
}
/* Frame had a valid vlan tag. See if vlan is allowed */
if (test_bit(*vid, v->vlan_bitmap))
return true;
drop:
kfree_skb(skb);
return false;
}
/* Called under RCU. */
bool br_allowed_egress(struct net_bridge *br,
const struct net_port_vlans *v,
const struct sk_buff *skb)
{
u16 vid;
/* If this packet was not filtered at input, let it pass */
if (!BR_INPUT_SKB_CB(skb)->vlan_filtered)
return true;
if (!v)
return false;
br_vlan_get_tag(skb, &vid);
if (test_bit(vid, v->vlan_bitmap))
return true;
return false;
}
/* Called under RCU */
bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
{
struct net_bridge *br = p->br;
struct net_port_vlans *v;
/* If filtering was disabled at input, let it pass. */
if (!br->vlan_enabled)
return true;
v = rcu_dereference(p->vlan_info);
if (!v)
return false;
if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto)
*vid = 0;
if (!*vid) {
*vid = br_get_pvid(v);
if (*vid == VLAN_N_VID)
return false;
return true;
}
if (test_bit(*vid, v->vlan_bitmap))
return true;
return false;
}
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags)
{
struct net_port_vlans *pv = NULL;
int err;
ASSERT_RTNL();
pv = rtnl_dereference(br->vlan_info);
if (pv)
return __vlan_add(pv, vid, flags);
/* Create port vlan infomration
*/
pv = kzalloc(sizeof(*pv), GFP_KERNEL);
if (!pv)
return -ENOMEM;
pv->parent.br = br;
err = __vlan_add(pv, vid, flags);
if (err)
goto out;
rcu_assign_pointer(br->vlan_info, pv);
return 0;
out:
kfree(pv);
return err;
}
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
int br_vlan_delete(struct net_bridge *br, u16 vid)
{
struct net_port_vlans *pv;
ASSERT_RTNL();
pv = rtnl_dereference(br->vlan_info);
if (!pv)
return -EINVAL;
br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid);
__vlan_del(pv, vid);
return 0;
}
void br_vlan_flush(struct net_bridge *br)
{
struct net_port_vlans *pv;
ASSERT_RTNL();
pv = rtnl_dereference(br->vlan_info);
if (!pv)
return;
__vlan_flush(pv);
}
bridge: Fix the way to check if a local fdb entry can be deleted We should take into account the followings when deleting a local fdb entry. - nbp_vlan_find() can be used only when vid != 0 to check if an entry is deletable, because a fdb entry with vid 0 can exist at any time while nbp_vlan_find() always return false with vid 0. Example of problematic case: ip link set eth0 address 12:34:56:78:90:ab ip link set eth1 address 12:34:56:78:90:ab brctl addif br0 eth0 brctl addif br0 eth1 ip link set eth0 address aa:bb:cc:dd:ee:ff Then, the fdb entry 12:34:56:78:90:ab will be deleted even though the bridge port eth1 still has that address. - The port to which the bridge device is attached might needs a local entry if its mac address is set manually. Example of problematic case: ip link set eth0 address 12:34:56:78:90:ab brctl addif br0 eth0 ip link set br0 address 12:34:56:78:90:ab ip link set eth0 address aa:bb:cc:dd:ee:ff Then, the fdb still must have the entry 12:34:56:78:90:ab, but it will be deleted. We can use br->dev->addr_assign_type to check if the address is manually set or not, but I propose another approach. Since we delete and insert local entries whenever changing mac address of the bridge device, we can change dst of the entry to NULL regardless of addr_assign_type when deleting an entry associated with a certain port, and if it is found to be unnecessary later, then delete it. That is, if changing mac address of a port, the entry might be changed to its dst being NULL first, but is eventually deleted when recalculating and changing bridge id. This approach is especially useful when we want to share the code with deleting vlan in which the bridge device might want such an entry regardless of addr_assign_type, and makes things easy because we don't have to consider if mac address of the bridge device will be changed or not at the time we delete a local entry of a port, which means fdb code will not be bothered even if the bridge id calculating logic is changed in the future. Also, this change reduces inconsistent state, where frames whose dst is the mac address of the bridge, can't reach the bridge because of premature fdb entry deletion. This change reduces the possibility that the bridge device replies unreachable mac address to arp requests, which could occur during the short window between calling del_nbp() and br_stp_recalculate_bridge_id() in br_del_if(). This will effective after br_fdb_delete_by_port() starts to use the same code by following patch. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-07 11:48:22 +04:00
bool br_vlan_find(struct net_bridge *br, u16 vid)
{
struct net_port_vlans *pv;
bool found = false;
rcu_read_lock();
pv = rcu_dereference(br->vlan_info);
if (!pv)
goto out;
if (test_bit(vid, pv->vlan_bitmap))
found = true;
out:
rcu_read_unlock();
return found;
}
/* Must be protected by RTNL. */
static void recalculate_group_addr(struct net_bridge *br)
{
if (br->group_addr_set)
return;
spin_lock_bh(&br->lock);
if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) {
/* Bridge Group Address */
br->group_addr[5] = 0x00;
} else { /* vlan_enabled && ETH_P_8021AD */
/* Provider Bridge Group Address */
br->group_addr[5] = 0x08;
}
spin_unlock_bh(&br->lock);
}
/* Must be protected by RTNL. */
void br_recalculate_fwd_mask(struct net_bridge *br)
{
if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q))
br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
else /* vlan_enabled && ETH_P_8021AD */
br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
~(1u << br->group_addr[5]);
}
int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
{
if (!rtnl_trylock())
return restart_syscall();
if (br->vlan_enabled == val)
goto unlock;
br->vlan_enabled = val;
bridge: Automatically manage port promiscuous mode. There exist configurations where the administrator or another management entity has the foreknowledge of all the mac addresses of end systems that are being bridged together. In these environments, the administrator can statically configure known addresses in the bridge FDB and disable flooding and learning on ports. This makes it possible to turn off promiscuous mode on the interfaces connected to the bridge. Here is why disabling flooding and learning allows us to control promiscuity: Consider port X. All traffic coming into this port from outside the bridge (ingress) will be either forwarded through other ports of the bridge (egress) or dropped. Forwarding (egress) is defined by FDB entries and by flooding in the event that no FDB entry exists. In the event that flooding is disabled, only FDB entries define the egress. Once learning is disabled, only static FDB entries provided by a management entity define the egress. If we provide information from these static FDBs to the ingress port X, then we'll be able to accept all traffic that can be successfully forwarded and drop all the other traffic sooner without spending CPU cycles to process it. Another way to define the above is as following equations: ingress = egress + drop expanding egress ingress = static FDB + learned FDB + flooding + drop disabling flooding and learning we a left with ingress = static FDB + drop By adding addresses from the static FDB entries to the MAC address filter of an ingress port X, we fully define what the bridge can process without dropping and can thus turn off promiscuous mode, thus dropping packets sooner. There have been suggestions that we may want to allow learning and update the filters with learned addresses as well. This would require mac-level authentication similar to 802.1x to prevent attacks against the hw filters as they are limited resource. Additionally, if the user places the bridge device in promiscuous mode, all ports are placed in promiscuous mode regardless of the changes to flooding and learning. Since the above functionality depends on full static configuration, we have also require that vlan filtering be enabled to take advantage of this. The reason is that the bridge has to be able to receive and process VLAN-tagged frames and the there are only 2 ways to accomplish this right now: promiscuous mode or vlan filtering. Suggested-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-16 17:59:20 +04:00
br_manage_promisc(br);
recalculate_group_addr(br);
br_recalculate_fwd_mask(br);
unlock:
rtnl_unlock();
return 0;
}
int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
{
int err = 0;
struct net_bridge_port *p;
struct net_port_vlans *pv;
__be16 proto, oldproto;
u16 vid, errvid;
if (val != ETH_P_8021Q && val != ETH_P_8021AD)
return -EPROTONOSUPPORT;
if (!rtnl_trylock())
return restart_syscall();
proto = htons(val);
if (br->vlan_proto == proto)
goto unlock;
/* Add VLANs for the new proto to the device filter. */
list_for_each_entry(p, &br->port_list, list) {
pv = rtnl_dereference(p->vlan_info);
if (!pv)
continue;
for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) {
err = vlan_vid_add(p->dev, proto, vid);
if (err)
goto err_filt;
}
}
oldproto = br->vlan_proto;
br->vlan_proto = proto;
recalculate_group_addr(br);
br_recalculate_fwd_mask(br);
/* Delete VLANs for the old proto from the device filter. */
list_for_each_entry(p, &br->port_list, list) {
pv = rtnl_dereference(p->vlan_info);
if (!pv)
continue;
for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
vlan_vid_del(p->dev, oldproto, vid);
}
unlock:
rtnl_unlock();
return err;
err_filt:
errvid = vid;
for_each_set_bit(vid, pv->vlan_bitmap, errvid)
vlan_vid_del(p->dev, proto, vid);
list_for_each_entry_continue_reverse(p, &br->port_list, list) {
pv = rtnl_dereference(p->vlan_info);
if (!pv)
continue;
for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
vlan_vid_del(p->dev, proto, vid);
}
goto unlock;
}
int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
{
u16 pvid = val;
int err = 0;
if (!val || val >= VLAN_VID_MASK)
return -EINVAL;
if (!rtnl_trylock())
return restart_syscall();
if (pvid == br->default_pvid)
goto unlock;
/* Only allow default pvid change when filtering is disabled */
if (br->vlan_enabled) {
pr_info_once("Please disable vlan filtering to change default_pvid\n");
err = -EPERM;
goto unlock;
}
br->default_pvid = pvid;
unlock:
rtnl_unlock();
return err;
}
void br_vlan_init(struct net_bridge *br)
{
br->vlan_proto = htons(ETH_P_8021Q);
br->default_pvid = 1;
}
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags)
{
struct net_port_vlans *pv = NULL;
int err;
ASSERT_RTNL();
pv = rtnl_dereference(port->vlan_info);
if (pv)
return __vlan_add(pv, vid, flags);
/* Create port vlan infomration
*/
pv = kzalloc(sizeof(*pv), GFP_KERNEL);
if (!pv) {
err = -ENOMEM;
goto clean_up;
}
pv->port_idx = port->port_no;
pv->parent.port = port;
err = __vlan_add(pv, vid, flags);
if (err)
goto clean_up;
rcu_assign_pointer(port->vlan_info, pv);
return 0;
clean_up:
kfree(pv);
return err;
}
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
int nbp_vlan_delete(struct net_bridge_port *port, u16 vid)
{
struct net_port_vlans *pv;
ASSERT_RTNL();
pv = rtnl_dereference(port->vlan_info);
if (!pv)
return -EINVAL;
br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid);
return __vlan_del(pv, vid);
}
void nbp_vlan_flush(struct net_bridge_port *port)
{
struct net_port_vlans *pv;
u16 vid;
ASSERT_RTNL();
pv = rtnl_dereference(port->vlan_info);
if (!pv)
return;
for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID)
vlan_vid_del(port->dev, port->br->vlan_proto, vid);
__vlan_flush(pv);
}
bool nbp_vlan_find(struct net_bridge_port *port, u16 vid)
{
struct net_port_vlans *pv;
bool found = false;
rcu_read_lock();
pv = rcu_dereference(port->vlan_info);
if (!pv)
goto out;
if (test_bit(vid, pv->vlan_bitmap))
found = true;
out:
rcu_read_unlock();
return found;
}