linux/drivers/net/vxlan.c

3355 lines
84 KiB
C
Raw Normal View History

/*
* VXLAN: Virtual eXtensible Local Area Network
*
* Copyright (c) 2012-2013 Vyatta Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/if_ether.h>
#include <linux/ethtool.h>
#include <net/arp.h>
#include <net/ndisc.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/rtnetlink.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/vxlan.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
#include <net/ip6_checksum.h>
#endif
#define VXLAN_VERSION "0.1"
#define PORT_HASH_BITS 8
#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
/* UDP port for VXLAN traffic.
* The IANA assigned port is 4789, but the Linux default is 8472
* for compatibility with early adopters.
*/
static unsigned short vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, ushort, 0444);
MODULE_PARM_DESC(udp_port, "Destination UDP port");
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static int vxlan_net_id;
static struct rtnl_link_ops vxlan_link_ops;
vxlan: fix a out of bounds access in __vxlan_find_mac The size of all_zeros_mac is 6 byte, but eth_hash() will access the 8 byte, and KASan reported the below bug: [ 8596.479031] BUG: KASan: out of bounds access in __vxlan_find_mac+0x24/0x100 at addr ffffffff841514c0 [ 8596.487647] Read of size 8 by task ip/52820 [ 8596.490818] Address belongs to variable all_zeros_mac+0x0/0x40 [ 8596.496051] CPU: 0 PID: 52820 Comm: ip Tainted: G WC 4.1.15 #1 [ 8596.503520] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 02/10/2014 [ 8596.509365] ffffffff841514c0 ffff88007450f0b8 ffffffff822fa5e1 0000000000000032 [ 8596.516112] ffff88007450f150 ffff88007450f138 ffffffff812dd58c ffff88007450f1d8 [ 8596.522856] ffffffff81113b80 0000000000000282 0000000000000001 ffffffff8101ee4d [ 8596.529599] Call Trace: [ 8596.530858] [<ffffffff822fa5e1>] dump_stack+0x4f/0x7b [ 8596.535080] [<ffffffff812dd58c>] kasan_report_error+0x3bc/0x3f0 [ 8596.540258] [<ffffffff81113b80>] ? __lock_acquire+0x90/0x2140 [ 8596.545245] [<ffffffff8101ee4d>] ? save_stack_trace+0x2d/0x80 [ 8596.550234] [<ffffffff812dda70>] kasan_report+0x40/0x50 [ 8596.554647] [<ffffffff81b211e4>] ? __vxlan_find_mac+0x24/0x100 [ 8596.559729] [<ffffffff812dc399>] __asan_load8+0x69/0xa0 [ 8596.564141] [<ffffffff81b211e4>] __vxlan_find_mac+0x24/0x100 [ 8596.569033] [<ffffffff81b2683d>] vxlan_fdb_create+0x9d/0x570 it can be fixed by enlarging the all_zeros_mac to 8 byte, although it is harmless; eth_hash() will be called in other place with the memory which is larger and equal to 8 byte. Signed-off-by: Li RongQing <roy.qing.li@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-01-29 04:43:47 +03:00
static const u8 all_zeros_mac[ETH_ALEN + 2];
static int vxlan_sock_add(struct vxlan_dev *vxlan);
/* per-network namespace private data for this module */
struct vxlan_net {
struct list_head vxlan_list;
struct hlist_head sock_list[PORT_HASH_SIZE];
spinlock_t sock_lock;
};
/* Forwarding table entry */
struct vxlan_fdb {
struct hlist_node hlist; /* linked list of entries */
struct rcu_head rcu;
unsigned long updated; /* jiffies */
unsigned long used;
struct list_head remotes;
u8 eth_addr[ETH_ALEN];
u16 state; /* see ndm_state */
u8 flags; /* see ndm_flags */
};
/* salt for hash table */
static u32 vxlan_salt __read_mostly;
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
{
return vs->flags & VXLAN_F_COLLECT_METADATA ||
ip_tunnel_collect_metadata();
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
}
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
if (a->sa.sa_family != b->sa.sa_family)
return false;
if (a->sa.sa_family == AF_INET6)
return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
else
return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}
static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
{
if (ipa->sa.sa_family == AF_INET6)
return ipv6_addr_any(&ipa->sin6.sin6_addr);
else
return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
}
static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
{
if (ipa->sa.sa_family == AF_INET6)
return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
else
return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
}
static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
if (nla_len(nla) >= sizeof(struct in6_addr)) {
ip->sin6.sin6_addr = nla_get_in6_addr(nla);
ip->sa.sa_family = AF_INET6;
return 0;
} else if (nla_len(nla) >= sizeof(__be32)) {
ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
ip->sa.sa_family = AF_INET;
return 0;
} else {
return -EAFNOSUPPORT;
}
}
static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
const union vxlan_addr *ip)
{
if (ip->sa.sa_family == AF_INET6)
return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
else
return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
}
#else /* !CONFIG_IPV6 */
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}
static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
{
return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
}
static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
{
return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
}
static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
if (nla_len(nla) >= sizeof(struct in6_addr)) {
return -EAFNOSUPPORT;
} else if (nla_len(nla) >= sizeof(__be32)) {
ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
ip->sa.sa_family = AF_INET;
return 0;
} else {
return -EAFNOSUPPORT;
}
}
static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
const union vxlan_addr *ip)
{
return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
}
#endif
/* Virtual Network hash table head */
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
{
return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
}
/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}
/* First remote destination for a forwarding entry.
* Guaranteed to be non-NULL because remotes are never deleted.
*/
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
{
return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}
static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
}
/* Find VXLAN socket based on network namespace, address family and UDP port
* and enabled unshareable flags.
*/
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
__be16 port, u32 flags)
{
struct vxlan_sock *vs;
flags &= VXLAN_F_RCV_FLAGS;
hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
if (inet_sk(vs->sock->sk)->inet_sport == port &&
vxlan_get_sk_family(vs) == family &&
vs->flags == flags)
return vs;
}
return NULL;
}
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, __be32 vni)
{
struct vxlan_dev *vxlan;
/* For flow based devices, map all packets to VNI 0 */
if (vs->flags & VXLAN_F_COLLECT_METADATA)
vni = 0;
hlist_for_each_entry_rcu(vxlan, vni_head(vs, vni), hlist) {
if (vxlan->default_dst.remote_vni == vni)
return vxlan;
}
return NULL;
}
/* Look up VNI in a per net namespace table */
static struct vxlan_dev *vxlan_find_vni(struct net *net, __be32 vni,
sa_family_t family, __be16 port,
u32 flags)
{
struct vxlan_sock *vs;
vs = vxlan_find_sock(net, family, port, flags);
if (!vs)
return NULL;
return vxlan_vs_find_vni(vs, vni);
}
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
const struct vxlan_fdb *fdb,
u32 portid, u32 seq, int type, unsigned int flags,
const struct vxlan_rdst *rdst)
{
unsigned long now = jiffies;
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
bool send_ip, send_eth;
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
return -EMSGSIZE;
ndm = nlmsg_data(nlh);
memset(ndm, 0, sizeof(*ndm));
send_eth = send_ip = true;
if (type == RTM_GETNEIGH) {
ndm->ndm_family = AF_INET;
send_ip = !vxlan_addr_any(&rdst->remote_ip);
send_eth = !is_zero_ether_addr(fdb->eth_addr);
} else
ndm->ndm_family = AF_BRIDGE;
ndm->ndm_state = fdb->state;
ndm->ndm_ifindex = vxlan->dev->ifindex;
ndm->ndm_flags = fdb->flags;
ndm->ndm_type = RTN_UNICAST;
if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
nla_put_s32(skb, NDA_LINK_NETNSID,
peernet2id(dev_net(vxlan->dev), vxlan->net)))
goto nla_put_failure;
if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
goto nla_put_failure;
if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
goto nla_put_failure;
if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
nla_put_be16(skb, NDA_PORT, rdst->remote_port))
goto nla_put_failure;
if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
goto nla_put_failure;
if (rdst->remote_ifindex &&
nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
goto nla_put_failure;
ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
ci.ndm_confirmed = 0;
ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
ci.ndm_refcnt = 0;
if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
goto nla_put_failure;
netlink: make nlmsg_end() and genlmsg_end() void Contrary to common expectations for an "int" return, these functions return only a positive value -- if used correctly they cannot even return 0 because the message header will necessarily be in the skb. This makes the very common pattern of if (genlmsg_end(...) < 0) { ... } be a whole bunch of dead code. Many places also simply do return nlmsg_end(...); and the caller is expected to deal with it. This also commonly (at least for me) causes errors, because it is very common to write if (my_function(...)) /* error condition */ and if my_function() does "return nlmsg_end()" this is of course wrong. Additionally, there's not a single place in the kernel that actually needs the message length returned, and if anyone needs it later then it'll be very easy to just use skb->len there. Remove this, and make the functions void. This removes a bunch of dead code as described above. The patch adds lines because I did - return nlmsg_end(...); + nlmsg_end(...); + return 0; I could have preserved all the function's return values by returning skb->len, but instead I've audited all the places calling the affected functions and found that none cared. A few places actually compared the return value with <= 0 in dump functionality, but that could just be changed to < 0 with no change in behaviour, so I opted for the more efficient version. One instance of the error I've made numerous times now is also present in net/phonet/pn_netlink.c in the route_dumpit() function - it didn't check for <0 or <=0 and thus broke out of the loop every single time. I've preserved this since it will (I think) have caused the messages to userspace to be formatted differently with just a single message for every SKB returned to userspace. It's possible that this isn't needed for the tools that actually use this, but I don't even know what they are so couldn't test that changing this behaviour would be acceptable. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-17 00:09:00 +03:00
nlmsg_end(skb, nlh);
return 0;
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static inline size_t vxlan_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ndmsg))
+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
+ nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
+ nla_total_size(sizeof(struct nda_cacheinfo));
}
static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
struct vxlan_rdst *rd, int type)
{
struct net *net = dev_net(vxlan->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
if (err < 0) {
/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_fdb f = {
.state = NUD_STALE,
};
struct vxlan_rdst remote = {
.remote_ip = *ipa, /* goes to NDA_DST */
.remote_vni = cpu_to_be32(VXLAN_N_VID),
};
vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
}
static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
struct vxlan_fdb f = {
.state = NUD_STALE,
};
struct vxlan_rdst remote = { };
memcpy(f.eth_addr, eth_addr, ETH_ALEN);
vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
}
/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
u64 value = get_unaligned((u64 *)addr);
/* only want 6 bytes */
#ifdef __BIG_ENDIAN
value >>= 16;
#else
value <<= 16;
#endif
return hash_64(value, FDB_HASH_BITS);
}
/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
const u8 *mac)
{
return &vxlan->fdb_head[eth_hash(mac)];
}
/* Look up Ethernet address in forwarding table */
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
const u8 *mac)
{
struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
struct vxlan_fdb *f;
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
hlist_for_each_entry_rcu(f, head, hlist) {
if (ether_addr_equal(mac, f->eth_addr))
return f;
}
return NULL;
}
static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
const u8 *mac)
{
struct vxlan_fdb *f;
f = __vxlan_find_mac(vxlan, mac);
if (f)
f->used = jiffies;
return f;
}
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
union vxlan_addr *ip, __be16 port,
__be32 vni, __u32 ifindex)
{
struct vxlan_rdst *rd;
list_for_each_entry(rd, &f->remotes, list) {
if (vxlan_addr_equal(&rd->remote_ip, ip) &&
rd->remote_port == port &&
rd->remote_vni == vni &&
rd->remote_ifindex == ifindex)
return rd;
}
return NULL;
}
/* Replace destination of unicast mac */
static int vxlan_fdb_replace(struct vxlan_fdb *f,
union vxlan_addr *ip, __be16 port, __be32 vni,
__u32 ifindex)
{
struct vxlan_rdst *rd;
rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
if (rd)
return 0;
rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
if (!rd)
return 0;
dst_cache_reset(&rd->dst_cache);
rd->remote_ip = *ip;
rd->remote_port = port;
rd->remote_vni = vni;
rd->remote_ifindex = ifindex;
return 1;
}
/* Add/update destinations for multicast */
static int vxlan_fdb_append(struct vxlan_fdb *f,
union vxlan_addr *ip, __be16 port, __be32 vni,
__u32 ifindex, struct vxlan_rdst **rdp)
{
struct vxlan_rdst *rd;
rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
if (rd)
return 0;
rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
if (rd == NULL)
return -ENOBUFS;
if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
kfree(rd);
return -ENOBUFS;
}
rd->remote_ip = *ip;
rd->remote_port = port;
rd->remote_vni = vni;
rd->remote_ifindex = ifindex;
list_add_tail_rcu(&rd->list, &f->remotes);
*rdp = rd;
return 1;
}
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
unsigned int off,
struct vxlanhdr *vh, size_t hdrlen,
__be32 vni_field,
struct gro_remcsum *grc,
bool nopartial)
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
{
size_t start, offset;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (skb->remcsum_offload)
return vh;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (!NAPI_GRO_CB(skb)->csum_valid)
return NULL;
start = vxlan_rco_start(vni_field);
offset = start + vxlan_rco_offset(vni_field);
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
start, offset, grc, nopartial);
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
skb->remcsum_offload = 1;
return vh;
}
static struct sk_buff **vxlan_gro_receive(struct sock *sk,
struct sk_buff **head,
struct sk_buff *skb)
{
struct sk_buff *p, **pp = NULL;
struct vxlanhdr *vh, *vh2;
unsigned int hlen, off_vx;
int flush = 1;
struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
__be32 flags;
struct gro_remcsum grc;
skb_gro_remcsum_init(&grc);
off_vx = skb_gro_offset(skb);
hlen = off_vx + sizeof(*vh);
vh = skb_gro_header_fast(skb, off_vx);
if (skb_gro_header_hard(skb, hlen)) {
vh = skb_gro_header_slow(skb, hlen, off_vx);
if (unlikely(!vh))
goto out;
}
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
flags = vh->vx_flags;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
vh->vx_vni, &grc,
!!(vs->flags &
VXLAN_F_REMCSUM_NOPARTIAL));
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (!vh)
goto out;
}
skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
for (p = *head; p; p = p->next) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
vh2 = (struct vxlanhdr *)(p->data + off_vx);
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
if (vh->vx_flags != vh2->vx_flags ||
vh->vx_vni != vh2->vx_vni) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
}
pp = call_gro_receive(eth_gro_receive, head, skb);
flush = 0;
out:
skb_gro_remcsum_cleanup(skb, &grc);
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
}
static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
/* Sets 'skb->inner_mac_header' since we are always called with
* 'skb->encapsulation' set.
*/
return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
}
/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
const u8 *mac, union vxlan_addr *ip,
__u16 state, __u16 flags,
__be16 port, __be32 vni, __u32 ifindex,
__u8 ndm_flags)
{
struct vxlan_rdst *rd = NULL;
struct vxlan_fdb *f;
int notify = 0;
f = __vxlan_find_mac(vxlan, mac);
if (f) {
if (flags & NLM_F_EXCL) {
netdev_dbg(vxlan->dev,
"lost race to create %pM\n", mac);
return -EEXIST;
}
if (f->state != state) {
f->state = state;
f->updated = jiffies;
notify = 1;
}
if (f->flags != ndm_flags) {
f->flags = ndm_flags;
f->updated = jiffies;
notify = 1;
}
if ((flags & NLM_F_REPLACE)) {
/* Only change unicasts */
if (!(is_multicast_ether_addr(f->eth_addr) ||
is_zero_ether_addr(f->eth_addr))) {
notify |= vxlan_fdb_replace(f, ip, port, vni,
ifindex);
} else
return -EOPNOTSUPP;
}
if ((flags & NLM_F_APPEND) &&
(is_multicast_ether_addr(f->eth_addr) ||
is_zero_ether_addr(f->eth_addr))) {
int rc = vxlan_fdb_append(f, ip, port, vni, ifindex,
&rd);
if (rc < 0)
return rc;
notify |= rc;
}
} else {
if (!(flags & NLM_F_CREATE))
return -ENOENT;
if (vxlan->cfg.addrmax &&
vxlan->addrcnt >= vxlan->cfg.addrmax)
return -ENOSPC;
/* Disallow replace to add a multicast entry */
if ((flags & NLM_F_REPLACE) &&
(is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
return -EOPNOTSUPP;
netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
f = kmalloc(sizeof(*f), GFP_ATOMIC);
if (!f)
return -ENOMEM;
notify = 1;
f->state = state;
f->flags = ndm_flags;
f->updated = f->used = jiffies;
INIT_LIST_HEAD(&f->remotes);
memcpy(f->eth_addr, mac, ETH_ALEN);
vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
++vxlan->addrcnt;
hlist_add_head_rcu(&f->hlist,
vxlan_fdb_head(vxlan, mac));
}
if (notify) {
if (rd == NULL)
rd = first_remote_rtnl(f);
vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH);
}
return 0;
}
static void vxlan_fdb_free(struct rcu_head *head)
{
struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
struct vxlan_rdst *rd, *nd;
list_for_each_entry_safe(rd, nd, &f->remotes, list) {
dst_cache_destroy(&rd->dst_cache);
kfree(rd);
}
kfree(f);
}
static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
{
netdev_dbg(vxlan->dev,
"delete %pM\n", f->eth_addr);
--vxlan->addrcnt;
vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH);
hlist_del_rcu(&f->hlist);
call_rcu(&f->rcu, vxlan_fdb_free);
}
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
union vxlan_addr *ip, __be16 *port, __be32 *vni,
u32 *ifindex)
{
struct net *net = dev_net(vxlan->dev);
int err;
if (tb[NDA_DST]) {
err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
if (err)
return err;
} else {
union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
if (remote->sa.sa_family == AF_INET) {
ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
ip->sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
} else {
ip->sin6.sin6_addr = in6addr_any;
ip->sa.sa_family = AF_INET6;
#endif
}
}
if (tb[NDA_PORT]) {
if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
return -EINVAL;
*port = nla_get_be16(tb[NDA_PORT]);
} else {
*port = vxlan->cfg.dst_port;
}
if (tb[NDA_VNI]) {
if (nla_len(tb[NDA_VNI]) != sizeof(u32))
return -EINVAL;
*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
} else {
*vni = vxlan->default_dst.remote_vni;
}
if (tb[NDA_IFINDEX]) {
struct net_device *tdev;
if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
return -EINVAL;
*ifindex = nla_get_u32(tb[NDA_IFINDEX]);
tdev = __dev_get_by_index(net, *ifindex);
if (!tdev)
return -EADDRNOTAVAIL;
} else {
*ifindex = 0;
}
return 0;
}
/* Add static entry (via netlink) */
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr, u16 vid, u16 flags)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
/* struct net *net = dev_net(vxlan->dev); */
union vxlan_addr ip;
__be16 port;
__be32 vni;
u32 ifindex;
int err;
if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
pr_info("RTM_NEWNEIGH with invalid state %#x\n",
ndm->ndm_state);
return -EINVAL;
}
if (tb[NDA_DST] == NULL)
return -EINVAL;
err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
if (err)
return err;
net: vxlan: fix crash when interface is created with no group If the vxlan interface is created without explicit group definition, there are corner cases which may cause kernel panic. For instance, in the following scenario: node A: $ ip link add dev vxlan42 address 2c:c2:60:00:10:20 type vxlan id 42 $ ip addr add dev vxlan42 10.0.0.1/24 $ ip link set up dev vxlan42 $ arp -i vxlan42 -s 10.0.0.2 2c:c2:60:00:01:02 $ bridge fdb add dev vxlan42 to 2c:c2:60:00:01:02 dst <IPv4 address> $ ping 10.0.0.2 node B: $ ip link add dev vxlan42 address 2c:c2:60:00:01:02 type vxlan id 42 $ ip addr add dev vxlan42 10.0.0.2/24 $ ip link set up dev vxlan42 $ arp -i vxlan42 -s 10.0.0.1 2c:c2:60:00:10:20 node B crashes: vxlan42: 2c:c2:60:00:10:20 migrated from 4011:eca4:c0a8:6466:c0a8:6415:8e09:2118 to (invalid address) vxlan42: 2c:c2:60:00:10:20 migrated from 4011:eca4:c0a8:6466:c0a8:6415:8e09:2118 to (invalid address) BUG: unable to handle kernel NULL pointer dereference at 0000000000000046 IP: [<ffffffff8143c459>] ip6_route_output+0x58/0x82 PGD 7bd89067 PUD 7bd4e067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.14.0-rc8-hvx-xen-00019-g97a5221-dirty #154 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff88007c774f50 ti: ffff88007c79c000 task.ti: ffff88007c79c000 RIP: 0010:[<ffffffff8143c459>] [<ffffffff8143c459>] ip6_route_output+0x58/0x82 RSP: 0018:ffff88007fd03668 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffff8186a000 RCX: 0000000000000040 RDX: 0000000000000000 RSI: ffff88007b0e4a80 RDI: ffff88007fd03754 RBP: ffff88007fd03688 R08: ffff88007b0e4a80 R09: 0000000000000000 R10: 0200000a0100000a R11: 0001002200000000 R12: ffff88007fd03740 R13: ffff88007b0e4a80 R14: ffff88007b0e4a80 R15: ffff88007bba0c50 FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000046 CR3: 000000007bb60000 CR4: 00000000000006e0 Stack: 0000000000000000 ffff88007fd037a0 ffffffff8186a000 ffff88007fd03740 ffff88007fd036c8 ffffffff814320bb 0000000000006e49 ffff88007b8b7360 ffff88007bdbf200 ffff88007bcbc000 ffff88007b8b7000 ffff88007b8b7360 Call Trace: <IRQ> [<ffffffff814320bb>] ip6_dst_lookup_tail+0x2d/0xa4 [<ffffffff814322a5>] ip6_dst_lookup+0x10/0x12 [<ffffffff81323b4e>] vxlan_xmit_one+0x32a/0x68c [<ffffffff814a325a>] ? _raw_spin_unlock_irqrestore+0x12/0x14 [<ffffffff8104c551>] ? lock_timer_base.isra.23+0x26/0x4b [<ffffffff8132451a>] vxlan_xmit+0x66a/0x6a8 [<ffffffff8141a365>] ? ipt_do_table+0x35f/0x37e [<ffffffff81204ba2>] ? selinux_ip_postroute+0x41/0x26e [<ffffffff8139d0c1>] dev_hard_start_xmit+0x2ce/0x3ce [<ffffffff8139d491>] __dev_queue_xmit+0x2d0/0x392 [<ffffffff813b380f>] ? eth_header+0x28/0xb5 [<ffffffff8139d569>] dev_queue_xmit+0xb/0xd [<ffffffff813a5aa6>] neigh_resolve_output+0x134/0x152 [<ffffffff813db741>] ip_finish_output2+0x236/0x299 [<ffffffff813dc074>] ip_finish_output+0x98/0x9d [<ffffffff813dc749>] ip_output+0x62/0x67 [<ffffffff813da9f2>] dst_output+0xf/0x11 [<ffffffff813dc11c>] ip_local_out+0x1b/0x1f [<ffffffff813dcf1b>] ip_send_skb+0x11/0x37 [<ffffffff813dcf70>] ip_push_pending_frames+0x2f/0x33 [<ffffffff813ff732>] icmp_push_reply+0x106/0x115 [<ffffffff813ff9e4>] icmp_reply+0x142/0x164 [<ffffffff813ffb3b>] icmp_echo.part.16+0x46/0x48 [<ffffffff813c1d30>] ? nf_iterate+0x43/0x80 [<ffffffff813d8037>] ? xfrm4_policy_check.constprop.11+0x52/0x52 [<ffffffff813ffb62>] icmp_echo+0x25/0x27 [<ffffffff814005f7>] icmp_rcv+0x1d2/0x20a [<ffffffff813d8037>] ? xfrm4_policy_check.constprop.11+0x52/0x52 [<ffffffff813d810d>] ip_local_deliver_finish+0xd6/0x14f [<ffffffff813d8037>] ? xfrm4_policy_check.constprop.11+0x52/0x52 [<ffffffff813d7fde>] NF_HOOK.constprop.10+0x4c/0x53 [<ffffffff813d82bf>] ip_local_deliver+0x4a/0x4f [<ffffffff813d7f7b>] ip_rcv_finish+0x253/0x26a [<ffffffff813d7d28>] ? inet_add_protocol+0x3e/0x3e [<ffffffff813d7fde>] NF_HOOK.constprop.10+0x4c/0x53 [<ffffffff813d856a>] ip_rcv+0x2a6/0x2ec [<ffffffff8139a9a0>] __netif_receive_skb_core+0x43e/0x478 [<ffffffff812a346f>] ? virtqueue_poll+0x16/0x27 [<ffffffff8139aa2f>] __netif_receive_skb+0x55/0x5a [<ffffffff8139aaaa>] process_backlog+0x76/0x12f [<ffffffff8139add8>] net_rx_action+0xa2/0x1ab [<ffffffff81047847>] __do_softirq+0xca/0x1d1 [<ffffffff81047ace>] irq_exit+0x3e/0x85 [<ffffffff8100b98b>] do_IRQ+0xa9/0xc4 [<ffffffff814a37ad>] common_interrupt+0x6d/0x6d <EOI> [<ffffffff810378db>] ? native_safe_halt+0x6/0x8 [<ffffffff810110c7>] default_idle+0x9/0xd [<ffffffff81011694>] arch_cpu_idle+0x13/0x1c [<ffffffff8107480d>] cpu_startup_entry+0xbc/0x137 [<ffffffff8102e741>] start_secondary+0x1a0/0x1a5 Code: 24 14 e8 f1 e5 01 00 31 d2 a8 32 0f 95 c2 49 8b 44 24 2c 49 0b 44 24 24 74 05 83 ca 04 eb 1c 4d 85 ed 74 17 49 8b 85 a8 02 00 00 <66> 8b 40 46 66 c1 e8 07 83 e0 07 c1 e0 03 09 c2 4c 89 e6 48 89 RIP [<ffffffff8143c459>] ip6_route_output+0x58/0x82 RSP <ffff88007fd03668> CR2: 0000000000000046 ---[ end trace 4612329caab37efd ]--- When vxlan interface is created without explicit group definition, the default_dst protocol family is initialiazed to AF_UNSPEC and the driver assumes IPv4 configuration. On the other side, the default_dst protocol family is used to differentiate between IPv4 and IPv6 cases and, since, AF_UNSPEC != AF_INET, the processing takes the IPv6 path. Making the IPv4 assumption explicit by settting default_dst protocol family to AF_INET4 and preventing mixing of IPv4 and IPv6 addresses in snooped fdb entries fixes the corner case crashes. Signed-off-by: Mike Rapoport <mike.rapoport@ravellosystems.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-01 10:23:01 +04:00
if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
return -EAFNOSUPPORT;
spin_lock_bh(&vxlan->hash_lock);
err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags,
port, vni, ifindex, ndm->ndm_flags);
spin_unlock_bh(&vxlan->hash_lock);
return err;
}
/* Delete entry (via netlink) */
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr, u16 vid)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_fdb *f;
struct vxlan_rdst *rd = NULL;
union vxlan_addr ip;
__be16 port;
__be32 vni;
u32 ifindex;
int err;
err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
if (err)
return err;
err = -ENOENT;
spin_lock_bh(&vxlan->hash_lock);
f = vxlan_find_mac(vxlan, addr);
if (!f)
goto out;
if (!vxlan_addr_any(&ip)) {
rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
if (!rd)
goto out;
}
err = 0;
/* remove a destination if it's not the only one on the list,
* otherwise destroy the fdb entry
*/
if (rd && !list_is_singular(&f->remotes)) {
list_del_rcu(&rd->list);
vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH);
kfree_rcu(rd, rcu);
goto out;
}
vxlan_fdb_destroy(vxlan, f);
out:
spin_unlock_bh(&vxlan->hash_lock);
return err;
}
/* Dump forwarding table */
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev,
rtnetlink: fdb dump: optimize by saving last interface markers fdb dumps spanning multiple skb's currently restart from the first interface again for every skb. This results in unnecessary iterations on the already visited interfaces and their fdb entries. In large scale setups, we have seen this to slow down fdb dumps considerably. On a system with 30k macs we see fdb dumps spanning across more than 300 skbs. To fix the problem, this patch replaces the existing single fdb marker with three markers: netdev hash entries, netdevs and fdb index to continue where we left off instead of restarting from the first netdev. This is consistent with link dumps. In the process of fixing the performance issue, this patch also re-implements fix done by commit 472681d57a5d ("net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump") (with an internal fix from Wilson Kok) in the following ways: - change ndo_fdb_dump handlers to return error code instead of the last fdb index - use cb->args strictly for dump frag markers and not error codes. This is consistent with other dump functions. Below results were taken on a system with 1000 netdevs and 35085 fdb entries: before patch: $time bridge fdb show | wc -l 15065 real 1m11.791s user 0m0.070s sys 1m8.395s (existing code does not return all macs) after patch: $time bridge fdb show | wc -l 35085 real 0m2.017s user 0m0.113s sys 0m1.942s Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-31 07:56:45 +03:00
struct net_device *filter_dev, int *idx)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
unsigned int h;
rtnetlink: fdb dump: optimize by saving last interface markers fdb dumps spanning multiple skb's currently restart from the first interface again for every skb. This results in unnecessary iterations on the already visited interfaces and their fdb entries. In large scale setups, we have seen this to slow down fdb dumps considerably. On a system with 30k macs we see fdb dumps spanning across more than 300 skbs. To fix the problem, this patch replaces the existing single fdb marker with three markers: netdev hash entries, netdevs and fdb index to continue where we left off instead of restarting from the first netdev. This is consistent with link dumps. In the process of fixing the performance issue, this patch also re-implements fix done by commit 472681d57a5d ("net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump") (with an internal fix from Wilson Kok) in the following ways: - change ndo_fdb_dump handlers to return error code instead of the last fdb index - use cb->args strictly for dump frag markers and not error codes. This is consistent with other dump functions. Below results were taken on a system with 1000 netdevs and 35085 fdb entries: before patch: $time bridge fdb show | wc -l 15065 real 1m11.791s user 0m0.070s sys 1m8.395s (existing code does not return all macs) after patch: $time bridge fdb show | wc -l 35085 real 0m2.017s user 0m0.113s sys 0m1.942s Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-31 07:56:45 +03:00
int err = 0;
for (h = 0; h < FDB_HASH_SIZE; ++h) {
struct vxlan_fdb *f;
hlist: drop the node parameter from iterators I'm not sure why, but the hlist for each entry iterators were conceived list_for_each_entry(pos, head, member) The hlist ones were greedy and wanted an extra parameter: hlist_for_each_entry(tpos, pos, head, member) Why did they need an extra pos parameter? I'm not quite sure. Not only they don't really need it, it also prevents the iterator from looking exactly like the list iterator, which is unfortunate. Besides the semantic patch, there was some manual work required: - Fix up the actual hlist iterators in linux/list.h - Fix up the declaration of other iterators based on the hlist ones. - A very small amount of places were using the 'node' parameter, this was modified to use 'obj->member' instead. - Coccinelle didn't handle the hlist_for_each_entry_safe iterator properly, so those had to be fixed up manually. The semantic patch which is mostly the work of Peter Senna Tschudin is here: @@ iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host; type T; expression a,c,d,e; identifier b; statement S; @@ -T b; <+... when != b ( hlist_for_each_entry(a, - b, c, d) S | hlist_for_each_entry_continue(a, - b, c) S | hlist_for_each_entry_from(a, - b, c) S | hlist_for_each_entry_rcu(a, - b, c, d) S | hlist_for_each_entry_rcu_bh(a, - b, c, d) S | hlist_for_each_entry_continue_rcu_bh(a, - b, c) S | for_each_busy_worker(a, c, - b, d) S | ax25_uid_for_each(a, - b, c) S | ax25_for_each(a, - b, c) S | inet_bind_bucket_for_each(a, - b, c) S | sctp_for_each_hentry(a, - b, c) S | sk_for_each(a, - b, c) S | sk_for_each_rcu(a, - b, c) S | sk_for_each_from -(a, b) +(a) S + sk_for_each_from(a) S | sk_for_each_safe(a, - b, c, d) S | sk_for_each_bound(a, - b, c) S | hlist_for_each_entry_safe(a, - b, c, d, e) S | hlist_for_each_entry_continue_rcu(a, - b, c) S | nr_neigh_for_each(a, - b, c) S | nr_neigh_for_each_safe(a, - b, c, d) S | nr_node_for_each(a, - b, c) S | nr_node_for_each_safe(a, - b, c, d) S | - for_each_gfn_sp(a, c, d, b) S + for_each_gfn_sp(a, c, d) S | - for_each_gfn_indirect_valid_sp(a, c, d, b) S + for_each_gfn_indirect_valid_sp(a, c, d) S | for_each_host(a, - b, c) S | for_each_host_safe(a, - b, c, d) S | for_each_mesh_entry(a, - b, c, d) S ) ...+> [akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c] [akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c] [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix warnings] [akpm@linux-foudnation.org: redo intrusive kvm changes] Tested-by: Peter Senna Tschudin <peter.senna@gmail.com> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Gleb Natapov <gleb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
struct vxlan_rdst *rd;
list_for_each_entry_rcu(rd, &f->remotes, list) {
rtnetlink: fdb dump: optimize by saving last interface markers fdb dumps spanning multiple skb's currently restart from the first interface again for every skb. This results in unnecessary iterations on the already visited interfaces and their fdb entries. In large scale setups, we have seen this to slow down fdb dumps considerably. On a system with 30k macs we see fdb dumps spanning across more than 300 skbs. To fix the problem, this patch replaces the existing single fdb marker with three markers: netdev hash entries, netdevs and fdb index to continue where we left off instead of restarting from the first netdev. This is consistent with link dumps. In the process of fixing the performance issue, this patch also re-implements fix done by commit 472681d57a5d ("net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump") (with an internal fix from Wilson Kok) in the following ways: - change ndo_fdb_dump handlers to return error code instead of the last fdb index - use cb->args strictly for dump frag markers and not error codes. This is consistent with other dump functions. Below results were taken on a system with 1000 netdevs and 35085 fdb entries: before patch: $time bridge fdb show | wc -l 15065 real 1m11.791s user 0m0.070s sys 1m8.395s (existing code does not return all macs) after patch: $time bridge fdb show | wc -l 35085 real 0m2.017s user 0m0.113s sys 0m1.942s Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-31 07:56:45 +03:00
if (*idx < cb->args[2])
goto skip;
err = vxlan_fdb_info(skb, vxlan, f,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH,
NLM_F_MULTI, rd);
rtnetlink: fdb dump: optimize by saving last interface markers fdb dumps spanning multiple skb's currently restart from the first interface again for every skb. This results in unnecessary iterations on the already visited interfaces and their fdb entries. In large scale setups, we have seen this to slow down fdb dumps considerably. On a system with 30k macs we see fdb dumps spanning across more than 300 skbs. To fix the problem, this patch replaces the existing single fdb marker with three markers: netdev hash entries, netdevs and fdb index to continue where we left off instead of restarting from the first netdev. This is consistent with link dumps. In the process of fixing the performance issue, this patch also re-implements fix done by commit 472681d57a5d ("net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump") (with an internal fix from Wilson Kok) in the following ways: - change ndo_fdb_dump handlers to return error code instead of the last fdb index - use cb->args strictly for dump frag markers and not error codes. This is consistent with other dump functions. Below results were taken on a system with 1000 netdevs and 35085 fdb entries: before patch: $time bridge fdb show | wc -l 15065 real 1m11.791s user 0m0.070s sys 1m8.395s (existing code does not return all macs) after patch: $time bridge fdb show | wc -l 35085 real 0m2.017s user 0m0.113s sys 0m1.942s Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-31 07:56:45 +03:00
if (err < 0)
goto out;
skip:
rtnetlink: fdb dump: optimize by saving last interface markers fdb dumps spanning multiple skb's currently restart from the first interface again for every skb. This results in unnecessary iterations on the already visited interfaces and their fdb entries. In large scale setups, we have seen this to slow down fdb dumps considerably. On a system with 30k macs we see fdb dumps spanning across more than 300 skbs. To fix the problem, this patch replaces the existing single fdb marker with three markers: netdev hash entries, netdevs and fdb index to continue where we left off instead of restarting from the first netdev. This is consistent with link dumps. In the process of fixing the performance issue, this patch also re-implements fix done by commit 472681d57a5d ("net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump") (with an internal fix from Wilson Kok) in the following ways: - change ndo_fdb_dump handlers to return error code instead of the last fdb index - use cb->args strictly for dump frag markers and not error codes. This is consistent with other dump functions. Below results were taken on a system with 1000 netdevs and 35085 fdb entries: before patch: $time bridge fdb show | wc -l 15065 real 1m11.791s user 0m0.070s sys 1m8.395s (existing code does not return all macs) after patch: $time bridge fdb show | wc -l 35085 real 0m2.017s user 0m0.113s sys 0m1.942s Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-31 07:56:45 +03:00
*idx += 1;
}
}
}
out:
rtnetlink: fdb dump: optimize by saving last interface markers fdb dumps spanning multiple skb's currently restart from the first interface again for every skb. This results in unnecessary iterations on the already visited interfaces and their fdb entries. In large scale setups, we have seen this to slow down fdb dumps considerably. On a system with 30k macs we see fdb dumps spanning across more than 300 skbs. To fix the problem, this patch replaces the existing single fdb marker with three markers: netdev hash entries, netdevs and fdb index to continue where we left off instead of restarting from the first netdev. This is consistent with link dumps. In the process of fixing the performance issue, this patch also re-implements fix done by commit 472681d57a5d ("net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump") (with an internal fix from Wilson Kok) in the following ways: - change ndo_fdb_dump handlers to return error code instead of the last fdb index - use cb->args strictly for dump frag markers and not error codes. This is consistent with other dump functions. Below results were taken on a system with 1000 netdevs and 35085 fdb entries: before patch: $time bridge fdb show | wc -l 15065 real 1m11.791s user 0m0.070s sys 1m8.395s (existing code does not return all macs) after patch: $time bridge fdb show | wc -l 35085 real 0m2.017s user 0m0.113s sys 0m1.942s Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-31 07:56:45 +03:00
return err;
}
/* Watch incoming packets to learn mapping between Ethernet address
* and Tunnel endpoint.
* Return true if packet is bogus and should be dropped.
*/
static bool vxlan_snoop(struct net_device *dev,
union vxlan_addr *src_ip, const u8 *src_mac)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_fdb *f;
f = vxlan_find_mac(vxlan, src_mac);
if (likely(f)) {
struct vxlan_rdst *rdst = first_remote_rcu(f);
if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip)))
return false;
/* Don't migrate static entries, drop packets */
if (f->state & NUD_NOARP)
return true;
if (net_ratelimit())
netdev_info(dev,
"%pM migrated from %pIS to %pIS\n",
src_mac, &rdst->remote_ip.sa, &src_ip->sa);
rdst->remote_ip = *src_ip;
f->updated = jiffies;
vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH);
} else {
/* learned new entry */
spin_lock(&vxlan->hash_lock);
/* close off race between vxlan_flush and incoming packets */
if (netif_running(dev))
vxlan_fdb_create(vxlan, src_mac, src_ip,
NUD_REACHABLE,
NLM_F_EXCL|NLM_F_CREATE,
vxlan->cfg.dst_port,
vxlan->default_dst.remote_vni,
0, NTF_SELF);
spin_unlock(&vxlan->hash_lock);
}
return false;
}
/* See if multicast group is already in use by other ID */
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
{
struct vxlan_dev *vxlan;
struct vxlan_sock *sock4;
#if IS_ENABLED(CONFIG_IPV6)
struct vxlan_sock *sock6;
#endif
unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
sock4 = rtnl_dereference(dev->vn4_sock);
/* The vxlan_sock is only used by dev, leaving group has
* no effect on other vxlan devices.
*/
if (family == AF_INET && sock4 && atomic_read(&sock4->refcnt) == 1)
return false;
#if IS_ENABLED(CONFIG_IPV6)
sock6 = rtnl_dereference(dev->vn6_sock);
if (family == AF_INET6 && sock6 && atomic_read(&sock6->refcnt) == 1)
return false;
#endif
list_for_each_entry(vxlan, &vn->vxlan_list, next) {
if (!netif_running(vxlan->dev) || vxlan == dev)
continue;
if (family == AF_INET &&
rtnl_dereference(vxlan->vn4_sock) != sock4)
continue;
#if IS_ENABLED(CONFIG_IPV6)
if (family == AF_INET6 &&
rtnl_dereference(vxlan->vn6_sock) != sock6)
continue;
#endif
if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
&dev->default_dst.remote_ip))
continue;
if (vxlan->default_dst.remote_ifindex !=
dev->default_dst.remote_ifindex)
continue;
return true;
}
return false;
}
static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
{
struct vxlan_net *vn;
if (!vs)
return false;
if (!atomic_dec_and_test(&vs->refcnt))
return false;
vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
spin_lock(&vn->sock_lock);
hlist_del_rcu(&vs->hlist);
udp_tunnel_notify_del_rx_port(vs->sock,
(vs->flags & VXLAN_F_GPE) ?
UDP_TUNNEL_TYPE_VXLAN_GPE :
UDP_TUNNEL_TYPE_VXLAN);
spin_unlock(&vn->sock_lock);
return true;
}
static void vxlan_sock_release(struct vxlan_dev *vxlan)
{
struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
#if IS_ENABLED(CONFIG_IPV6)
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
rcu_assign_pointer(vxlan->vn6_sock, NULL);
#endif
rcu_assign_pointer(vxlan->vn4_sock, NULL);
synchronize_net();
if (__vxlan_sock_release_prep(sock4)) {
udp_tunnel_sock_release(sock4->sock);
kfree(sock4);
}
#if IS_ENABLED(CONFIG_IPV6)
if (__vxlan_sock_release_prep(sock6)) {
udp_tunnel_sock_release(sock6->sock);
kfree(sock6);
}
#endif
}
/* Update multicast group membership when first VNI on
* multicast address is brought up
*/
static int vxlan_igmp_join(struct vxlan_dev *vxlan)
{
struct sock *sk;
union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
int ifindex = vxlan->default_dst.remote_ifindex;
int ret = -EINVAL;
if (ip->sa.sa_family == AF_INET) {
struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
struct ip_mreqn mreq = {
.imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr,
.imr_ifindex = ifindex,
};
sk = sock4->sock->sk;
lock_sock(sk);
ret = ip_mc_join_group(sk, &mreq);
release_sock(sk);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
sk = sock6->sock->sk;
lock_sock(sk);
ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
&ip->sin6.sin6_addr);
release_sock(sk);
#endif
}
return ret;
}
/* Inverse of vxlan_igmp_join when last VNI is brought down */
static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
{
struct sock *sk;
union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
int ifindex = vxlan->default_dst.remote_ifindex;
int ret = -EINVAL;
if (ip->sa.sa_family == AF_INET) {
struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
struct ip_mreqn mreq = {
.imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr,
.imr_ifindex = ifindex,
};
sk = sock4->sock->sk;
lock_sock(sk);
ret = ip_mc_leave_group(sk, &mreq);
release_sock(sk);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
sk = sock6->sock->sk;
lock_sock(sk);
ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
&ip->sin6.sin6_addr);
release_sock(sk);
#endif
}
return ret;
}
static bool vxlan_remcsum(struct vxlanhdr *unparsed,
struct sk_buff *skb, u32 vxflags)
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
{
size_t start, offset;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
goto out;
start = vxlan_rco_start(unparsed->vx_vni);
offset = start + vxlan_rco_offset(unparsed->vx_vni);
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (!pskb_may_pull(skb, offset + sizeof(u16)))
return false;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
!!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
out:
unparsed->vx_flags &= ~VXLAN_HF_RCO;
unparsed->vx_vni &= VXLAN_VNI_MASK;
return true;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
}
static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
struct sk_buff *skb, u32 vxflags,
struct vxlan_metadata *md)
{
struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
struct metadata_dst *tun_dst;
if (!(unparsed->vx_flags & VXLAN_HF_GBP))
goto out;
md->gbp = ntohs(gbp->policy_id);
tun_dst = (struct metadata_dst *)skb_dst(skb);
if (tun_dst) {
tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
tun_dst->u.tun_info.options_len = sizeof(*md);
}
if (gbp->dont_learn)
md->gbp |= VXLAN_GBP_DONT_LEARN;
if (gbp->policy_applied)
md->gbp |= VXLAN_GBP_POLICY_APPLIED;
/* In flow-based mode, GBP is carried in dst_metadata */
if (!(vxflags & VXLAN_F_COLLECT_METADATA))
skb->mark = md->gbp;
out:
unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
}
static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
__be16 *protocol,
struct sk_buff *skb, u32 vxflags)
{
struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;
/* Need to have Next Protocol set for interfaces in GPE mode. */
if (!gpe->np_applied)
return false;
/* "The initial version is 0. If a receiver does not support the
* version indicated it MUST drop the packet.
*/
if (gpe->version != 0)
return false;
/* "When the O bit is set to 1, the packet is an OAM packet and OAM
* processing MUST occur." However, we don't implement OAM
* processing, thus drop the packet.
*/
if (gpe->oam_flag)
return false;
switch (gpe->next_protocol) {
case VXLAN_GPE_NP_IPV4:
*protocol = htons(ETH_P_IP);
break;
case VXLAN_GPE_NP_IPV6:
*protocol = htons(ETH_P_IPV6);
break;
case VXLAN_GPE_NP_ETHERNET:
*protocol = htons(ETH_P_TEB);
break;
default:
return false;
}
unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
return true;
}
static bool vxlan_set_mac(struct vxlan_dev *vxlan,
struct vxlan_sock *vs,
struct sk_buff *skb)
{
union vxlan_addr saddr;
skb_reset_mac_header(skb);
skb->protocol = eth_type_trans(skb, vxlan->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
/* Ignore packet loops (and multicast echo) */
if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
return false;
/* Get address from the outer IP header */
if (vxlan_get_sk_family(vs) == AF_INET) {
saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
} else {
saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
saddr.sa.sa_family = AF_INET6;
#endif
}
if ((vxlan->flags & VXLAN_F_LEARN) &&
vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
return false;
return true;
}
static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
struct sk_buff *skb)
{
int err = 0;
if (vxlan_get_sk_family(vs) == AF_INET)
err = IP_ECN_decapsulate(oiph, skb);
#if IS_ENABLED(CONFIG_IPV6)
else
err = IP6_ECN_decapsulate(oiph, skb);
#endif
if (unlikely(err) && log_ecn_error) {
if (vxlan_get_sk_family(vs) == AF_INET)
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
&((struct iphdr *)oiph)->saddr,
((struct iphdr *)oiph)->tos);
else
net_info_ratelimited("non-ECT from %pI6\n",
&((struct ipv6hdr *)oiph)->saddr);
}
return err <= 1;
}
/* Callback from net/ipv4/udp.c to receive packets */
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
{
struct pcpu_sw_netstats *stats;
struct vxlan_dev *vxlan;
struct vxlan_sock *vs;
struct vxlanhdr unparsed;
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
struct vxlan_metadata _md;
struct vxlan_metadata *md = &_md;
__be16 protocol = htons(ETH_P_TEB);
bool raw_proto = false;
void *oiph;
/* Need UDP and VXLAN header to be present */
if (!pskb_may_pull(skb, VXLAN_HLEN))
goto drop;
unparsed = *vxlan_hdr(skb);
/* VNI flag always required to be set */
if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
ntohl(vxlan_hdr(skb)->vx_flags),
ntohl(vxlan_hdr(skb)->vx_vni));
/* Return non vxlan pkt */
goto drop;
}
unparsed.vx_flags &= ~VXLAN_HF_VNI;
unparsed.vx_vni &= ~VXLAN_VNI_MASK;
vs = rcu_dereference_sk_user_data(sk);
if (!vs)
goto drop;
vxlan = vxlan_vs_find_vni(vs, vxlan_vni(vxlan_hdr(skb)->vx_vni));
if (!vxlan)
goto drop;
/* For backwards compatibility, only allow reserved fields to be
* used by VXLAN extensions if explicitly requested.
*/
if (vs->flags & VXLAN_F_GPE) {
if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
goto drop;
raw_proto = true;
}
if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
!net_eq(vxlan->net, dev_net(vxlan->dev))))
goto drop;
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
if (vxlan_collect_metadata(vs)) {
__be32 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
struct metadata_dst *tun_dst;
tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
key32_to_tunnel_id(vni), sizeof(*md));
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
if (!tun_dst)
goto drop;
md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
skb_dst_set(skb, (struct dst_entry *)tun_dst);
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
} else {
memset(md, 0, sizeof(*md));
}
if (vs->flags & VXLAN_F_REMCSUM_RX)
if (!vxlan_remcsum(&unparsed, skb, vs->flags))
goto drop;
if (vs->flags & VXLAN_F_GBP)
vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
/* Note that GBP and GPE can never be active together. This is
* ensured in vxlan_dev_configure.
*/
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
if (unparsed.vx_flags || unparsed.vx_vni) {
/* If there are any unprocessed flags remaining treat
* this as a malformed packet. This behavior diverges from
* VXLAN RFC (RFC7348) which stipulates that bits in reserved
* in reserved fields are to be ignored. The approach here
* maintains compatibility with previous stack code, and also
* is more robust and provides a little more security in
* adding extensions to VXLAN.
*/
goto drop;
}
if (!raw_proto) {
if (!vxlan_set_mac(vxlan, vs, skb))
goto drop;
} else {
skb_reset_mac_header(skb);
skb->dev = vxlan->dev;
skb->pkt_type = PACKET_HOST;
}
oiph = skb_network_header(skb);
skb_reset_network_header(skb);
if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
++vxlan->dev->stats.rx_frame_errors;
++vxlan->dev->stats.rx_errors;
goto drop;
}
stats = this_cpu_ptr(vxlan->dev->tstats);
u64_stats_update_begin(&stats->syncp);
stats->rx_packets++;
stats->rx_bytes += skb->len;
u64_stats_update_end(&stats->syncp);
gro_cells_receive(&vxlan->gro_cells, skb);
return 0;
drop:
/* Consume bad packet */
kfree_skb(skb);
return 0;
}
static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct arphdr *parp;
u8 *arpptr, *sha;
__be32 sip, tip;
struct neighbour *n;
if (dev->flags & IFF_NOARP)
goto out;
if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
dev->stats.tx_dropped++;
goto out;
}
parp = arp_hdr(skb);
if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
parp->ar_pro != htons(ETH_P_IP) ||
parp->ar_op != htons(ARPOP_REQUEST) ||
parp->ar_hln != dev->addr_len ||
parp->ar_pln != 4)
goto out;
arpptr = (u8 *)parp + sizeof(struct arphdr);
sha = arpptr;
arpptr += dev->addr_len; /* sha */
memcpy(&sip, arpptr, sizeof(sip));
arpptr += sizeof(sip);
arpptr += dev->addr_len; /* tha */
memcpy(&tip, arpptr, sizeof(tip));
if (ipv4_is_loopback(tip) ||
ipv4_is_multicast(tip))
goto out;
n = neigh_lookup(&arp_tbl, &tip, dev);
if (n) {
struct vxlan_fdb *f;
struct sk_buff *reply;
if (!(n->nud_state & NUD_CONNECTED)) {
neigh_release(n);
goto out;
}
f = vxlan_find_mac(vxlan, n->ha);
if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
/* bridge-local neighbor */
neigh_release(n);
goto out;
}
reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
n->ha, sha);
neigh_release(n);
if (reply == NULL)
goto out;
skb_reset_mac_header(reply);
__skb_pull(reply, skb_network_offset(reply));
reply->ip_summed = CHECKSUM_UNNECESSARY;
reply->pkt_type = PACKET_HOST;
if (netif_rx_ni(reply) == NET_RX_DROP)
dev->stats.rx_dropped++;
} else if (vxlan->flags & VXLAN_F_L3MISS) {
union vxlan_addr ipa = {
.sin.sin_addr.s_addr = tip,
.sin.sin_family = AF_INET,
};
vxlan_ip_miss(dev, &ipa);
}
out:
consume_skb(skb);
return NETDEV_TX_OK;
}
#if IS_ENABLED(CONFIG_IPV6)
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
static struct sk_buff *vxlan_na_create(struct sk_buff *request,
struct neighbour *n, bool isrouter)
{
struct net_device *dev = request->dev;
struct sk_buff *reply;
struct nd_msg *ns, *na;
struct ipv6hdr *pip6;
u8 *daddr;
int na_olen = 8; /* opt hdr + ETH_ALEN for target */
int ns_olen;
int i, len;
if (dev == NULL)
return NULL;
len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
sizeof(*na) + na_olen + dev->needed_tailroom;
reply = alloc_skb(len, GFP_ATOMIC);
if (reply == NULL)
return NULL;
reply->protocol = htons(ETH_P_IPV6);
reply->dev = dev;
skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
skb_push(reply, sizeof(struct ethhdr));
skb_reset_mac_header(reply);
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
ns = (struct nd_msg *)skb_transport_header(request);
daddr = eth_hdr(request)->h_source;
ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns);
for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
break;
}
}
/* Ethernet header */
ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
reply->protocol = htons(ETH_P_IPV6);
skb_pull(reply, sizeof(struct ethhdr));
skb_reset_network_header(reply);
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
skb_put(reply, sizeof(struct ipv6hdr));
/* IPv6 header */
pip6 = ipv6_hdr(reply);
memset(pip6, 0, sizeof(struct ipv6hdr));
pip6->version = 6;
pip6->priority = ipv6_hdr(request)->priority;
pip6->nexthdr = IPPROTO_ICMPV6;
pip6->hop_limit = 255;
pip6->daddr = ipv6_hdr(request)->saddr;
pip6->saddr = *(struct in6_addr *)n->primary_key;
skb_pull(reply, sizeof(struct ipv6hdr));
skb_reset_transport_header(reply);
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen);
/* Neighbor Advertisement */
memset(na, 0, sizeof(*na)+na_olen);
na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
na->icmph.icmp6_router = isrouter;
na->icmph.icmp6_override = 1;
na->icmph.icmp6_solicited = 1;
na->target = ns->target;
ether_addr_copy(&na->opt[2], n->ha);
na->opt[0] = ND_OPT_TARGET_LL_ADDR;
na->opt[1] = na_olen >> 3;
na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
&pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
csum_partial(na, sizeof(*na)+na_olen, 0));
pip6->payload_len = htons(sizeof(*na)+na_olen);
skb_push(reply, sizeof(struct ipv6hdr));
reply->ip_summed = CHECKSUM_UNNECESSARY;
return reply;
}
static int neigh_reduce(struct net_device *dev, struct sk_buff *skb)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
struct nd_msg *msg;
const struct ipv6hdr *iphdr;
const struct in6_addr *saddr, *daddr;
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
struct neighbour *n;
struct inet6_dev *in6_dev;
in6_dev = __in6_dev_get(dev);
if (!in6_dev)
goto out;
iphdr = ipv6_hdr(skb);
saddr = &iphdr->saddr;
daddr = &iphdr->daddr;
msg = (struct nd_msg *)skb_transport_header(skb);
if (msg->icmph.icmp6_code != 0 ||
msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
goto out;
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
if (ipv6_addr_loopback(daddr) ||
ipv6_addr_is_multicast(&msg->target))
goto out;
n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
if (n) {
struct vxlan_fdb *f;
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
struct sk_buff *reply;
if (!(n->nud_state & NUD_CONNECTED)) {
neigh_release(n);
goto out;
}
f = vxlan_find_mac(vxlan, n->ha);
if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
/* bridge-local neighbor */
neigh_release(n);
goto out;
}
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
reply = vxlan_na_create(skb, n,
!!(f ? f->flags & NTF_ROUTER : 0));
neigh_release(n);
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
if (reply == NULL)
goto out;
if (netif_rx_ni(reply) == NET_RX_DROP)
dev->stats.rx_dropped++;
} else if (vxlan->flags & VXLAN_F_L3MISS) {
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
union vxlan_addr ipa = {
.sin6.sin6_addr = msg->target,
.sin6.sin6_family = AF_INET6,
vxlan: fix nonfunctional neigh_reduce() The VXLAN neigh_reduce() code is completely non-functional since check-in. Specific errors: 1) The original code drops all packets with a multicast destination address, even though neighbor solicitations are sent to the solicited-node address, a multicast address. The code after this check was never run. 2) The neighbor table lookup used the IPv6 header destination, which is the solicited node address, rather than the target address from the neighbor solicitation. So neighbor lookups would always fail if it got this far. Also for L3MISSes. 3) The code calls ndisc_send_na(), which does a send on the tunnel device. The context for neigh_reduce() is the transmit path, vxlan_xmit(), where the host or a bridge-attached neighbor is trying to transmit a neighbor solicitation. To respond to it, the tunnel endpoint needs to do a *receive* of the appropriate neighbor advertisement. Doing a send, would only try to send the advertisement, encapsulated, to the remote destinations in the fdb -- hosts that definitely did not do the corresponding solicitation. 4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the isrouter flag in the advertisement. This has nothing to do with whether or not the target is a router, and generally won't be set since the tunnel endpoint is bridging, not routing, traffic. The patch below creates a proxy neighbor advertisement to respond to neighbor solicitions as intended, providing proper IPv6 support for neighbor reduction. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 18:39:58 +04:00
};
vxlan_ip_miss(dev, &ipa);
}
out:
consume_skb(skb);
return NETDEV_TX_OK;
}
#endif
static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct neighbour *n;
if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
return false;
n = NULL;
switch (ntohs(eth_hdr(skb)->h_proto)) {
case ETH_P_IP:
{
struct iphdr *pip;
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
return false;
pip = ip_hdr(skb);
n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
union vxlan_addr ipa = {
.sin.sin_addr.s_addr = pip->daddr,
.sin.sin_family = AF_INET,
};
vxlan_ip_miss(dev, &ipa);
return false;
}
break;
}
#if IS_ENABLED(CONFIG_IPV6)
case ETH_P_IPV6:
{
struct ipv6hdr *pip6;
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
return false;
pip6 = ipv6_hdr(skb);
n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
union vxlan_addr ipa = {
.sin6.sin6_addr = pip6->daddr,
.sin6.sin6_family = AF_INET6,
};
vxlan_ip_miss(dev, &ipa);
return false;
}
break;
}
#endif
default:
return false;
}
if (n) {
bool diff;
diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
if (diff) {
memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
dev->addr_len);
memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
}
neigh_release(n);
return diff;
}
return false;
}
static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
struct vxlan_metadata *md)
{
struct vxlanhdr_gbp *gbp;
if (!md->gbp)
return;
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
gbp = (struct vxlanhdr_gbp *)vxh;
vxh->vx_flags |= VXLAN_HF_GBP;
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
if (md->gbp & VXLAN_GBP_DONT_LEARN)
gbp->dont_learn = 1;
if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
gbp->policy_applied = 1;
gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
}
static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
__be16 protocol)
{
struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
gpe->np_applied = 1;
switch (protocol) {
case htons(ETH_P_IP):
gpe->next_protocol = VXLAN_GPE_NP_IPV4;
return 0;
case htons(ETH_P_IPV6):
gpe->next_protocol = VXLAN_GPE_NP_IPV6;
return 0;
case htons(ETH_P_TEB):
gpe->next_protocol = VXLAN_GPE_NP_ETHERNET;
return 0;
}
return -EPFNOSUPPORT;
}
static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
int iphdr_len, __be32 vni,
struct vxlan_metadata *md, u32 vxflags,
bool udp_sum)
{
struct vxlanhdr *vxh;
int min_headroom;
int err;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
__be16 inner_protocol = htons(ETH_P_TEB);
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if ((vxflags & VXLAN_F_REMCSUM_TX) &&
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
skb->ip_summed == CHECKSUM_PARTIAL) {
int csum_start = skb_checksum_start_offset(skb);
if (csum_start <= VXLAN_MAX_REMCSUM_START &&
!(csum_start & VXLAN_RCO_SHIFT_MASK) &&
(skb->csum_offset == offsetof(struct udphdr, check) ||
skb->csum_offset == offsetof(struct tcphdr, check)))
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
type |= SKB_GSO_TUNNEL_REMCSUM;
}
min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
+ VXLAN_HLEN + iphdr_len;
/* Need space for new headers (invalidates iph ptr) */
err = skb_cow_head(skb, min_headroom);
if (unlikely(err))
return err;
err = iptunnel_handle_offloads(skb, type);
if (err)
return err;
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
vxh->vx_flags = VXLAN_HF_VNI;
vxh->vx_vni = vxlan_vni_field(vni);
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (type & SKB_GSO_TUNNEL_REMCSUM) {
unsigned int start;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
vxh->vx_flags |= VXLAN_HF_RCO;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (!skb_is_gso(skb)) {
skb->ip_summed = CHECKSUM_NONE;
skb->encapsulation = 0;
}
}
if (vxflags & VXLAN_F_GBP)
vxlan_build_gbp_hdr(vxh, vxflags, md);
if (vxflags & VXLAN_F_GPE) {
err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
if (err < 0)
return err;
inner_protocol = skb->protocol;
}
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
skb_set_inner_protocol(skb, inner_protocol);
return 0;
}
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
struct vxlan_sock *sock4,
struct sk_buff *skb, int oif, u8 tos,
__be32 daddr, __be32 *saddr,
struct dst_cache *dst_cache,
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
const struct ip_tunnel_info *info)
{
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
struct rtable *rt = NULL;
struct flowi4 fl4;
if (!sock4)
return ERR_PTR(-EIO);
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
if (tos && !info)
use_cache = false;
if (use_cache) {
rt = dst_cache_get_ip4(dst_cache, saddr);
if (rt)
return rt;
}
memset(&fl4, 0, sizeof(fl4));
fl4.flowi4_oif = oif;
fl4.flowi4_tos = RT_TOS(tos);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_proto = IPPROTO_UDP;
fl4.daddr = daddr;
fl4.saddr = *saddr;
rt = ip_route_output_key(vxlan->net, &fl4);
if (likely(!IS_ERR(rt))) {
if (rt->dst.dev == dev) {
netdev_dbg(dev, "circular route to %pI4\n", &daddr);
ip_rt_put(rt);
return ERR_PTR(-ELOOP);
}
*saddr = fl4.saddr;
if (use_cache)
dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
} else {
netdev_dbg(dev, "no route to %pI4\n", &daddr);
return ERR_PTR(-ENETUNREACH);
}
return rt;
}
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
struct net_device *dev,
struct vxlan_sock *sock6,
struct sk_buff *skb, int oif, u8 tos,
__be32 label,
const struct in6_addr *daddr,
struct in6_addr *saddr,
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
struct dst_cache *dst_cache,
const struct ip_tunnel_info *info)
{
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
struct dst_entry *ndst;
struct flowi6 fl6;
int err;
if (!sock6)
return ERR_PTR(-EIO);
if (tos && !info)
use_cache = false;
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
if (use_cache) {
ndst = dst_cache_get_ip6(dst_cache, saddr);
if (ndst)
return ndst;
}
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_oif = oif;
fl6.daddr = *daddr;
fl6.saddr = *saddr;
fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
fl6.flowi6_mark = skb->mark;
fl6.flowi6_proto = IPPROTO_UDP;
err = ipv6_stub->ipv6_dst_lookup(vxlan->net,
sock6->sock->sk,
&ndst, &fl6);
if (unlikely(err < 0)) {
netdev_dbg(dev, "no route to %pI6\n", daddr);
return ERR_PTR(-ENETUNREACH);
}
if (unlikely(ndst->dev == dev)) {
netdev_dbg(dev, "circular route to %pI6\n", daddr);
dst_release(ndst);
return ERR_PTR(-ELOOP);
}
*saddr = fl6.saddr;
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
if (use_cache)
dst_cache_set_ip6(dst_cache, ndst, saddr);
return ndst;
}
#endif
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
struct vxlan_dev *dst_vxlan)
{
struct pcpu_sw_netstats *tx_stats, *rx_stats;
union vxlan_addr loopback;
union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
struct net_device *dev = skb->dev;
int len = skb->len;
tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
skb->pkt_type = PACKET_HOST;
skb->encapsulation = 0;
skb->dev = dst_vxlan->dev;
__skb_pull(skb, skb_network_offset(skb));
if (remote_ip->sa.sa_family == AF_INET) {
loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
loopback.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
} else {
loopback.sin6.sin6_addr = in6addr_loopback;
loopback.sa.sa_family = AF_INET6;
#endif
}
if (dst_vxlan->flags & VXLAN_F_LEARN)
vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source);
u64_stats_update_begin(&tx_stats->syncp);
tx_stats->tx_packets++;
tx_stats->tx_bytes += len;
u64_stats_update_end(&tx_stats->syncp);
if (netif_rx(skb) == NET_RX_SUCCESS) {
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->rx_packets++;
rx_stats->rx_bytes += len;
u64_stats_update_end(&rx_stats->syncp);
} else {
dev->stats.rx_dropped++;
}
}
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
struct vxlan_rdst *rdst, bool did_rsc)
{
struct dst_cache *dst_cache;
struct ip_tunnel_info *info;
struct vxlan_dev *vxlan = netdev_priv(dev);
struct sock *sk;
const struct iphdr *old_iph;
union vxlan_addr *dst;
union vxlan_addr remote_ip, local_ip;
union vxlan_addr *src;
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
struct vxlan_metadata _md;
struct vxlan_metadata *md = &_md;
__be16 src_port = 0, dst_port;
struct dst_entry *ndst = NULL;
__be32 vni, label;
__be16 df = 0;
__u8 tos, ttl;
int err;
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
u32 flags = vxlan->flags;
bool udp_sum = false;
bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
info = skb_tunnel_info(skb);
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
if (rdst) {
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
vni = rdst->remote_vni;
dst = &rdst->remote_ip;
src = &vxlan->cfg.saddr;
dst_cache = &rdst->dst_cache;
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
} else {
if (!info) {
WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
dev->name);
goto drop;
}
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
vni = tunnel_id_to_key32(info->key.tun_id);
remote_ip.sa.sa_family = ip_tunnel_info_af(info);
if (remote_ip.sa.sa_family == AF_INET) {
remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
} else {
remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
}
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
dst = &remote_ip;
src = &local_ip;
dst_cache = &info->dst_cache;
vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: Pravin B Shelar <pshelar@nicira.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-21 11:43:58 +03:00
}
if (vxlan_addr_any(dst)) {
if (did_rsc) {
/* short-circuited back to local bridge */
vxlan_encap_bypass(skb, vxlan, vxlan);
return;
}
goto drop;
}
old_iph = ip_hdr(skb);
ttl = vxlan->cfg.ttl;
if (!ttl && vxlan_addr_multicast(dst))
ttl = 1;
tos = vxlan->cfg.tos;
if (tos == 1)
tos = ip_tunnel_get_dsfield(old_iph, skb);
label = vxlan->cfg.label;
src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
vxlan->cfg.port_max, true);
if (info) {
ttl = info->key.ttl;
tos = info->key.tos;
label = info->key.label;
udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
if (info->options_len)
md = ip_tunnel_info_opts(info);
} else {
md->gbp = skb->mark;
}
if (dst->sa.sa_family == AF_INET) {
struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
struct rtable *rt;
rt = vxlan_get_route(vxlan, dev, sock4, skb,
rdst ? rdst->remote_ifindex : 0, tos,
dst->sin.sin_addr.s_addr,
&src->sin.sin_addr.s_addr,
dst_cache, info);
if (IS_ERR(rt))
goto tx_error;
sk = sock4->sock->sk;
/* Bypass encapsulation if the destination is local */
if (!info && rt->rt_flags & RTCF_LOCAL &&
!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
struct vxlan_dev *dst_vxlan;
ip_rt_put(rt);
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
dst->sa.sa_family, dst_port,
vxlan->flags);
if (!dst_vxlan)
goto tx_error;
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
return;
}
if (!info)
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
df = htons(IP_DF);
ndst = &rt->dst;
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
vni, md, flags, udp_sum);
if (err < 0)
goto tx_error;
udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr,
dst->sin.sin_addr.s_addr, tos, ttl, df,
src_port, dst_port, xnet, !udp_sum);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
u32 rt6i_flags;
ndst = vxlan6_get_route(vxlan, dev, sock6, skb,
rdst ? rdst->remote_ifindex : 0, tos,
label, &dst->sin6.sin6_addr,
&src->sin6.sin6_addr,
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
dst_cache, info);
if (IS_ERR(ndst)) {
ndst = NULL;
goto tx_error;
}
sk = sock6->sock->sk;
/* Bypass encapsulation if the destination is local */
rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
if (!info && rt6i_flags & RTF_LOCAL &&
!(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
struct vxlan_dev *dst_vxlan;
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
dst->sa.sa_family, dst_port,
vxlan->flags);
if (!dst_vxlan)
goto tx_error;
dst_release(ndst);
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
return;
}
if (!info)
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip6_dst_hoplimit(ndst);
skb_scrub_packet(skb, xnet);
err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
vni, md, flags, udp_sum);
if (err < 0)
goto tx_error;
udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
&src->sin6.sin6_addr,
&dst->sin6.sin6_addr, tos, ttl,
label, src_port, dst_port, !udp_sum);
#endif
}
return;
drop:
dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return;
tx_error:
if (err == -ELOOP)
dev->stats.collisions++;
else if (err == -ENETUNREACH)
dev->stats.tx_carrier_errors++;
dst_release(ndst);
dev->stats.tx_errors++;
kfree_skb(skb);
}
/* Transmit local packets over Vxlan
*
* Outer IP header inherits ECN and DF from inner header.
* Outer UDP destination is the VXLAN assigned port.
* source port is based on hash of flow
*/
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
const struct ip_tunnel_info *info;
struct ethhdr *eth;
bool did_rsc = false;
vxlan: keep original skb ownership Sathya Perla posted a patch trying to address following problem : <quote> The vxlan driver sets itself as the socket owner for all the TX flows it encapsulates (using vxlan_set_owner()) and assigns it's own skb destructor. This causes all tunneled traffic to land up on only one TXQ as all encapsulated skbs refer to the vxlan socket and not the original socket. Also, the vxlan skb destructor breaks some functionality for tunneled traffic like wmem accounting and as TCP small queues and FQ/pacing packet scheduler. </quote> I reworked Sathya patch and added some explanations. vxlan_xmit() can avoid one skb_clone()/dev_kfree_skb() pair and gain better drop monitor accuracy, by calling kfree_skb() when appropriate. The UDP socket used by vxlan to perform encapsulation of xmit packets do not need to be alive while packets leave vxlan code. Its better to keep original socket ownership to get proper feedback from qdisc and NIC layers. We use skb->sk to A) control amount of bytes/packets queued on behalf of a socket, but prior vxlan code did the skb->sk transfert without any limit/control on vxlan socket sk_sndbuf. B) security purposes (as selinux) or netfilter uses, and I do not think anything is prepared to handle vxlan stacked case in this area. By not changing ownership, vxlan tunnels behave like other tunnels. As Stephen mentioned, we might do the same change in L2TP. Reported-by: Sathya Perla <sathya.perla@emulex.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-06 21:54:31 +04:00
struct vxlan_rdst *rdst, *fdst = NULL;
struct vxlan_fdb *f;
info = skb_tunnel_info(skb);
skb_reset_mac_header(skb);
if (vxlan->flags & VXLAN_F_COLLECT_METADATA) {
if (info && info->mode & IP_TUNNEL_INFO_TX)
vxlan_xmit_one(skb, dev, NULL, false);
else
kfree_skb(skb);
return NETDEV_TX_OK;
}
if (vxlan->flags & VXLAN_F_PROXY) {
eth = eth_hdr(skb);
if (ntohs(eth->h_proto) == ETH_P_ARP)
return arp_reduce(dev, skb);
#if IS_ENABLED(CONFIG_IPV6)
else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
pskb_may_pull(skb, sizeof(struct ipv6hdr)
+ sizeof(struct nd_msg)) &&
ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
struct nd_msg *msg;
msg = (struct nd_msg *)skb_transport_header(skb);
if (msg->icmph.icmp6_code == 0 &&
msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
return neigh_reduce(dev, skb);
}
#endif
}
eth = eth_hdr(skb);
f = vxlan_find_mac(vxlan, eth->h_dest);
did_rsc = false;
if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) &&
(ntohs(eth->h_proto) == ETH_P_IP ||
ntohs(eth->h_proto) == ETH_P_IPV6)) {
did_rsc = route_shortcircuit(dev, skb);
if (did_rsc)
f = vxlan_find_mac(vxlan, eth->h_dest);
}
if (f == NULL) {
f = vxlan_find_mac(vxlan, all_zeros_mac);
if (f == NULL) {
if ((vxlan->flags & VXLAN_F_L2MISS) &&
!is_multicast_ether_addr(eth->h_dest))
vxlan_fdb_miss(vxlan, eth->h_dest);
dev->stats.tx_dropped++;
vxlan: keep original skb ownership Sathya Perla posted a patch trying to address following problem : <quote> The vxlan driver sets itself as the socket owner for all the TX flows it encapsulates (using vxlan_set_owner()) and assigns it's own skb destructor. This causes all tunneled traffic to land up on only one TXQ as all encapsulated skbs refer to the vxlan socket and not the original socket. Also, the vxlan skb destructor breaks some functionality for tunneled traffic like wmem accounting and as TCP small queues and FQ/pacing packet scheduler. </quote> I reworked Sathya patch and added some explanations. vxlan_xmit() can avoid one skb_clone()/dev_kfree_skb() pair and gain better drop monitor accuracy, by calling kfree_skb() when appropriate. The UDP socket used by vxlan to perform encapsulation of xmit packets do not need to be alive while packets leave vxlan code. Its better to keep original socket ownership to get proper feedback from qdisc and NIC layers. We use skb->sk to A) control amount of bytes/packets queued on behalf of a socket, but prior vxlan code did the skb->sk transfert without any limit/control on vxlan socket sk_sndbuf. B) security purposes (as selinux) or netfilter uses, and I do not think anything is prepared to handle vxlan stacked case in this area. By not changing ownership, vxlan tunnels behave like other tunnels. As Stephen mentioned, we might do the same change in L2TP. Reported-by: Sathya Perla <sathya.perla@emulex.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-06 21:54:31 +04:00
kfree_skb(skb);
return NETDEV_TX_OK;
}
}
list_for_each_entry_rcu(rdst, &f->remotes, list) {
struct sk_buff *skb1;
vxlan: keep original skb ownership Sathya Perla posted a patch trying to address following problem : <quote> The vxlan driver sets itself as the socket owner for all the TX flows it encapsulates (using vxlan_set_owner()) and assigns it's own skb destructor. This causes all tunneled traffic to land up on only one TXQ as all encapsulated skbs refer to the vxlan socket and not the original socket. Also, the vxlan skb destructor breaks some functionality for tunneled traffic like wmem accounting and as TCP small queues and FQ/pacing packet scheduler. </quote> I reworked Sathya patch and added some explanations. vxlan_xmit() can avoid one skb_clone()/dev_kfree_skb() pair and gain better drop monitor accuracy, by calling kfree_skb() when appropriate. The UDP socket used by vxlan to perform encapsulation of xmit packets do not need to be alive while packets leave vxlan code. Its better to keep original socket ownership to get proper feedback from qdisc and NIC layers. We use skb->sk to A) control amount of bytes/packets queued on behalf of a socket, but prior vxlan code did the skb->sk transfert without any limit/control on vxlan socket sk_sndbuf. B) security purposes (as selinux) or netfilter uses, and I do not think anything is prepared to handle vxlan stacked case in this area. By not changing ownership, vxlan tunnels behave like other tunnels. As Stephen mentioned, we might do the same change in L2TP. Reported-by: Sathya Perla <sathya.perla@emulex.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-06 21:54:31 +04:00
if (!fdst) {
fdst = rdst;
continue;
}
skb1 = skb_clone(skb, GFP_ATOMIC);
if (skb1)
vxlan_xmit_one(skb1, dev, rdst, did_rsc);
}
vxlan: keep original skb ownership Sathya Perla posted a patch trying to address following problem : <quote> The vxlan driver sets itself as the socket owner for all the TX flows it encapsulates (using vxlan_set_owner()) and assigns it's own skb destructor. This causes all tunneled traffic to land up on only one TXQ as all encapsulated skbs refer to the vxlan socket and not the original socket. Also, the vxlan skb destructor breaks some functionality for tunneled traffic like wmem accounting and as TCP small queues and FQ/pacing packet scheduler. </quote> I reworked Sathya patch and added some explanations. vxlan_xmit() can avoid one skb_clone()/dev_kfree_skb() pair and gain better drop monitor accuracy, by calling kfree_skb() when appropriate. The UDP socket used by vxlan to perform encapsulation of xmit packets do not need to be alive while packets leave vxlan code. Its better to keep original socket ownership to get proper feedback from qdisc and NIC layers. We use skb->sk to A) control amount of bytes/packets queued on behalf of a socket, but prior vxlan code did the skb->sk transfert without any limit/control on vxlan socket sk_sndbuf. B) security purposes (as selinux) or netfilter uses, and I do not think anything is prepared to handle vxlan stacked case in this area. By not changing ownership, vxlan tunnels behave like other tunnels. As Stephen mentioned, we might do the same change in L2TP. Reported-by: Sathya Perla <sathya.perla@emulex.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-06 21:54:31 +04:00
if (fdst)
vxlan_xmit_one(skb, dev, fdst, did_rsc);
else
kfree_skb(skb);
return NETDEV_TX_OK;
}
/* Walk the forwarding table and purge stale entries */
static void vxlan_cleanup(unsigned long arg)
{
struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
unsigned int h;
if (!netif_running(vxlan->dev))
return;
for (h = 0; h < FDB_HASH_SIZE; ++h) {
struct hlist_node *p, *n;
spin_lock_bh(&vxlan->hash_lock);
hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
struct vxlan_fdb *f
= container_of(p, struct vxlan_fdb, hlist);
unsigned long timeout;
if (f->state & NUD_PERMANENT)
continue;
timeout = f->used + vxlan->cfg.age_interval * HZ;
if (time_before_eq(timeout, jiffies)) {
netdev_dbg(vxlan->dev,
"garbage collect %pM\n",
f->eth_addr);
f->state = NUD_STALE;
vxlan_fdb_destroy(vxlan, f);
} else if (time_before(timeout, next_timer))
next_timer = timeout;
}
spin_unlock_bh(&vxlan->hash_lock);
}
mod_timer(&vxlan->age_timer, next_timer);
}
static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
__be32 vni = vxlan->default_dst.remote_vni;
spin_lock(&vn->sock_lock);
hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
spin_unlock(&vn->sock_lock);
}
/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
return 0;
}
static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan)
{
struct vxlan_fdb *f;
spin_lock_bh(&vxlan->hash_lock);
f = __vxlan_find_mac(vxlan, all_zeros_mac);
if (f)
vxlan_fdb_destroy(vxlan, f);
spin_unlock_bh(&vxlan->hash_lock);
}
static void vxlan_uninit(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
vxlan_fdb_delete_default(vxlan);
free_percpu(dev->tstats);
}
/* Start ageing timer and join group when device is brought up */
static int vxlan_open(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
int ret;
ret = vxlan_sock_add(vxlan);
if (ret < 0)
return ret;
if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
ret = vxlan_igmp_join(vxlan);
if (ret == -EADDRINUSE)
ret = 0;
if (ret) {
vxlan_sock_release(vxlan);
return ret;
}
}
if (vxlan->cfg.age_interval)
mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
return ret;
}
/* Purge the forwarding table */
static void vxlan_flush(struct vxlan_dev *vxlan)
{
unsigned int h;
spin_lock_bh(&vxlan->hash_lock);
for (h = 0; h < FDB_HASH_SIZE; ++h) {
struct hlist_node *p, *n;
hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
struct vxlan_fdb *f
= container_of(p, struct vxlan_fdb, hlist);
/* the all_zeros_mac entry is deleted at vxlan_uninit */
if (!is_zero_ether_addr(f->eth_addr))
vxlan_fdb_destroy(vxlan, f);
}
}
spin_unlock_bh(&vxlan->hash_lock);
}
/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
int ret = 0;
if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
vxlan: do not exit on error in vxlan_stop() We need to clean up vxlan despite vxlan_igmp_leave() fails. This fixes the following kernel warning: WARNING: CPU: 0 PID: 6 at lib/debugobjects.c:263 debug_print_object+0x7c/0x8d() ODEBUG: free active (active state 0) object type: timer_list hint: vxlan_cleanup+0x0/0xd0 CPU: 0 PID: 6 Comm: kworker/u8:0 Not tainted 4.0.0-rc7+ #953 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Workqueue: netns cleanup_net 0000000000000009 ffff88011955f948 ffffffff81a25f5a 00000000253f253e ffff88011955f998 ffff88011955f988 ffffffff8107608e 0000000000000000 ffffffff814deba2 ffff8800d4e94000 ffffffff82254c30 ffffffff81fbe455 Call Trace: [<ffffffff81a25f5a>] dump_stack+0x4c/0x65 [<ffffffff8107608e>] warn_slowpath_common+0x9c/0xb6 [<ffffffff814deba2>] ? debug_print_object+0x7c/0x8d [<ffffffff81076116>] warn_slowpath_fmt+0x46/0x48 [<ffffffff814deba2>] debug_print_object+0x7c/0x8d [<ffffffff81666bf1>] ? vxlan_fdb_destroy+0x5b/0x5b [<ffffffff814dee02>] __debug_check_no_obj_freed+0xc3/0x15f [<ffffffff814df728>] debug_check_no_obj_freed+0x12/0x16 [<ffffffff8117ae4e>] slab_free_hook+0x64/0x6c [<ffffffff8114deaa>] ? kvfree+0x31/0x33 [<ffffffff8117dc66>] kfree+0x101/0x1ac [<ffffffff8114deaa>] kvfree+0x31/0x33 [<ffffffff817d4137>] netdev_freemem+0x18/0x1a [<ffffffff817e8b52>] netdev_release+0x2e/0x32 [<ffffffff815b4163>] device_release+0x5a/0x92 [<ffffffff814bd4dd>] kobject_cleanup+0x49/0x5e [<ffffffff814bd3ff>] kobject_put+0x45/0x49 [<ffffffff817d3fc1>] netdev_run_todo+0x26f/0x283 [<ffffffff817d4873>] ? rollback_registered_many+0x20f/0x23b [<ffffffff817e0c80>] rtnl_unlock+0xe/0x10 [<ffffffff817d4af0>] default_device_exit_batch+0x12a/0x139 [<ffffffff810aadfa>] ? wait_woken+0x8f/0x8f [<ffffffff817c8e14>] ops_exit_list+0x2b/0x57 [<ffffffff817c9b21>] cleanup_net+0x154/0x1e7 [<ffffffff8108b05d>] process_one_work+0x255/0x4ad [<ffffffff8108af69>] ? process_one_work+0x161/0x4ad [<ffffffff8108b4b1>] worker_thread+0x1cd/0x2ab [<ffffffff8108b2e4>] ? process_scheduled_works+0x2f/0x2f [<ffffffff81090686>] kthread+0xd4/0xdc [<ffffffff8109eca3>] ? local_clock+0x19/0x22 [<ffffffff810905b2>] ? __kthread_parkme+0x83/0x83 [<ffffffff81a31c48>] ret_from_fork+0x58/0x90 [<ffffffff810905b2>] ? __kthread_parkme+0x83/0x83 For the long-term, we should handle NETDEV_{UP,DOWN} event from the lower device of a tunnel device. Fixes: 56ef9c909b40 ("vxlan: Move socket initialization to within rtnl scope") Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-09 00:48:30 +03:00
!vxlan_group_used(vn, vxlan))
ret = vxlan_igmp_leave(vxlan);
del_timer_sync(&vxlan->age_timer);
vxlan_flush(vxlan);
vxlan_sock_release(vxlan);
return ret;
}
/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list(struct net_device *dev)
{
}
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 20:55:20 +03:00
static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
{
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 20:55:20 +03:00
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_rdst *dst = &vxlan->default_dst;
struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
dst->remote_ifindex);
bool use_ipv6 = false;
if (dst->remote_ip.sa.sa_family == AF_INET6)
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 20:55:20 +03:00
use_ipv6 = true;
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 20:55:20 +03:00
/* This check is different than dev->max_mtu, because it looks at
* the lowerdev->mtu, rather than the static dev->max_mtu
*/
if (lowerdev) {
int max_mtu = lowerdev->mtu -
(use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
if (new_mtu > max_mtu)
return -EINVAL;
}
dev->mtu = new_mtu;
return 0;
}
static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct ip_tunnel_info *info = skb_tunnel_info(skb);
__be16 sport, dport;
sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
vxlan->cfg.port_max, true);
dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
if (ip_tunnel_info_af(info) == AF_INET) {
struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
struct rtable *rt;
rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
info->key.u.ipv4.dst,
&info->key.u.ipv4.src, NULL, info);
if (IS_ERR(rt))
return PTR_ERR(rt);
ip_rt_put(rt);
} else {
#if IS_ENABLED(CONFIG_IPV6)
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
struct dst_entry *ndst;
ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
info->key.label, &info->key.u.ipv6.dst,
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paolo Abeni <pabeni@redhat.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 17:15:07 +03:00
&info->key.u.ipv6.src, NULL, info);
if (IS_ERR(ndst))
return PTR_ERR(ndst);
dst_release(ndst);
#else /* !CONFIG_IPV6 */
return -EPFNOSUPPORT;
#endif
}
info->key.tp_src = sport;
info->key.tp_dst = dport;
return 0;
}
static const struct net_device_ops vxlan_netdev_ether_ops = {
.ndo_init = vxlan_init,
.ndo_uninit = vxlan_uninit,
.ndo_open = vxlan_open,
.ndo_stop = vxlan_stop,
.ndo_start_xmit = vxlan_xmit,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_set_rx_mode = vxlan_set_multicast_list,
.ndo_change_mtu = vxlan_change_mtu,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = eth_mac_addr,
.ndo_fdb_add = vxlan_fdb_add,
.ndo_fdb_del = vxlan_fdb_delete,
.ndo_fdb_dump = vxlan_fdb_dump,
.ndo_fill_metadata_dst = vxlan_fill_metadata_dst,
};
static const struct net_device_ops vxlan_netdev_raw_ops = {
.ndo_init = vxlan_init,
.ndo_uninit = vxlan_uninit,
.ndo_open = vxlan_open,
.ndo_stop = vxlan_stop,
.ndo_start_xmit = vxlan_xmit,
.ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_change_mtu = vxlan_change_mtu,
.ndo_fill_metadata_dst = vxlan_fill_metadata_dst,
};
/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type vxlan_type = {
.name = "vxlan",
};
/* Calls the ndo_udp_tunnel_add of the caller in order to
* supply the listening VXLAN udp ports. Callers are expected
* to implement the ndo_udp_tunnel_add.
*/
static void vxlan_push_rx_ports(struct net_device *dev)
{
struct vxlan_sock *vs;
struct net *net = dev_net(dev);
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
unsigned int i;
spin_lock(&vn->sock_lock);
for (i = 0; i < PORT_HASH_SIZE; ++i) {
hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist)
udp_tunnel_push_rx_port(dev, vs->sock,
(vs->flags & VXLAN_F_GPE) ?
UDP_TUNNEL_TYPE_VXLAN_GPE :
UDP_TUNNEL_TYPE_VXLAN);
}
spin_unlock(&vn->sock_lock);
}
/* Initialize the device structure. */
static void vxlan_setup(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
unsigned int h;
eth_hw_addr_random(dev);
ether_setup(dev);
dev->destructor = free_netdev;
SET_NETDEV_DEVTYPE(dev, &vxlan_type);
dev->features |= NETIF_F_LLTX;
dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
dev->features |= NETIF_F_RXCSUM;
dev->features |= NETIF_F_GSO_SOFTWARE;
dev->vlan_features = dev->features;
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
netif_keep_dst(dev);
dev->priv_flags |= IFF_NO_QUEUE;
INIT_LIST_HEAD(&vxlan->next);
spin_lock_init(&vxlan->hash_lock);
init_timer_deferrable(&vxlan->age_timer);
vxlan->age_timer.function = vxlan_cleanup;
vxlan->age_timer.data = (unsigned long) vxlan;
vxlan->cfg.dst_port = htons(vxlan_port);
vxlan->dev = dev;
gro_cells_init(&vxlan->gro_cells, dev);
for (h = 0; h < FDB_HASH_SIZE; ++h)
INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
}
static void vxlan_ether_setup(struct net_device *dev)
{
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->netdev_ops = &vxlan_netdev_ether_ops;
}
static void vxlan_raw_setup(struct net_device *dev)
{
dev->header_ops = NULL;
dev->type = ARPHRD_NONE;
dev->hard_header_len = 0;
dev->addr_len = 0;
dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
dev->netdev_ops = &vxlan_netdev_raw_ops;
}
static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_ID] = { .type = NLA_U32 },
[IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
[IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) },
[IFLA_VXLAN_LINK] = { .type = NLA_U32 },
[IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
[IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) },
[IFLA_VXLAN_TOS] = { .type = NLA_U8 },
[IFLA_VXLAN_TTL] = { .type = NLA_U8 },
[IFLA_VXLAN_LABEL] = { .type = NLA_U32 },
[IFLA_VXLAN_LEARNING] = { .type = NLA_U8 },
[IFLA_VXLAN_AGEING] = { .type = NLA_U32 },
[IFLA_VXLAN_LIMIT] = { .type = NLA_U32 },
[IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) },
[IFLA_VXLAN_PROXY] = { .type = NLA_U8 },
[IFLA_VXLAN_RSC] = { .type = NLA_U8 },
[IFLA_VXLAN_L2MISS] = { .type = NLA_U8 },
[IFLA_VXLAN_L3MISS] = { .type = NLA_U8 },
[IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 },
[IFLA_VXLAN_PORT] = { .type = NLA_U16 },
[IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 },
[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
[IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
[IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
[IFLA_VXLAN_GBP] = { .type = NLA_FLAG, },
[IFLA_VXLAN_GPE] = { .type = NLA_FLAG, },
[IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG },
};
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
{
if (tb[IFLA_ADDRESS]) {
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
pr_debug("invalid link address (not ethernet)\n");
return -EINVAL;
}
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
pr_debug("invalid all zero ethernet address\n");
return -EADDRNOTAVAIL;
}
}
if (!data)
return -EINVAL;
if (data[IFLA_VXLAN_ID]) {
__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
if (id >= VXLAN_VID_MASK)
return -ERANGE;
}
if (data[IFLA_VXLAN_PORT_RANGE]) {
const struct ifla_vxlan_port_range *p
= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
if (ntohs(p->high) < ntohs(p->low)) {
pr_debug("port range %u .. %u not valid\n",
ntohs(p->low), ntohs(p->high));
return -EINVAL;
}
}
return 0;
}
static void vxlan_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *drvinfo)
{
strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
}
static const struct ethtool_ops vxlan_ethtool_ops = {
.get_drvinfo = vxlan_get_drvinfo,
.get_link = ethtool_op_get_link,
};
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
__be16 port, u32 flags)
{
struct socket *sock;
struct udp_port_cfg udp_conf;
int err;
memset(&udp_conf, 0, sizeof(udp_conf));
if (ipv6) {
udp_conf.family = AF_INET6;
udp_conf.use_udp6_rx_checksums =
!(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
udp_conf.ipv6_v6only = 1;
} else {
udp_conf.family = AF_INET;
}
udp_conf.local_udp_port = port;
/* Open UDP socket */
err = udp_sock_create(net, &udp_conf, &sock);
if (err < 0)
return ERR_PTR(err);
return sock;
}
/* Create new listen socket if needed */
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
__be16 port, u32 flags)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_sock *vs;
struct socket *sock;
unsigned int h;
struct udp_tunnel_sock_cfg tunnel_cfg;
vs = kzalloc(sizeof(*vs), GFP_KERNEL);
if (!vs)
return ERR_PTR(-ENOMEM);
for (h = 0; h < VNI_HASH_SIZE; ++h)
INIT_HLIST_HEAD(&vs->vni_list[h]);
sock = vxlan_create_sock(net, ipv6, port, flags);
if (IS_ERR(sock)) {
pr_info("Cannot bind port %d, err=%ld\n", ntohs(port),
PTR_ERR(sock));
kfree(vs);
return ERR_CAST(sock);
}
vs->sock = sock;
atomic_set(&vs->refcnt, 1);
vs->flags = (flags & VXLAN_F_RCV_FLAGS);
spin_lock(&vn->sock_lock);
hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
udp_tunnel_notify_add_rx_port(sock,
(vs->flags & VXLAN_F_GPE) ?
UDP_TUNNEL_TYPE_VXLAN_GPE :
UDP_TUNNEL_TYPE_VXLAN);
spin_unlock(&vn->sock_lock);
/* Mark socket as an encapsulation socket. */
memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
tunnel_cfg.sk_user_data = vs;
tunnel_cfg.encap_type = 1;
tunnel_cfg.encap_rcv = vxlan_rcv;
tunnel_cfg.encap_destroy = NULL;
tunnel_cfg.gro_receive = vxlan_gro_receive;
tunnel_cfg.gro_complete = vxlan_gro_complete;
setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
return vs;
}
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_sock *vs = NULL;
if (!vxlan->cfg.no_share) {
spin_lock(&vn->sock_lock);
vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
vxlan->cfg.dst_port, vxlan->flags);
if (vs && !atomic_add_unless(&vs->refcnt, 1, 0)) {
spin_unlock(&vn->sock_lock);
return -EBUSY;
}
spin_unlock(&vn->sock_lock);
}
if (!vs)
vs = vxlan_socket_create(vxlan->net, ipv6,
vxlan->cfg.dst_port, vxlan->flags);
if (IS_ERR(vs))
return PTR_ERR(vs);
#if IS_ENABLED(CONFIG_IPV6)
if (ipv6)
rcu_assign_pointer(vxlan->vn6_sock, vs);
else
#endif
rcu_assign_pointer(vxlan->vn4_sock, vs);
vxlan_vs_add_dev(vs, vxlan);
return 0;
}
static int vxlan_sock_add(struct vxlan_dev *vxlan)
{
bool ipv6 = vxlan->flags & VXLAN_F_IPV6;
bool metadata = vxlan->flags & VXLAN_F_COLLECT_METADATA;
int ret = 0;
RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
#if IS_ENABLED(CONFIG_IPV6)
RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
if (ipv6 || metadata)
ret = __vxlan_sock_add(vxlan, true);
#endif
if (!ret && (!ipv6 || metadata))
ret = __vxlan_sock_add(vxlan, false);
if (ret < 0)
vxlan_sock_release(vxlan);
return ret;
}
static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
struct vxlan_config *conf)
{
struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
struct vxlan_dev *vxlan = netdev_priv(dev), *tmp;
struct vxlan_rdst *dst = &vxlan->default_dst;
unsigned short needed_headroom = ETH_HLEN;
int err;
bool use_ipv6 = false;
__be16 default_port = vxlan->cfg.dst_port;
struct net_device *lowerdev = NULL;
if (conf->flags & VXLAN_F_GPE) {
/* For now, allow GPE only together with COLLECT_METADATA.
* This can be relaxed later; in such case, the other side
* of the PtP link will have to be provided.
*/
if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
pr_info("unsupported combination of extensions\n");
return -EINVAL;
}
vxlan_raw_setup(dev);
} else {
vxlan_ether_setup(dev);
}
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 20:55:20 +03:00
/* MTU range: 68 - 65535 */
dev->min_mtu = ETH_MIN_MTU;
dev->max_mtu = ETH_MAX_MTU;
vxlan->net = src_net;
dst->remote_vni = conf->vni;
memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));
/* Unless IPv6 is explicitly requested, assume IPv4 */
if (!dst->remote_ip.sa.sa_family)
dst->remote_ip.sa.sa_family = AF_INET;
if (dst->remote_ip.sa.sa_family == AF_INET6 ||
vxlan->cfg.saddr.sa.sa_family == AF_INET6) {
if (!IS_ENABLED(CONFIG_IPV6))
return -EPFNOSUPPORT;
use_ipv6 = true;
vxlan->flags |= VXLAN_F_IPV6;
}
if (conf->label && !use_ipv6) {
pr_info("label only supported in use with IPv6\n");
return -EINVAL;
}
if (conf->remote_ifindex) {
lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
dst->remote_ifindex = conf->remote_ifindex;
if (!lowerdev) {
pr_info("ifindex %d does not exist\n", dst->remote_ifindex);
return -ENODEV;
}
#if IS_ENABLED(CONFIG_IPV6)
if (use_ipv6) {
struct inet6_dev *idev = __in6_dev_get(lowerdev);
if (idev && idev->cnf.disable_ipv6) {
pr_info("IPv6 is disabled via sysctl\n");
return -EPERM;
}
}
#endif
if (!conf->mtu)
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 20:55:20 +03:00
dev->mtu = lowerdev->mtu -
(use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
needed_headroom = lowerdev->hard_header_len;
} else if (vxlan_addr_multicast(&dst->remote_ip)) {
pr_info("multicast destination requires interface to be specified\n");
return -EINVAL;
}
if (conf->mtu) {
net: use core MTU range checking in core net infra geneve: - Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu - This one isn't quite as straight-forward as others, could use some closer inspection and testing macvlan: - set min/max_mtu tun: - set min/max_mtu, remove tun_net_change_mtu vxlan: - Merge __vxlan_change_mtu back into vxlan_change_mtu - Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function - This one is also not as straight-forward and could use closer inspection and testing from vxlan folks bridge: - set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in change_mtu function openvswitch: - set min/max_mtu, remove internal_dev_change_mtu - note: max_mtu wasn't checked previously, it's been set to 65535, which is the largest possible size supported sch_teql: - set min/max_mtu (note: max_mtu previously unchecked, used max of 65535) macsec: - min_mtu = 0, max_mtu = 65535 macvlan: - min_mtu = 0, max_mtu = 65535 ntb_netdev: - min_mtu = 0, max_mtu = 65535 veth: - min_mtu = 68, max_mtu = 65535 8021q: - min_mtu = 0, max_mtu = 65535 CC: netdev@vger.kernel.org CC: Nicolas Dichtel <nicolas.dichtel@6wind.com> CC: Hannes Frederic Sowa <hannes@stressinduktion.org> CC: Tom Herbert <tom@herbertland.com> CC: Daniel Borkmann <daniel@iogearbox.net> CC: Alexander Duyck <alexander.h.duyck@intel.com> CC: Paolo Abeni <pabeni@redhat.com> CC: Jiri Benc <jbenc@redhat.com> CC: WANG Cong <xiyou.wangcong@gmail.com> CC: Roopa Prabhu <roopa@cumulusnetworks.com> CC: Pravin B Shelar <pshelar@ovn.org> CC: Sabrina Dubroca <sd@queasysnail.net> CC: Patrick McHardy <kaber@trash.net> CC: Stephen Hemminger <stephen@networkplumber.org> CC: Pravin Shelar <pshelar@nicira.com> CC: Maxim Krasnyansky <maxk@qti.qualcomm.com> Signed-off-by: Jarod Wilson <jarod@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 20:55:20 +03:00
int max_mtu = ETH_MAX_MTU;
if (lowerdev)
max_mtu = lowerdev->mtu;
max_mtu -= (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
if (conf->mtu < dev->min_mtu || conf->mtu > dev->max_mtu)
return -EINVAL;
dev->mtu = conf->mtu;
if (conf->mtu > max_mtu)
dev->mtu = max_mtu;
}
if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
needed_headroom += VXLAN6_HEADROOM;
else
needed_headroom += VXLAN_HEADROOM;
dev->needed_headroom = needed_headroom;
memcpy(&vxlan->cfg, conf, sizeof(*conf));
if (!vxlan->cfg.dst_port) {
if (conf->flags & VXLAN_F_GPE)
vxlan->cfg.dst_port = 4790; /* IANA assigned VXLAN-GPE port */
else
vxlan->cfg.dst_port = default_port;
}
vxlan->flags |= conf->flags;
if (!vxlan->cfg.age_interval)
vxlan->cfg.age_interval = FDB_AGE_DEFAULT;
list_for_each_entry(tmp, &vn->vxlan_list, next) {
if (tmp->cfg.vni == conf->vni &&
(tmp->default_dst.remote_ip.sa.sa_family == AF_INET6 ||
tmp->cfg.saddr.sa.sa_family == AF_INET6) == use_ipv6 &&
tmp->cfg.dst_port == vxlan->cfg.dst_port &&
(tmp->flags & VXLAN_F_RCV_FLAGS) ==
(vxlan->flags & VXLAN_F_RCV_FLAGS)) {
pr_info("duplicate VNI %u\n", be32_to_cpu(conf->vni));
return -EEXIST;
}
}
dev->ethtool_ops = &vxlan_ethtool_ops;
/* create an fdb entry for a valid default destination */
if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
err = vxlan_fdb_create(vxlan, all_zeros_mac,
&vxlan->default_dst.remote_ip,
NUD_REACHABLE|NUD_PERMANENT,
NLM_F_EXCL|NLM_F_CREATE,
vxlan->cfg.dst_port,
vxlan->default_dst.remote_vni,
vxlan->default_dst.remote_ifindex,
NTF_SELF);
if (err)
return err;
}
err = register_netdevice(dev);
if (err) {
vxlan_fdb_delete_default(vxlan);
return err;
}
list_add(&vxlan->next, &vn->vxlan_list);
return 0;
}
static int vxlan_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct vxlan_config conf;
memset(&conf, 0, sizeof(conf));
if (data[IFLA_VXLAN_ID])
conf.vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
if (data[IFLA_VXLAN_GROUP]) {
conf.remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
} else if (data[IFLA_VXLAN_GROUP6]) {
if (!IS_ENABLED(CONFIG_IPV6))
return -EPFNOSUPPORT;
conf.remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
conf.remote_ip.sa.sa_family = AF_INET6;
}
if (data[IFLA_VXLAN_LOCAL]) {
conf.saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
conf.saddr.sa.sa_family = AF_INET;
} else if (data[IFLA_VXLAN_LOCAL6]) {
if (!IS_ENABLED(CONFIG_IPV6))
return -EPFNOSUPPORT;
/* TODO: respect scope id */
conf.saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
conf.saddr.sa.sa_family = AF_INET6;
}
if (data[IFLA_VXLAN_LINK])
conf.remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
if (data[IFLA_VXLAN_TOS])
conf.tos = nla_get_u8(data[IFLA_VXLAN_TOS]);
if (data[IFLA_VXLAN_TTL])
conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
if (data[IFLA_VXLAN_LABEL])
conf.label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
IPV6_FLOWLABEL_MASK;
if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
conf.flags |= VXLAN_F_LEARN;
if (data[IFLA_VXLAN_AGEING])
conf.age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
conf.flags |= VXLAN_F_PROXY;
if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
conf.flags |= VXLAN_F_RSC;
if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
conf.flags |= VXLAN_F_L2MISS;
if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
conf.flags |= VXLAN_F_L3MISS;
if (data[IFLA_VXLAN_LIMIT])
conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
if (data[IFLA_VXLAN_COLLECT_METADATA] &&
nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
conf.flags |= VXLAN_F_COLLECT_METADATA;
if (data[IFLA_VXLAN_PORT_RANGE]) {
const struct ifla_vxlan_port_range *p
= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
conf.port_min = ntohs(p->low);
conf.port_max = ntohs(p->high);
}
if (data[IFLA_VXLAN_PORT])
conf.dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
if (data[IFLA_VXLAN_UDP_CSUM] &&
!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
conf.flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] &&
nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
conf.flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (data[IFLA_VXLAN_REMCSUM_TX] &&
nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
conf.flags |= VXLAN_F_REMCSUM_TX;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
if (data[IFLA_VXLAN_REMCSUM_RX] &&
nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
conf.flags |= VXLAN_F_REMCSUM_RX;
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
if (data[IFLA_VXLAN_GBP])
conf.flags |= VXLAN_F_GBP;
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
if (data[IFLA_VXLAN_GPE])
conf.flags |= VXLAN_F_GPE;
if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL])
conf.flags |= VXLAN_F_REMCSUM_NOPARTIAL;
if (tb[IFLA_MTU])
conf.mtu = nla_get_u32(tb[IFLA_MTU]);
return vxlan_dev_configure(src_net, dev, &conf);
}
static void vxlan_dellink(struct net_device *dev, struct list_head *head)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
spin_lock(&vn->sock_lock);
if (!hlist_unhashed(&vxlan->hlist))
hlist_del_rcu(&vxlan->hlist);
spin_unlock(&vn->sock_lock);
gro_cells_destroy(&vxlan->gro_cells);
list_del(&vxlan->next);
unregister_netdevice_queue(dev, head);
}
static size_t vxlan_get_size(const struct net_device *dev)
{
return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */
nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */
nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
0;
}
static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
const struct vxlan_dev *vxlan = netdev_priv(dev);
const struct vxlan_rdst *dst = &vxlan->default_dst;
struct ifla_vxlan_port_range ports = {
.low = htons(vxlan->cfg.port_min),
.high = htons(vxlan->cfg.port_max),
};
if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
goto nla_put_failure;
if (!vxlan_addr_any(&dst->remote_ip)) {
if (dst->remote_ip.sa.sa_family == AF_INET) {
if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
dst->remote_ip.sin.sin_addr.s_addr))
goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
} else {
if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
&dst->remote_ip.sin6.sin6_addr))
goto nla_put_failure;
#endif
}
}
if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
goto nla_put_failure;
if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
vxlan->cfg.saddr.sin.sin_addr.s_addr))
goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
} else {
if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
&vxlan->cfg.saddr.sin6.sin6_addr))
goto nla_put_failure;
#endif
}
}
if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
nla_put_u8(skb, IFLA_VXLAN_LEARNING,
!!(vxlan->flags & VXLAN_F_LEARN)) ||
nla_put_u8(skb, IFLA_VXLAN_PROXY,
!!(vxlan->flags & VXLAN_F_PROXY)) ||
nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
nla_put_u8(skb, IFLA_VXLAN_L2MISS,
!!(vxlan->flags & VXLAN_F_L2MISS)) ||
nla_put_u8(skb, IFLA_VXLAN_L3MISS,
!!(vxlan->flags & VXLAN_F_L3MISS)) ||
nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
!!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) ||
nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
vxlan: Remote checksum offload Add support for remote checksum offload in VXLAN. This uses a reserved bit to indicate that RCO is being done, and uses the low order reserved eight bits of the VNI to hold the start and offset values in a compressed manner. Start is encoded in the low order seven bits of VNI. This is start >> 1 so that the checksum start offset is 0-254 using even values only. Checksum offset (transport checksum field) is indicated in the high order bit in the low order byte of the VNI. If the bit is set, the checksum field is for UDP (so offset = start + 6), else checksum field is for TCP (so offset = start + 16). Only TCP and UDP are supported in this implementation. Remote checksum offload for VXLAN is described in: https://tools.ietf.org/html/draft-herbert-vxlan-rco-00 Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4). With UDP checksums and Remote Checksum Offload IPv4 Client 11.84% CPU utilization Server 12.96% CPU utilization 9197 Mbps IPv6 Client 12.46% CPU utilization Server 14.48% CPU utilization 8963 Mbps With UDP checksums, no remote checksum offload IPv4 Client 15.67% CPU utilization Server 14.83% CPU utilization 9094 Mbps IPv6 Client 16.21% CPU utilization Server 14.32% CPU utilization 9058 Mbps No UDP checksums IPv4 Client 15.03% CPU utilization Server 23.09% CPU utilization 9089 Mbps IPv6 Client 16.18% CPU utilization Server 26.57% CPU utilization 8954 Mbps Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-13 04:00:38 +03:00
!!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
!!(vxlan->flags & VXLAN_F_REMCSUM_TX)) ||
nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
!!(vxlan->flags & VXLAN_F_REMCSUM_RX)))
goto nla_put_failure;
if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
goto nla_put_failure;
vxlan: Group Policy extension Implements supports for the Group Policy VXLAN extension [0] to provide a lightweight and simple security label mechanism across network peers based on VXLAN. The security context and associated metadata is mapped to/from skb->mark. This allows further mapping to a SELinux context using SECMARK, to implement ACLs directly with nftables, iptables, OVS, tc, etc. The group membership is defined by the lower 16 bits of skb->mark, the upper 16 bits are used for flags. SELinux allows to manage label to secure local resources. However, distributed applications require ACLs to implemented across hosts. This is typically achieved by matching on L2-L4 fields to identify the original sending host and process on the receiver. On top of that, netlabel and specifically CIPSO [1] allow to map security contexts to universal labels. However, netlabel and CIPSO are relatively complex. This patch provides a lightweight alternative for overlay network environments with a trusted underlay. No additional control protocol is required. Host 1: Host 2: Group A Group B Group B Group A +-----+ +-------------+ +-------+ +-----+ | lxc | | SELinux CTX | | httpd | | VM | +--+--+ +--+----------+ +---+---+ +--+--+ \---+---/ \----+---/ | | +---+---+ +---+---+ | vxlan | | vxlan | +---+---+ +---+---+ +------------------------------+ Backwards compatibility: A VXLAN-GBP socket can receive standard VXLAN frames and will assign the default group 0x0000 to such frames. A Linux VXLAN socket will drop VXLAN-GBP frames. The extension is therefore disabled by default and needs to be specifically enabled: ip link add [...] type vxlan [...] gbp In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket must run on a separate port number. Examples: iptables: host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200 host2# iptables -I INPUT -m mark --mark 0x200 -j DROP OVS: # ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL' # ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop' [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy [1] http://lwn.net/Articles/204905/ Signed-off-by: Thomas Graf <tgraf@suug.ch> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 05:53:55 +03:00
if (vxlan->flags & VXLAN_F_GBP &&
nla_put_flag(skb, IFLA_VXLAN_GBP))
goto nla_put_failure;
if (vxlan->flags & VXLAN_F_GPE &&
nla_put_flag(skb, IFLA_VXLAN_GPE))
goto nla_put_failure;
if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL &&
nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
goto nla_put_failure;
return 0;
nla_put_failure:
return -EMSGSIZE;
}
static struct net *vxlan_get_link_net(const struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
return vxlan->net;
}
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
.kind = "vxlan",
.maxtype = IFLA_VXLAN_MAX,
.policy = vxlan_policy,
.priv_size = sizeof(struct vxlan_dev),
.setup = vxlan_setup,
.validate = vxlan_validate,
.newlink = vxlan_newlink,
.dellink = vxlan_dellink,
.get_size = vxlan_get_size,
.fill_info = vxlan_fill_info,
.get_link_net = vxlan_get_link_net,
};
struct net_device *vxlan_dev_create(struct net *net, const char *name,
u8 name_assign_type,
struct vxlan_config *conf)
{
struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
int err;
memset(&tb, 0, sizeof(tb));
dev = rtnl_create_link(net, name, name_assign_type,
&vxlan_link_ops, tb);
if (IS_ERR(dev))
return dev;
err = vxlan_dev_configure(net, dev, conf);
if (err < 0) {
free_netdev(dev);
return ERR_PTR(err);
}
err = rtnl_configure_link(dev, NULL);
if (err < 0) {
LIST_HEAD(list_kill);
vxlan_dellink(dev, &list_kill);
unregister_netdevice_many(&list_kill);
return ERR_PTR(err);
}
return dev;
}
EXPORT_SYMBOL_GPL(vxlan_dev_create);
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
struct net_device *dev)
{
struct vxlan_dev *vxlan, *next;
LIST_HEAD(list_kill);
list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
struct vxlan_rdst *dst = &vxlan->default_dst;
/* In case we created vxlan device with carrier
* and we loose the carrier due to module unload
* we also need to remove vxlan device. In other
* cases, it's not necessary and remote_ifindex
* is 0 here, so no matches.
*/
if (dst->remote_ifindex == dev->ifindex)
vxlan_dellink(vxlan->dev, &list_kill);
}
unregister_netdevice_many(&list_kill);
}
static int vxlan_netdevice_event(struct notifier_block *unused,
unsigned long event, void *ptr)
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
if (event == NETDEV_UNREGISTER)
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
vxlan_handle_lowerdev_unregister(vn, dev);
else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
vxlan_push_rx_ports(dev);
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
return NOTIFY_DONE;
}
static struct notifier_block vxlan_notifier_block __read_mostly = {
.notifier_call = vxlan_netdevice_event,
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
};
static __net_init int vxlan_init_net(struct net *net)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
unsigned int h;
INIT_LIST_HEAD(&vn->vxlan_list);
spin_lock_init(&vn->sock_lock);
for (h = 0; h < PORT_HASH_SIZE; ++h)
INIT_HLIST_HEAD(&vn->sock_list[h]);
return 0;
}
static void __net_exit vxlan_exit_net(struct net *net)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_dev *vxlan, *next;
struct net_device *dev, *aux;
LIST_HEAD(list);
rtnl_lock();
for_each_netdev_safe(net, dev, aux)
if (dev->rtnl_link_ops == &vxlan_link_ops)
unregister_netdevice_queue(dev, &list);
list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
/* If vxlan->dev is in the same netns, it has already been added
* to the list by the previous loop.
*/
if (!net_eq(dev_net(vxlan->dev), net)) {
gro_cells_destroy(&vxlan->gro_cells);
unregister_netdevice_queue(vxlan->dev, &list);
}
}
unregister_netdevice_many(&list);
rtnl_unlock();
}
static struct pernet_operations vxlan_net_ops = {
.init = vxlan_init_net,
.exit = vxlan_exit_net,
.id = &vxlan_net_id,
.size = sizeof(struct vxlan_net),
};
static int __init vxlan_init_module(void)
{
int rc;
get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
rc = register_pernet_subsys(&vxlan_net_ops);
if (rc)
goto out1;
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
rc = register_netdevice_notifier(&vxlan_notifier_block);
if (rc)
goto out2;
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
rc = rtnl_link_register(&vxlan_link_ops);
if (rc)
goto out3;
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
return 0;
out3:
unregister_netdevice_notifier(&vxlan_notifier_block);
out2:
unregister_pernet_subsys(&vxlan_net_ops);
out1:
return rc;
}
late_initcall(vxlan_init_module);
static void __exit vxlan_cleanup_module(void)
{
rtnl_link_unregister(&vxlan_link_ops);
net: vxlan: when lower dev unregisters remove vxlan dev as well We can create a vxlan device with an explicit underlying carrier. In that case, when the carrier link is being deleted from the system (e.g. due to module unload) we should also clean up all created vxlan devices on top of it since otherwise we're in an inconsistent state in vxlan device. In that case, the user needs to remove all such devices, while in case of other virtual devs that sit on top of physical ones, it is usually the case that these devices do unregister automatically as well and do not leave the burden on the user. This work is not necessary when vxlan device was not created with a real underlying device, as connections can resume in that case when driver is plugged again. But at least for the other cases, we should go ahead and do the cleanup on removal. We don't register the notifier during vxlan_newlink() here since I consider this event rather rare, and therefore we should not bloat vxlan's core structure unecessary. Also, we can simply make use of unregister_netdevice_many() to batch that. fdb is flushed upon ndo_stop(). E.g. `ip -d link show vxlan13` after carrier removal before this patch: 5: vxlan13: <BROADCAST,MULTICAST> mtu 1450 qdisc noop state DOWN mode DEFAULT group default link/ether 1e:47:da:6d:4d:99 brd ff:ff:ff:ff:ff:ff promiscuity 0 vxlan id 13 group 239.0.0.10 dev 2 port 32768 61000 ageing 300 ^^^^^ Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-13 21:41:19 +04:00
unregister_netdevice_notifier(&vxlan_notifier_block);
unregister_pernet_subsys(&vxlan_net_ops);
/* rcu_barrier() is called by netns */
}
module_exit(vxlan_cleanup_module);
MODULE_LICENSE("GPL");
MODULE_VERSION(VXLAN_VERSION);
MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
MODULE_ALIAS_RTNL_LINK("vxlan");