2012-10-01 12:32:35 +00:00
/*
2012-11-13 13:29:15 +00:00
* VXLAN : Virtual eXtensible Local Area Network
2012-10-01 12:32:35 +00:00
*
2013-04-27 11:31:52 +00:00
* Copyright ( c ) 2012 - 2013 Vyatta Inc .
2012-10-01 12:32:35 +00:00
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*/
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
# include <linux/kernel.h>
# include <linux/module.h>
# include <linux/errno.h>
# include <linux/slab.h>
# include <linux/udp.h>
# include <linux/igmp.h>
# include <linux/if_ether.h>
2013-01-29 23:43:07 +00:00
# include <linux/ethtool.h>
2012-11-20 02:50:14 +00:00
# include <net/arp.h>
# include <net/ndisc.h>
2012-10-01 12:32:35 +00:00
# include <net/ip.h>
# include <net/icmp.h>
# include <net/rtnetlink.h>
# include <net/inet_ecn.h>
# include <net/net_namespace.h>
# include <net/netns/generic.h>
2013-08-19 11:23:07 -07:00
# include <net/vxlan.h>
2016-02-09 22:07:29 -08:00
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
# include <net/ip6_tunnel.h>
2013-09-02 10:06:52 +08:00
# include <net/ip6_checksum.h>
2013-08-31 13:44:33 +08:00
# endif
2012-10-01 12:32:35 +00:00
# define VXLAN_VERSION "0.1"
2013-05-16 11:35:20 +00:00
# define PORT_HASH_BITS 8
# define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
2012-10-01 12:32:35 +00:00
# define FDB_AGE_DEFAULT 300 /* 5 min */
# define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
2013-04-27 11:31:53 +00:00
/* UDP port for VXLAN traffic.
* The IANA assigned port is 4789 , but the Linux default is 8472
2013-06-17 14:16:41 -07:00
* for compatibility with early adopters .
2013-04-27 11:31:53 +00:00
*/
2013-06-17 14:16:12 -07:00
static unsigned short vxlan_port __read_mostly = 8472 ;
module_param_named ( udp_port , vxlan_port , ushort , 0444 ) ;
2012-10-01 12:32:35 +00:00
MODULE_PARM_DESC ( udp_port , " Destination UDP port " ) ;
static bool log_ecn_error = true ;
module_param ( log_ecn_error , bool , 0644 ) ;
MODULE_PARM_DESC ( log_ecn_error , " Log packets received with corrupted ECN " ) ;
netns: make struct pernet_operations::id unsigned int
Make struct pernet_operations::id unsigned.
There are 2 reasons to do so:
1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.
2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.
"int" being used as an array index needs to be sign-extended
to 64-bit before being used.
void f(long *p, int i)
{
g(p[i]);
}
roughly translates to
movsx rsi, esi
mov rdi, [rsi+...]
call g
MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.
Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:
static inline void *net_generic(const struct net *net, int id)
{
...
ptr = ng->ptr[id - 1];
...
}
And this function is used a lot, so those sign extensions add up.
Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]
However, overall balance is in negative direction:
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
function old new delta
nfsd4_lock 3886 3959 +73
tipc_link_build_proto_msg 1096 1140 +44
mac80211_hwsim_new_radio 2776 2808 +32
tipc_mon_rcv 1032 1058 +26
svcauth_gss_legacy_init 1413 1429 +16
tipc_bcbase_select_primary 379 392 +13
nfsd4_exchange_id 1247 1260 +13
nfsd4_setclientid_confirm 782 793 +11
...
put_client_renew_locked 494 480 -14
ip_set_sockfn_get 730 716 -14
geneve_sock_add 829 813 -16
nfsd4_sequence_done 721 703 -18
nlmclnt_lookup_host 708 686 -22
nfsd4_lockt 1085 1063 -22
nfs_get_client 1077 1050 -27
tcf_bpf_init 1106 1076 -30
nfsd4_encode_fattr 5997 5930 -67
Total: Before=154856051, After=154854321, chg -0.00%
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 04:58:21 +03:00
static unsigned int vxlan_net_id ;
2015-07-21 10:44:02 +02:00
static struct rtnl_link_ops vxlan_link_ops ;
2013-05-16 11:35:20 +00:00
2016-01-29 09:43:47 +08:00
static const u8 all_zeros_mac [ ETH_ALEN + 2 ] ;
2013-06-25 16:01:51 +03:00
2015-09-24 13:50:01 +02:00
static int vxlan_sock_add ( struct vxlan_dev * vxlan ) ;
2015-07-21 10:44:06 +02:00
2013-05-16 11:35:20 +00:00
/* per-network namespace private data for this module */
struct vxlan_net {
struct list_head vxlan_list ;
struct hlist_head sock_list [ PORT_HASH_SIZE ] ;
2013-06-17 14:16:11 -07:00
spinlock_t sock_lock ;
2013-05-16 11:35:20 +00:00
} ;
2012-10-01 12:32:35 +00:00
/* Forwarding table entry */
struct vxlan_fdb {
struct hlist_node hlist ; /* linked list of entries */
struct rcu_head rcu ;
unsigned long updated ; /* jiffies */
unsigned long used ;
2013-06-17 14:16:12 -07:00
struct list_head remotes ;
2015-07-20 09:54:50 +02:00
u8 eth_addr [ ETH_ALEN ] ;
2012-10-01 12:32:35 +00:00
u16 state ; /* see ndm_state */
2017-01-31 22:59:52 -08:00
__be32 vni ;
2013-04-19 00:36:26 +00:00
u8 flags ; /* see ndm_flags */
2012-10-01 12:32:35 +00:00
} ;
/* salt for hash table */
static u32 vxlan_salt __read_mostly ;
2015-07-21 10:43:58 +02:00
static inline bool vxlan_collect_metadata ( struct vxlan_sock * vs )
{
2015-07-21 10:44:01 +02:00
return vs - > flags & VXLAN_F_COLLECT_METADATA | |
ip_tunnel_collect_metadata ( ) ;
2015-07-21 10:43:58 +02:00
}
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal ( const union vxlan_addr * a , const union vxlan_addr * b )
{
2015-03-29 16:17:37 +02:00
if ( a - > sa . sa_family ! = b - > sa . sa_family )
return false ;
if ( a - > sa . sa_family = = AF_INET6 )
return ipv6_addr_equal ( & a - > sin6 . sin6_addr , & b - > sin6 . sin6_addr ) ;
else
return a - > sin . sin_addr . s_addr = = b - > sin . sin_addr . s_addr ;
2013-08-31 13:44:33 +08:00
}
static inline bool vxlan_addr_any ( const union vxlan_addr * ipa )
{
2015-03-29 16:17:37 +02:00
if ( ipa - > sa . sa_family = = AF_INET6 )
return ipv6_addr_any ( & ipa - > sin6 . sin6_addr ) ;
else
return ipa - > sin . sin_addr . s_addr = = htonl ( INADDR_ANY ) ;
2013-08-31 13:44:33 +08:00
}
static inline bool vxlan_addr_multicast ( const union vxlan_addr * ipa )
{
2015-03-29 16:17:37 +02:00
if ( ipa - > sa . sa_family = = AF_INET6 )
return ipv6_addr_is_multicast ( & ipa - > sin6 . sin6_addr ) ;
else
return IN_MULTICAST ( ntohl ( ipa - > sin . sin_addr . s_addr ) ) ;
2013-08-31 13:44:33 +08:00
}
static int vxlan_nla_get_addr ( union vxlan_addr * ip , struct nlattr * nla )
{
2015-03-29 16:17:37 +02:00
if ( nla_len ( nla ) > = sizeof ( struct in6_addr ) ) {
2015-03-29 16:59:26 +02:00
ip - > sin6 . sin6_addr = nla_get_in6_addr ( nla ) ;
2015-03-29 16:17:37 +02:00
ip - > sa . sa_family = AF_INET6 ;
return 0 ;
} else if ( nla_len ( nla ) > = sizeof ( __be32 ) ) {
2015-03-29 16:59:26 +02:00
ip - > sin . sin_addr . s_addr = nla_get_in_addr ( nla ) ;
2015-03-29 16:17:37 +02:00
ip - > sa . sa_family = AF_INET ;
return 0 ;
} else {
return - EAFNOSUPPORT ;
}
2013-08-31 13:44:33 +08:00
}
static int vxlan_nla_put_addr ( struct sk_buff * skb , int attr ,
2015-03-29 16:17:37 +02:00
const union vxlan_addr * ip )
2013-08-31 13:44:33 +08:00
{
2015-03-29 16:17:37 +02:00
if ( ip - > sa . sa_family = = AF_INET6 )
2015-03-29 16:59:25 +02:00
return nla_put_in6_addr ( skb , attr , & ip - > sin6 . sin6_addr ) ;
2015-03-29 16:17:37 +02:00
else
2015-03-29 16:59:25 +02:00
return nla_put_in_addr ( skb , attr , ip - > sin . sin_addr . s_addr ) ;
2013-08-31 13:44:33 +08:00
}
# else /* !CONFIG_IPV6 */
static inline
bool vxlan_addr_equal ( const union vxlan_addr * a , const union vxlan_addr * b )
{
2015-03-29 16:17:37 +02:00
return a - > sin . sin_addr . s_addr = = b - > sin . sin_addr . s_addr ;
2013-08-31 13:44:33 +08:00
}
static inline bool vxlan_addr_any ( const union vxlan_addr * ipa )
{
2015-03-29 16:17:37 +02:00
return ipa - > sin . sin_addr . s_addr = = htonl ( INADDR_ANY ) ;
2013-08-31 13:44:33 +08:00
}
static inline bool vxlan_addr_multicast ( const union vxlan_addr * ipa )
{
2015-03-29 16:17:37 +02:00
return IN_MULTICAST ( ntohl ( ipa - > sin . sin_addr . s_addr ) ) ;
2013-08-31 13:44:33 +08:00
}
static int vxlan_nla_get_addr ( union vxlan_addr * ip , struct nlattr * nla )
{
2015-03-29 16:17:37 +02:00
if ( nla_len ( nla ) > = sizeof ( struct in6_addr ) ) {
return - EAFNOSUPPORT ;
} else if ( nla_len ( nla ) > = sizeof ( __be32 ) ) {
2015-03-29 16:59:26 +02:00
ip - > sin . sin_addr . s_addr = nla_get_in_addr ( nla ) ;
2015-03-29 16:17:37 +02:00
ip - > sa . sa_family = AF_INET ;
return 0 ;
} else {
return - EAFNOSUPPORT ;
}
2013-08-31 13:44:33 +08:00
}
static int vxlan_nla_put_addr ( struct sk_buff * skb , int attr ,
2015-03-29 16:17:37 +02:00
const union vxlan_addr * ip )
2013-08-31 13:44:33 +08:00
{
2015-03-29 16:59:25 +02:00
return nla_put_in_addr ( skb , attr , ip - > sin . sin_addr . s_addr ) ;
2013-08-31 13:44:33 +08:00
}
# endif
2013-05-16 11:35:20 +00:00
/* Virtual Network hash table head */
2016-02-16 21:58:58 +01:00
static inline struct hlist_head * vni_head ( struct vxlan_sock * vs , __be32 vni )
2013-05-16 11:35:20 +00:00
{
2016-02-16 21:58:58 +01:00
return & vs - > vni_list [ hash_32 ( ( __force u32 ) vni , VNI_HASH_BITS ) ] ;
2013-05-16 11:35:20 +00:00
}
/* Socket hash table head */
static inline struct hlist_head * vs_head ( struct net * net , __be16 port )
2012-10-01 12:32:35 +00:00
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
2013-05-16 11:35:20 +00:00
return & vn - > sock_list [ hash_32 ( ntohs ( port ) , PORT_HASH_BITS ) ] ;
}
2013-06-17 14:16:12 -07:00
/* First remote destination for a forwarding entry.
* Guaranteed to be non - NULL because remotes are never deleted .
*/
2013-08-04 17:17:39 -07:00
static inline struct vxlan_rdst * first_remote_rcu ( struct vxlan_fdb * fdb )
2013-06-17 14:16:12 -07:00
{
2013-08-04 17:17:39 -07:00
return list_entry_rcu ( fdb - > remotes . next , struct vxlan_rdst , list ) ;
}
static inline struct vxlan_rdst * first_remote_rtnl ( struct vxlan_fdb * fdb )
{
return list_first_entry ( & fdb - > remotes , struct vxlan_rdst , list ) ;
2013-06-17 14:16:12 -07:00
}
2015-01-15 03:53:56 +01:00
/* Find VXLAN socket based on network namespace, address family and UDP port
* and enabled unshareable flags .
*/
static struct vxlan_sock * vxlan_find_sock ( struct net * net , sa_family_t family ,
__be16 port , u32 flags )
2013-05-16 11:35:20 +00:00
{
struct vxlan_sock * vs ;
2015-01-20 11:23:05 -08:00
flags & = VXLAN_F_RCV_FLAGS ;
2013-05-16 11:35:20 +00:00
hlist_for_each_entry_rcu ( vs , vs_head ( net , port ) , hlist ) {
2014-11-13 14:43:08 -02:00
if ( inet_sk ( vs - > sock - > sk ) - > inet_sport = = port & &
2015-08-20 13:56:28 +02:00
vxlan_get_sk_family ( vs ) = = family & &
2015-01-20 11:23:05 -08:00
vs - > flags = = flags )
2013-05-16 11:35:20 +00:00
return vs ;
}
return NULL ;
2012-10-01 12:32:35 +00:00
}
2016-02-16 21:58:58 +01:00
static struct vxlan_dev * vxlan_vs_find_vni ( struct vxlan_sock * vs , __be32 vni )
2012-10-01 12:32:35 +00:00
{
struct vxlan_dev * vxlan ;
2016-02-16 21:59:03 +01:00
/* For flow based devices, map all packets to VNI 0 */
if ( vs - > flags & VXLAN_F_COLLECT_METADATA )
vni = 0 ;
2016-02-16 21:58:58 +01:00
hlist_for_each_entry_rcu ( vxlan , vni_head ( vs , vni ) , hlist ) {
if ( vxlan - > default_dst . remote_vni = = vni )
2012-10-01 12:32:35 +00:00
return vxlan ;
}
return NULL ;
}
2013-08-19 11:23:02 -07:00
/* Look up VNI in a per net namespace table */
2016-02-16 21:58:58 +01:00
static struct vxlan_dev * vxlan_find_vni ( struct net * net , __be32 vni ,
2015-01-15 03:53:56 +01:00
sa_family_t family , __be16 port ,
u32 flags )
2013-08-19 11:23:02 -07:00
{
struct vxlan_sock * vs ;
2015-01-15 03:53:56 +01:00
vs = vxlan_find_sock ( net , family , port , flags ) ;
2013-08-19 11:23:02 -07:00
if ( ! vs )
return NULL ;
2016-02-16 21:58:58 +01:00
return vxlan_vs_find_vni ( vs , vni ) ;
2013-08-19 11:23:02 -07:00
}
2012-10-01 12:32:35 +00:00
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info ( struct sk_buff * skb , struct vxlan_dev * vxlan ,
2013-06-17 14:16:41 -07:00
const struct vxlan_fdb * fdb ,
u32 portid , u32 seq , int type , unsigned int flags ,
const struct vxlan_rdst * rdst )
2012-10-01 12:32:35 +00:00
{
unsigned long now = jiffies ;
struct nda_cacheinfo ci ;
struct nlmsghdr * nlh ;
struct ndmsg * ndm ;
2012-11-20 02:50:14 +00:00
bool send_ip , send_eth ;
2012-10-01 12:32:35 +00:00
nlh = nlmsg_put ( skb , portid , seq , type , sizeof ( * ndm ) , flags ) ;
if ( nlh = = NULL )
return - EMSGSIZE ;
ndm = nlmsg_data ( nlh ) ;
memset ( ndm , 0 , sizeof ( * ndm ) ) ;
2012-11-20 02:50:14 +00:00
send_eth = send_ip = true ;
if ( type = = RTM_GETNEIGH ) {
2013-08-31 13:44:33 +08:00
send_ip = ! vxlan_addr_any ( & rdst - > remote_ip ) ;
2012-11-20 02:50:14 +00:00
send_eth = ! is_zero_ether_addr ( fdb - > eth_addr ) ;
2017-03-10 16:30:24 +01:00
ndm - > ndm_family = send_ip ? rdst - > remote_ip . sa . sa_family : AF_INET ;
2012-11-20 02:50:14 +00:00
} else
ndm - > ndm_family = AF_BRIDGE ;
2012-10-01 12:32:35 +00:00
ndm - > ndm_state = fdb - > state ;
ndm - > ndm_ifindex = vxlan - > dev - > ifindex ;
2013-04-19 00:36:26 +00:00
ndm - > ndm_flags = fdb - > flags ;
2014-07-26 00:38:59 +08:00
ndm - > ndm_type = RTN_UNICAST ;
2012-10-01 12:32:35 +00:00
2015-01-20 15:15:47 +01:00
if ( ! net_eq ( dev_net ( vxlan - > dev ) , vxlan - > net ) & &
2015-01-26 14:10:53 +01:00
nla_put_s32 ( skb , NDA_LINK_NETNSID ,
2016-09-01 21:53:44 -07:00
peernet2id ( dev_net ( vxlan - > dev ) , vxlan - > net ) ) )
2015-01-20 15:15:47 +01:00
goto nla_put_failure ;
2012-11-20 02:50:14 +00:00
if ( send_eth & & nla_put ( skb , NDA_LLADDR , ETH_ALEN , & fdb - > eth_addr ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
2013-08-31 13:44:33 +08:00
if ( send_ip & & vxlan_nla_put_addr ( skb , NDA_DST , & rdst - > remote_ip ) )
2013-03-15 04:35:51 +00:00
goto nla_put_failure ;
2015-07-21 10:44:02 +02:00
if ( rdst - > remote_port & & rdst - > remote_port ! = vxlan - > cfg . dst_port & &
2013-03-15 04:35:51 +00:00
nla_put_be16 ( skb , NDA_PORT , rdst - > remote_port ) )
goto nla_put_failure ;
2013-04-16 02:50:52 +00:00
if ( rdst - > remote_vni ! = vxlan - > default_dst . remote_vni & &
2016-02-16 21:58:58 +01:00
nla_put_u32 ( skb , NDA_VNI , be32_to_cpu ( rdst - > remote_vni ) ) )
2013-03-15 04:35:51 +00:00
goto nla_put_failure ;
2017-01-31 22:59:52 -08:00
if ( ( vxlan - > flags & VXLAN_F_COLLECT_METADATA ) & & fdb - > vni & &
nla_put_u32 ( skb , NDA_SRC_VNI ,
be32_to_cpu ( fdb - > vni ) ) )
goto nla_put_failure ;
2013-03-15 04:35:51 +00:00
if ( rdst - > remote_ifindex & &
nla_put_u32 ( skb , NDA_IFINDEX , rdst - > remote_ifindex ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
ci . ndm_used = jiffies_to_clock_t ( now - fdb - > used ) ;
ci . ndm_confirmed = 0 ;
ci . ndm_updated = jiffies_to_clock_t ( now - fdb - > updated ) ;
ci . ndm_refcnt = 0 ;
if ( nla_put ( skb , NDA_CACHEINFO , sizeof ( ci ) , & ci ) )
goto nla_put_failure ;
2015-01-16 22:09:00 +01:00
nlmsg_end ( skb , nlh ) ;
return 0 ;
2012-10-01 12:32:35 +00:00
nla_put_failure :
nlmsg_cancel ( skb , nlh ) ;
return - EMSGSIZE ;
}
static inline size_t vxlan_nlmsg_size ( void )
{
return NLMSG_ALIGN ( sizeof ( struct ndmsg ) )
+ nla_total_size ( ETH_ALEN ) /* NDA_LLADDR */
2013-08-31 13:44:33 +08:00
+ nla_total_size ( sizeof ( struct in6_addr ) ) /* NDA_DST */
2013-04-27 11:31:54 +00:00
+ nla_total_size ( sizeof ( __be16 ) ) /* NDA_PORT */
2013-03-15 04:35:51 +00:00
+ nla_total_size ( sizeof ( __be32 ) ) /* NDA_VNI */
+ nla_total_size ( sizeof ( __u32 ) ) /* NDA_IFINDEX */
2015-01-26 14:10:53 +01:00
+ nla_total_size ( sizeof ( __s32 ) ) /* NDA_LINK_NETNSID */
2012-10-01 12:32:35 +00:00
+ nla_total_size ( sizeof ( struct nda_cacheinfo ) ) ;
}
2014-04-22 15:01:30 +02:00
static void vxlan_fdb_notify ( struct vxlan_dev * vxlan , struct vxlan_fdb * fdb ,
struct vxlan_rdst * rd , int type )
2012-10-01 12:32:35 +00:00
{
struct net * net = dev_net ( vxlan - > dev ) ;
struct sk_buff * skb ;
int err = - ENOBUFS ;
skb = nlmsg_new ( vxlan_nlmsg_size ( ) , GFP_ATOMIC ) ;
if ( skb = = NULL )
goto errout ;
2014-04-22 15:01:30 +02:00
err = vxlan_fdb_info ( skb , vxlan , fdb , 0 , 0 , type , 0 , rd ) ;
2012-10-01 12:32:35 +00:00
if ( err < 0 ) {
/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
WARN_ON ( err = = - EMSGSIZE ) ;
kfree_skb ( skb ) ;
goto errout ;
}
rtnl_notify ( skb , net , 0 , RTNLGRP_NEIGH , NULL , GFP_ATOMIC ) ;
return ;
errout :
if ( err < 0 )
rtnl_set_sk_err ( net , RTNLGRP_NEIGH , err ) ;
}
2013-08-31 13:44:33 +08:00
static void vxlan_ip_miss ( struct net_device * dev , union vxlan_addr * ipa )
2012-11-20 02:50:14 +00:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-06-17 14:16:40 -07:00
struct vxlan_fdb f = {
. state = NUD_STALE ,
} ;
struct vxlan_rdst remote = {
2013-08-31 13:44:33 +08:00
. remote_ip = * ipa , /* goes to NDA_DST */
2016-02-16 21:58:58 +01:00
. remote_vni = cpu_to_be32 ( VXLAN_N_VID ) ,
2013-06-17 14:16:40 -07:00
} ;
2013-06-17 14:16:12 -07:00
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , & f , & remote , RTM_GETNEIGH ) ;
2012-11-20 02:50:14 +00:00
}
static void vxlan_fdb_miss ( struct vxlan_dev * vxlan , const u8 eth_addr [ ETH_ALEN ] )
{
2013-06-17 14:16:40 -07:00
struct vxlan_fdb f = {
. state = NUD_STALE ,
} ;
2014-04-22 15:01:30 +02:00
struct vxlan_rdst remote = { } ;
2012-11-20 02:50:14 +00:00
memcpy ( f . eth_addr , eth_addr , ETH_ALEN ) ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , & f , & remote , RTM_GETNEIGH ) ;
2012-11-20 02:50:14 +00:00
}
2012-10-01 12:32:35 +00:00
/* Hash Ethernet address */
static u32 eth_hash ( const unsigned char * addr )
{
u64 value = get_unaligned ( ( u64 * ) addr ) ;
/* only want 6 bytes */
# ifdef __BIG_ENDIAN
value > > = 16 ;
2012-10-09 20:35:47 +00:00
# else
value < < = 16 ;
2012-10-01 12:32:35 +00:00
# endif
return hash_64 ( value , FDB_HASH_BITS ) ;
}
2017-01-31 22:59:52 -08:00
static u32 eth_vni_hash ( const unsigned char * addr , __be32 vni )
{
/* use 1 byte of OUI and 3 bytes of NIC */
u32 key = get_unaligned ( ( u32 * ) ( addr + 2 ) ) ;
return jhash_2words ( key , vni , vxlan_salt ) & ( FDB_HASH_SIZE - 1 ) ;
}
2012-10-01 12:32:35 +00:00
/* Hash chain to use given mac address */
static inline struct hlist_head * vxlan_fdb_head ( struct vxlan_dev * vxlan ,
2017-01-31 22:59:52 -08:00
const u8 * mac , __be32 vni )
2012-10-01 12:32:35 +00:00
{
2017-01-31 22:59:52 -08:00
if ( vxlan - > flags & VXLAN_F_COLLECT_METADATA )
return & vxlan - > fdb_head [ eth_vni_hash ( mac , vni ) ] ;
else
return & vxlan - > fdb_head [ eth_hash ( mac ) ] ;
2012-10-01 12:32:35 +00:00
}
/* Look up Ethernet address in forwarding table */
2013-05-17 06:39:07 +00:00
static struct vxlan_fdb * __vxlan_find_mac ( struct vxlan_dev * vxlan ,
2017-01-31 22:59:52 -08:00
const u8 * mac , __be32 vni )
2012-10-01 12:32:35 +00:00
{
2017-01-31 22:59:52 -08:00
struct hlist_head * head = vxlan_fdb_head ( vxlan , mac , vni ) ;
2012-10-01 12:32:35 +00:00
struct vxlan_fdb * f ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
hlist_for_each_entry_rcu ( f , head , hlist ) {
2017-01-31 22:59:52 -08:00
if ( ether_addr_equal ( mac , f - > eth_addr ) ) {
if ( vxlan - > flags & VXLAN_F_COLLECT_METADATA ) {
if ( vni = = f - > vni )
return f ;
} else {
return f ;
}
}
2012-10-01 12:32:35 +00:00
}
return NULL ;
}
2013-05-17 06:39:07 +00:00
static struct vxlan_fdb * vxlan_find_mac ( struct vxlan_dev * vxlan ,
2017-01-31 22:59:52 -08:00
const u8 * mac , __be32 vni )
2013-05-17 06:39:07 +00:00
{
struct vxlan_fdb * f ;
2017-01-31 22:59:52 -08:00
f = __vxlan_find_mac ( vxlan , mac , vni ) ;
2013-05-17 06:39:07 +00:00
if ( f )
f - > used = jiffies ;
return f ;
}
2013-06-25 16:01:52 +03:00
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst * vxlan_fdb_find_rdst ( struct vxlan_fdb * f ,
2013-08-31 13:44:33 +08:00
union vxlan_addr * ip , __be16 port ,
2016-02-16 21:58:58 +01:00
__be32 vni , __u32 ifindex )
2013-03-15 04:35:51 +00:00
{
2013-06-17 14:16:12 -07:00
struct vxlan_rdst * rd ;
2013-03-15 04:35:51 +00:00
2013-06-17 14:16:12 -07:00
list_for_each_entry ( rd , & f - > remotes , list ) {
2013-08-31 13:44:33 +08:00
if ( vxlan_addr_equal ( & rd - > remote_ip , ip ) & &
2013-03-15 04:35:51 +00:00
rd - > remote_port = = port & &
rd - > remote_vni = = vni & &
rd - > remote_ifindex = = ifindex )
2013-06-25 16:01:52 +03:00
return rd ;
2013-03-15 04:35:51 +00:00
}
2013-06-17 14:16:12 -07:00
2013-06-25 16:01:52 +03:00
return NULL ;
}
2013-07-19 17:20:07 +02:00
/* Replace destination of unicast mac */
static int vxlan_fdb_replace ( struct vxlan_fdb * f ,
2016-02-16 21:58:58 +01:00
union vxlan_addr * ip , __be16 port , __be32 vni ,
__u32 ifindex )
2013-07-19 17:20:07 +02:00
{
struct vxlan_rdst * rd ;
rd = vxlan_fdb_find_rdst ( f , ip , port , vni , ifindex ) ;
if ( rd )
return 0 ;
rd = list_first_entry_or_null ( & f - > remotes , struct vxlan_rdst , list ) ;
if ( ! rd )
return 0 ;
2016-02-12 15:43:56 +01:00
dst_cache_reset ( & rd - > dst_cache ) ;
2013-08-31 13:44:33 +08:00
rd - > remote_ip = * ip ;
2013-07-19 17:20:07 +02:00
rd - > remote_port = port ;
rd - > remote_vni = vni ;
rd - > remote_ifindex = ifindex ;
return 1 ;
}
2013-06-25 16:01:52 +03:00
/* Add/update destinations for multicast */
static int vxlan_fdb_append ( struct vxlan_fdb * f ,
2016-02-16 21:58:58 +01:00
union vxlan_addr * ip , __be16 port , __be32 vni ,
2014-04-22 15:01:30 +02:00
__u32 ifindex , struct vxlan_rdst * * rdp )
2013-06-25 16:01:52 +03:00
{
struct vxlan_rdst * rd ;
rd = vxlan_fdb_find_rdst ( f , ip , port , vni , ifindex ) ;
if ( rd )
return 0 ;
2013-03-15 04:35:51 +00:00
rd = kmalloc ( sizeof ( * rd ) , GFP_ATOMIC ) ;
if ( rd = = NULL )
return - ENOBUFS ;
2016-02-12 15:43:56 +01:00
if ( dst_cache_init ( & rd - > dst_cache , GFP_ATOMIC ) ) {
kfree ( rd ) ;
return - ENOBUFS ;
}
2013-08-31 13:44:33 +08:00
rd - > remote_ip = * ip ;
2013-03-15 04:35:51 +00:00
rd - > remote_port = port ;
rd - > remote_vni = vni ;
rd - > remote_ifindex = ifindex ;
2013-06-17 14:16:12 -07:00
list_add_tail_rcu ( & rd - > list , & f - > remotes ) ;
2014-04-22 15:01:30 +02:00
* rdp = rd ;
2013-03-15 04:35:51 +00:00
return 1 ;
}
2015-01-12 17:00:38 -08:00
static struct vxlanhdr * vxlan_gro_remcsum ( struct sk_buff * skb ,
unsigned int off ,
struct vxlanhdr * vh , size_t hdrlen ,
2016-02-16 21:58:58 +01:00
__be32 vni_field ,
struct gro_remcsum * grc ,
2015-02-10 16:30:32 -08:00
bool nopartial )
2015-01-12 17:00:38 -08:00
{
2015-08-19 17:07:32 -07:00
size_t start , offset ;
2015-01-12 17:00:38 -08:00
if ( skb - > remcsum_offload )
2015-08-19 17:07:32 -07:00
return vh ;
2015-01-12 17:00:38 -08:00
if ( ! NAPI_GRO_CB ( skb ) - > csum_valid )
return NULL ;
2016-02-16 21:58:58 +01:00
start = vxlan_rco_start ( vni_field ) ;
offset = start + vxlan_rco_offset ( vni_field ) ;
2015-01-12 17:00:38 -08:00
2015-08-19 17:07:32 -07:00
vh = skb_gro_remcsum_process ( skb , ( void * ) vh , off , hdrlen ,
start , offset , grc , nopartial ) ;
2015-01-12 17:00:38 -08:00
skb - > remcsum_offload = 1 ;
return vh ;
}
2016-04-05 08:22:53 -07:00
static struct sk_buff * * vxlan_gro_receive ( struct sock * sk ,
struct sk_buff * * head ,
struct sk_buff * skb )
2014-01-20 13:59:21 +02:00
{
struct sk_buff * p , * * pp = NULL ;
struct vxlanhdr * vh , * vh2 ;
2014-12-30 19:10:15 -08:00
unsigned int hlen , off_vx ;
2014-01-20 13:59:21 +02:00
int flush = 1 ;
2016-04-05 08:22:53 -07:00
struct vxlan_sock * vs = rcu_dereference_sk_user_data ( sk ) ;
2016-02-16 21:58:58 +01:00
__be32 flags ;
2015-02-10 16:30:27 -08:00
struct gro_remcsum grc ;
skb_gro_remcsum_init ( & grc ) ;
2014-01-20 13:59:21 +02:00
off_vx = skb_gro_offset ( skb ) ;
hlen = off_vx + sizeof ( * vh ) ;
vh = skb_gro_header_fast ( skb , off_vx ) ;
if ( skb_gro_header_hard ( skb , hlen ) ) {
vh = skb_gro_header_slow ( skb , hlen , off_vx ) ;
if ( unlikely ( ! vh ) )
goto out ;
}
2015-01-12 17:00:38 -08:00
skb_gro_postpull_rcsum ( skb , vh , sizeof ( struct vxlanhdr ) ) ;
2016-02-16 21:58:58 +01:00
flags = vh - > vx_flags ;
2015-01-12 17:00:38 -08:00
if ( ( flags & VXLAN_HF_RCO ) & & ( vs - > flags & VXLAN_F_REMCSUM_RX ) ) {
vh = vxlan_gro_remcsum ( skb , off_vx , vh , sizeof ( struct vxlanhdr ) ,
2016-02-16 21:58:58 +01:00
vh - > vx_vni , & grc ,
2015-02-10 16:30:32 -08:00
! ! ( vs - > flags &
VXLAN_F_REMCSUM_NOPARTIAL ) ) ;
2015-01-12 17:00:38 -08:00
if ( ! vh )
goto out ;
}
2015-08-19 17:07:32 -07:00
skb_gro_pull ( skb , sizeof ( struct vxlanhdr ) ) ; /* pull vxlan header */
2014-01-20 13:59:21 +02:00
for ( p = * head ; p ; p = p - > next ) {
if ( ! NAPI_GRO_CB ( p ) - > same_flow )
continue ;
vh2 = ( struct vxlanhdr * ) ( p - > data + off_vx ) ;
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
if ( vh - > vx_flags ! = vh2 - > vx_flags | |
vh - > vx_vni ! = vh2 - > vx_vni ) {
2014-01-20 13:59:21 +02:00
NAPI_GRO_CB ( p ) - > same_flow = 0 ;
continue ;
}
}
2016-10-20 15:58:02 +02:00
pp = call_gro_receive ( eth_gro_receive , head , skb ) ;
2016-03-09 09:24:23 -08:00
flush = 0 ;
2014-01-20 13:59:21 +02:00
out :
2015-02-10 16:30:27 -08:00
skb_gro_remcsum_cleanup ( skb , & grc ) ;
2014-01-20 13:59:21 +02:00
NAPI_GRO_CB ( skb ) - > flush | = flush ;
return pp ;
}
2016-04-05 08:22:53 -07:00
static int vxlan_gro_complete ( struct sock * sk , struct sk_buff * skb , int nhoff )
2014-01-20 13:59:21 +02:00
{
2016-05-03 16:10:21 -07:00
/* Sets 'skb->inner_mac_header' since we are always called with
* ' skb - > encapsulation ' set .
*/
2014-12-30 19:10:15 -08:00
return eth_gro_complete ( skb , nhoff + sizeof ( struct vxlanhdr ) ) ;
2014-01-20 13:59:21 +02:00
}
2012-10-01 12:32:35 +00:00
/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_create ( struct vxlan_dev * vxlan ,
2013-08-31 13:44:33 +08:00
const u8 * mac , union vxlan_addr * ip ,
2013-03-15 04:35:51 +00:00
__u16 state , __u16 flags ,
2017-01-31 22:59:52 -08:00
__be16 port , __be32 src_vni , __be32 vni ,
__u32 ifindex , __u8 ndm_flags )
2012-10-01 12:32:35 +00:00
{
2014-04-22 15:01:30 +02:00
struct vxlan_rdst * rd = NULL ;
2012-10-01 12:32:35 +00:00
struct vxlan_fdb * f ;
int notify = 0 ;
2016-11-29 09:59:36 +08:00
int rc ;
2012-10-01 12:32:35 +00:00
2017-01-31 22:59:52 -08:00
f = __vxlan_find_mac ( vxlan , mac , src_vni ) ;
2012-10-01 12:32:35 +00:00
if ( f ) {
if ( flags & NLM_F_EXCL ) {
netdev_dbg ( vxlan - > dev ,
" lost race to create %pM \n " , mac ) ;
return - EEXIST ;
}
if ( f - > state ! = state ) {
f - > state = state ;
f - > updated = jiffies ;
notify = 1 ;
}
2013-04-19 00:36:26 +00:00
if ( f - > flags ! = ndm_flags ) {
f - > flags = ndm_flags ;
f - > updated = jiffies ;
notify = 1 ;
}
2013-07-19 17:20:07 +02:00
if ( ( flags & NLM_F_REPLACE ) ) {
/* Only change unicasts */
if ( ! ( is_multicast_ether_addr ( f - > eth_addr ) | |
is_zero_ether_addr ( f - > eth_addr ) ) ) {
2015-04-22 15:49:10 +08:00
notify | = vxlan_fdb_replace ( f , ip , port , vni ,
2013-07-19 17:20:07 +02:00
ifindex ) ;
} else
return - EOPNOTSUPP ;
}
2013-03-15 04:35:51 +00:00
if ( ( flags & NLM_F_APPEND ) & &
2013-06-25 16:01:56 +03:00
( is_multicast_ether_addr ( f - > eth_addr ) | |
is_zero_ether_addr ( f - > eth_addr ) ) ) {
2016-11-29 09:59:36 +08:00
rc = vxlan_fdb_append ( f , ip , port , vni , ifindex , & rd ) ;
2013-03-15 04:35:51 +00:00
if ( rc < 0 )
return rc ;
notify | = rc ;
}
2012-10-01 12:32:35 +00:00
} else {
if ( ! ( flags & NLM_F_CREATE ) )
return - ENOENT ;
2015-07-21 10:44:02 +02:00
if ( vxlan - > cfg . addrmax & &
vxlan - > addrcnt > = vxlan - > cfg . addrmax )
2012-10-01 12:32:35 +00:00
return - ENOSPC ;
2013-07-19 17:20:07 +02:00
/* Disallow replace to add a multicast entry */
if ( ( flags & NLM_F_REPLACE ) & &
( is_multicast_ether_addr ( mac ) | | is_zero_ether_addr ( mac ) ) )
return - EOPNOTSUPP ;
2013-08-31 13:44:33 +08:00
netdev_dbg ( vxlan - > dev , " add %pM -> %pIS \n " , mac , ip ) ;
2012-10-01 12:32:35 +00:00
f = kmalloc ( sizeof ( * f ) , GFP_ATOMIC ) ;
if ( ! f )
return - ENOMEM ;
notify = 1 ;
f - > state = state ;
2013-04-19 00:36:26 +00:00
f - > flags = ndm_flags ;
2012-10-01 12:32:35 +00:00
f - > updated = f - > used = jiffies ;
2017-01-31 22:59:52 -08:00
f - > vni = src_vni ;
2013-06-17 14:16:12 -07:00
INIT_LIST_HEAD ( & f - > remotes ) ;
2012-10-01 12:32:35 +00:00
memcpy ( f - > eth_addr , mac , ETH_ALEN ) ;
2016-11-29 09:59:36 +08:00
rc = vxlan_fdb_append ( f , ip , port , vni , ifindex , & rd ) ;
if ( rc < 0 ) {
kfree ( f ) ;
return rc ;
}
2013-06-17 14:16:12 -07:00
2012-10-01 12:32:35 +00:00
+ + vxlan - > addrcnt ;
hlist_add_head_rcu ( & f - > hlist ,
2017-01-31 22:59:52 -08:00
vxlan_fdb_head ( vxlan , mac , src_vni ) ) ;
2012-10-01 12:32:35 +00:00
}
2014-04-22 15:01:30 +02:00
if ( notify ) {
if ( rd = = NULL )
rd = first_remote_rtnl ( f ) ;
vxlan_fdb_notify ( vxlan , f , rd , RTM_NEWNEIGH ) ;
}
2012-10-01 12:32:35 +00:00
return 0 ;
}
2013-04-11 19:00:35 +00:00
static void vxlan_fdb_free ( struct rcu_head * head )
2013-03-15 04:35:51 +00:00
{
struct vxlan_fdb * f = container_of ( head , struct vxlan_fdb , rcu ) ;
2013-06-17 14:16:12 -07:00
struct vxlan_rdst * rd , * nd ;
2013-03-15 04:35:51 +00:00
2016-02-12 15:43:56 +01:00
list_for_each_entry_safe ( rd , nd , & f - > remotes , list ) {
dst_cache_destroy ( & rd - > dst_cache ) ;
2013-03-15 04:35:51 +00:00
kfree ( rd ) ;
2016-02-12 15:43:56 +01:00
}
2013-03-15 04:35:51 +00:00
kfree ( f ) ;
}
2012-10-01 12:32:35 +00:00
static void vxlan_fdb_destroy ( struct vxlan_dev * vxlan , struct vxlan_fdb * f )
{
netdev_dbg ( vxlan - > dev ,
" delete %pM \n " , f - > eth_addr ) ;
- - vxlan - > addrcnt ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , f , first_remote_rtnl ( f ) , RTM_DELNEIGH ) ;
2012-10-01 12:32:35 +00:00
hlist_del_rcu ( & f - > hlist ) ;
2013-03-15 04:35:51 +00:00
call_rcu ( & f - > rcu , vxlan_fdb_free ) ;
2012-10-01 12:32:35 +00:00
}
2013-06-25 16:01:53 +03:00
static int vxlan_fdb_parse ( struct nlattr * tb [ ] , struct vxlan_dev * vxlan ,
2017-01-31 22:59:52 -08:00
union vxlan_addr * ip , __be16 * port , __be32 * src_vni ,
__be32 * vni , u32 * ifindex )
2012-10-01 12:32:35 +00:00
{
2013-03-15 04:35:51 +00:00
struct net * net = dev_net ( vxlan - > dev ) ;
2013-08-31 13:44:33 +08:00
int err ;
2012-10-01 12:32:35 +00:00
2013-06-25 16:01:53 +03:00
if ( tb [ NDA_DST ] ) {
2013-08-31 13:44:33 +08:00
err = vxlan_nla_get_addr ( ip , tb [ NDA_DST ] ) ;
if ( err )
return err ;
2013-06-25 16:01:53 +03:00
} else {
2013-08-31 13:44:33 +08:00
union vxlan_addr * remote = & vxlan - > default_dst . remote_ip ;
if ( remote - > sa . sa_family = = AF_INET ) {
ip - > sin . sin_addr . s_addr = htonl ( INADDR_ANY ) ;
ip - > sa . sa_family = AF_INET ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
ip - > sin6 . sin6_addr = in6addr_any ;
ip - > sa . sa_family = AF_INET6 ;
# endif
}
2013-06-25 16:01:53 +03:00
}
2012-10-01 12:32:35 +00:00
2013-03-15 04:35:51 +00:00
if ( tb [ NDA_PORT ] ) {
2013-04-27 11:31:54 +00:00
if ( nla_len ( tb [ NDA_PORT ] ) ! = sizeof ( __be16 ) )
2013-03-15 04:35:51 +00:00
return - EINVAL ;
2013-06-25 16:01:53 +03:00
* port = nla_get_be16 ( tb [ NDA_PORT ] ) ;
} else {
2015-07-21 10:44:02 +02:00
* port = vxlan - > cfg . dst_port ;
2013-06-25 16:01:53 +03:00
}
2013-03-15 04:35:51 +00:00
if ( tb [ NDA_VNI ] ) {
if ( nla_len ( tb [ NDA_VNI ] ) ! = sizeof ( u32 ) )
return - EINVAL ;
2016-02-16 21:58:58 +01:00
* vni = cpu_to_be32 ( nla_get_u32 ( tb [ NDA_VNI ] ) ) ;
2013-06-25 16:01:53 +03:00
} else {
* vni = vxlan - > default_dst . remote_vni ;
}
2013-03-15 04:35:51 +00:00
2017-01-31 22:59:52 -08:00
if ( tb [ NDA_SRC_VNI ] ) {
if ( nla_len ( tb [ NDA_SRC_VNI ] ) ! = sizeof ( u32 ) )
return - EINVAL ;
* src_vni = cpu_to_be32 ( nla_get_u32 ( tb [ NDA_SRC_VNI ] ) ) ;
} else {
* src_vni = vxlan - > default_dst . remote_vni ;
}
2013-03-15 04:35:51 +00:00
if ( tb [ NDA_IFINDEX ] ) {
2013-03-26 08:29:30 +00:00
struct net_device * tdev ;
2013-03-15 04:35:51 +00:00
if ( nla_len ( tb [ NDA_IFINDEX ] ) ! = sizeof ( u32 ) )
return - EINVAL ;
2013-06-25 16:01:53 +03:00
* ifindex = nla_get_u32 ( tb [ NDA_IFINDEX ] ) ;
2014-01-15 10:23:41 +08:00
tdev = __dev_get_by_index ( net , * ifindex ) ;
2013-03-26 08:29:30 +00:00
if ( ! tdev )
2013-03-15 04:35:51 +00:00
return - EADDRNOTAVAIL ;
2013-06-25 16:01:53 +03:00
} else {
* ifindex = 0 ;
}
return 0 ;
}
/* Add static entry (via netlink) */
static int vxlan_fdb_add ( struct ndmsg * ndm , struct nlattr * tb [ ] ,
struct net_device * dev ,
2014-11-28 14:34:15 +01:00
const unsigned char * addr , u16 vid , u16 flags )
2013-06-25 16:01:53 +03:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
/* struct net *net = dev_net(vxlan->dev); */
2013-08-31 13:44:33 +08:00
union vxlan_addr ip ;
2013-06-25 16:01:53 +03:00
__be16 port ;
2017-01-31 22:59:52 -08:00
__be32 src_vni , vni ;
2016-02-16 21:58:58 +01:00
u32 ifindex ;
2013-06-25 16:01:53 +03:00
int err ;
if ( ! ( ndm - > ndm_state & ( NUD_PERMANENT | NUD_REACHABLE ) ) ) {
pr_info ( " RTM_NEWNEIGH with invalid state %#x \n " ,
ndm - > ndm_state ) ;
return - EINVAL ;
}
if ( tb [ NDA_DST ] = = NULL )
return - EINVAL ;
2017-01-31 22:59:52 -08:00
err = vxlan_fdb_parse ( tb , vxlan , & ip , & port , & src_vni , & vni , & ifindex ) ;
2013-06-25 16:01:53 +03:00
if ( err )
return err ;
2013-03-15 04:35:51 +00:00
2014-04-01 09:23:01 +03:00
if ( vxlan - > default_dst . remote_ip . sa . sa_family ! = ip . sa . sa_family )
return - EAFNOSUPPORT ;
2012-10-01 12:32:35 +00:00
spin_lock_bh ( & vxlan - > hash_lock ) ;
2013-08-31 13:44:33 +08:00
err = vxlan_fdb_create ( vxlan , addr , & ip , ndm - > ndm_state , flags ,
2017-01-31 22:59:52 -08:00
port , src_vni , vni , ifindex , ndm - > ndm_flags ) ;
2012-10-01 12:32:35 +00:00
spin_unlock_bh ( & vxlan - > hash_lock ) ;
return err ;
}
2017-01-31 22:59:52 -08:00
static int __vxlan_fdb_delete ( struct vxlan_dev * vxlan ,
const unsigned char * addr , union vxlan_addr ip ,
__be16 port , __be32 src_vni , u32 vni , u32 ifindex ,
u16 vid )
2012-10-01 12:32:35 +00:00
{
struct vxlan_fdb * f ;
2013-06-25 16:01:54 +03:00
struct vxlan_rdst * rd = NULL ;
2017-01-31 22:59:52 -08:00
int err = - ENOENT ;
2013-06-25 16:01:54 +03:00
2017-01-31 22:59:52 -08:00
f = vxlan_find_mac ( vxlan , addr , src_vni ) ;
2013-06-25 16:01:54 +03:00
if ( ! f )
2017-01-31 22:59:52 -08:00
return err ;
2013-06-25 16:01:54 +03:00
2013-08-31 13:44:33 +08:00
if ( ! vxlan_addr_any ( & ip ) ) {
rd = vxlan_fdb_find_rdst ( f , & ip , port , vni , ifindex ) ;
2013-06-25 16:01:54 +03:00
if ( ! rd )
goto out ;
}
/* remove a destination if it's not the only one on the list,
* otherwise destroy the fdb entry
*/
if ( rd & & ! list_is_singular ( & f - > remotes ) ) {
list_del_rcu ( & rd - > list ) ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , f , rd , RTM_DELNEIGH ) ;
2013-08-17 07:32:09 +08:00
kfree_rcu ( rd , rcu ) ;
2013-06-25 16:01:54 +03:00
goto out ;
2012-10-01 12:32:35 +00:00
}
2013-06-25 16:01:54 +03:00
vxlan_fdb_destroy ( vxlan , f ) ;
out :
2017-01-31 22:59:52 -08:00
return 0 ;
}
/* Delete entry (via netlink) */
static int vxlan_fdb_delete ( struct ndmsg * ndm , struct nlattr * tb [ ] ,
struct net_device * dev ,
const unsigned char * addr , u16 vid )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
union vxlan_addr ip ;
__be32 src_vni , vni ;
__be16 port ;
u32 ifindex ;
int err ;
err = vxlan_fdb_parse ( tb , vxlan , & ip , & port , & src_vni , & vni , & ifindex ) ;
if ( err )
return err ;
spin_lock_bh ( & vxlan - > hash_lock ) ;
err = __vxlan_fdb_delete ( vxlan , addr , ip , port , src_vni , vni , ifindex ,
vid ) ;
2012-10-01 12:32:35 +00:00
spin_unlock_bh ( & vxlan - > hash_lock ) ;
return err ;
}
/* Dump forwarding table */
static int vxlan_fdb_dump ( struct sk_buff * skb , struct netlink_callback * cb ,
2014-07-10 07:01:58 -04:00
struct net_device * dev ,
2016-08-30 21:56:45 -07:00
struct net_device * filter_dev , int * idx )
2012-10-01 12:32:35 +00:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
unsigned int h ;
2016-08-30 21:56:45 -07:00
int err = 0 ;
2012-10-01 12:32:35 +00:00
for ( h = 0 ; h < FDB_HASH_SIZE ; + + h ) {
struct vxlan_fdb * f ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
hlist_for_each_entry_rcu ( f , & vxlan - > fdb_head [ h ] , hlist ) {
2013-03-15 04:35:51 +00:00
struct vxlan_rdst * rd ;
2013-06-17 14:16:12 -07:00
list_for_each_entry_rcu ( rd , & f - > remotes , list ) {
2016-08-30 21:56:45 -07:00
if ( * idx < cb - > args [ 2 ] )
2015-08-10 23:39:09 +09:00
goto skip ;
2013-03-15 04:35:51 +00:00
err = vxlan_fdb_info ( skb , vxlan , f ,
NETLINK_CB ( cb - > skb ) . portid ,
cb - > nlh - > nlmsg_seq ,
RTM_NEWNEIGH ,
NLM_F_MULTI , rd ) ;
2016-08-30 21:56:45 -07:00
if ( err < 0 )
2013-06-17 14:16:12 -07:00
goto out ;
skip :
2016-08-30 21:56:45 -07:00
* idx + = 1 ;
2015-08-10 23:39:09 +09:00
}
2012-10-01 12:32:35 +00:00
}
}
2013-06-17 14:16:12 -07:00
out :
2016-08-30 21:56:45 -07:00
return err ;
2012-10-01 12:32:35 +00:00
}
/* Watch incoming packets to learn mapping between Ethernet address
* and Tunnel endpoint .
2015-04-02 11:17:58 +09:00
* Return true if packet is bogus and should be dropped .
2012-10-01 12:32:35 +00:00
*/
2013-06-17 12:09:58 -07:00
static bool vxlan_snoop ( struct net_device * dev ,
2017-01-31 22:59:52 -08:00
union vxlan_addr * src_ip , const u8 * src_mac ,
__be32 vni )
2012-10-01 12:32:35 +00:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct vxlan_fdb * f ;
2017-01-31 22:59:52 -08:00
f = vxlan_find_mac ( vxlan , src_mac , vni ) ;
2012-10-01 12:32:35 +00:00
if ( likely ( f ) ) {
2013-08-04 17:17:39 -07:00
struct vxlan_rdst * rdst = first_remote_rcu ( f ) ;
2013-06-17 14:16:12 -07:00
2013-08-31 13:44:33 +08:00
if ( likely ( vxlan_addr_equal ( & rdst - > remote_ip , src_ip ) ) )
2013-06-17 12:09:58 -07:00
return false ;
/* Don't migrate static entries, drop packets */
2013-06-18 14:27:01 -07:00
if ( f - > state & NUD_NOARP )
2013-06-17 12:09:58 -07:00
return true ;
2012-10-01 12:32:35 +00:00
if ( net_ratelimit ( ) )
netdev_info ( dev ,
2013-08-31 13:44:33 +08:00
" %pM migrated from %pIS to %pIS \n " ,
2015-02-07 03:17:31 +01:00
src_mac , & rdst - > remote_ip . sa , & src_ip - > sa ) ;
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
rdst - > remote_ip = * src_ip ;
2012-10-01 12:32:35 +00:00
f - > updated = jiffies ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , f , rdst , RTM_NEWNEIGH ) ;
2012-10-01 12:32:35 +00:00
} else {
/* learned new entry */
spin_lock ( & vxlan - > hash_lock ) ;
2013-06-17 12:09:57 -07:00
/* close off race between vxlan_flush and incoming packets */
if ( netif_running ( dev ) )
vxlan_fdb_create ( vxlan , src_mac , src_ip ,
NUD_REACHABLE ,
NLM_F_EXCL | NLM_F_CREATE ,
2015-07-21 10:44:02 +02:00
vxlan - > cfg . dst_port ,
2017-01-31 22:59:52 -08:00
vni ,
2013-06-17 12:09:57 -07:00
vxlan - > default_dst . remote_vni ,
0 , NTF_SELF ) ;
2012-10-01 12:32:35 +00:00
spin_unlock ( & vxlan - > hash_lock ) ;
}
2013-06-17 12:09:58 -07:00
return false ;
2012-10-01 12:32:35 +00:00
}
/* See if multicast group is already in use by other ID */
2013-12-10 16:37:33 +08:00
static bool vxlan_group_used ( struct vxlan_net * vn , struct vxlan_dev * dev )
2012-10-01 12:32:35 +00:00
{
2013-05-16 11:35:20 +00:00
struct vxlan_dev * vxlan ;
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock4 ;
2016-11-07 22:09:07 +01:00
# if IS_ENABLED(CONFIG_IPV6)
struct vxlan_sock * sock6 ;
# endif
2015-09-24 13:50:02 +02:00
unsigned short family = dev - > default_dst . remote_ip . sa . sa_family ;
2012-10-01 12:32:35 +00:00
2016-10-28 09:59:15 -07:00
sock4 = rtnl_dereference ( dev - > vn4_sock ) ;
2013-12-10 16:37:33 +08:00
/* The vxlan_sock is only used by dev, leaving group has
* no effect on other vxlan devices .
*/
2016-10-28 09:59:15 -07:00
if ( family = = AF_INET & & sock4 & & atomic_read ( & sock4 - > refcnt ) = = 1 )
2013-12-10 16:37:33 +08:00
return false ;
2015-09-24 13:50:02 +02:00
# if IS_ENABLED(CONFIG_IPV6)
2016-10-28 09:59:15 -07:00
sock6 = rtnl_dereference ( dev - > vn6_sock ) ;
if ( family = = AF_INET6 & & sock6 & & atomic_read ( & sock6 - > refcnt ) = = 1 )
2015-09-24 13:50:02 +02:00
return false ;
# endif
2013-12-10 16:37:33 +08:00
2013-05-16 11:35:20 +00:00
list_for_each_entry ( vxlan , & vn - > vxlan_list , next ) {
2013-12-10 16:37:33 +08:00
if ( ! netif_running ( vxlan - > dev ) | | vxlan = = dev )
2013-05-16 11:35:20 +00:00
continue ;
2012-10-01 12:32:35 +00:00
2016-10-28 09:59:15 -07:00
if ( family = = AF_INET & &
rtnl_dereference ( vxlan - > vn4_sock ) ! = sock4 )
2013-12-10 16:37:33 +08:00
continue ;
2015-09-24 13:50:02 +02:00
# if IS_ENABLED(CONFIG_IPV6)
2016-10-28 09:59:15 -07:00
if ( family = = AF_INET6 & &
rtnl_dereference ( vxlan - > vn6_sock ) ! = sock6 )
2015-09-24 13:50:02 +02:00
continue ;
# endif
2013-12-10 16:37:33 +08:00
if ( ! vxlan_addr_equal ( & vxlan - > default_dst . remote_ip ,
& dev - > default_dst . remote_ip ) )
continue ;
if ( vxlan - > default_dst . remote_ifindex ! =
dev - > default_dst . remote_ifindex )
continue ;
return true ;
2013-05-16 11:35:20 +00:00
}
2012-10-01 12:32:35 +00:00
return false ;
}
2016-04-09 12:46:23 +02:00
static bool __vxlan_sock_release_prep ( struct vxlan_sock * vs )
2013-06-17 14:16:10 -07:00
{
2015-09-24 13:50:02 +02:00
struct vxlan_net * vn ;
2013-08-19 11:23:07 -07:00
2015-09-24 13:50:02 +02:00
if ( ! vs )
2016-04-09 12:46:23 +02:00
return false ;
2013-06-17 14:16:10 -07:00
if ( ! atomic_dec_and_test ( & vs - > refcnt ) )
2016-04-09 12:46:23 +02:00
return false ;
2012-10-01 12:32:35 +00:00
2015-09-24 13:50:02 +02:00
vn = net_generic ( sock_net ( vs - > sock - > sk ) , vxlan_net_id ) ;
2013-06-17 14:16:11 -07:00
spin_lock ( & vn - > sock_lock ) ;
2013-06-17 14:16:10 -07:00
hlist_del_rcu ( & vs - > hlist ) ;
2016-06-16 12:20:52 -07:00
udp_tunnel_notify_del_rx_port ( vs - > sock ,
2016-06-16 12:23:19 -07:00
( vs - > flags & VXLAN_F_GPE ) ?
UDP_TUNNEL_TYPE_VXLAN_GPE :
2016-06-16 12:20:52 -07:00
UDP_TUNNEL_TYPE_VXLAN ) ;
2013-06-17 14:16:11 -07:00
spin_unlock ( & vn - > sock_lock ) ;
2016-04-09 12:46:23 +02:00
return true ;
2012-10-01 12:32:35 +00:00
}
2015-09-24 13:50:02 +02:00
static void vxlan_sock_release ( struct vxlan_dev * vxlan )
{
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock4 = rtnl_dereference ( vxlan - > vn4_sock ) ;
2015-09-24 13:50:02 +02:00
# if IS_ENABLED(CONFIG_IPV6)
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock6 = rtnl_dereference ( vxlan - > vn6_sock ) ;
rcu_assign_pointer ( vxlan - > vn6_sock , NULL ) ;
2016-04-09 12:46:23 +02:00
# endif
2016-10-28 09:59:15 -07:00
rcu_assign_pointer ( vxlan - > vn4_sock , NULL ) ;
2016-04-09 12:46:23 +02:00
synchronize_net ( ) ;
2016-10-28 09:59:15 -07:00
if ( __vxlan_sock_release_prep ( sock4 ) ) {
udp_tunnel_sock_release ( sock4 - > sock ) ;
kfree ( sock4 ) ;
2016-04-09 12:46:23 +02:00
}
# if IS_ENABLED(CONFIG_IPV6)
2016-10-28 09:59:15 -07:00
if ( __vxlan_sock_release_prep ( sock6 ) ) {
udp_tunnel_sock_release ( sock6 - > sock ) ;
kfree ( sock6 ) ;
2016-04-09 12:46:23 +02:00
}
2015-09-24 13:50:02 +02:00
# endif
}
2015-03-18 14:50:44 -03:00
/* Update multicast group membership when first VNI on
2015-04-02 11:17:58 +09:00
* multicast address is brought up
2013-06-17 14:16:10 -07:00
*/
2015-03-18 14:50:44 -03:00
static int vxlan_igmp_join ( struct vxlan_dev * vxlan )
2012-10-01 12:32:35 +00:00
{
2015-09-24 13:50:02 +02:00
struct sock * sk ;
2013-08-31 13:44:33 +08:00
union vxlan_addr * ip = & vxlan - > default_dst . remote_ip ;
int ifindex = vxlan - > default_dst . remote_ifindex ;
2015-03-20 10:26:21 -03:00
int ret = - EINVAL ;
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
if ( ip - > sa . sa_family = = AF_INET ) {
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock4 = rtnl_dereference ( vxlan - > vn4_sock ) ;
2013-08-31 13:44:33 +08:00
struct ip_mreqn mreq = {
. imr_multiaddr . s_addr = ip - > sin . sin_addr . s_addr ,
. imr_ifindex = ifindex ,
} ;
2016-10-28 09:59:15 -07:00
sk = sock4 - > sock - > sk ;
2015-09-24 13:50:02 +02:00
lock_sock ( sk ) ;
2015-03-18 14:50:44 -03:00
ret = ip_mc_join_group ( sk , & mreq ) ;
2015-09-24 13:50:02 +02:00
release_sock ( sk ) ;
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
} else {
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock6 = rtnl_dereference ( vxlan - > vn6_sock ) ;
sk = sock6 - > sock - > sk ;
2015-09-24 13:50:02 +02:00
lock_sock ( sk ) ;
2015-03-18 14:50:44 -03:00
ret = ipv6_stub - > ipv6_sock_mc_join ( sk , ifindex ,
& ip - > sin6 . sin6_addr ) ;
2015-09-24 13:50:02 +02:00
release_sock ( sk ) ;
2013-08-31 13:44:33 +08:00
# endif
}
2013-07-18 08:40:15 -07:00
2015-03-18 14:50:44 -03:00
return ret ;
2013-07-18 08:40:15 -07:00
}
/* Inverse of vxlan_igmp_join when last VNI is brought down */
2015-03-18 14:50:44 -03:00
static int vxlan_igmp_leave ( struct vxlan_dev * vxlan )
2013-07-18 08:40:15 -07:00
{
2015-09-24 13:50:02 +02:00
struct sock * sk ;
2013-08-31 13:44:33 +08:00
union vxlan_addr * ip = & vxlan - > default_dst . remote_ip ;
int ifindex = vxlan - > default_dst . remote_ifindex ;
2015-03-20 10:26:21 -03:00
int ret = - EINVAL ;
2013-07-18 08:40:15 -07:00
2013-08-31 13:44:33 +08:00
if ( ip - > sa . sa_family = = AF_INET ) {
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock4 = rtnl_dereference ( vxlan - > vn4_sock ) ;
2013-08-31 13:44:33 +08:00
struct ip_mreqn mreq = {
. imr_multiaddr . s_addr = ip - > sin . sin_addr . s_addr ,
. imr_ifindex = ifindex ,
} ;
2016-10-28 09:59:15 -07:00
sk = sock4 - > sock - > sk ;
2015-09-24 13:50:02 +02:00
lock_sock ( sk ) ;
2015-03-18 14:50:44 -03:00
ret = ip_mc_leave_group ( sk , & mreq ) ;
2015-09-24 13:50:02 +02:00
release_sock ( sk ) ;
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
} else {
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock6 = rtnl_dereference ( vxlan - > vn6_sock ) ;
sk = sock6 - > sock - > sk ;
2015-09-24 13:50:02 +02:00
lock_sock ( sk ) ;
2015-03-18 14:50:44 -03:00
ret = ipv6_stub - > ipv6_sock_mc_drop ( sk , ifindex ,
& ip - > sin6 . sin6_addr ) ;
2015-09-24 13:50:02 +02:00
release_sock ( sk ) ;
2013-08-31 13:44:33 +08:00
# endif
}
2012-10-01 12:32:35 +00:00
2015-03-18 14:50:44 -03:00
return ret ;
2012-10-01 12:32:35 +00:00
}
2016-02-16 21:59:01 +01:00
static bool vxlan_remcsum ( struct vxlanhdr * unparsed ,
struct sk_buff * skb , u32 vxflags )
2015-01-12 17:00:38 -08:00
{
2016-03-21 17:50:05 +01:00
size_t start , offset ;
2015-01-12 17:00:38 -08:00
2016-02-16 21:59:01 +01:00
if ( ! ( unparsed - > vx_flags & VXLAN_HF_RCO ) | | skb - > remcsum_offload )
goto out ;
2015-08-19 17:07:32 -07:00
2016-02-16 21:59:01 +01:00
start = vxlan_rco_start ( unparsed - > vx_vni ) ;
offset = start + vxlan_rco_offset ( unparsed - > vx_vni ) ;
2015-01-12 17:00:38 -08:00
2016-03-21 17:50:05 +01:00
if ( ! pskb_may_pull ( skb , offset + sizeof ( u16 ) ) )
2016-02-16 21:58:59 +01:00
return false ;
2015-01-12 17:00:38 -08:00
2016-02-16 21:58:59 +01:00
skb_remcsum_process ( skb , ( void * ) ( vxlan_hdr ( skb ) + 1 ) , start , offset ,
! ! ( vxflags & VXLAN_F_REMCSUM_NOPARTIAL ) ) ;
2016-02-16 21:59:01 +01:00
out :
unparsed - > vx_flags & = ~ VXLAN_HF_RCO ;
unparsed - > vx_vni & = VXLAN_VNI_MASK ;
2016-02-16 21:58:59 +01:00
return true ;
2015-01-12 17:00:38 -08:00
}
2016-02-16 21:59:01 +01:00
static void vxlan_parse_gbp_hdr ( struct vxlanhdr * unparsed ,
2016-02-23 18:02:55 +01:00
struct sk_buff * skb , u32 vxflags ,
2016-02-23 18:02:59 +01:00
struct vxlan_metadata * md )
2016-02-16 21:59:00 +01:00
{
2016-02-16 21:59:01 +01:00
struct vxlanhdr_gbp * gbp = ( struct vxlanhdr_gbp * ) unparsed ;
2016-02-23 18:02:59 +01:00
struct metadata_dst * tun_dst ;
2016-02-16 21:59:01 +01:00
if ( ! ( unparsed - > vx_flags & VXLAN_HF_GBP ) )
goto out ;
2016-02-16 21:59:00 +01:00
md - > gbp = ntohs ( gbp - > policy_id ) ;
2016-02-23 18:02:59 +01:00
tun_dst = ( struct metadata_dst * ) skb_dst ( skb ) ;
2016-03-08 12:34:12 -05:00
if ( tun_dst ) {
2016-02-16 21:59:00 +01:00
tun_dst - > u . tun_info . key . tun_flags | = TUNNEL_VXLAN_OPT ;
2016-03-08 12:34:12 -05:00
tun_dst - > u . tun_info . options_len = sizeof ( * md ) ;
}
2016-02-16 21:59:00 +01:00
if ( gbp - > dont_learn )
md - > gbp | = VXLAN_GBP_DONT_LEARN ;
if ( gbp - > policy_applied )
md - > gbp | = VXLAN_GBP_POLICY_APPLIED ;
2016-02-16 21:59:01 +01:00
2016-02-23 18:02:55 +01:00
/* In flow-based mode, GBP is carried in dst_metadata */
if ( ! ( vxflags & VXLAN_F_COLLECT_METADATA ) )
skb - > mark = md - > gbp ;
2016-02-16 21:59:01 +01:00
out :
unparsed - > vx_flags & = ~ VXLAN_GBP_USED_BITS ;
2016-02-16 21:59:00 +01:00
}
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
static bool vxlan_parse_gpe_hdr ( struct vxlanhdr * unparsed ,
2016-04-11 17:06:08 +02:00
__be16 * protocol ,
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
struct sk_buff * skb , u32 vxflags )
{
struct vxlanhdr_gpe * gpe = ( struct vxlanhdr_gpe * ) unparsed ;
/* Need to have Next Protocol set for interfaces in GPE mode. */
if ( ! gpe - > np_applied )
return false ;
/* "The initial version is 0. If a receiver does not support the
* version indicated it MUST drop the packet .
*/
if ( gpe - > version ! = 0 )
return false ;
/* "When the O bit is set to 1, the packet is an OAM packet and OAM
* processing MUST occur . " However, we don't implement OAM
* processing , thus drop the packet .
*/
if ( gpe - > oam_flag )
return false ;
switch ( gpe - > next_protocol ) {
case VXLAN_GPE_NP_IPV4 :
* protocol = htons ( ETH_P_IP ) ;
break ;
case VXLAN_GPE_NP_IPV6 :
* protocol = htons ( ETH_P_IPV6 ) ;
break ;
case VXLAN_GPE_NP_ETHERNET :
* protocol = htons ( ETH_P_TEB ) ;
break ;
default :
return false ;
}
unparsed - > vx_flags & = ~ VXLAN_GPE_USED_BITS ;
return true ;
}
2016-02-23 18:02:56 +01:00
static bool vxlan_set_mac ( struct vxlan_dev * vxlan ,
struct vxlan_sock * vs ,
2017-01-31 22:59:52 -08:00
struct sk_buff * skb , __be32 vni )
2015-07-21 10:44:06 +02:00
{
union vxlan_addr saddr ;
skb_reset_mac_header ( skb ) ;
skb - > protocol = eth_type_trans ( skb , vxlan - > dev ) ;
skb_postpull_rcsum ( skb , eth_hdr ( skb ) , ETH_HLEN ) ;
/* Ignore packet loops (and multicast echo) */
if ( ether_addr_equal ( eth_hdr ( skb ) - > h_source , vxlan - > dev - > dev_addr ) )
2016-02-23 18:02:56 +01:00
return false ;
2015-07-21 10:44:06 +02:00
2016-02-23 18:02:57 +01:00
/* Get address from the outer IP header */
2015-12-07 16:29:08 +01:00
if ( vxlan_get_sk_family ( vs ) = = AF_INET ) {
2016-02-23 18:02:56 +01:00
saddr . sin . sin_addr . s_addr = ip_hdr ( skb ) - > saddr ;
2015-07-21 10:44:06 +02:00
saddr . sa . sa_family = AF_INET ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
2016-02-23 18:02:56 +01:00
saddr . sin6 . sin6_addr = ipv6_hdr ( skb ) - > saddr ;
2015-07-21 10:44:06 +02:00
saddr . sa . sa_family = AF_INET6 ;
# endif
}
2016-02-23 18:02:56 +01:00
if ( ( vxlan - > flags & VXLAN_F_LEARN ) & &
2017-01-31 22:59:52 -08:00
vxlan_snoop ( skb - > dev , & saddr , eth_hdr ( skb ) - > h_source , vni ) )
2016-02-23 18:02:56 +01:00
return false ;
return true ;
}
2016-02-23 18:02:57 +01:00
static bool vxlan_ecn_decapsulate ( struct vxlan_sock * vs , void * oiph ,
struct sk_buff * skb )
{
int err = 0 ;
if ( vxlan_get_sk_family ( vs ) = = AF_INET )
err = IP_ECN_decapsulate ( oiph , skb ) ;
# if IS_ENABLED(CONFIG_IPV6)
else
err = IP6_ECN_decapsulate ( oiph , skb ) ;
# endif
if ( unlikely ( err ) & & log_ecn_error ) {
if ( vxlan_get_sk_family ( vs ) = = AF_INET )
net_info_ratelimited ( " non-ECT from %pI4 with TOS=%#x \n " ,
& ( ( struct iphdr * ) oiph ) - > saddr ,
( ( struct iphdr * ) oiph ) - > tos ) ;
else
net_info_ratelimited ( " non-ECT from %pI6 \n " ,
& ( ( struct ipv6hdr * ) oiph ) - > saddr ) ;
}
return err < = 1 ;
}
2012-10-01 12:32:35 +00:00
/* Callback from net/ipv4/udp.c to receive packets */
2016-02-23 18:02:58 +01:00
static int vxlan_rcv ( struct sock * sk , struct sk_buff * skb )
2012-10-01 12:32:35 +00:00
{
2016-02-23 18:02:58 +01:00
struct pcpu_sw_netstats * stats ;
2016-02-18 11:22:51 +01:00
struct vxlan_dev * vxlan ;
2013-08-19 11:23:02 -07:00
struct vxlan_sock * vs ;
2016-02-16 21:59:01 +01:00
struct vxlanhdr unparsed ;
2015-07-21 10:43:58 +02:00
struct vxlan_metadata _md ;
struct vxlan_metadata * md = & _md ;
2016-04-11 17:06:08 +02:00
__be16 protocol = htons ( ETH_P_TEB ) ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
bool raw_proto = false ;
2016-02-23 18:02:58 +01:00
void * oiph ;
2017-01-31 22:59:52 -08:00
__be32 vni = 0 ;
2012-10-01 12:32:35 +00:00
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
/* Need UDP and VXLAN header to be present */
2013-08-19 11:22:54 -07:00
if ( ! pskb_may_pull ( skb , VXLAN_HLEN ) )
2016-05-19 15:58:33 +02:00
goto drop ;
2012-10-01 12:32:35 +00:00
2016-02-16 21:59:01 +01:00
unparsed = * vxlan_hdr ( skb ) ;
2016-02-16 21:59:02 +01:00
/* VNI flag always required to be set */
if ( ! ( unparsed . vx_flags & VXLAN_HF_VNI ) ) {
netdev_dbg ( skb - > dev , " invalid vxlan flags=%#x vni=%#x \n " ,
ntohl ( vxlan_hdr ( skb ) - > vx_flags ) ,
ntohl ( vxlan_hdr ( skb ) - > vx_vni ) ) ;
/* Return non vxlan pkt */
2016-05-19 15:58:33 +02:00
goto drop ;
2012-10-01 12:32:35 +00:00
}
2016-02-16 21:59:02 +01:00
unparsed . vx_flags & = ~ VXLAN_HF_VNI ;
unparsed . vx_vni & = ~ VXLAN_VNI_MASK ;
2012-10-01 12:32:35 +00:00
2013-09-24 10:25:40 -07:00
vs = rcu_dereference_sk_user_data ( sk ) ;
2013-08-19 11:23:02 -07:00
if ( ! vs )
2012-10-01 12:32:35 +00:00
goto drop ;
2017-01-31 22:59:52 -08:00
vni = vxlan_vni ( vxlan_hdr ( skb ) - > vx_vni ) ;
vxlan = vxlan_vs_find_vni ( vs , vni ) ;
2016-02-18 11:22:51 +01:00
if ( ! vxlan )
goto drop ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
/* For backwards compatibility, only allow reserved fields to be
* used by VXLAN extensions if explicitly requested .
*/
if ( vs - > flags & VXLAN_F_GPE ) {
if ( ! vxlan_parse_gpe_hdr ( & unparsed , & protocol , skb , vs - > flags ) )
goto drop ;
raw_proto = true ;
}
if ( __iptunnel_pull_header ( skb , VXLAN_HLEN , protocol , raw_proto ,
! net_eq ( vxlan - > net , dev_net ( vxlan - > dev ) ) ) )
goto drop ;
2016-02-18 11:22:51 +01:00
2015-07-21 10:43:58 +02:00
if ( vxlan_collect_metadata ( vs ) ) {
2016-02-23 18:02:59 +01:00
struct metadata_dst * tun_dst ;
2016-02-18 19:19:29 +01:00
2015-08-26 23:46:50 -07:00
tun_dst = udp_tun_rx_dst ( skb , vxlan_get_sk_family ( vs ) , TUNNEL_KEY ,
2016-09-08 16:23:45 +03:00
key32_to_tunnel_id ( vni ) , sizeof ( * md ) ) ;
2015-08-26 23:46:50 -07:00
2015-07-21 10:43:58 +02:00
if ( ! tun_dst )
goto drop ;
2015-09-04 12:49:32 +02:00
md = ip_tunnel_info_opts ( & tun_dst - > u . tun_info ) ;
2016-02-23 18:02:59 +01:00
skb_dst_set ( skb , ( struct dst_entry * ) tun_dst ) ;
2015-07-21 10:43:58 +02:00
} else {
memset ( md , 0 , sizeof ( * md ) ) ;
}
2016-02-16 21:59:01 +01:00
if ( vs - > flags & VXLAN_F_REMCSUM_RX )
if ( ! vxlan_remcsum ( & unparsed , skb , vs - > flags ) )
goto drop ;
if ( vs - > flags & VXLAN_F_GBP )
2016-02-23 18:02:59 +01:00
vxlan_parse_gbp_hdr ( & unparsed , skb , vs - > flags , md ) ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
/* Note that GBP and GPE can never be active together. This is
* ensured in vxlan_dev_configure .
*/
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
2016-02-16 21:59:01 +01:00
if ( unparsed . vx_flags | | unparsed . vx_vni ) {
2015-01-08 12:31:18 -08:00
/* If there are any unprocessed flags remaining treat
* this as a malformed packet . This behavior diverges from
* VXLAN RFC ( RFC7348 ) which stipulates that bits in reserved
* in reserved fields are to be ignored . The approach here
2015-04-02 11:17:58 +09:00
* maintains compatibility with previous stack code , and also
2015-01-08 12:31:18 -08:00
* is more robust and provides a little more security in
* adding extensions to VXLAN .
*/
2016-02-16 21:59:02 +01:00
goto drop ;
2015-01-08 12:31:18 -08:00
}
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
if ( ! raw_proto ) {
2017-01-31 22:59:52 -08:00
if ( ! vxlan_set_mac ( vxlan , vs , skb , vni ) )
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
goto drop ;
} else {
2016-05-13 10:48:42 +02:00
skb_reset_mac_header ( skb ) ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
skb - > dev = vxlan - > dev ;
skb - > pkt_type = PACKET_HOST ;
}
2016-02-23 18:02:58 +01:00
oiph = skb_network_header ( skb ) ;
skb_reset_network_header ( skb ) ;
if ( ! vxlan_ecn_decapsulate ( vs , oiph , skb ) ) {
+ + vxlan - > dev - > stats . rx_frame_errors ;
+ + vxlan - > dev - > stats . rx_errors ;
goto drop ;
}
stats = this_cpu_ptr ( vxlan - > dev - > tstats ) ;
u64_stats_update_begin ( & stats - > syncp ) ;
stats - > rx_packets + + ;
stats - > rx_bytes + = skb - > len ;
u64_stats_update_end ( & stats - > syncp ) ;
gro_cells_receive ( & vxlan - > gro_cells , skb ) ;
2013-08-19 11:23:02 -07:00
return 0 ;
drop :
2016-02-16 21:59:02 +01:00
/* Consume bad packet */
kfree_skb ( skb ) ;
return 0 ;
2013-08-19 11:23:02 -07:00
}
2017-01-31 22:59:52 -08:00
static int arp_reduce ( struct net_device * dev , struct sk_buff * skb , __be32 vni )
2012-11-20 02:50:14 +00:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct arphdr * parp ;
u8 * arpptr , * sha ;
__be32 sip , tip ;
struct neighbour * n ;
if ( dev - > flags & IFF_NOARP )
goto out ;
if ( ! pskb_may_pull ( skb , arp_hdr_len ( dev ) ) ) {
dev - > stats . tx_dropped + + ;
goto out ;
}
parp = arp_hdr ( skb ) ;
if ( ( parp - > ar_hrd ! = htons ( ARPHRD_ETHER ) & &
parp - > ar_hrd ! = htons ( ARPHRD_IEEE802 ) ) | |
parp - > ar_pro ! = htons ( ETH_P_IP ) | |
parp - > ar_op ! = htons ( ARPOP_REQUEST ) | |
parp - > ar_hln ! = dev - > addr_len | |
parp - > ar_pln ! = 4 )
goto out ;
arpptr = ( u8 * ) parp + sizeof ( struct arphdr ) ;
sha = arpptr ;
arpptr + = dev - > addr_len ; /* sha */
memcpy ( & sip , arpptr , sizeof ( sip ) ) ;
arpptr + = sizeof ( sip ) ;
arpptr + = dev - > addr_len ; /* tha */
memcpy ( & tip , arpptr , sizeof ( tip ) ) ;
if ( ipv4_is_loopback ( tip ) | |
ipv4_is_multicast ( tip ) )
goto out ;
n = neigh_lookup ( & arp_tbl , & tip , dev ) ;
if ( n ) {
struct vxlan_fdb * f ;
struct sk_buff * reply ;
if ( ! ( n - > nud_state & NUD_CONNECTED ) ) {
neigh_release ( n ) ;
goto out ;
}
2017-01-31 22:59:52 -08:00
f = vxlan_find_mac ( vxlan , n - > ha , vni ) ;
2013-08-31 13:44:33 +08:00
if ( f & & vxlan_addr_any ( & ( first_remote_rcu ( f ) - > remote_ip ) ) ) {
2012-11-20 02:50:14 +00:00
/* bridge-local neighbor */
neigh_release ( n ) ;
goto out ;
}
reply = arp_create ( ARPOP_REPLY , ETH_P_ARP , sip , dev , tip , sha ,
n - > ha , sha ) ;
neigh_release ( n ) ;
2014-03-18 12:32:29 -04:00
if ( reply = = NULL )
goto out ;
2012-11-20 02:50:14 +00:00
skb_reset_mac_header ( reply ) ;
__skb_pull ( reply , skb_network_offset ( reply ) ) ;
reply - > ip_summed = CHECKSUM_UNNECESSARY ;
reply - > pkt_type = PACKET_HOST ;
if ( netif_rx_ni ( reply ) = = NET_RX_DROP )
dev - > stats . rx_dropped + + ;
2013-08-31 13:44:33 +08:00
} else if ( vxlan - > flags & VXLAN_F_L3MISS ) {
union vxlan_addr ipa = {
. sin . sin_addr . s_addr = tip ,
2014-08-22 21:34:16 +02:00
. sin . sin_family = AF_INET ,
2013-08-31 13:44:33 +08:00
} ;
vxlan_ip_miss ( dev , & ipa ) ;
}
2012-11-20 02:50:14 +00:00
out :
consume_skb ( skb ) ;
return NETDEV_TX_OK ;
}
2013-08-31 13:44:36 +08:00
# if IS_ENABLED(CONFIG_IPV6)
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
static struct sk_buff * vxlan_na_create ( struct sk_buff * request ,
struct neighbour * n , bool isrouter )
{
struct net_device * dev = request - > dev ;
struct sk_buff * reply ;
struct nd_msg * ns , * na ;
struct ipv6hdr * pip6 ;
u8 * daddr ;
int na_olen = 8 ; /* opt hdr + ETH_ALEN for target */
int ns_olen ;
int i , len ;
2017-04-02 11:00:06 +02:00
if ( dev = = NULL | | ! pskb_may_pull ( request , request - > len ) )
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
return NULL ;
len = LL_RESERVED_SPACE ( dev ) + sizeof ( struct ipv6hdr ) +
sizeof ( * na ) + na_olen + dev - > needed_tailroom ;
reply = alloc_skb ( len , GFP_ATOMIC ) ;
if ( reply = = NULL )
return NULL ;
reply - > protocol = htons ( ETH_P_IPV6 ) ;
reply - > dev = dev ;
skb_reserve ( reply , LL_RESERVED_SPACE ( request - > dev ) ) ;
skb_push ( reply , sizeof ( struct ethhdr ) ) ;
2016-03-03 01:16:54 +00:00
skb_reset_mac_header ( reply ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
2017-04-02 11:00:06 +02:00
ns = ( struct nd_msg * ) ( ipv6_hdr ( request ) + 1 ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
daddr = eth_hdr ( request ) - > h_source ;
2017-04-02 11:00:06 +02:00
ns_olen = request - > len - skb_network_offset ( request ) -
sizeof ( struct ipv6hdr ) - sizeof ( * ns ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
for ( i = 0 ; i < ns_olen - 1 ; i + = ( ns - > opt [ i + 1 ] < < 3 ) ) {
if ( ns - > opt [ i ] = = ND_OPT_SOURCE_LL_ADDR ) {
daddr = ns - > opt + i + sizeof ( struct nd_opt_hdr ) ;
break ;
}
}
/* Ethernet header */
ether_addr_copy ( eth_hdr ( reply ) - > h_dest , daddr ) ;
ether_addr_copy ( eth_hdr ( reply ) - > h_source , n - > ha ) ;
eth_hdr ( reply ) - > h_proto = htons ( ETH_P_IPV6 ) ;
reply - > protocol = htons ( ETH_P_IPV6 ) ;
skb_pull ( reply , sizeof ( struct ethhdr ) ) ;
2016-03-03 01:16:54 +00:00
skb_reset_network_header ( reply ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
skb_put ( reply , sizeof ( struct ipv6hdr ) ) ;
/* IPv6 header */
pip6 = ipv6_hdr ( reply ) ;
memset ( pip6 , 0 , sizeof ( struct ipv6hdr ) ) ;
pip6 - > version = 6 ;
pip6 - > priority = ipv6_hdr ( request ) - > priority ;
pip6 - > nexthdr = IPPROTO_ICMPV6 ;
pip6 - > hop_limit = 255 ;
pip6 - > daddr = ipv6_hdr ( request ) - > saddr ;
pip6 - > saddr = * ( struct in6_addr * ) n - > primary_key ;
skb_pull ( reply , sizeof ( struct ipv6hdr ) ) ;
2016-03-03 01:16:54 +00:00
skb_reset_transport_header ( reply ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
na = ( struct nd_msg * ) skb_put ( reply , sizeof ( * na ) + na_olen ) ;
/* Neighbor Advertisement */
memset ( na , 0 , sizeof ( * na ) + na_olen ) ;
na - > icmph . icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT ;
na - > icmph . icmp6_router = isrouter ;
na - > icmph . icmp6_override = 1 ;
na - > icmph . icmp6_solicited = 1 ;
na - > target = ns - > target ;
ether_addr_copy ( & na - > opt [ 2 ] , n - > ha ) ;
na - > opt [ 0 ] = ND_OPT_TARGET_LL_ADDR ;
na - > opt [ 1 ] = na_olen > > 3 ;
na - > icmph . icmp6_cksum = csum_ipv6_magic ( & pip6 - > saddr ,
& pip6 - > daddr , sizeof ( * na ) + na_olen , IPPROTO_ICMPV6 ,
csum_partial ( na , sizeof ( * na ) + na_olen , 0 ) ) ;
pip6 - > payload_len = htons ( sizeof ( * na ) + na_olen ) ;
skb_push ( reply , sizeof ( struct ipv6hdr ) ) ;
reply - > ip_summed = CHECKSUM_UNNECESSARY ;
return reply ;
}
2017-01-31 22:59:52 -08:00
static int neigh_reduce ( struct net_device * dev , struct sk_buff * skb , __be32 vni )
2013-08-31 13:44:36 +08:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
struct nd_msg * msg ;
2013-08-31 13:44:36 +08:00
const struct ipv6hdr * iphdr ;
2017-02-20 08:41:16 -08:00
const struct in6_addr * daddr ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
struct neighbour * n ;
struct inet6_dev * in6_dev ;
2013-08-31 13:44:36 +08:00
in6_dev = __in6_dev_get ( dev ) ;
if ( ! in6_dev )
goto out ;
2017-04-02 11:00:06 +02:00
if ( ! pskb_may_pull ( skb , sizeof ( struct ipv6hdr ) + sizeof ( struct nd_msg ) ) )
goto out ;
2013-08-31 13:44:36 +08:00
iphdr = ipv6_hdr ( skb ) ;
daddr = & iphdr - > daddr ;
2017-04-02 11:00:06 +02:00
msg = ( struct nd_msg * ) ( iphdr + 1 ) ;
2013-08-31 13:44:36 +08:00
if ( msg - > icmph . icmp6_code ! = 0 | |
msg - > icmph . icmp6_type ! = NDISC_NEIGHBOUR_SOLICITATION )
goto out ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
if ( ipv6_addr_loopback ( daddr ) | |
ipv6_addr_is_multicast ( & msg - > target ) )
goto out ;
n = neigh_lookup ( ipv6_stub - > nd_tbl , & msg - > target , dev ) ;
2013-08-31 13:44:36 +08:00
if ( n ) {
struct vxlan_fdb * f ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
struct sk_buff * reply ;
2013-08-31 13:44:36 +08:00
if ( ! ( n - > nud_state & NUD_CONNECTED ) ) {
neigh_release ( n ) ;
goto out ;
}
2017-01-31 22:59:52 -08:00
f = vxlan_find_mac ( vxlan , n - > ha , vni ) ;
2013-08-31 13:44:36 +08:00
if ( f & & vxlan_addr_any ( & ( first_remote_rcu ( f ) - > remote_ip ) ) ) {
/* bridge-local neighbor */
neigh_release ( n ) ;
goto out ;
}
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
reply = vxlan_na_create ( skb , n ,
! ! ( f ? f - > flags & NTF_ROUTER : 0 ) ) ;
2013-08-31 13:44:36 +08:00
neigh_release ( n ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
if ( reply = = NULL )
goto out ;
if ( netif_rx_ni ( reply ) = = NET_RX_DROP )
dev - > stats . rx_dropped + + ;
2013-08-31 13:44:36 +08:00
} else if ( vxlan - > flags & VXLAN_F_L3MISS ) {
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
union vxlan_addr ipa = {
. sin6 . sin6_addr = msg - > target ,
2014-08-22 21:34:16 +02:00
. sin6 . sin6_family = AF_INET6 ,
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
} ;
2013-08-31 13:44:36 +08:00
vxlan_ip_miss ( dev , & ipa ) ;
}
out :
consume_skb ( skb ) ;
return NETDEV_TX_OK ;
}
# endif
2012-11-20 02:50:14 +00:00
static bool route_shortcircuit ( struct net_device * dev , struct sk_buff * skb )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct neighbour * n ;
if ( is_multicast_ether_addr ( eth_hdr ( skb ) - > h_dest ) )
return false ;
n = NULL ;
switch ( ntohs ( eth_hdr ( skb ) - > h_proto ) ) {
case ETH_P_IP :
2013-08-31 13:44:34 +08:00
{
struct iphdr * pip ;
2012-11-20 02:50:14 +00:00
if ( ! pskb_may_pull ( skb , sizeof ( struct iphdr ) ) )
return false ;
pip = ip_hdr ( skb ) ;
n = neigh_lookup ( & arp_tbl , & pip - > daddr , dev ) ;
2013-08-31 13:44:33 +08:00
if ( ! n & & ( vxlan - > flags & VXLAN_F_L3MISS ) ) {
union vxlan_addr ipa = {
. sin . sin_addr . s_addr = pip - > daddr ,
2014-08-22 21:34:16 +02:00
. sin . sin_family = AF_INET ,
2013-08-31 13:44:33 +08:00
} ;
vxlan_ip_miss ( dev , & ipa ) ;
return false ;
}
2012-11-20 02:50:14 +00:00
break ;
2013-08-31 13:44:34 +08:00
}
# if IS_ENABLED(CONFIG_IPV6)
case ETH_P_IPV6 :
{
struct ipv6hdr * pip6 ;
if ( ! pskb_may_pull ( skb , sizeof ( struct ipv6hdr ) ) )
return false ;
pip6 = ipv6_hdr ( skb ) ;
n = neigh_lookup ( ipv6_stub - > nd_tbl , & pip6 - > daddr , dev ) ;
if ( ! n & & ( vxlan - > flags & VXLAN_F_L3MISS ) ) {
union vxlan_addr ipa = {
. sin6 . sin6_addr = pip6 - > daddr ,
2014-08-22 21:34:16 +02:00
. sin6 . sin6_family = AF_INET6 ,
2013-08-31 13:44:34 +08:00
} ;
vxlan_ip_miss ( dev , & ipa ) ;
return false ;
}
break ;
}
# endif
2012-11-20 02:50:14 +00:00
default :
return false ;
}
if ( n ) {
bool diff ;
drivers/net: Convert uses of compare_ether_addr to ether_addr_equal
Use the new bool function ether_addr_equal to add
some clarity and reduce the likelihood for misuse
of compare_ether_addr for sorting.
Done via cocci script: (and a little typing)
$ cat compare_ether_addr.cocci
@@
expression a,b;
@@
- !compare_ether_addr(a, b)
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- compare_ether_addr(a, b)
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) == 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) != 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) == 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) != 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal(a, b)
+ ether_addr_equal(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-09-01 11:51:23 -07:00
diff = ! ether_addr_equal ( eth_hdr ( skb ) - > h_dest , n - > ha ) ;
2012-11-20 02:50:14 +00:00
if ( diff ) {
memcpy ( eth_hdr ( skb ) - > h_source , eth_hdr ( skb ) - > h_dest ,
dev - > addr_len ) ;
memcpy ( eth_hdr ( skb ) - > h_dest , n - > ha , dev - > addr_len ) ;
}
neigh_release ( n ) ;
return diff ;
2013-08-31 13:44:33 +08:00
}
2012-11-20 02:50:14 +00:00
return false ;
}
2015-01-20 11:23:05 -08:00
static void vxlan_build_gbp_hdr ( struct vxlanhdr * vxh , u32 vxflags ,
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
struct vxlan_metadata * md )
{
struct vxlanhdr_gbp * gbp ;
2015-02-04 17:00:04 +01:00
if ( ! md - > gbp )
return ;
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
gbp = ( struct vxlanhdr_gbp * ) vxh ;
2016-02-16 21:58:58 +01:00
vxh - > vx_flags | = VXLAN_HF_GBP ;
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
if ( md - > gbp & VXLAN_GBP_DONT_LEARN )
gbp - > dont_learn = 1 ;
if ( md - > gbp & VXLAN_GBP_POLICY_APPLIED )
gbp - > policy_applied = 1 ;
gbp - > policy_id = htons ( md - > gbp & VXLAN_GBP_ID_MASK ) ;
}
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
static int vxlan_build_gpe_hdr ( struct vxlanhdr * vxh , u32 vxflags ,
__be16 protocol )
{
struct vxlanhdr_gpe * gpe = ( struct vxlanhdr_gpe * ) vxh ;
gpe - > np_applied = 1 ;
switch ( protocol ) {
case htons ( ETH_P_IP ) :
gpe - > next_protocol = VXLAN_GPE_NP_IPV4 ;
return 0 ;
case htons ( ETH_P_IPV6 ) :
gpe - > next_protocol = VXLAN_GPE_NP_IPV6 ;
return 0 ;
case htons ( ETH_P_TEB ) :
gpe - > next_protocol = VXLAN_GPE_NP_ETHERNET ;
return 0 ;
}
return - EPFNOSUPPORT ;
}
2016-02-02 18:09:16 +01:00
static int vxlan_build_skb ( struct sk_buff * skb , struct dst_entry * dst ,
int iphdr_len , __be32 vni ,
struct vxlan_metadata * md , u32 vxflags ,
2016-02-02 18:09:15 +01:00
bool udp_sum )
2013-08-31 13:44:33 +08:00
{
struct vxlanhdr * vxh ;
int min_headroom ;
int err ;
2015-01-12 17:00:38 -08:00
int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
__be16 inner_protocol = htons ( ETH_P_TEB ) ;
2015-01-12 17:00:38 -08:00
2015-01-20 11:23:05 -08:00
if ( ( vxflags & VXLAN_F_REMCSUM_TX ) & &
2015-01-12 17:00:38 -08:00
skb - > ip_summed = = CHECKSUM_PARTIAL ) {
int csum_start = skb_checksum_start_offset ( skb ) ;
if ( csum_start < = VXLAN_MAX_REMCSUM_START & &
! ( csum_start & VXLAN_RCO_SHIFT_MASK ) & &
( skb - > csum_offset = = offsetof ( struct udphdr , check ) | |
2016-02-11 20:57:17 +00:00
skb - > csum_offset = = offsetof ( struct tcphdr , check ) ) )
2015-01-12 17:00:38 -08:00
type | = SKB_GSO_TUNNEL_REMCSUM ;
}
2013-08-31 13:44:33 +08:00
min_headroom = LL_RESERVED_SPACE ( dst - > dev ) + dst - > header_len
2016-11-13 20:43:52 -08:00
+ VXLAN_HLEN + iphdr_len ;
2013-08-19 11:23:22 -07:00
/* Need space for new headers (invalidates iph ptr) */
err = skb_cow_head ( skb , min_headroom ) ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
if ( unlikely ( err ) )
2016-11-13 20:43:54 -08:00
return err ;
2013-08-19 11:23:22 -07:00
2016-04-14 15:33:37 -04:00
err = iptunnel_handle_offloads ( skb , type ) ;
if ( err )
2016-11-13 20:43:54 -08:00
return err ;
2015-04-09 11:19:14 -07:00
2013-08-19 11:23:17 -07:00
vxh = ( struct vxlanhdr * ) __skb_push ( skb , sizeof ( * vxh ) ) ;
2016-02-16 21:58:58 +01:00
vxh - > vx_flags = VXLAN_HF_VNI ;
vxh - > vx_vni = vxlan_vni_field ( vni ) ;
2013-08-19 11:23:17 -07:00
2015-01-12 17:00:38 -08:00
if ( type & SKB_GSO_TUNNEL_REMCSUM ) {
2016-02-16 21:58:58 +01:00
unsigned int start ;
2015-01-12 17:00:38 -08:00
2016-02-16 21:58:58 +01:00
start = skb_checksum_start_offset ( skb ) - sizeof ( struct vxlanhdr ) ;
vxh - > vx_vni | = vxlan_compute_rco ( start , skb - > csum_offset ) ;
vxh - > vx_flags | = VXLAN_HF_RCO ;
2015-01-12 17:00:38 -08:00
if ( ! skb_is_gso ( skb ) ) {
skb - > ip_summed = CHECKSUM_NONE ;
skb - > encapsulation = 0 ;
}
}
2015-01-20 11:23:05 -08:00
if ( vxflags & VXLAN_F_GBP )
vxlan_build_gbp_hdr ( vxh , vxflags , md ) ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
if ( vxflags & VXLAN_F_GPE ) {
err = vxlan_build_gpe_hdr ( vxh , vxflags , skb - > protocol ) ;
if ( err < 0 )
2016-11-13 20:43:54 -08:00
return err ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
inner_protocol = skb - > protocol ;
}
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
skb_set_inner_protocol ( skb , inner_protocol ) ;
2015-12-24 14:34:54 -08:00
return 0 ;
2013-08-19 11:23:17 -07:00
}
2016-11-13 20:43:55 -08:00
static struct rtable * vxlan_get_route ( struct vxlan_dev * vxlan , struct net_device * dev ,
struct vxlan_sock * sock4 ,
2016-02-02 18:09:14 +01:00
struct sk_buff * skb , int oif , u8 tos ,
2017-01-11 15:18:53 +00:00
__be32 daddr , __be32 * saddr , __be16 dport , __be16 sport ,
2016-02-12 15:43:56 +01:00
struct dst_cache * dst_cache ,
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit
The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.
While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.
Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.
Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.
Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 15:15:07 +01:00
const struct ip_tunnel_info * info )
2016-02-02 18:09:14 +01:00
{
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit
The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.
While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.
Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.
Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.
Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 15:15:07 +01:00
bool use_cache = ip_tunnel_dst_cache_usable ( skb , info ) ;
2016-02-02 18:09:14 +01:00
struct rtable * rt = NULL ;
struct flowi4 fl4 ;
2016-11-13 20:43:55 -08:00
if ( ! sock4 )
return ERR_PTR ( - EIO ) ;
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit
The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.
While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.
Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.
Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.
Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 15:15:07 +01:00
if ( tos & & ! info )
use_cache = false ;
if ( use_cache ) {
2016-02-12 15:43:56 +01:00
rt = dst_cache_get_ip4 ( dst_cache , saddr ) ;
if ( rt )
return rt ;
}
2016-02-02 18:09:14 +01:00
memset ( & fl4 , 0 , sizeof ( fl4 ) ) ;
fl4 . flowi4_oif = oif ;
fl4 . flowi4_tos = RT_TOS ( tos ) ;
fl4 . flowi4_mark = skb - > mark ;
fl4 . flowi4_proto = IPPROTO_UDP ;
fl4 . daddr = daddr ;
2016-08-05 17:45:36 -07:00
fl4 . saddr = * saddr ;
2017-01-11 15:18:53 +00:00
fl4 . fl4_dport = dport ;
fl4 . fl4_sport = sport ;
2016-02-02 18:09:14 +01:00
rt = ip_route_output_key ( vxlan - > net , & fl4 ) ;
2016-11-13 20:43:55 -08:00
if ( likely ( ! IS_ERR ( rt ) ) ) {
if ( rt - > dst . dev = = dev ) {
netdev_dbg ( dev , " circular route to %pI4 \n " , & daddr ) ;
ip_rt_put ( rt ) ;
return ERR_PTR ( - ELOOP ) ;
}
2016-02-02 18:09:14 +01:00
* saddr = fl4 . saddr ;
2016-02-12 15:43:56 +01:00
if ( use_cache )
dst_cache_set_ip4 ( dst_cache , & rt - > dst , fl4 . saddr ) ;
2016-11-13 20:43:55 -08:00
} else {
netdev_dbg ( dev , " no route to %pI4 \n " , & daddr ) ;
return ERR_PTR ( - ENETUNREACH ) ;
2016-02-12 15:43:56 +01:00
}
2016-02-02 18:09:14 +01:00
return rt ;
}
2015-12-07 13:04:30 +01:00
# if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry * vxlan6_get_route ( struct vxlan_dev * vxlan ,
2016-11-13 20:43:55 -08:00
struct net_device * dev ,
2016-11-13 20:43:53 -08:00
struct vxlan_sock * sock6 ,
2016-03-04 15:15:08 +01:00
struct sk_buff * skb , int oif , u8 tos ,
2016-03-09 03:00:03 +01:00
__be32 label ,
2015-12-07 13:04:30 +01:00
const struct in6_addr * daddr ,
2016-02-12 15:43:56 +01:00
struct in6_addr * saddr ,
2017-01-11 15:18:53 +00:00
__be16 dport , __be16 sport ,
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit
The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.
While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.
Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.
Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.
Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 15:15:07 +01:00
struct dst_cache * dst_cache ,
const struct ip_tunnel_info * info )
2015-12-07 13:04:30 +01:00
{
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit
The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.
While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.
Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.
Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.
Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 15:15:07 +01:00
bool use_cache = ip_tunnel_dst_cache_usable ( skb , info ) ;
2015-12-07 13:04:30 +01:00
struct dst_entry * ndst ;
struct flowi6 fl6 ;
int err ;
2016-10-28 09:59:15 -07:00
if ( ! sock6 )
return ERR_PTR ( - EIO ) ;
2016-03-04 15:15:08 +01:00
if ( tos & & ! info )
use_cache = false ;
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit
The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.
While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.
Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.
Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.
Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 15:15:07 +01:00
if ( use_cache ) {
2016-02-12 15:43:56 +01:00
ndst = dst_cache_get_ip6 ( dst_cache , saddr ) ;
if ( ndst )
return ndst ;
}
2015-12-07 13:04:30 +01:00
memset ( & fl6 , 0 , sizeof ( fl6 ) ) ;
fl6 . flowi6_oif = oif ;
fl6 . daddr = * daddr ;
2016-08-05 17:45:36 -07:00
fl6 . saddr = * saddr ;
2016-03-18 18:37:57 +01:00
fl6 . flowlabel = ip6_make_flowinfo ( RT_TOS ( tos ) , label ) ;
2015-12-07 13:04:30 +01:00
fl6 . flowi6_mark = skb - > mark ;
fl6 . flowi6_proto = IPPROTO_UDP ;
2017-01-11 15:18:53 +00:00
fl6 . fl6_dport = dport ;
fl6 . fl6_sport = sport ;
2015-12-07 13:04:30 +01:00
err = ipv6_stub - > ipv6_dst_lookup ( vxlan - > net ,
2016-10-28 09:59:15 -07:00
sock6 - > sock - > sk ,
2015-12-07 13:04:30 +01:00
& ndst , & fl6 ) ;
2016-11-13 20:43:55 -08:00
if ( unlikely ( err < 0 ) ) {
netdev_dbg ( dev , " no route to %pI6 \n " , daddr ) ;
return ERR_PTR ( - ENETUNREACH ) ;
}
if ( unlikely ( ndst - > dev = = dev ) ) {
netdev_dbg ( dev , " circular route to %pI6 \n " , daddr ) ;
dst_release ( ndst ) ;
return ERR_PTR ( - ELOOP ) ;
}
2015-12-07 13:04:30 +01:00
* saddr = fl6 . saddr ;
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit
The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.
While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.
Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.
Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.
Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 15:15:07 +01:00
if ( use_cache )
2016-02-12 15:43:56 +01:00
dst_cache_set_ip6 ( dst_cache , ndst , saddr ) ;
2015-12-07 13:04:30 +01:00
return ndst ;
}
# endif
2013-04-02 12:31:52 +00:00
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass ( struct sk_buff * skb , struct vxlan_dev * src_vxlan ,
2017-01-31 22:59:52 -08:00
struct vxlan_dev * dst_vxlan , __be32 vni )
2013-04-02 12:31:52 +00:00
{
2014-01-04 13:57:59 +08:00
struct pcpu_sw_netstats * tx_stats , * rx_stats ;
2013-08-31 13:44:33 +08:00
union vxlan_addr loopback ;
union vxlan_addr * remote_ip = & dst_vxlan - > default_dst . remote_ip ;
2014-10-16 08:49:41 +08:00
struct net_device * dev = skb - > dev ;
int len = skb - > len ;
2013-04-02 12:31:52 +00:00
2014-01-04 13:57:59 +08:00
tx_stats = this_cpu_ptr ( src_vxlan - > dev - > tstats ) ;
rx_stats = this_cpu_ptr ( dst_vxlan - > dev - > tstats ) ;
2013-04-02 12:31:52 +00:00
skb - > pkt_type = PACKET_HOST ;
skb - > encapsulation = 0 ;
skb - > dev = dst_vxlan - > dev ;
__skb_pull ( skb , skb_network_offset ( skb ) ) ;
2013-08-31 13:44:33 +08:00
if ( remote_ip - > sa . sa_family = = AF_INET ) {
loopback . sin . sin_addr . s_addr = htonl ( INADDR_LOOPBACK ) ;
loopback . sa . sa_family = AF_INET ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
loopback . sin6 . sin6_addr = in6addr_loopback ;
loopback . sa . sa_family = AF_INET6 ;
# endif
}
2013-04-02 12:31:52 +00:00
if ( dst_vxlan - > flags & VXLAN_F_LEARN )
2017-01-31 22:59:52 -08:00
vxlan_snoop ( skb - > dev , & loopback , eth_hdr ( skb ) - > h_source , vni ) ;
2013-04-02 12:31:52 +00:00
u64_stats_update_begin ( & tx_stats - > syncp ) ;
tx_stats - > tx_packets + + ;
2014-10-16 08:49:41 +08:00
tx_stats - > tx_bytes + = len ;
2013-04-02 12:31:52 +00:00
u64_stats_update_end ( & tx_stats - > syncp ) ;
if ( netif_rx ( skb ) = = NET_RX_SUCCESS ) {
u64_stats_update_begin ( & rx_stats - > syncp ) ;
rx_stats - > rx_packets + + ;
2014-10-16 08:49:41 +08:00
rx_stats - > rx_bytes + = len ;
2013-04-02 12:31:52 +00:00
u64_stats_update_end ( & rx_stats - > syncp ) ;
} else {
2014-10-16 08:49:41 +08:00
dev - > stats . rx_dropped + + ;
2013-04-02 12:31:52 +00:00
}
}
2016-11-13 20:43:56 -08:00
static int encap_bypass_if_local ( struct sk_buff * skb , struct net_device * dev ,
struct vxlan_dev * vxlan , union vxlan_addr * daddr ,
2017-01-18 15:24:57 -05:00
__be16 dst_port , __be32 vni , struct dst_entry * dst ,
2016-11-13 20:43:56 -08:00
u32 rt_flags )
{
# if IS_ENABLED(CONFIG_IPV6)
/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
* RTF_LOCAL is equal to RTCF_LOCAL . So to keep code simple
* we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry .
*/
BUILD_BUG_ON ( RTCF_LOCAL ! = RTF_LOCAL ) ;
# endif
/* Bypass encapsulation if the destination is local */
if ( rt_flags & RTCF_LOCAL & &
! ( rt_flags & ( RTCF_BROADCAST | RTCF_MULTICAST ) ) ) {
struct vxlan_dev * dst_vxlan ;
dst_release ( dst ) ;
dst_vxlan = vxlan_find_vni ( vxlan - > net , vni ,
daddr - > sa . sa_family , dst_port ,
vxlan - > flags ) ;
if ( ! dst_vxlan ) {
dev - > stats . tx_errors + + ;
kfree_skb ( skb ) ;
return - ENOENT ;
}
2017-01-31 22:59:52 -08:00
vxlan_encap_bypass ( skb , vxlan , dst_vxlan , vni ) ;
2016-11-13 20:43:56 -08:00
return 1 ;
}
return 0 ;
}
2013-06-17 14:16:11 -07:00
static void vxlan_xmit_one ( struct sk_buff * skb , struct net_device * dev ,
2017-01-31 22:59:52 -08:00
__be32 default_vni , struct vxlan_rdst * rdst ,
bool did_rsc )
2012-10-01 12:32:35 +00:00
{
2016-02-12 15:43:57 +01:00
struct dst_cache * dst_cache ;
2015-07-21 10:44:00 +02:00
struct ip_tunnel_info * info ;
2012-10-01 12:32:35 +00:00
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2016-11-13 20:43:57 -08:00
const struct iphdr * old_iph = ip_hdr ( skb ) ;
2013-08-31 13:44:33 +08:00
union vxlan_addr * dst ;
2016-08-05 17:45:36 -07:00
union vxlan_addr remote_ip , local_ip ;
2015-07-21 10:43:58 +02:00
struct vxlan_metadata _md ;
struct vxlan_metadata * md = & _md ;
2013-08-31 13:44:33 +08:00
__be16 src_port = 0 , dst_port ;
2016-11-13 20:43:55 -08:00
struct dst_entry * ndst = NULL ;
2016-03-09 03:00:03 +01:00
__be32 vni , label ;
2012-10-01 12:32:35 +00:00
__u8 tos , ttl ;
2013-06-17 17:49:56 -07:00
int err ;
2015-07-21 10:43:58 +02:00
u32 flags = vxlan - > flags ;
2016-02-02 18:09:15 +01:00
bool udp_sum = false ;
2016-02-02 18:09:16 +01:00
bool xnet = ! net_eq ( vxlan - > net , dev_net ( vxlan - > dev ) ) ;
2012-10-01 12:32:35 +00:00
2015-08-20 13:56:25 +02:00
info = skb_tunnel_info ( skb ) ;
2015-07-21 10:44:00 +02:00
2015-07-21 10:43:58 +02:00
if ( rdst ) {
2016-11-13 20:43:57 -08:00
dst = & rdst - > remote_ip ;
if ( vxlan_addr_any ( dst ) ) {
if ( did_rsc ) {
/* short-circuited back to local bridge */
2017-01-31 22:59:52 -08:00
vxlan_encap_bypass ( skb , vxlan , vxlan , default_vni ) ;
2016-11-13 20:43:57 -08:00
return ;
}
goto drop ;
}
2015-07-21 10:44:02 +02:00
dst_port = rdst - > remote_port ? rdst - > remote_port : vxlan - > cfg . dst_port ;
2017-01-31 22:59:52 -08:00
vni = ( rdst - > remote_vni ) ? : default_vni ;
2017-02-24 17:47:11 +00:00
local_ip = vxlan - > cfg . saddr ;
2016-02-12 15:43:57 +01:00
dst_cache = & rdst - > dst_cache ;
2016-11-13 20:43:57 -08:00
md - > gbp = skb - > mark ;
ttl = vxlan - > cfg . ttl ;
if ( ! ttl & & vxlan_addr_multicast ( dst ) )
ttl = 1 ;
tos = vxlan - > cfg . tos ;
if ( tos = = 1 )
tos = ip_tunnel_get_dsfield ( old_iph , skb ) ;
if ( dst - > sa . sa_family = = AF_INET )
udp_sum = ! ( flags & VXLAN_F_UDP_ZERO_CSUM_TX ) ;
else
udp_sum = ! ( flags & VXLAN_F_UDP_ZERO_CSUM6_TX ) ;
label = vxlan - > cfg . label ;
2015-07-21 10:43:58 +02:00
} else {
if ( ! info ) {
WARN_ONCE ( 1 , " %s: Missing encapsulation instructions \n " ,
dev - > name ) ;
goto drop ;
}
2015-09-24 13:50:02 +02:00
remote_ip . sa . sa_family = ip_tunnel_info_af ( info ) ;
2016-08-05 17:45:36 -07:00
if ( remote_ip . sa . sa_family = = AF_INET ) {
2015-08-20 13:56:30 +02:00
remote_ip . sin . sin_addr . s_addr = info - > key . u . ipv4 . dst ;
2016-08-05 17:45:36 -07:00
local_ip . sin . sin_addr . s_addr = info - > key . u . ipv4 . src ;
} else {
2015-08-20 13:56:30 +02:00
remote_ip . sin6 . sin6_addr = info - > key . u . ipv6 . dst ;
2016-08-05 17:45:36 -07:00
local_ip . sin6 . sin6_addr = info - > key . u . ipv6 . src ;
}
2015-07-21 10:43:58 +02:00
dst = & remote_ip ;
2016-11-13 20:43:57 -08:00
dst_port = info - > key . tp_dst ? : vxlan - > cfg . dst_port ;
vni = tunnel_id_to_key32 ( info - > key . tun_id ) ;
2016-02-12 15:43:57 +01:00
dst_cache = & info - > dst_cache ;
2016-11-13 20:43:57 -08:00
if ( info - > options_len )
md = ip_tunnel_info_opts ( info ) ;
2015-08-20 13:56:30 +02:00
ttl = info - > key . ttl ;
tos = info - > key . tos ;
2016-03-09 03:00:03 +01:00
label = info - > key . label ;
2016-02-02 18:09:15 +01:00
udp_sum = ! ! ( info - > key . tun_flags & TUNNEL_CSUM ) ;
2015-08-20 13:56:30 +02:00
}
2016-11-13 20:43:57 -08:00
src_port = udp_flow_src_port ( dev_net ( dev ) , skb , vxlan - > cfg . port_min ,
vxlan - > cfg . port_max , true ) ;
2015-08-20 13:56:30 +02:00
2017-02-24 11:43:36 -08:00
rcu_read_lock ( ) ;
2013-08-31 13:44:33 +08:00
if ( dst - > sa . sa_family = = AF_INET ) {
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock4 = rcu_dereference ( vxlan - > vn4_sock ) ;
2016-11-13 20:43:54 -08:00
struct rtable * rt ;
2016-11-13 20:43:57 -08:00
__be16 df = 0 ;
2016-10-28 09:59:15 -07:00
2016-11-13 20:43:55 -08:00
rt = vxlan_get_route ( vxlan , dev , sock4 , skb ,
2016-02-02 18:09:14 +01:00
rdst ? rdst - > remote_ifindex : 0 , tos ,
2016-08-05 17:45:36 -07:00
dst - > sin . sin_addr . s_addr ,
2017-02-24 17:47:11 +00:00
& local_ip . sin . sin_addr . s_addr ,
2017-01-11 15:18:53 +00:00
dst_port , src_port ,
2016-02-12 15:43:57 +01:00
dst_cache , info ) ;
2016-11-15 16:32:11 -05:00
if ( IS_ERR ( rt ) ) {
err = PTR_ERR ( rt ) ;
2016-11-13 20:43:54 -08:00
goto tx_error ;
2016-11-15 16:32:11 -05:00
}
2013-08-31 13:44:33 +08:00
/* Bypass encapsulation if the destination is local */
2016-11-13 20:43:56 -08:00
if ( ! info ) {
err = encap_bypass_if_local ( skb , dev , vxlan , dst ,
dst_port , vni , & rt - > dst ,
rt - > rt_flags ) ;
if ( err )
2017-02-24 11:43:36 -08:00
goto out_unlock ;
2016-11-13 20:43:56 -08:00
} else if ( info - > key . tun_flags & TUNNEL_DONT_FRAGMENT ) {
2016-02-19 11:26:31 -08:00
df = htons ( IP_DF ) ;
2016-11-13 20:43:56 -08:00
}
2016-02-19 11:26:31 -08:00
2016-11-13 20:43:54 -08:00
ndst = & rt - > dst ;
2013-08-31 13:44:33 +08:00
tos = ip_tunnel_ecn_encap ( tos , old_iph , skb ) ;
ttl = ttl ? : ip4_dst_hoplimit ( & rt - > dst ) ;
2016-11-13 20:43:54 -08:00
err = vxlan_build_skb ( skb , ndst , sizeof ( struct iphdr ) ,
2016-02-16 21:58:58 +01:00
vni , md , flags , udp_sum ) ;
2016-02-02 18:09:16 +01:00
if ( err < 0 )
2016-11-13 20:43:54 -08:00
goto tx_error ;
2016-02-02 18:09:16 +01:00
2017-02-24 17:47:11 +00:00
udp_tunnel_xmit_skb ( rt , sock4 - > sock - > sk , skb , local_ip . sin . sin_addr . s_addr ,
2016-02-02 18:09:16 +01:00
dst - > sin . sin_addr . s_addr , tos , ttl , df ,
src_port , dst_port , xnet , ! udp_sum ) ;
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
} else {
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock6 = rcu_dereference ( vxlan - > vn6_sock ) ;
2013-08-31 13:44:33 +08:00
2016-11-13 20:43:55 -08:00
ndst = vxlan6_get_route ( vxlan , dev , sock6 , skb ,
2016-03-04 15:15:08 +01:00
rdst ? rdst - > remote_ifindex : 0 , tos ,
2016-08-05 17:45:36 -07:00
label , & dst - > sin6 . sin6_addr ,
2017-02-24 17:47:11 +00:00
& local_ip . sin6 . sin6_addr ,
2017-01-11 15:18:53 +00:00
dst_port , src_port ,
bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit
The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan
device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c
("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage
when ip_tunnel_info is used is unfortunately not always valid as assumed.
While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF
however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre
with different remote dsts, tos, etc, therefore they cannot be assumed as
packet independent.
Right now vxlan, geneve, gre would cache the dst for eBPF and every packet
would reuse the same entry that was first created on the initial route
lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have
a different one.
Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test
in vxlan needs to be handeled differently in this context as it is currently
inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable()
helper is added for the three tunnel cases, which checks if we can use dst
cache.
Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-04 15:15:07 +01:00
dst_cache , info ) ;
2015-12-07 13:04:30 +01:00
if ( IS_ERR ( ndst ) ) {
2016-11-15 16:32:11 -05:00
err = PTR_ERR ( ndst ) ;
2016-11-13 20:43:54 -08:00
ndst = NULL ;
2013-04-02 12:31:52 +00:00
goto tx_error ;
2013-08-31 13:44:33 +08:00
}
2016-11-13 20:43:55 -08:00
2016-11-13 20:43:56 -08:00
if ( ! info ) {
u32 rt6i_flags = ( ( struct rt6_info * ) ndst ) - > rt6i_flags ;
2013-08-19 11:23:17 -07:00
2016-11-13 20:43:56 -08:00
err = encap_bypass_if_local ( skb , dev , vxlan , dst ,
dst_port , vni , ndst ,
rt6i_flags ) ;
if ( err )
2017-02-24 11:43:36 -08:00
goto out_unlock ;
2016-11-13 20:43:56 -08:00
}
2016-01-20 16:22:47 -08:00
2016-03-04 15:15:08 +01:00
tos = ip_tunnel_ecn_encap ( tos , old_iph , skb ) ;
2013-08-31 13:44:33 +08:00
ttl = ttl ? : ip6_dst_hoplimit ( ndst ) ;
2016-02-02 18:09:16 +01:00
skb_scrub_packet ( skb , xnet ) ;
err = vxlan_build_skb ( skb , ndst , sizeof ( struct ipv6hdr ) ,
2016-02-16 21:58:58 +01:00
vni , md , flags , udp_sum ) ;
2016-11-13 20:43:54 -08:00
if ( err < 0 )
goto tx_error ;
2016-11-13 20:43:57 -08:00
udp_tunnel6_xmit_skb ( ndst , sock6 - > sock - > sk , skb , dev ,
2017-02-24 17:47:11 +00:00
& local_ip . sin6 . sin6_addr ,
2016-08-05 17:45:36 -07:00
& dst - > sin6 . sin6_addr , tos , ttl ,
2016-03-09 03:00:03 +01:00
label , src_port , dst_port , ! udp_sum ) ;
2013-08-31 13:44:33 +08:00
# endif
}
2017-02-24 11:43:36 -08:00
out_unlock :
rcu_read_unlock ( ) ;
2013-06-17 14:16:11 -07:00
return ;
2012-10-01 12:32:35 +00:00
drop :
dev - > stats . tx_dropped + + ;
2016-11-13 20:43:54 -08:00
dev_kfree_skb ( skb ) ;
return ;
2012-10-01 12:32:35 +00:00
tx_error :
2017-02-24 11:43:36 -08:00
rcu_read_unlock ( ) ;
2016-11-13 20:43:55 -08:00
if ( err = = - ELOOP )
dev - > stats . collisions + + ;
else if ( err = = - ENETUNREACH )
dev - > stats . tx_carrier_errors + + ;
2016-11-13 20:43:54 -08:00
dst_release ( ndst ) ;
2012-10-01 12:32:35 +00:00
dev - > stats . tx_errors + + ;
2016-11-13 20:43:54 -08:00
kfree_skb ( skb ) ;
2012-10-01 12:32:35 +00:00
}
2013-03-15 04:35:51 +00:00
/* Transmit local packets over Vxlan
*
* Outer IP header inherits ECN and DF from inner header .
* Outer UDP destination is the VXLAN assigned port .
* source port is based on hash of flow
*/
static netdev_tx_t vxlan_xmit ( struct sk_buff * skb , struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2015-07-21 10:44:00 +02:00
const struct ip_tunnel_info * info ;
2013-03-15 04:35:51 +00:00
struct ethhdr * eth ;
bool did_rsc = false ;
2014-01-06 09:54:31 -08:00
struct vxlan_rdst * rdst , * fdst = NULL ;
2013-03-15 04:35:51 +00:00
struct vxlan_fdb * f ;
2017-01-31 22:59:52 -08:00
__be32 vni = 0 ;
2013-03-15 04:35:51 +00:00
2015-08-20 13:56:25 +02:00
info = skb_tunnel_info ( skb ) ;
2015-07-21 10:44:00 +02:00
2013-03-15 04:35:51 +00:00
skb_reset_mac_header ( skb ) ;
2016-04-05 14:47:11 +02:00
if ( vxlan - > flags & VXLAN_F_COLLECT_METADATA ) {
2017-01-31 22:59:52 -08:00
if ( info & & info - > mode & IP_TUNNEL_INFO_BRIDGE & &
info - > mode & IP_TUNNEL_INFO_TX ) {
vni = tunnel_id_to_key32 ( info - > key . tun_id ) ;
} else {
if ( info & & info - > mode & IP_TUNNEL_INFO_TX )
vxlan_xmit_one ( skb , dev , vni , NULL , false ) ;
else
kfree_skb ( skb ) ;
return NETDEV_TX_OK ;
}
2016-04-05 14:47:11 +02:00
}
if ( vxlan - > flags & VXLAN_F_PROXY ) {
eth = eth_hdr ( skb ) ;
2013-08-31 13:44:36 +08:00
if ( ntohs ( eth - > h_proto ) = = ETH_P_ARP )
2017-01-31 22:59:52 -08:00
return arp_reduce ( dev , skb , vni ) ;
2013-08-31 13:44:36 +08:00
# if IS_ENABLED(CONFIG_IPV6)
2017-04-02 11:00:06 +02:00
else if ( ntohs ( eth - > h_proto ) = = ETH_P_IPV6 ) {
struct ipv6hdr * hdr , _hdr ;
if ( ( hdr = skb_header_pointer ( skb ,
skb_network_offset ( skb ) ,
sizeof ( _hdr ) , & _hdr ) ) & &
hdr - > nexthdr = = IPPROTO_ICMPV6 )
return neigh_reduce ( dev , skb , vni ) ;
2013-08-31 13:44:36 +08:00
}
# endif
}
2013-03-15 04:35:51 +00:00
2016-04-05 14:47:11 +02:00
eth = eth_hdr ( skb ) ;
2017-01-31 22:59:52 -08:00
f = vxlan_find_mac ( vxlan , eth - > h_dest , vni ) ;
2013-04-19 00:36:26 +00:00
did_rsc = false ;
if ( f & & ( f - > flags & NTF_ROUTER ) & & ( vxlan - > flags & VXLAN_F_RSC ) & &
2013-08-31 13:44:34 +08:00
( ntohs ( eth - > h_proto ) = = ETH_P_IP | |
ntohs ( eth - > h_proto ) = = ETH_P_IPV6 ) ) {
2013-04-19 00:36:26 +00:00
did_rsc = route_shortcircuit ( dev , skb ) ;
if ( did_rsc )
2017-01-31 22:59:52 -08:00
f = vxlan_find_mac ( vxlan , eth - > h_dest , vni ) ;
2013-04-19 00:36:26 +00:00
}
2013-03-15 04:35:51 +00:00
if ( f = = NULL ) {
2017-01-31 22:59:52 -08:00
f = vxlan_find_mac ( vxlan , all_zeros_mac , vni ) ;
2013-06-25 16:01:51 +03:00
if ( f = = NULL ) {
if ( ( vxlan - > flags & VXLAN_F_L2MISS ) & &
! is_multicast_ether_addr ( eth - > h_dest ) )
vxlan_fdb_miss ( vxlan , eth - > h_dest ) ;
dev - > stats . tx_dropped + + ;
2014-01-06 09:54:31 -08:00
kfree_skb ( skb ) ;
2013-06-25 16:01:51 +03:00
return NETDEV_TX_OK ;
}
}
2013-03-15 04:35:51 +00:00
2013-06-25 16:01:51 +03:00
list_for_each_entry_rcu ( rdst , & f - > remotes , list ) {
struct sk_buff * skb1 ;
2013-03-15 04:35:51 +00:00
2014-01-06 09:54:31 -08:00
if ( ! fdst ) {
fdst = rdst ;
continue ;
}
2013-06-25 16:01:51 +03:00
skb1 = skb_clone ( skb , GFP_ATOMIC ) ;
if ( skb1 )
2017-01-31 22:59:52 -08:00
vxlan_xmit_one ( skb1 , dev , vni , rdst , did_rsc ) ;
2013-03-15 04:35:51 +00:00
}
2014-01-06 09:54:31 -08:00
if ( fdst )
2017-01-31 22:59:52 -08:00
vxlan_xmit_one ( skb , dev , vni , fdst , did_rsc ) ;
2014-01-06 09:54:31 -08:00
else
kfree_skb ( skb ) ;
2013-06-17 14:16:11 -07:00
return NETDEV_TX_OK ;
2013-03-15 04:35:51 +00:00
}
2012-10-01 12:32:35 +00:00
/* Walk the forwarding table and purge stale entries */
static void vxlan_cleanup ( unsigned long arg )
{
struct vxlan_dev * vxlan = ( struct vxlan_dev * ) arg ;
unsigned long next_timer = jiffies + FDB_AGE_INTERVAL ;
unsigned int h ;
if ( ! netif_running ( vxlan - > dev ) )
return ;
for ( h = 0 ; h < FDB_HASH_SIZE ; + + h ) {
struct hlist_node * p , * n ;
2015-05-26 10:42:04 +03:00
spin_lock_bh ( & vxlan - > hash_lock ) ;
2012-10-01 12:32:35 +00:00
hlist_for_each_safe ( p , n , & vxlan - > fdb_head [ h ] ) {
struct vxlan_fdb * f
= container_of ( p , struct vxlan_fdb , hlist ) ;
unsigned long timeout ;
2017-01-23 20:44:33 -08:00
if ( f - > state & ( NUD_PERMANENT | NUD_NOARP ) )
2012-10-01 12:32:35 +00:00
continue ;
2017-03-27 15:46:41 -07:00
if ( f - > flags & NTF_EXT_LEARNED )
continue ;
2015-07-21 10:44:02 +02:00
timeout = f - > used + vxlan - > cfg . age_interval * HZ ;
2012-10-01 12:32:35 +00:00
if ( time_before_eq ( timeout , jiffies ) ) {
netdev_dbg ( vxlan - > dev ,
" garbage collect %pM \n " ,
f - > eth_addr ) ;
f - > state = NUD_STALE ;
vxlan_fdb_destroy ( vxlan , f ) ;
} else if ( time_before ( timeout , next_timer ) )
next_timer = timeout ;
}
2015-05-26 10:42:04 +03:00
spin_unlock_bh ( & vxlan - > hash_lock ) ;
2012-10-01 12:32:35 +00:00
}
mod_timer ( & vxlan - > age_timer , next_timer ) ;
}
2013-08-19 11:22:48 -07:00
static void vxlan_vs_add_dev ( struct vxlan_sock * vs , struct vxlan_dev * vxlan )
{
2015-03-18 14:50:44 -03:00
struct vxlan_net * vn = net_generic ( vxlan - > net , vxlan_net_id ) ;
2016-02-16 21:58:58 +01:00
__be32 vni = vxlan - > default_dst . remote_vni ;
2013-08-19 11:22:48 -07:00
2015-03-18 14:50:44 -03:00
spin_lock ( & vn - > sock_lock ) ;
2013-08-19 11:22:48 -07:00
hlist_add_head_rcu ( & vxlan - > hlist , vni_head ( vs , vni ) ) ;
2015-03-18 14:50:44 -03:00
spin_unlock ( & vn - > sock_lock ) ;
2013-08-19 11:22:48 -07:00
}
2012-10-01 12:32:35 +00:00
/* Setup stats when device is created */
static int vxlan_init ( struct net_device * dev )
{
2014-02-13 11:46:28 -08:00
dev - > tstats = netdev_alloc_pcpu_stats ( struct pcpu_sw_netstats ) ;
2013-03-25 14:49:46 +00:00
if ( ! dev - > tstats )
2012-10-01 12:32:35 +00:00
return - ENOMEM ;
return 0 ;
}
2017-01-31 22:59:52 -08:00
static void vxlan_fdb_delete_default ( struct vxlan_dev * vxlan , __be32 vni )
2013-06-25 16:01:51 +03:00
{
struct vxlan_fdb * f ;
spin_lock_bh ( & vxlan - > hash_lock ) ;
2017-01-31 22:59:52 -08:00
f = __vxlan_find_mac ( vxlan , all_zeros_mac , vni ) ;
2013-06-25 16:01:51 +03:00
if ( f )
vxlan_fdb_destroy ( vxlan , f ) ;
spin_unlock_bh ( & vxlan - > hash_lock ) ;
}
2013-06-17 14:16:11 -07:00
static void vxlan_uninit ( struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2017-01-31 22:59:52 -08:00
vxlan_fdb_delete_default ( vxlan , vxlan - > cfg . vni ) ;
2013-06-25 16:01:51 +03:00
2013-06-17 14:16:11 -07:00
free_percpu ( dev - > tstats ) ;
}
2012-10-01 12:32:35 +00:00
/* Start ageing timer and join group when device is brought up */
static int vxlan_open ( struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2015-09-24 13:50:01 +02:00
int ret ;
2015-03-18 14:50:44 -03:00
2015-09-24 13:50:01 +02:00
ret = vxlan_sock_add ( vxlan ) ;
if ( ret < 0 )
return ret ;
2012-10-01 12:32:35 +00:00
2013-12-10 16:37:32 +08:00
if ( vxlan_addr_multicast ( & vxlan - > default_dst . remote_ip ) ) {
2015-03-18 14:50:44 -03:00
ret = vxlan_igmp_join ( vxlan ) ;
2015-08-25 20:22:35 -03:00
if ( ret = = - EADDRINUSE )
ret = 0 ;
2015-03-18 14:50:44 -03:00
if ( ret ) {
2015-09-24 13:50:01 +02:00
vxlan_sock_release ( vxlan ) ;
2015-03-18 14:50:44 -03:00
return ret ;
}
2012-10-01 12:32:35 +00:00
}
2015-07-21 10:44:02 +02:00
if ( vxlan - > cfg . age_interval )
2012-10-01 12:32:35 +00:00
mod_timer ( & vxlan - > age_timer , jiffies + FDB_AGE_INTERVAL ) ;
2015-03-18 14:50:44 -03:00
return ret ;
2012-10-01 12:32:35 +00:00
}
/* Purge the forwarding table */
2017-01-23 20:44:32 -08:00
static void vxlan_flush ( struct vxlan_dev * vxlan , bool do_all )
2012-10-01 12:32:35 +00:00
{
2013-05-27 22:35:52 +00:00
unsigned int h ;
2012-10-01 12:32:35 +00:00
spin_lock_bh ( & vxlan - > hash_lock ) ;
for ( h = 0 ; h < FDB_HASH_SIZE ; + + h ) {
struct hlist_node * p , * n ;
hlist_for_each_safe ( p , n , & vxlan - > fdb_head [ h ] ) {
struct vxlan_fdb * f
= container_of ( p , struct vxlan_fdb , hlist ) ;
2017-01-23 20:44:32 -08:00
if ( ! do_all & & ( f - > state & ( NUD_PERMANENT | NUD_NOARP ) ) )
continue ;
2013-06-25 16:01:51 +03:00
/* the all_zeros_mac entry is deleted at vxlan_uninit */
if ( ! is_zero_ether_addr ( f - > eth_addr ) )
vxlan_fdb_destroy ( vxlan , f ) ;
2012-10-01 12:32:35 +00:00
}
}
spin_unlock_bh ( & vxlan - > hash_lock ) ;
}
/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop ( struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2014-04-24 10:02:49 +02:00
struct vxlan_net * vn = net_generic ( vxlan - > net , vxlan_net_id ) ;
2015-03-18 14:50:44 -03:00
int ret = 0 ;
2012-10-01 12:32:35 +00:00
2015-03-23 16:23:12 -03:00
if ( vxlan_addr_multicast ( & vxlan - > default_dst . remote_ip ) & &
2015-04-08 14:48:30 -07:00
! vxlan_group_used ( vn , vxlan ) )
2015-03-18 14:50:44 -03:00
ret = vxlan_igmp_leave ( vxlan ) ;
2012-10-01 12:32:35 +00:00
del_timer_sync ( & vxlan - > age_timer ) ;
2017-01-23 20:44:32 -08:00
vxlan_flush ( vxlan , false ) ;
2015-09-24 13:50:01 +02:00
vxlan_sock_release ( vxlan ) ;
2012-10-01 12:32:35 +00:00
2015-03-18 14:50:44 -03:00
return ret ;
2012-10-01 12:32:35 +00:00
}
/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list ( struct net_device * dev )
{
}
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
static int vxlan_change_mtu ( struct net_device * dev , int new_mtu )
2013-12-18 00:21:08 +01:00
{
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct vxlan_rdst * dst = & vxlan - > default_dst ;
struct net_device * lowerdev = __dev_get_by_index ( vxlan - > net ,
dst - > remote_ifindex ) ;
bool use_ipv6 = false ;
2013-12-18 00:21:08 +01:00
if ( dst - > remote_ip . sa . sa_family = = AF_INET6 )
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
use_ipv6 = true ;
2013-12-18 00:21:08 +01:00
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
/* This check is different than dev->max_mtu, because it looks at
* the lowerdev - > mtu , rather than the static dev - > max_mtu
*/
if ( lowerdev ) {
int max_mtu = lowerdev - > mtu -
( use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM ) ;
if ( new_mtu > max_mtu )
2016-02-10 00:05:55 +00:00
return - EINVAL ;
}
2013-12-18 00:21:08 +01:00
dev - > mtu = new_mtu ;
return 0 ;
}
2015-10-22 18:17:16 -07:00
static int vxlan_fill_metadata_dst ( struct net_device * dev , struct sk_buff * skb )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct ip_tunnel_info * info = skb_tunnel_info ( skb ) ;
__be16 sport , dport ;
sport = udp_flow_src_port ( dev_net ( dev ) , skb , vxlan - > cfg . port_min ,
vxlan - > cfg . port_max , true ) ;
dport = info - > key . tp_dst ? : vxlan - > cfg . dst_port ;
2015-12-07 13:04:31 +01:00
if ( ip_tunnel_info_af ( info ) = = AF_INET ) {
2016-10-28 09:59:15 -07:00
struct vxlan_sock * sock4 = rcu_dereference ( vxlan - > vn4_sock ) ;
2016-02-02 18:09:14 +01:00
struct rtable * rt ;
2016-11-13 20:43:55 -08:00
rt = vxlan_get_route ( vxlan , dev , sock4 , skb , 0 , info - > key . tos ,
2016-02-02 18:09:14 +01:00
info - > key . u . ipv4 . dst ,
2017-02-17 19:14:27 +01:00
& info - > key . u . ipv4 . src , dport , sport ,
& info - > dst_cache , info ) ;
2016-02-02 18:09:14 +01:00
if ( IS_ERR ( rt ) )
return PTR_ERR ( rt ) ;
ip_rt_put ( rt ) ;
2015-12-07 13:04:31 +01:00
} else {
# if IS_ENABLED(CONFIG_IPV6)
2016-11-13 20:43:53 -08:00
struct vxlan_sock * sock6 = rcu_dereference ( vxlan - > vn6_sock ) ;
2015-12-07 13:04:31 +01:00
struct dst_entry * ndst ;
2016-11-13 20:43:55 -08:00
ndst = vxlan6_get_route ( vxlan , dev , sock6 , skb , 0 , info - > key . tos ,
2016-03-09 03:00:03 +01:00
info - > key . label , & info - > key . u . ipv6 . dst ,
2017-02-17 19:14:27 +01:00
& info - > key . u . ipv6 . src , dport , sport ,
& info - > dst_cache , info ) ;
2015-12-07 13:04:31 +01:00
if ( IS_ERR ( ndst ) )
return PTR_ERR ( ndst ) ;
dst_release ( ndst ) ;
# else /* !CONFIG_IPV6 */
return - EPFNOSUPPORT ;
# endif
}
2016-02-02 18:09:14 +01:00
info - > key . tp_src = sport ;
info - > key . tp_dst = dport ;
2015-12-07 13:04:31 +01:00
return 0 ;
2015-10-22 18:17:16 -07:00
}
2016-04-05 14:47:10 +02:00
static const struct net_device_ops vxlan_netdev_ether_ops = {
2012-10-01 12:32:35 +00:00
. ndo_init = vxlan_init ,
2013-06-17 14:16:11 -07:00
. ndo_uninit = vxlan_uninit ,
2012-10-01 12:32:35 +00:00
. ndo_open = vxlan_open ,
. ndo_stop = vxlan_stop ,
. ndo_start_xmit = vxlan_xmit ,
2013-03-25 14:49:46 +00:00
. ndo_get_stats64 = ip_tunnel_get_stats64 ,
2012-10-01 12:32:35 +00:00
. ndo_set_rx_mode = vxlan_set_multicast_list ,
2013-12-18 00:21:08 +01:00
. ndo_change_mtu = vxlan_change_mtu ,
2012-10-01 12:32:35 +00:00
. ndo_validate_addr = eth_validate_addr ,
. ndo_set_mac_address = eth_mac_addr ,
. ndo_fdb_add = vxlan_fdb_add ,
. ndo_fdb_del = vxlan_fdb_delete ,
. ndo_fdb_dump = vxlan_fdb_dump ,
2015-10-22 18:17:16 -07:00
. ndo_fill_metadata_dst = vxlan_fill_metadata_dst ,
2012-10-01 12:32:35 +00:00
} ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
static const struct net_device_ops vxlan_netdev_raw_ops = {
. ndo_init = vxlan_init ,
. ndo_uninit = vxlan_uninit ,
. ndo_open = vxlan_open ,
. ndo_stop = vxlan_stop ,
. ndo_start_xmit = vxlan_xmit ,
. ndo_get_stats64 = ip_tunnel_get_stats64 ,
. ndo_change_mtu = vxlan_change_mtu ,
. ndo_fill_metadata_dst = vxlan_fill_metadata_dst ,
} ;
2012-10-01 12:32:35 +00:00
/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type vxlan_type = {
. name = " vxlan " ,
} ;
2016-07-11 13:12:28 +02:00
/* Calls the ndo_udp_tunnel_add of the caller in order to
2013-09-13 07:34:13 -07:00
* supply the listening VXLAN udp ports . Callers are expected
2016-07-11 13:12:28 +02:00
* to implement the ndo_udp_tunnel_add .
2013-09-04 02:13:38 -07:00
*/
2016-04-18 21:19:47 +02:00
static void vxlan_push_rx_ports ( struct net_device * dev )
2013-09-04 02:13:38 -07:00
{
struct vxlan_sock * vs ;
struct net * net = dev_net ( dev ) ;
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
2013-09-13 07:34:13 -07:00
unsigned int i ;
2013-09-04 02:13:38 -07:00
spin_lock ( & vn - > sock_lock ) ;
for ( i = 0 ; i < PORT_HASH_SIZE ; + + i ) {
2016-06-16 12:20:52 -07:00
hlist_for_each_entry_rcu ( vs , & vn - > sock_list [ i ] , hlist )
udp_tunnel_push_rx_port ( dev , vs - > sock ,
2016-06-16 12:23:19 -07:00
( vs - > flags & VXLAN_F_GPE ) ?
UDP_TUNNEL_TYPE_VXLAN_GPE :
2016-06-16 12:20:52 -07:00
UDP_TUNNEL_TYPE_VXLAN ) ;
2013-09-04 02:13:38 -07:00
}
spin_unlock ( & vn - > sock_lock ) ;
}
2012-10-01 12:32:35 +00:00
/* Initialize the device structure. */
static void vxlan_setup ( struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-05-27 22:35:52 +00:00
unsigned int h ;
2012-10-01 12:32:35 +00:00
2016-04-28 16:36:30 +02:00
eth_hw_addr_random ( dev ) ;
ether_setup ( dev ) ;
2013-06-17 14:16:11 -07:00
dev - > destructor = free_netdev ;
2012-10-01 12:32:35 +00:00
SET_NETDEV_DEVTYPE ( dev , & vxlan_type ) ;
dev - > features | = NETIF_F_LLTX ;
2012-12-07 14:14:16 +00:00
dev - > features | = NETIF_F_SG | NETIF_F_HW_CSUM ;
2012-12-07 14:14:18 +00:00
dev - > features | = NETIF_F_RXCSUM ;
2013-03-07 13:22:36 +00:00
dev - > features | = NETIF_F_GSO_SOFTWARE ;
2012-12-07 14:14:18 +00:00
2013-08-19 11:23:29 -07:00
dev - > vlan_features = dev - > features ;
2012-12-07 14:14:18 +00:00
dev - > hw_features | = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM ;
2013-03-07 13:22:36 +00:00
dev - > hw_features | = NETIF_F_GSO_SOFTWARE ;
2014-10-05 18:38:35 -07:00
netif_keep_dst ( dev ) ;
2016-04-05 14:47:10 +02:00
dev - > priv_flags | = IFF_NO_QUEUE ;
2012-10-01 12:32:35 +00:00
2013-05-16 11:35:20 +00:00
INIT_LIST_HEAD ( & vxlan - > next ) ;
2012-10-01 12:32:35 +00:00
spin_lock_init ( & vxlan - > hash_lock ) ;
init_timer_deferrable ( & vxlan - > age_timer ) ;
vxlan - > age_timer . function = vxlan_cleanup ;
vxlan - > age_timer . data = ( unsigned long ) vxlan ;
2015-07-21 10:44:02 +02:00
vxlan - > cfg . dst_port = htons ( vxlan_port ) ;
2012-10-09 20:35:50 +00:00
2012-10-01 12:32:35 +00:00
vxlan - > dev = dev ;
2015-08-19 17:07:33 -07:00
gro_cells_init ( & vxlan - > gro_cells , dev ) ;
2012-10-01 12:32:35 +00:00
for ( h = 0 ; h < FDB_HASH_SIZE ; + + h )
INIT_HLIST_HEAD ( & vxlan - > fdb_head [ h ] ) ;
}
2016-04-05 14:47:10 +02:00
static void vxlan_ether_setup ( struct net_device * dev )
{
dev - > priv_flags & = ~ IFF_TX_SKB_SHARING ;
dev - > priv_flags | = IFF_LIVE_ADDR_CHANGE ;
dev - > netdev_ops = & vxlan_netdev_ether_ops ;
}
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
static void vxlan_raw_setup ( struct net_device * dev )
{
2016-04-28 16:36:30 +02:00
dev - > header_ops = NULL ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
dev - > type = ARPHRD_NONE ;
dev - > hard_header_len = 0 ;
dev - > addr_len = 0 ;
dev - > flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST ;
dev - > netdev_ops = & vxlan_netdev_raw_ops ;
}
2012-10-01 12:32:35 +00:00
static const struct nla_policy vxlan_policy [ IFLA_VXLAN_MAX + 1 ] = {
[ IFLA_VXLAN_ID ] = { . type = NLA_U32 } ,
2013-04-27 11:31:55 +00:00
[ IFLA_VXLAN_GROUP ] = { . len = FIELD_SIZEOF ( struct iphdr , daddr ) } ,
2013-08-31 13:44:33 +08:00
[ IFLA_VXLAN_GROUP6 ] = { . len = sizeof ( struct in6_addr ) } ,
2012-10-01 12:32:35 +00:00
[ IFLA_VXLAN_LINK ] = { . type = NLA_U32 } ,
[ IFLA_VXLAN_LOCAL ] = { . len = FIELD_SIZEOF ( struct iphdr , saddr ) } ,
2013-08-31 13:44:33 +08:00
[ IFLA_VXLAN_LOCAL6 ] = { . len = sizeof ( struct in6_addr ) } ,
2012-10-01 12:32:35 +00:00
[ IFLA_VXLAN_TOS ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_TTL ] = { . type = NLA_U8 } ,
2016-03-09 03:00:03 +01:00
[ IFLA_VXLAN_LABEL ] = { . type = NLA_U32 } ,
2012-10-01 12:32:35 +00:00
[ IFLA_VXLAN_LEARNING ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_AGEING ] = { . type = NLA_U32 } ,
[ IFLA_VXLAN_LIMIT ] = { . type = NLA_U32 } ,
2012-10-09 20:35:50 +00:00
[ IFLA_VXLAN_PORT_RANGE ] = { . len = sizeof ( struct ifla_vxlan_port_range ) } ,
2012-11-20 02:50:14 +00:00
[ IFLA_VXLAN_PROXY ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_RSC ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_L2MISS ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_L3MISS ] = { . type = NLA_U8 } ,
2015-07-30 20:10:22 -07:00
[ IFLA_VXLAN_COLLECT_METADATA ] = { . type = NLA_U8 } ,
2013-04-27 11:31:57 +00:00
[ IFLA_VXLAN_PORT ] = { . type = NLA_U16 } ,
2014-11-06 18:06:01 -08:00
[ IFLA_VXLAN_UDP_CSUM ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_UDP_ZERO_CSUM6_TX ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_UDP_ZERO_CSUM6_RX ] = { . type = NLA_U8 } ,
2015-01-12 17:00:38 -08:00
[ IFLA_VXLAN_REMCSUM_TX ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_REMCSUM_RX ] = { . type = NLA_U8 } ,
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
[ IFLA_VXLAN_GBP ] = { . type = NLA_FLAG , } ,
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
[ IFLA_VXLAN_GPE ] = { . type = NLA_FLAG , } ,
2015-02-10 16:30:32 -08:00
[ IFLA_VXLAN_REMCSUM_NOPARTIAL ] = { . type = NLA_FLAG } ,
2012-10-01 12:32:35 +00:00
} ;
static int vxlan_validate ( struct nlattr * tb [ ] , struct nlattr * data [ ] )
{
if ( tb [ IFLA_ADDRESS ] ) {
if ( nla_len ( tb [ IFLA_ADDRESS ] ) ! = ETH_ALEN ) {
pr_debug ( " invalid link address (not ethernet) \n " ) ;
return - EINVAL ;
}
if ( ! is_valid_ether_addr ( nla_data ( tb [ IFLA_ADDRESS ] ) ) ) {
pr_debug ( " invalid all zero ethernet address \n " ) ;
return - EADDRNOTAVAIL ;
}
}
if ( ! data )
return - EINVAL ;
if ( data [ IFLA_VXLAN_ID ] ) {
__u32 id = nla_get_u32 ( data [ IFLA_VXLAN_ID ] ) ;
2017-02-23 17:19:41 +01:00
if ( id > = VXLAN_N_VID )
2012-10-01 12:32:35 +00:00
return - ERANGE ;
}
2012-10-09 20:35:50 +00:00
if ( data [ IFLA_VXLAN_PORT_RANGE ] ) {
const struct ifla_vxlan_port_range * p
= nla_data ( data [ IFLA_VXLAN_PORT_RANGE ] ) ;
if ( ntohs ( p - > high ) < ntohs ( p - > low ) ) {
pr_debug ( " port range %u .. %u not valid \n " ,
ntohs ( p - > low ) , ntohs ( p - > high ) ) ;
return - EINVAL ;
}
}
2012-10-01 12:32:35 +00:00
return 0 ;
}
2013-01-29 23:43:07 +00:00
static void vxlan_get_drvinfo ( struct net_device * netdev ,
struct ethtool_drvinfo * drvinfo )
{
strlcpy ( drvinfo - > version , VXLAN_VERSION , sizeof ( drvinfo - > version ) ) ;
strlcpy ( drvinfo - > driver , " vxlan " , sizeof ( drvinfo - > driver ) ) ;
}
static const struct ethtool_ops vxlan_ethtool_ops = {
. get_drvinfo = vxlan_get_drvinfo ,
. get_link = ethtool_op_get_link ,
} ;
2014-07-13 19:49:42 -07:00
static struct socket * vxlan_create_sock ( struct net * net , bool ipv6 ,
__be16 port , u32 flags )
2013-05-16 11:35:20 +00:00
{
2013-08-31 13:44:33 +08:00
struct socket * sock ;
2014-07-13 19:49:42 -07:00
struct udp_port_cfg udp_conf ;
int err ;
2013-08-31 13:44:33 +08:00
2014-07-13 19:49:42 -07:00
memset ( & udp_conf , 0 , sizeof ( udp_conf ) ) ;
2013-08-31 13:44:33 +08:00
2014-07-13 19:49:42 -07:00
if ( ipv6 ) {
udp_conf . family = AF_INET6 ;
udp_conf . use_udp6_rx_checksums =
2014-11-24 20:08:38 -08:00
! ( flags & VXLAN_F_UDP_ZERO_CSUM6_RX ) ;
2015-08-28 20:48:22 +02:00
udp_conf . ipv6_v6only = 1 ;
2014-07-13 19:49:42 -07:00
} else {
udp_conf . family = AF_INET ;
2013-08-31 13:44:33 +08:00
}
2014-07-13 19:49:42 -07:00
udp_conf . local_udp_port = port ;
2013-05-16 11:35:20 +00:00
2014-07-13 19:49:42 -07:00
/* Open UDP socket */
err = udp_sock_create ( net , & udp_conf , & sock ) ;
if ( err < 0 )
return ERR_PTR ( err ) ;
2013-08-31 13:44:33 +08:00
2013-10-28 14:01:48 +08:00
return sock ;
2013-08-31 13:44:33 +08:00
}
/* Create new listen socket if needed */
2015-09-24 13:50:02 +02:00
static struct vxlan_sock * vxlan_socket_create ( struct net * net , bool ipv6 ,
__be16 port , u32 flags )
2013-08-31 13:44:33 +08:00
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
struct vxlan_sock * vs ;
struct socket * sock ;
unsigned int h ;
2014-09-16 17:31:18 -07:00
struct udp_tunnel_sock_cfg tunnel_cfg ;
2013-08-31 13:44:33 +08:00
2014-01-20 13:59:21 +02:00
vs = kzalloc ( sizeof ( * vs ) , GFP_KERNEL ) ;
2013-08-31 13:44:33 +08:00
if ( ! vs )
return ERR_PTR ( - ENOMEM ) ;
for ( h = 0 ; h < VNI_HASH_SIZE ; + + h )
INIT_HLIST_HEAD ( & vs - > vni_list [ h ] ) ;
2014-07-13 19:49:42 -07:00
sock = vxlan_create_sock ( net , ipv6 , port , flags ) ;
2013-10-28 14:01:48 +08:00
if ( IS_ERR ( sock ) ) {
2013-05-16 11:35:20 +00:00
kfree ( vs ) ;
2013-11-01 13:09:43 +08:00
return ERR_CAST ( sock ) ;
2013-05-16 11:35:20 +00:00
}
2013-08-31 13:44:33 +08:00
vs - > sock = sock ;
2013-08-19 11:22:48 -07:00
atomic_set ( & vs - > refcnt , 1 ) ;
2015-01-20 11:23:05 -08:00
vs - > flags = ( flags & VXLAN_F_RCV_FLAGS ) ;
2013-05-16 11:35:20 +00:00
2013-08-19 11:22:48 -07:00
spin_lock ( & vn - > sock_lock ) ;
hlist_add_head_rcu ( & vs - > hlist , vs_head ( net , port ) ) ;
2016-06-16 12:20:52 -07:00
udp_tunnel_notify_add_rx_port ( sock ,
2016-06-16 12:23:19 -07:00
( vs - > flags & VXLAN_F_GPE ) ?
UDP_TUNNEL_TYPE_VXLAN_GPE :
2016-06-16 12:20:52 -07:00
UDP_TUNNEL_TYPE_VXLAN ) ;
2013-08-19 11:22:48 -07:00
spin_unlock ( & vn - > sock_lock ) ;
2013-05-16 11:35:20 +00:00
/* Mark socket as an encapsulation socket. */
2016-04-05 08:22:53 -07:00
memset ( & tunnel_cfg , 0 , sizeof ( tunnel_cfg ) ) ;
2014-09-16 17:31:18 -07:00
tunnel_cfg . sk_user_data = vs ;
tunnel_cfg . encap_type = 1 ;
2016-02-23 18:02:58 +01:00
tunnel_cfg . encap_rcv = vxlan_rcv ;
2014-09-16 17:31:18 -07:00
tunnel_cfg . encap_destroy = NULL ;
2016-04-05 08:22:53 -07:00
tunnel_cfg . gro_receive = vxlan_gro_receive ;
tunnel_cfg . gro_complete = vxlan_gro_complete ;
2014-09-16 17:31:18 -07:00
setup_udp_tunnel_sock ( net , sock , & tunnel_cfg ) ;
2013-08-31 13:44:33 +08:00
2013-08-19 11:22:48 -07:00
return vs ;
}
2015-09-24 13:50:02 +02:00
static int __vxlan_sock_add ( struct vxlan_dev * vxlan , bool ipv6 )
2013-08-19 11:22:48 -07:00
{
2015-09-24 13:50:01 +02:00
struct vxlan_net * vn = net_generic ( vxlan - > net , vxlan_net_id ) ;
struct vxlan_sock * vs = NULL ;
2013-08-19 11:22:48 -07:00
2015-09-24 13:50:01 +02:00
if ( ! vxlan - > cfg . no_share ) {
2015-03-18 14:50:44 -03:00
spin_lock ( & vn - > sock_lock ) ;
2015-09-24 13:50:01 +02:00
vs = vxlan_find_sock ( vxlan - > net , ipv6 ? AF_INET6 : AF_INET ,
vxlan - > cfg . dst_port , vxlan - > flags ) ;
if ( vs & & ! atomic_add_unless ( & vs - > refcnt , 1 , 0 ) ) {
2015-03-18 14:50:44 -03:00
spin_unlock ( & vn - > sock_lock ) ;
2015-09-24 13:50:01 +02:00
return - EBUSY ;
2015-03-18 14:50:44 -03:00
}
spin_unlock ( & vn - > sock_lock ) ;
}
2015-09-24 13:50:01 +02:00
if ( ! vs )
2015-09-24 13:50:02 +02:00
vs = vxlan_socket_create ( vxlan - > net , ipv6 ,
vxlan - > cfg . dst_port , vxlan - > flags ) ;
2015-09-24 13:50:01 +02:00
if ( IS_ERR ( vs ) )
return PTR_ERR ( vs ) ;
2015-09-24 13:50:02 +02:00
# if IS_ENABLED(CONFIG_IPV6)
if ( ipv6 )
2016-10-28 09:59:15 -07:00
rcu_assign_pointer ( vxlan - > vn6_sock , vs ) ;
2015-09-24 13:50:02 +02:00
else
# endif
2016-10-28 09:59:15 -07:00
rcu_assign_pointer ( vxlan - > vn4_sock , vs ) ;
2015-09-24 13:50:01 +02:00
vxlan_vs_add_dev ( vs , vxlan ) ;
return 0 ;
2013-05-16 11:35:20 +00:00
}
2015-09-24 13:50:02 +02:00
static int vxlan_sock_add ( struct vxlan_dev * vxlan )
{
bool metadata = vxlan - > flags & VXLAN_F_COLLECT_METADATA ;
2017-04-27 21:24:35 +02:00
bool ipv6 = vxlan - > flags & VXLAN_F_IPV6 | | metadata ;
bool ipv4 = ! ipv6 | | metadata ;
2015-09-24 13:50:02 +02:00
int ret = 0 ;
2016-10-28 09:59:15 -07:00
RCU_INIT_POINTER ( vxlan - > vn4_sock , NULL ) ;
2015-09-24 13:50:02 +02:00
# if IS_ENABLED(CONFIG_IPV6)
2016-10-28 09:59:15 -07:00
RCU_INIT_POINTER ( vxlan - > vn6_sock , NULL ) ;
2017-04-27 21:24:35 +02:00
if ( ipv6 ) {
2015-09-24 13:50:02 +02:00
ret = __vxlan_sock_add ( vxlan , true ) ;
2017-04-27 21:24:35 +02:00
if ( ret < 0 & & ret ! = - EAFNOSUPPORT )
ipv4 = false ;
}
2015-09-24 13:50:02 +02:00
# endif
2017-04-27 21:24:35 +02:00
if ( ipv4 )
2015-09-24 13:50:02 +02:00
ret = __vxlan_sock_add ( vxlan , false ) ;
if ( ret < 0 )
vxlan_sock_release ( vxlan ) ;
return ret ;
}
2015-07-21 10:44:02 +02:00
static int vxlan_dev_configure ( struct net * src_net , struct net_device * dev ,
2017-02-20 08:29:19 -08:00
struct vxlan_config * conf ,
bool changelink )
2012-10-01 12:32:35 +00:00
{
2015-01-26 22:28:14 +01:00
struct vxlan_net * vn = net_generic ( src_net , vxlan_net_id ) ;
2016-01-07 11:26:53 +01:00
struct vxlan_dev * vxlan = netdev_priv ( dev ) , * tmp ;
2013-04-16 02:50:52 +00:00
struct vxlan_rdst * dst = & vxlan - > default_dst ;
2015-09-24 13:50:02 +02:00
unsigned short needed_headroom = ETH_HLEN ;
2013-08-31 13:44:33 +08:00
bool use_ipv6 = false ;
2015-07-21 10:44:02 +02:00
__be16 default_port = vxlan - > cfg . dst_port ;
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 00:05:58 +00:00
struct net_device * lowerdev = NULL ;
2012-10-01 12:32:35 +00:00
2017-02-20 08:29:19 -08:00
if ( ! changelink ) {
if ( conf - > flags & VXLAN_F_GPE ) {
/* For now, allow GPE only together with
* COLLECT_METADATA . This can be relaxed later ; in such
* case , the other side of the PtP link will have to be
* provided .
*/
if ( ( conf - > flags & ~ VXLAN_F_ALLOWED_GPE ) | |
! ( conf - > flags & VXLAN_F_COLLECT_METADATA ) ) {
pr_info ( " unsupported combination of extensions \n " ) ;
return - EINVAL ;
}
vxlan_raw_setup ( dev ) ;
} else {
vxlan_ether_setup ( dev ) ;
2016-09-02 13:37:12 +02:00
}
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
2017-02-20 08:29:19 -08:00
/* MTU range: 68 - 65535 */
dev - > min_mtu = ETH_MIN_MTU ;
dev - > max_mtu = ETH_MAX_MTU ;
vxlan - > net = src_net ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
}
2016-04-05 14:47:10 +02:00
2015-07-21 10:44:02 +02:00
dst - > remote_vni = conf - > vni ;
2012-10-01 12:32:35 +00:00
2015-07-21 10:44:02 +02:00
memcpy ( & dst - > remote_ip , & conf - > remote_ip , sizeof ( conf - > remote_ip ) ) ;
2012-10-01 12:32:35 +00:00
2015-07-21 10:44:02 +02:00
/* Unless IPv6 is explicitly requested, assume IPv4 */
if ( ! dst - > remote_ip . sa . sa_family )
dst - > remote_ip . sa . sa_family = AF_INET ;
2013-08-31 13:44:33 +08:00
2015-07-21 10:44:02 +02:00
if ( dst - > remote_ip . sa . sa_family = = AF_INET6 | |
2015-09-17 16:11:11 +02:00
vxlan - > cfg . saddr . sa . sa_family = = AF_INET6 ) {
if ( ! IS_ENABLED ( CONFIG_IPV6 ) )
return - EPFNOSUPPORT ;
2013-08-31 13:44:33 +08:00
use_ipv6 = true ;
2015-09-24 13:50:02 +02:00
vxlan - > flags | = VXLAN_F_IPV6 ;
2015-09-17 16:11:11 +02:00
}
2012-10-01 12:32:35 +00:00
2016-03-09 03:00:03 +01:00
if ( conf - > label & & ! use_ipv6 ) {
pr_info ( " label only supported in use with IPv6 \n " ) ;
return - EINVAL ;
}
2017-02-20 08:29:19 -08:00
if ( conf - > remote_ifindex & &
conf - > remote_ifindex ! = vxlan - > cfg . remote_ifindex ) {
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 00:05:58 +00:00
lowerdev = __dev_get_by_index ( src_net , conf - > remote_ifindex ) ;
2015-07-21 10:44:02 +02:00
dst - > remote_ifindex = conf - > remote_ifindex ;
2012-10-09 20:35:53 +00:00
if ( ! lowerdev ) {
2017-02-20 08:29:19 -08:00
pr_info ( " ifindex %d does not exist \n " ,
dst - > remote_ifindex ) ;
2012-10-09 20:35:53 +00:00
return - ENODEV ;
}
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
if ( use_ipv6 ) {
struct inet6_dev * idev = __in6_dev_get ( lowerdev ) ;
if ( idev & & idev - > cnf . disable_ipv6 ) {
pr_info ( " IPv6 is disabled via sysctl \n " ) ;
return - EPERM ;
}
}
# endif
2015-07-21 10:44:02 +02:00
if ( ! conf - > mtu )
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
dev - > mtu = lowerdev - > mtu -
( use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM ) ;
2012-11-13 13:10:59 +00:00
2015-09-24 13:50:02 +02:00
needed_headroom = lowerdev - > hard_header_len ;
2017-02-20 08:29:19 -08:00
} else if ( ! conf - > remote_ifindex & &
vxlan_addr_multicast ( & dst - > remote_ip ) ) {
2016-09-02 13:37:11 +02:00
pr_info ( " multicast destination requires interface to be specified \n " ) ;
return - EINVAL ;
2015-09-17 16:11:10 +02:00
}
2012-10-01 12:32:35 +00:00
2017-03-29 17:56:43 -07:00
if ( lowerdev ) {
dev - > gso_max_size = lowerdev - > gso_max_size ;
dev - > gso_max_segs = lowerdev - > gso_max_segs ;
}
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 00:05:58 +00:00
if ( conf - > mtu ) {
net: use core MTU range checking in core net infra
geneve:
- Merge __geneve_change_mtu back into geneve_change_mtu, set max_mtu
- This one isn't quite as straight-forward as others, could use some
closer inspection and testing
macvlan:
- set min/max_mtu
tun:
- set min/max_mtu, remove tun_net_change_mtu
vxlan:
- Merge __vxlan_change_mtu back into vxlan_change_mtu
- Set max_mtu to IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
- This one is also not as straight-forward and could use closer inspection
and testing from vxlan folks
bridge:
- set max_mtu of IP_MAX_MTU and retain dynamic MTU range checks in
change_mtu function
openvswitch:
- set min/max_mtu, remove internal_dev_change_mtu
- note: max_mtu wasn't checked previously, it's been set to 65535, which
is the largest possible size supported
sch_teql:
- set min/max_mtu (note: max_mtu previously unchecked, used max of 65535)
macsec:
- min_mtu = 0, max_mtu = 65535
macvlan:
- min_mtu = 0, max_mtu = 65535
ntb_netdev:
- min_mtu = 0, max_mtu = 65535
veth:
- min_mtu = 68, max_mtu = 65535
8021q:
- min_mtu = 0, max_mtu = 65535
CC: netdev@vger.kernel.org
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Tom Herbert <tom@herbertland.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Alexander Duyck <alexander.h.duyck@intel.com>
CC: Paolo Abeni <pabeni@redhat.com>
CC: Jiri Benc <jbenc@redhat.com>
CC: WANG Cong <xiyou.wangcong@gmail.com>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Pravin B Shelar <pshelar@ovn.org>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Stephen Hemminger <stephen@networkplumber.org>
CC: Pravin Shelar <pshelar@nicira.com>
CC: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-10-20 13:55:20 -04:00
int max_mtu = ETH_MAX_MTU ;
if ( lowerdev )
max_mtu = lowerdev - > mtu ;
max_mtu - = ( use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM ) ;
if ( conf - > mtu < dev - > min_mtu | | conf - > mtu > dev - > max_mtu )
return - EINVAL ;
dev - > mtu = conf - > mtu ;
if ( conf - > mtu > max_mtu )
dev - > mtu = max_mtu ;
vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices
Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could
transmit vxlan packets of any size, constrained only by the ability to
send out the resulting packets. 4.3 introduced netdevs corresponding
to tunnel vports. These netdevs have an MTU, which limits the size of
a packet that can be successfully encapsulated. The default MTU
values are low (1500 or less), which is awkwardly small in the context
of physical networks supporting jumbo frames, and leads to a
conspicuous change in behaviour for userspace.
Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.
Signed-off-by: David Wragg <david@weave.works>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-10 00:05:58 +00:00
}
2015-09-24 13:50:02 +02:00
if ( use_ipv6 | | conf - > flags & VXLAN_F_COLLECT_METADATA )
needed_headroom + = VXLAN6_HEADROOM ;
else
needed_headroom + = VXLAN_HEADROOM ;
dev - > needed_headroom = needed_headroom ;
2015-07-21 10:44:02 +02:00
memcpy ( & vxlan - > cfg , conf , sizeof ( * conf ) ) ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
if ( ! vxlan - > cfg . dst_port ) {
if ( conf - > flags & VXLAN_F_GPE )
2017-01-16 18:37:58 -05:00
vxlan - > cfg . dst_port = htons ( 4790 ) ; /* IANA VXLAN-GPE port */
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
else
vxlan - > cfg . dst_port = default_port ;
}
2015-07-21 10:44:02 +02:00
vxlan - > flags | = conf - > flags ;
if ( ! vxlan - > cfg . age_interval )
vxlan - > cfg . age_interval = FDB_AGE_DEFAULT ;
2017-02-20 08:29:19 -08:00
if ( changelink )
return 0 ;
2016-01-07 11:26:53 +01:00
list_for_each_entry ( tmp , & vn - > vxlan_list , next ) {
if ( tmp - > cfg . vni = = conf - > vni & &
( tmp - > default_dst . remote_ip . sa . sa_family = = AF_INET6 | |
tmp - > cfg . saddr . sa . sa_family = = AF_INET6 ) = = use_ipv6 & &
tmp - > cfg . dst_port = = vxlan - > cfg . dst_port & &
( tmp - > flags & VXLAN_F_RCV_FLAGS ) = =
2016-09-02 13:37:12 +02:00
( vxlan - > flags & VXLAN_F_RCV_FLAGS ) ) {
pr_info ( " duplicate VNI %u \n " , be32_to_cpu ( conf - > vni ) ) ;
return - EEXIST ;
}
2016-01-07 11:26:53 +01:00
}
2015-07-21 10:44:02 +02:00
return 0 ;
}
2017-03-13 16:24:03 +01:00
static int __vxlan_dev_create ( struct net * net , struct net_device * dev ,
struct vxlan_config * conf )
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
int err ;
err = vxlan_dev_configure ( net , dev , conf , false ) ;
if ( err )
return err ;
dev - > ethtool_ops = & vxlan_ethtool_ops ;
/* create an fdb entry for a valid default destination */
if ( ! vxlan_addr_any ( & vxlan - > default_dst . remote_ip ) ) {
err = vxlan_fdb_create ( vxlan , all_zeros_mac ,
& vxlan - > default_dst . remote_ip ,
NUD_REACHABLE | NUD_PERMANENT ,
NLM_F_EXCL | NLM_F_CREATE ,
vxlan - > cfg . dst_port ,
vxlan - > default_dst . remote_vni ,
vxlan - > default_dst . remote_vni ,
vxlan - > default_dst . remote_ifindex ,
NTF_SELF ) ;
if ( err )
return err ;
}
err = register_netdevice ( dev ) ;
if ( err ) {
vxlan_fdb_delete_default ( vxlan , vxlan - > default_dst . remote_vni ) ;
return err ;
}
list_add ( & vxlan - > next , & vn - > vxlan_list ) ;
return 0 ;
}
2017-02-20 08:29:19 -08:00
static int vxlan_nl2conf ( struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct net_device * dev , struct vxlan_config * conf ,
bool changelink )
2015-07-21 10:44:02 +02:00
{
2017-02-20 08:29:19 -08:00
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2015-07-21 10:44:02 +02:00
2017-02-20 08:29:19 -08:00
memset ( conf , 0 , sizeof ( * conf ) ) ;
2015-10-16 16:36:00 -07:00
2017-02-20 08:29:19 -08:00
/* if changelink operation, start with old existing cfg */
if ( changelink )
memcpy ( conf , & vxlan - > cfg , sizeof ( * conf ) ) ;
if ( data [ IFLA_VXLAN_ID ] ) {
__be32 vni = cpu_to_be32 ( nla_get_u32 ( data [ IFLA_VXLAN_ID ] ) ) ;
if ( changelink & & ( vni ! = conf - > vni ) )
return - EOPNOTSUPP ;
conf - > vni = cpu_to_be32 ( nla_get_u32 ( data [ IFLA_VXLAN_ID ] ) ) ;
}
2015-07-21 10:44:02 +02:00
if ( data [ IFLA_VXLAN_GROUP ] ) {
2017-02-20 08:29:19 -08:00
conf - > remote_ip . sin . sin_addr . s_addr = nla_get_in_addr ( data [ IFLA_VXLAN_GROUP ] ) ;
2015-07-21 10:44:02 +02:00
} else if ( data [ IFLA_VXLAN_GROUP6 ] ) {
if ( ! IS_ENABLED ( CONFIG_IPV6 ) )
return - EPFNOSUPPORT ;
2017-02-20 08:29:19 -08:00
conf - > remote_ip . sin6 . sin6_addr = nla_get_in6_addr ( data [ IFLA_VXLAN_GROUP6 ] ) ;
conf - > remote_ip . sa . sa_family = AF_INET6 ;
2015-07-21 10:44:02 +02:00
}
if ( data [ IFLA_VXLAN_LOCAL ] ) {
2017-02-20 08:29:19 -08:00
conf - > saddr . sin . sin_addr . s_addr = nla_get_in_addr ( data [ IFLA_VXLAN_LOCAL ] ) ;
conf - > saddr . sa . sa_family = AF_INET ;
2015-07-21 10:44:02 +02:00
} else if ( data [ IFLA_VXLAN_LOCAL6 ] ) {
if ( ! IS_ENABLED ( CONFIG_IPV6 ) )
return - EPFNOSUPPORT ;
/* TODO: respect scope id */
2017-02-20 08:29:19 -08:00
conf - > saddr . sin6 . sin6_addr = nla_get_in6_addr ( data [ IFLA_VXLAN_LOCAL6 ] ) ;
conf - > saddr . sa . sa_family = AF_INET6 ;
2015-07-21 10:44:02 +02:00
}
if ( data [ IFLA_VXLAN_LINK ] )
2017-02-20 08:29:19 -08:00
conf - > remote_ifindex = nla_get_u32 ( data [ IFLA_VXLAN_LINK ] ) ;
2015-07-21 10:44:02 +02:00
2012-10-01 12:32:35 +00:00
if ( data [ IFLA_VXLAN_TOS ] )
2017-02-20 08:29:19 -08:00
conf - > tos = nla_get_u8 ( data [ IFLA_VXLAN_TOS ] ) ;
2012-10-01 12:32:35 +00:00
2012-10-30 10:27:16 +00:00
if ( data [ IFLA_VXLAN_TTL ] )
2017-02-20 08:29:19 -08:00
conf - > ttl = nla_get_u8 ( data [ IFLA_VXLAN_TTL ] ) ;
2012-10-30 10:27:16 +00:00
2016-03-09 03:00:03 +01:00
if ( data [ IFLA_VXLAN_LABEL ] )
2017-02-20 08:29:19 -08:00
conf - > label = nla_get_be32 ( data [ IFLA_VXLAN_LABEL ] ) &
2016-03-09 03:00:03 +01:00
IPV6_FLOWLABEL_MASK ;
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_LEARNING ] ) {
if ( nla_get_u8 ( data [ IFLA_VXLAN_LEARNING ] ) ) {
conf - > flags | = VXLAN_F_LEARN ;
} else {
conf - > flags & = ~ VXLAN_F_LEARN ;
vxlan - > flags & = ~ VXLAN_F_LEARN ;
}
} else if ( ! changelink ) {
/* default to learn on a new device */
conf - > flags | = VXLAN_F_LEARN ;
}
2012-10-01 12:32:35 +00:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_AGEING ] ) {
if ( changelink )
return - EOPNOTSUPP ;
conf - > age_interval = nla_get_u32 ( data [ IFLA_VXLAN_AGEING ] ) ;
}
2012-10-01 12:32:35 +00:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_PROXY ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_PROXY ] ) )
conf - > flags | = VXLAN_F_PROXY ;
}
2012-11-20 02:50:14 +00:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_RSC ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_RSC ] ) )
conf - > flags | = VXLAN_F_RSC ;
}
2012-11-20 02:50:14 +00:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_L2MISS ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_L2MISS ] ) )
conf - > flags | = VXLAN_F_L2MISS ;
}
2012-11-20 02:50:14 +00:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_L3MISS ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_L3MISS ] ) )
conf - > flags | = VXLAN_F_L3MISS ;
}
2012-11-20 02:50:14 +00:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_LIMIT ] ) {
if ( changelink )
return - EOPNOTSUPP ;
conf - > addrmax = nla_get_u32 ( data [ IFLA_VXLAN_LIMIT ] ) ;
}
2012-10-01 12:32:35 +00:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_COLLECT_METADATA ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_COLLECT_METADATA ] ) )
conf - > flags | = VXLAN_F_COLLECT_METADATA ;
}
2015-07-30 20:10:22 -07:00
2012-10-09 20:35:50 +00:00
if ( data [ IFLA_VXLAN_PORT_RANGE ] ) {
2017-02-20 08:29:19 -08:00
if ( ! changelink ) {
const struct ifla_vxlan_port_range * p
= nla_data ( data [ IFLA_VXLAN_PORT_RANGE ] ) ;
conf - > port_min = ntohs ( p - > low ) ;
conf - > port_max = ntohs ( p - > high ) ;
} else {
return - EOPNOTSUPP ;
}
2012-10-09 20:35:50 +00:00
}
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_PORT ] ) {
if ( changelink )
return - EOPNOTSUPP ;
conf - > dst_port = nla_get_be16 ( data [ IFLA_VXLAN_PORT ] ) ;
}
2013-04-27 11:31:57 +00:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_UDP_CSUM ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( ! nla_get_u8 ( data [ IFLA_VXLAN_UDP_CSUM ] ) )
conf - > flags | = VXLAN_F_UDP_ZERO_CSUM_TX ;
}
2014-06-04 17:20:29 -07:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_UDP_ZERO_CSUM6_TX ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_UDP_ZERO_CSUM6_TX ] ) )
conf - > flags | = VXLAN_F_UDP_ZERO_CSUM6_TX ;
}
2014-06-04 17:20:29 -07:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_UDP_ZERO_CSUM6_RX ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_UDP_ZERO_CSUM6_RX ] ) )
conf - > flags | = VXLAN_F_UDP_ZERO_CSUM6_RX ;
}
2014-06-04 17:20:29 -07:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_REMCSUM_TX ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_REMCSUM_TX ] ) )
conf - > flags | = VXLAN_F_REMCSUM_TX ;
}
2015-01-12 17:00:38 -08:00
2017-02-20 08:29:19 -08:00
if ( data [ IFLA_VXLAN_REMCSUM_RX ] ) {
if ( changelink )
return - EOPNOTSUPP ;
if ( nla_get_u8 ( data [ IFLA_VXLAN_REMCSUM_RX ] ) )
conf - > flags | = VXLAN_F_REMCSUM_RX ;
}
if ( data [ IFLA_VXLAN_GBP ] ) {
if ( changelink )
return - EOPNOTSUPP ;
conf - > flags | = VXLAN_F_GBP ;
}
if ( data [ IFLA_VXLAN_GPE ] ) {
if ( changelink )
return - EOPNOTSUPP ;
conf - > flags | = VXLAN_F_GPE ;
}
if ( data [ IFLA_VXLAN_REMCSUM_NOPARTIAL ] ) {
if ( changelink )
return - EOPNOTSUPP ;
conf - > flags | = VXLAN_F_REMCSUM_NOPARTIAL ;
}
if ( tb [ IFLA_MTU ] ) {
if ( changelink )
return - EOPNOTSUPP ;
conf - > mtu = nla_get_u32 ( tb [ IFLA_MTU ] ) ;
}
return 0 ;
}
static int vxlan_newlink ( struct net * src_net , struct net_device * dev ,
struct nlattr * tb [ ] , struct nlattr * data [ ] )
{
struct vxlan_config conf ;
int err ;
err = vxlan_nl2conf ( tb , data , dev , & conf , false ) ;
if ( err )
return err ;
2017-03-13 16:24:03 +01:00
return __vxlan_dev_create ( src_net , dev , & conf ) ;
2017-02-20 08:29:19 -08:00
}
2015-01-12 17:00:38 -08:00
2017-02-20 08:29:19 -08:00
static int vxlan_changelink ( struct net_device * dev , struct nlattr * tb [ ] ,
struct nlattr * data [ ] )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct vxlan_rdst * dst = & vxlan - > default_dst ;
struct vxlan_rdst old_dst ;
struct vxlan_config conf ;
int err ;
err = vxlan_nl2conf ( tb , data ,
dev , & conf , true ) ;
if ( err )
return err ;
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
2017-02-20 08:29:19 -08:00
memcpy ( & old_dst , dst , sizeof ( struct vxlan_rdst ) ) ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
2017-02-20 08:29:19 -08:00
err = vxlan_dev_configure ( vxlan - > net , dev , & conf , true ) ;
if ( err )
return err ;
2015-02-10 16:30:32 -08:00
2017-02-20 08:29:19 -08:00
/* handle default dst entry */
if ( ! vxlan_addr_equal ( & dst - > remote_ip , & old_dst . remote_ip ) ) {
spin_lock_bh ( & vxlan - > hash_lock ) ;
if ( ! vxlan_addr_any ( & old_dst . remote_ip ) )
__vxlan_fdb_delete ( vxlan , all_zeros_mac ,
old_dst . remote_ip ,
vxlan - > cfg . dst_port ,
old_dst . remote_vni ,
old_dst . remote_vni ,
old_dst . remote_ifindex , 0 ) ;
if ( ! vxlan_addr_any ( & dst - > remote_ip ) ) {
err = vxlan_fdb_create ( vxlan , all_zeros_mac ,
& dst - > remote_ip ,
NUD_REACHABLE | NUD_PERMANENT ,
NLM_F_CREATE | NLM_F_APPEND ,
vxlan - > cfg . dst_port ,
dst - > remote_vni ,
dst - > remote_vni ,
dst - > remote_ifindex ,
NTF_SELF ) ;
if ( err ) {
spin_unlock_bh ( & vxlan - > hash_lock ) ;
return err ;
}
}
spin_unlock_bh ( & vxlan - > hash_lock ) ;
}
2016-05-27 10:49:11 +08:00
2017-02-20 08:29:19 -08:00
return 0 ;
2012-10-01 12:32:35 +00:00
}
static void vxlan_dellink ( struct net_device * dev , struct list_head * head )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2014-04-24 10:02:49 +02:00
struct vxlan_net * vn = net_generic ( vxlan - > net , vxlan_net_id ) ;
2012-10-01 12:32:35 +00:00
2017-01-23 20:44:32 -08:00
vxlan_flush ( vxlan , true ) ;
2013-07-13 10:18:18 -07:00
spin_lock ( & vn - > sock_lock ) ;
2013-08-19 11:22:48 -07:00
if ( ! hlist_unhashed ( & vxlan - > hlist ) )
hlist_del_rcu ( & vxlan - > hlist ) ;
2013-07-13 10:18:18 -07:00
spin_unlock ( & vn - > sock_lock ) ;
2015-08-19 17:07:33 -07:00
gro_cells_destroy ( & vxlan - > gro_cells ) ;
2013-05-16 11:35:20 +00:00
list_del ( & vxlan - > next ) ;
2012-10-01 12:32:35 +00:00
unregister_netdevice_queue ( dev , head ) ;
}
static size_t vxlan_get_size ( const struct net_device * dev )
{
return nla_total_size ( sizeof ( __u32 ) ) + /* IFLA_VXLAN_ID */
2013-08-31 13:44:33 +08:00
nla_total_size ( sizeof ( struct in6_addr ) ) + /* IFLA_VXLAN_GROUP{6} */
2012-10-01 12:32:35 +00:00
nla_total_size ( sizeof ( __u32 ) ) + /* IFLA_VXLAN_LINK */
2013-08-31 13:44:33 +08:00
nla_total_size ( sizeof ( struct in6_addr ) ) + /* IFLA_VXLAN_LOCAL{6} */
2012-10-01 12:32:35 +00:00
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_TTL */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_TOS */
2016-03-09 03:00:03 +01:00
nla_total_size ( sizeof ( __be32 ) ) + /* IFLA_VXLAN_LABEL */
2012-10-01 12:32:35 +00:00
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_LEARNING */
2012-11-20 02:50:14 +00:00
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_PROXY */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_RSC */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_L2MISS */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_L3MISS */
2015-08-04 22:51:07 -07:00
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_COLLECT_METADATA */
2012-10-01 12:32:35 +00:00
nla_total_size ( sizeof ( __u32 ) ) + /* IFLA_VXLAN_AGEING */
nla_total_size ( sizeof ( __u32 ) ) + /* IFLA_VXLAN_LIMIT */
2012-10-09 20:35:50 +00:00
nla_total_size ( sizeof ( struct ifla_vxlan_port_range ) ) +
2014-06-04 17:20:29 -07:00
nla_total_size ( sizeof ( __be16 ) ) + /* IFLA_VXLAN_PORT */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_UDP_CSUM */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
2015-01-12 17:00:38 -08:00
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_REMCSUM_TX */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_REMCSUM_RX */
2012-10-01 12:32:35 +00:00
0 ;
}
static int vxlan_fill_info ( struct sk_buff * skb , const struct net_device * dev )
{
const struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-04-16 02:50:52 +00:00
const struct vxlan_rdst * dst = & vxlan - > default_dst ;
2012-10-09 20:35:50 +00:00
struct ifla_vxlan_port_range ports = {
2015-07-21 10:44:02 +02:00
. low = htons ( vxlan - > cfg . port_min ) ,
. high = htons ( vxlan - > cfg . port_max ) ,
2012-10-09 20:35:50 +00:00
} ;
2012-10-01 12:32:35 +00:00
2016-02-16 21:58:58 +01:00
if ( nla_put_u32 ( skb , IFLA_VXLAN_ID , be32_to_cpu ( dst - > remote_vni ) ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
2013-08-31 13:44:33 +08:00
if ( ! vxlan_addr_any ( & dst - > remote_ip ) ) {
if ( dst - > remote_ip . sa . sa_family = = AF_INET ) {
2015-03-29 16:59:25 +02:00
if ( nla_put_in_addr ( skb , IFLA_VXLAN_GROUP ,
dst - > remote_ip . sin . sin_addr . s_addr ) )
2013-08-31 13:44:33 +08:00
goto nla_put_failure ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
2015-03-29 16:59:25 +02:00
if ( nla_put_in6_addr ( skb , IFLA_VXLAN_GROUP6 ,
& dst - > remote_ip . sin6 . sin6_addr ) )
2013-08-31 13:44:33 +08:00
goto nla_put_failure ;
# endif
}
}
2012-10-01 12:32:35 +00:00
2013-04-16 02:50:52 +00:00
if ( dst - > remote_ifindex & & nla_put_u32 ( skb , IFLA_VXLAN_LINK , dst - > remote_ifindex ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
2015-07-21 10:44:02 +02:00
if ( ! vxlan_addr_any ( & vxlan - > cfg . saddr ) ) {
if ( vxlan - > cfg . saddr . sa . sa_family = = AF_INET ) {
2015-03-29 16:59:25 +02:00
if ( nla_put_in_addr ( skb , IFLA_VXLAN_LOCAL ,
2015-07-21 10:44:02 +02:00
vxlan - > cfg . saddr . sin . sin_addr . s_addr ) )
2013-08-31 13:44:33 +08:00
goto nla_put_failure ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
2015-03-29 16:59:25 +02:00
if ( nla_put_in6_addr ( skb , IFLA_VXLAN_LOCAL6 ,
2015-07-21 10:44:02 +02:00
& vxlan - > cfg . saddr . sin6 . sin6_addr ) )
2013-08-31 13:44:33 +08:00
goto nla_put_failure ;
# endif
}
}
2012-10-01 12:32:35 +00:00
2015-07-21 10:44:02 +02:00
if ( nla_put_u8 ( skb , IFLA_VXLAN_TTL , vxlan - > cfg . ttl ) | |
nla_put_u8 ( skb , IFLA_VXLAN_TOS , vxlan - > cfg . tos ) | |
2016-03-09 03:00:03 +01:00
nla_put_be32 ( skb , IFLA_VXLAN_LABEL , vxlan - > cfg . label ) | |
2012-11-20 02:50:14 +00:00
nla_put_u8 ( skb , IFLA_VXLAN_LEARNING ,
! ! ( vxlan - > flags & VXLAN_F_LEARN ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_PROXY ,
! ! ( vxlan - > flags & VXLAN_F_PROXY ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_RSC , ! ! ( vxlan - > flags & VXLAN_F_RSC ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_L2MISS ,
! ! ( vxlan - > flags & VXLAN_F_L2MISS ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_L3MISS ,
! ! ( vxlan - > flags & VXLAN_F_L3MISS ) ) | |
2015-08-04 22:51:07 -07:00
nla_put_u8 ( skb , IFLA_VXLAN_COLLECT_METADATA ,
! ! ( vxlan - > flags & VXLAN_F_COLLECT_METADATA ) ) | |
2015-07-21 10:44:02 +02:00
nla_put_u32 ( skb , IFLA_VXLAN_AGEING , vxlan - > cfg . age_interval ) | |
nla_put_u32 ( skb , IFLA_VXLAN_LIMIT , vxlan - > cfg . addrmax ) | |
nla_put_be16 ( skb , IFLA_VXLAN_PORT , vxlan - > cfg . dst_port ) | |
2014-06-04 17:20:29 -07:00
nla_put_u8 ( skb , IFLA_VXLAN_UDP_CSUM ,
2016-02-19 11:26:31 -08:00
! ( vxlan - > flags & VXLAN_F_UDP_ZERO_CSUM_TX ) ) | |
2014-06-04 17:20:29 -07:00
nla_put_u8 ( skb , IFLA_VXLAN_UDP_ZERO_CSUM6_TX ,
! ! ( vxlan - > flags & VXLAN_F_UDP_ZERO_CSUM6_TX ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_UDP_ZERO_CSUM6_RX ,
2015-01-12 17:00:38 -08:00
! ! ( vxlan - > flags & VXLAN_F_UDP_ZERO_CSUM6_RX ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_REMCSUM_TX ,
! ! ( vxlan - > flags & VXLAN_F_REMCSUM_TX ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_REMCSUM_RX ,
! ! ( vxlan - > flags & VXLAN_F_REMCSUM_RX ) ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
2012-10-09 20:35:50 +00:00
if ( nla_put ( skb , IFLA_VXLAN_PORT_RANGE , sizeof ( ports ) , & ports ) )
goto nla_put_failure ;
vxlan: Group Policy extension
Implements supports for the Group Policy VXLAN extension [0] to provide
a lightweight and simple security label mechanism across network peers
based on VXLAN. The security context and associated metadata is mapped
to/from skb->mark. This allows further mapping to a SELinux context
using SECMARK, to implement ACLs directly with nftables, iptables, OVS,
tc, etc.
The group membership is defined by the lower 16 bits of skb->mark, the
upper 16 bits are used for flags.
SELinux allows to manage label to secure local resources. However,
distributed applications require ACLs to implemented across hosts. This
is typically achieved by matching on L2-L4 fields to identify the
original sending host and process on the receiver. On top of that,
netlabel and specifically CIPSO [1] allow to map security contexts to
universal labels. However, netlabel and CIPSO are relatively complex.
This patch provides a lightweight alternative for overlay network
environments with a trusted underlay. No additional control protocol
is required.
Host 1: Host 2:
Group A Group B Group B Group A
+-----+ +-------------+ +-------+ +-----+
| lxc | | SELinux CTX | | httpd | | VM |
+--+--+ +--+----------+ +---+---+ +--+--+
\---+---/ \----+---/
| |
+---+---+ +---+---+
| vxlan | | vxlan |
+---+---+ +---+---+
+------------------------------+
Backwards compatibility:
A VXLAN-GBP socket can receive standard VXLAN frames and will assign
the default group 0x0000 to such frames. A Linux VXLAN socket will
drop VXLAN-GBP frames. The extension is therefore disabled by default
and needs to be specifically enabled:
ip link add [...] type vxlan [...] gbp
In a mixed environment with VXLAN and VXLAN-GBP sockets, the GBP socket
must run on a separate port number.
Examples:
iptables:
host1# iptables -I OUTPUT -m owner --uid-owner 101 -j MARK --set-mark 0x200
host2# iptables -I INPUT -m mark --mark 0x200 -j DROP
OVS:
# ovs-ofctl add-flow br0 'in_port=1,actions=load:0x200->NXM_NX_TUN_GBP_ID[],NORMAL'
# ovs-ofctl add-flow br0 'in_port=2,tun_gbp_id=0x200,actions=drop'
[0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy
[1] http://lwn.net/Articles/204905/
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-15 03:53:55 +01:00
if ( vxlan - > flags & VXLAN_F_GBP & &
nla_put_flag ( skb , IFLA_VXLAN_GBP ) )
goto nla_put_failure ;
vxlan: implement GPE
Implement VXLAN-GPE. Only COLLECT_METADATA is supported for now (it is
possible to support static configuration, too, if there is demand for it).
The GPE header parsing has to be moved before iptunnel_pull_header, as we
need to know the protocol.
v2: Removed what was called "L2 mode" in v1 of the patchset. Only "L3 mode"
(now called "raw mode") is added by this patch. This mode does not allow
Ethernet header to be encapsulated in VXLAN-GPE when using ip route to
specify the encapsulation, IP header is encapsulated instead. The patch
does support Ethernet to be encapsulated, though, using ETH_P_TEB in
skb->protocol. This will be utilized by other COLLECT_METADATA users
(openvswitch in particular).
If there is ever demand for Ethernet encapsulation with VXLAN-GPE using
ip route, it's easy to add a new flag switching the interface to
"Ethernet mode" (called "L2 mode" in v1 of this patchset). For now,
leave this out, it seems we don't need it.
Disallowed more flag combinations, especially RCO with GPE.
Added comment explaining that GBP and GPE cannot be set together.
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-05 14:47:13 +02:00
if ( vxlan - > flags & VXLAN_F_GPE & &
nla_put_flag ( skb , IFLA_VXLAN_GPE ) )
goto nla_put_failure ;
2015-02-10 16:30:32 -08:00
if ( vxlan - > flags & VXLAN_F_REMCSUM_NOPARTIAL & &
nla_put_flag ( skb , IFLA_VXLAN_REMCSUM_NOPARTIAL ) )
goto nla_put_failure ;
2012-10-01 12:32:35 +00:00
return 0 ;
nla_put_failure :
return - EMSGSIZE ;
}
2015-01-15 15:11:17 +01:00
static struct net * vxlan_get_link_net ( const struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
return vxlan - > net ;
}
2012-10-01 12:32:35 +00:00
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
. kind = " vxlan " ,
. maxtype = IFLA_VXLAN_MAX ,
. policy = vxlan_policy ,
. priv_size = sizeof ( struct vxlan_dev ) ,
. setup = vxlan_setup ,
. validate = vxlan_validate ,
. newlink = vxlan_newlink ,
2017-02-20 08:29:19 -08:00
. changelink = vxlan_changelink ,
2012-10-01 12:32:35 +00:00
. dellink = vxlan_dellink ,
. get_size = vxlan_get_size ,
. fill_info = vxlan_fill_info ,
2015-01-15 15:11:17 +01:00
. get_link_net = vxlan_get_link_net ,
2012-10-01 12:32:35 +00:00
} ;
2016-06-13 10:31:05 +02:00
struct net_device * vxlan_dev_create ( struct net * net , const char * name ,
u8 name_assign_type ,
struct vxlan_config * conf )
{
struct nlattr * tb [ IFLA_MAX + 1 ] ;
struct net_device * dev ;
int err ;
memset ( & tb , 0 , sizeof ( tb ) ) ;
dev = rtnl_create_link ( net , name , name_assign_type ,
& vxlan_link_ops , tb ) ;
if ( IS_ERR ( dev ) )
return dev ;
2017-03-13 16:24:03 +01:00
err = __vxlan_dev_create ( net , dev , conf ) ;
2016-06-13 10:31:05 +02:00
if ( err < 0 ) {
free_netdev ( dev ) ;
return ERR_PTR ( err ) ;
}
err = rtnl_configure_link ( dev , NULL ) ;
if ( err < 0 ) {
LIST_HEAD ( list_kill ) ;
vxlan_dellink ( dev , & list_kill ) ;
unregister_netdevice_many ( & list_kill ) ;
return ERR_PTR ( err ) ;
}
return dev ;
}
EXPORT_SYMBOL_GPL ( vxlan_dev_create ) ;
2014-01-13 18:41:19 +01:00
static void vxlan_handle_lowerdev_unregister ( struct vxlan_net * vn ,
struct net_device * dev )
{
struct vxlan_dev * vxlan , * next ;
LIST_HEAD ( list_kill ) ;
list_for_each_entry_safe ( vxlan , next , & vn - > vxlan_list , next ) {
struct vxlan_rdst * dst = & vxlan - > default_dst ;
/* In case we created vxlan device with carrier
* and we loose the carrier due to module unload
* we also need to remove vxlan device . In other
* cases , it ' s not necessary and remote_ifindex
* is 0 here , so no matches .
*/
if ( dst - > remote_ifindex = = dev - > ifindex )
vxlan_dellink ( vxlan - > dev , & list_kill ) ;
}
unregister_netdevice_many ( & list_kill ) ;
}
2016-04-18 21:19:47 +02:00
static int vxlan_netdevice_event ( struct notifier_block * unused ,
unsigned long event , void * ptr )
2014-01-13 18:41:19 +01:00
{
struct net_device * dev = netdev_notifier_info_to_dev ( ptr ) ;
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
struct vxlan_net * vn = net_generic ( dev_net ( dev ) , vxlan_net_id ) ;
2014-01-13 18:41:19 +01:00
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
if ( event = = NETDEV_UNREGISTER )
2014-01-13 18:41:19 +01:00
vxlan_handle_lowerdev_unregister ( vn , dev ) ;
2016-06-16 12:21:00 -07:00
else if ( event = = NETDEV_UDP_TUNNEL_PUSH_INFO )
2016-04-18 21:19:47 +02:00
vxlan_push_rx_ports ( dev ) ;
2014-01-13 18:41:19 +01:00
return NOTIFY_DONE ;
}
static struct notifier_block vxlan_notifier_block __read_mostly = {
2016-04-18 21:19:47 +02:00
. notifier_call = vxlan_netdevice_event ,
2014-01-13 18:41:19 +01:00
} ;
2012-10-01 12:32:35 +00:00
static __net_init int vxlan_init_net ( struct net * net )
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
2013-05-27 22:35:52 +00:00
unsigned int h ;
2012-10-01 12:32:35 +00:00
2013-05-16 11:35:20 +00:00
INIT_LIST_HEAD ( & vn - > vxlan_list ) ;
2013-06-17 14:16:11 -07:00
spin_lock_init ( & vn - > sock_lock ) ;
2012-10-01 12:32:35 +00:00
2013-05-16 11:35:20 +00:00
for ( h = 0 ; h < PORT_HASH_SIZE ; + + h )
INIT_HLIST_HEAD ( & vn - > sock_list [ h ] ) ;
2012-10-01 12:32:35 +00:00
return 0 ;
}
2014-04-24 10:02:49 +02:00
static void __net_exit vxlan_exit_net ( struct net * net )
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
struct vxlan_dev * vxlan , * next ;
struct net_device * dev , * aux ;
LIST_HEAD ( list ) ;
rtnl_lock ( ) ;
for_each_netdev_safe ( net , dev , aux )
if ( dev - > rtnl_link_ops = = & vxlan_link_ops )
unregister_netdevice_queue ( dev , & list ) ;
list_for_each_entry_safe ( vxlan , next , & vn - > vxlan_list , next ) {
/* If vxlan->dev is in the same netns, it has already been added
* to the list by the previous loop .
*/
2015-08-19 17:07:33 -07:00
if ( ! net_eq ( dev_net ( vxlan - > dev ) , net ) ) {
gro_cells_destroy ( & vxlan - > gro_cells ) ;
2015-05-18 13:51:24 -04:00
unregister_netdevice_queue ( vxlan - > dev , & list ) ;
2015-08-19 17:07:33 -07:00
}
2014-04-24 10:02:49 +02:00
}
unregister_netdevice_many ( & list ) ;
rtnl_unlock ( ) ;
}
2012-10-01 12:32:35 +00:00
static struct pernet_operations vxlan_net_ops = {
. init = vxlan_init_net ,
2014-04-24 10:02:49 +02:00
. exit = vxlan_exit_net ,
2012-10-01 12:32:35 +00:00
. id = & vxlan_net_id ,
. size = sizeof ( struct vxlan_net ) ,
} ;
static int __init vxlan_init_module ( void )
{
int rc ;
get_random_bytes ( & vxlan_salt , sizeof ( vxlan_salt ) ) ;
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
rc = register_pernet_subsys ( & vxlan_net_ops ) ;
2012-10-01 12:32:35 +00:00
if ( rc )
goto out1 ;
2014-01-13 18:41:19 +01:00
rc = register_netdevice_notifier ( & vxlan_notifier_block ) ;
2012-10-01 12:32:35 +00:00
if ( rc )
goto out2 ;
2014-01-13 18:41:19 +01:00
rc = rtnl_link_register ( & vxlan_link_ops ) ;
if ( rc )
goto out3 ;
2012-10-01 12:32:35 +00:00
2014-01-13 18:41:19 +01:00
return 0 ;
out3 :
unregister_netdevice_notifier ( & vxlan_notifier_block ) ;
2012-10-01 12:32:35 +00:00
out2 :
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
unregister_pernet_subsys ( & vxlan_net_ops ) ;
2012-10-01 12:32:35 +00:00
out1 :
return rc ;
}
2013-05-27 22:35:53 +00:00
late_initcall ( vxlan_init_module ) ;
2012-10-01 12:32:35 +00:00
static void __exit vxlan_cleanup_module ( void )
{
2013-06-17 14:16:09 -07:00
rtnl_link_unregister ( & vxlan_link_ops ) ;
2014-01-13 18:41:19 +01:00
unregister_netdevice_notifier ( & vxlan_notifier_block ) ;
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
unregister_pernet_subsys ( & vxlan_net_ops ) ;
/* rcu_barrier() is called by netns */
2012-10-01 12:32:35 +00:00
}
module_exit ( vxlan_cleanup_module ) ;
MODULE_LICENSE ( " GPL " ) ;
MODULE_VERSION ( VXLAN_VERSION ) ;
2013-04-27 11:31:52 +00:00
MODULE_AUTHOR ( " Stephen Hemminger <stephen@networkplumber.org> " ) ;
2014-01-17 11:00:33 -08:00
MODULE_DESCRIPTION ( " Driver for VXLAN encapsulated traffic " ) ;
2012-10-01 12:32:35 +00:00
MODULE_ALIAS_RTNL_LINK ( " vxlan " ) ;