2012-10-01 12:32:35 +00:00
/*
2012-11-13 13:29:15 +00:00
* VXLAN : Virtual eXtensible Local Area Network
2012-10-01 12:32:35 +00:00
*
2013-04-27 11:31:52 +00:00
* Copyright ( c ) 2012 - 2013 Vyatta Inc .
2012-10-01 12:32:35 +00:00
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*/
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
# include <linux/kernel.h>
# include <linux/types.h>
# include <linux/module.h>
# include <linux/errno.h>
# include <linux/slab.h>
# include <linux/skbuff.h>
# include <linux/rculist.h>
# include <linux/netdevice.h>
# include <linux/in.h>
# include <linux/ip.h>
# include <linux/udp.h>
# include <linux/igmp.h>
# include <linux/etherdevice.h>
# include <linux/if_ether.h>
2013-08-19 11:23:29 -07:00
# include <linux/if_vlan.h>
2012-10-01 12:32:35 +00:00
# include <linux/hash.h>
2013-01-29 23:43:07 +00:00
# include <linux/ethtool.h>
2012-11-20 02:50:14 +00:00
# include <net/arp.h>
# include <net/ndisc.h>
2012-10-01 12:32:35 +00:00
# include <net/ip.h>
2013-03-25 14:49:35 +00:00
# include <net/ip_tunnels.h>
2012-10-01 12:32:35 +00:00
# include <net/icmp.h>
# include <net/udp.h>
# include <net/rtnetlink.h>
# include <net/route.h>
# include <net/dsfield.h>
# include <net/inet_ecn.h>
# include <net/net_namespace.h>
# include <net/netns/generic.h>
2013-08-19 11:23:07 -07:00
# include <net/vxlan.h>
2014-01-20 13:59:21 +02:00
# include <net/protocol.h>
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
# include <net/ipv6.h>
# include <net/addrconf.h>
# include <net/ip6_tunnel.h>
2013-09-02 10:06:52 +08:00
# include <net/ip6_checksum.h>
2013-08-31 13:44:33 +08:00
# endif
2012-10-01 12:32:35 +00:00
# define VXLAN_VERSION "0.1"
2013-05-16 11:35:20 +00:00
# define PORT_HASH_BITS 8
# define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
2012-10-01 12:32:35 +00:00
# define VNI_HASH_BITS 10
# define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
# define FDB_HASH_BITS 8
# define FDB_HASH_SIZE (1<<FDB_HASH_BITS)
# define FDB_AGE_DEFAULT 300 /* 5 min */
# define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
# define VXLAN_N_VID (1u << 24)
# define VXLAN_VID_MASK (VXLAN_N_VID - 1)
2013-08-19 11:22:54 -07:00
# define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
2012-10-01 12:32:35 +00:00
# define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */
/* VXLAN protocol header */
struct vxlanhdr {
__be32 vx_flags ;
__be32 vx_vni ;
} ;
2013-04-27 11:31:53 +00:00
/* UDP port for VXLAN traffic.
* The IANA assigned port is 4789 , but the Linux default is 8472
2013-06-17 14:16:41 -07:00
* for compatibility with early adopters .
2013-04-27 11:31:53 +00:00
*/
2013-06-17 14:16:12 -07:00
static unsigned short vxlan_port __read_mostly = 8472 ;
module_param_named ( udp_port , vxlan_port , ushort , 0444 ) ;
2012-10-01 12:32:35 +00:00
MODULE_PARM_DESC ( udp_port , " Destination UDP port " ) ;
static bool log_ecn_error = true ;
module_param ( log_ecn_error , bool , 0644 ) ;
MODULE_PARM_DESC ( log_ecn_error , " Log packets received with corrupted ECN " ) ;
2013-06-20 00:26:31 -07:00
static int vxlan_net_id ;
2013-05-16 11:35:20 +00:00
2013-06-25 16:01:51 +03:00
static const u8 all_zeros_mac [ ETH_ALEN ] ;
2013-05-16 11:35:20 +00:00
/* per-network namespace private data for this module */
struct vxlan_net {
struct list_head vxlan_list ;
struct hlist_head sock_list [ PORT_HASH_SIZE ] ;
2013-06-17 14:16:11 -07:00
spinlock_t sock_lock ;
2013-05-16 11:35:20 +00:00
} ;
2013-08-31 13:44:33 +08:00
union vxlan_addr {
struct sockaddr_in sin ;
struct sockaddr_in6 sin6 ;
struct sockaddr sa ;
} ;
2013-03-15 04:35:51 +00:00
struct vxlan_rdst {
2013-08-31 13:44:33 +08:00
union vxlan_addr remote_ip ;
2013-03-15 04:35:51 +00:00
__be16 remote_port ;
u32 remote_vni ;
u32 remote_ifindex ;
2013-06-17 14:16:12 -07:00
struct list_head list ;
2013-06-25 16:01:54 +03:00
struct rcu_head rcu ;
2013-03-15 04:35:51 +00:00
} ;
2012-10-01 12:32:35 +00:00
/* Forwarding table entry */
struct vxlan_fdb {
struct hlist_node hlist ; /* linked list of entries */
struct rcu_head rcu ;
unsigned long updated ; /* jiffies */
unsigned long used ;
2013-06-17 14:16:12 -07:00
struct list_head remotes ;
2012-10-01 12:32:35 +00:00
u16 state ; /* see ndm_state */
2013-04-19 00:36:26 +00:00
u8 flags ; /* see ndm_flags */
2012-10-01 12:32:35 +00:00
u8 eth_addr [ ETH_ALEN ] ;
} ;
/* Pseudo network device */
struct vxlan_dev {
2013-05-16 11:35:20 +00:00
struct hlist_node hlist ; /* vni hash table */
struct list_head next ; /* vxlan's per namespace list */
struct vxlan_sock * vn_sock ; /* listening socket */
2012-10-01 12:32:35 +00:00
struct net_device * dev ;
2013-04-16 02:50:52 +00:00
struct vxlan_rdst default_dst ; /* default destination */
2013-08-31 13:44:33 +08:00
union vxlan_addr saddr ; /* source address */
2013-04-27 11:31:57 +00:00
__be16 dst_port ;
2012-10-09 20:35:50 +00:00
__u16 port_min ; /* source port range */
__u16 port_max ;
2012-10-01 12:32:35 +00:00
__u8 tos ; /* TOS override */
__u8 ttl ;
2012-11-20 02:50:14 +00:00
u32 flags ; /* VXLAN_F_* below */
2012-10-01 12:32:35 +00:00
2013-06-17 14:16:11 -07:00
struct work_struct sock_work ;
2013-07-18 08:40:15 -07:00
struct work_struct igmp_join ;
struct work_struct igmp_leave ;
2013-06-17 14:16:11 -07:00
2012-10-01 12:32:35 +00:00
unsigned long age_interval ;
struct timer_list age_timer ;
spinlock_t hash_lock ;
unsigned int addrcnt ;
unsigned int addrmax ;
struct hlist_head fdb_head [ FDB_HASH_SIZE ] ;
} ;
2012-11-20 02:50:14 +00:00
# define VXLAN_F_LEARN 0x01
# define VXLAN_F_PROXY 0x02
# define VXLAN_F_RSC 0x04
# define VXLAN_F_L2MISS 0x08
# define VXLAN_F_L3MISS 0x10
2013-08-31 13:44:33 +08:00
# define VXLAN_F_IPV6 0x20 /* internal flag */
2012-11-20 02:50:14 +00:00
2012-10-01 12:32:35 +00:00
/* salt for hash table */
static u32 vxlan_salt __read_mostly ;
2013-06-17 14:16:09 -07:00
static struct workqueue_struct * vxlan_wq ;
2012-10-01 12:32:35 +00:00
2013-06-17 14:16:11 -07:00
static void vxlan_sock_work ( struct work_struct * work ) ;
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal ( const union vxlan_addr * a , const union vxlan_addr * b )
{
if ( a - > sa . sa_family ! = b - > sa . sa_family )
return false ;
if ( a - > sa . sa_family = = AF_INET6 )
return ipv6_addr_equal ( & a - > sin6 . sin6_addr , & b - > sin6 . sin6_addr ) ;
else
return a - > sin . sin_addr . s_addr = = b - > sin . sin_addr . s_addr ;
}
static inline bool vxlan_addr_any ( const union vxlan_addr * ipa )
{
if ( ipa - > sa . sa_family = = AF_INET6 )
return ipv6_addr_any ( & ipa - > sin6 . sin6_addr ) ;
else
return ipa - > sin . sin_addr . s_addr = = htonl ( INADDR_ANY ) ;
}
static inline bool vxlan_addr_multicast ( const union vxlan_addr * ipa )
{
if ( ipa - > sa . sa_family = = AF_INET6 )
return ipv6_addr_is_multicast ( & ipa - > sin6 . sin6_addr ) ;
else
return IN_MULTICAST ( ntohl ( ipa - > sin . sin_addr . s_addr ) ) ;
}
static int vxlan_nla_get_addr ( union vxlan_addr * ip , struct nlattr * nla )
{
if ( nla_len ( nla ) > = sizeof ( struct in6_addr ) ) {
nla_memcpy ( & ip - > sin6 . sin6_addr , nla , sizeof ( struct in6_addr ) ) ;
ip - > sa . sa_family = AF_INET6 ;
return 0 ;
} else if ( nla_len ( nla ) > = sizeof ( __be32 ) ) {
ip - > sin . sin_addr . s_addr = nla_get_be32 ( nla ) ;
ip - > sa . sa_family = AF_INET ;
return 0 ;
} else {
return - EAFNOSUPPORT ;
}
}
static int vxlan_nla_put_addr ( struct sk_buff * skb , int attr ,
const union vxlan_addr * ip )
{
if ( ip - > sa . sa_family = = AF_INET6 )
return nla_put ( skb , attr , sizeof ( struct in6_addr ) , & ip - > sin6 . sin6_addr ) ;
else
return nla_put_be32 ( skb , attr , ip - > sin . sin_addr . s_addr ) ;
}
# else /* !CONFIG_IPV6 */
static inline
bool vxlan_addr_equal ( const union vxlan_addr * a , const union vxlan_addr * b )
{
return a - > sin . sin_addr . s_addr = = b - > sin . sin_addr . s_addr ;
}
static inline bool vxlan_addr_any ( const union vxlan_addr * ipa )
{
return ipa - > sin . sin_addr . s_addr = = htonl ( INADDR_ANY ) ;
}
static inline bool vxlan_addr_multicast ( const union vxlan_addr * ipa )
{
return IN_MULTICAST ( ntohl ( ipa - > sin . sin_addr . s_addr ) ) ;
}
static int vxlan_nla_get_addr ( union vxlan_addr * ip , struct nlattr * nla )
{
if ( nla_len ( nla ) > = sizeof ( struct in6_addr ) ) {
return - EAFNOSUPPORT ;
} else if ( nla_len ( nla ) > = sizeof ( __be32 ) ) {
ip - > sin . sin_addr . s_addr = nla_get_be32 ( nla ) ;
ip - > sa . sa_family = AF_INET ;
return 0 ;
} else {
return - EAFNOSUPPORT ;
}
}
static int vxlan_nla_put_addr ( struct sk_buff * skb , int attr ,
const union vxlan_addr * ip )
{
return nla_put_be32 ( skb , attr , ip - > sin . sin_addr . s_addr ) ;
}
# endif
2013-05-16 11:35:20 +00:00
/* Virtual Network hash table head */
static inline struct hlist_head * vni_head ( struct vxlan_sock * vs , u32 id )
{
return & vs - > vni_list [ hash_32 ( id , VNI_HASH_BITS ) ] ;
}
/* Socket hash table head */
static inline struct hlist_head * vs_head ( struct net * net , __be16 port )
2012-10-01 12:32:35 +00:00
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
2013-05-16 11:35:20 +00:00
return & vn - > sock_list [ hash_32 ( ntohs ( port ) , PORT_HASH_BITS ) ] ;
}
2013-06-17 14:16:12 -07:00
/* First remote destination for a forwarding entry.
* Guaranteed to be non - NULL because remotes are never deleted .
*/
2013-08-04 17:17:39 -07:00
static inline struct vxlan_rdst * first_remote_rcu ( struct vxlan_fdb * fdb )
2013-06-17 14:16:12 -07:00
{
2013-08-04 17:17:39 -07:00
return list_entry_rcu ( fdb - > remotes . next , struct vxlan_rdst , list ) ;
}
static inline struct vxlan_rdst * first_remote_rtnl ( struct vxlan_fdb * fdb )
{
return list_first_entry ( & fdb - > remotes , struct vxlan_rdst , list ) ;
2013-06-17 14:16:12 -07:00
}
2013-05-16 11:35:20 +00:00
/* Find VXLAN socket based on network namespace and UDP port */
2013-08-19 11:22:48 -07:00
static struct vxlan_sock * vxlan_find_sock ( struct net * net , __be16 port )
2013-05-16 11:35:20 +00:00
{
struct vxlan_sock * vs ;
hlist_for_each_entry_rcu ( vs , vs_head ( net , port ) , hlist ) {
if ( inet_sk ( vs - > sock - > sk ) - > inet_sport = = port )
return vs ;
}
return NULL ;
2012-10-01 12:32:35 +00:00
}
2013-08-19 11:23:02 -07:00
static struct vxlan_dev * vxlan_vs_find_vni ( struct vxlan_sock * vs , u32 id )
2012-10-01 12:32:35 +00:00
{
struct vxlan_dev * vxlan ;
2013-05-16 11:35:20 +00:00
hlist_for_each_entry_rcu ( vxlan , vni_head ( vs , id ) , hlist ) {
2013-04-16 02:50:52 +00:00
if ( vxlan - > default_dst . remote_vni = = id )
2012-10-01 12:32:35 +00:00
return vxlan ;
}
return NULL ;
}
2013-08-19 11:23:02 -07:00
/* Look up VNI in a per net namespace table */
static struct vxlan_dev * vxlan_find_vni ( struct net * net , u32 id , __be16 port )
{
struct vxlan_sock * vs ;
vs = vxlan_find_sock ( net , port ) ;
if ( ! vs )
return NULL ;
return vxlan_vs_find_vni ( vs , id ) ;
}
2012-10-01 12:32:35 +00:00
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info ( struct sk_buff * skb , struct vxlan_dev * vxlan ,
2013-06-17 14:16:41 -07:00
const struct vxlan_fdb * fdb ,
u32 portid , u32 seq , int type , unsigned int flags ,
const struct vxlan_rdst * rdst )
2012-10-01 12:32:35 +00:00
{
unsigned long now = jiffies ;
struct nda_cacheinfo ci ;
struct nlmsghdr * nlh ;
struct ndmsg * ndm ;
2012-11-20 02:50:14 +00:00
bool send_ip , send_eth ;
2012-10-01 12:32:35 +00:00
nlh = nlmsg_put ( skb , portid , seq , type , sizeof ( * ndm ) , flags ) ;
if ( nlh = = NULL )
return - EMSGSIZE ;
ndm = nlmsg_data ( nlh ) ;
memset ( ndm , 0 , sizeof ( * ndm ) ) ;
2012-11-20 02:50:14 +00:00
send_eth = send_ip = true ;
if ( type = = RTM_GETNEIGH ) {
ndm - > ndm_family = AF_INET ;
2013-08-31 13:44:33 +08:00
send_ip = ! vxlan_addr_any ( & rdst - > remote_ip ) ;
2012-11-20 02:50:14 +00:00
send_eth = ! is_zero_ether_addr ( fdb - > eth_addr ) ;
} else
ndm - > ndm_family = AF_BRIDGE ;
2012-10-01 12:32:35 +00:00
ndm - > ndm_state = fdb - > state ;
ndm - > ndm_ifindex = vxlan - > dev - > ifindex ;
2013-04-19 00:36:26 +00:00
ndm - > ndm_flags = fdb - > flags ;
2012-10-01 12:32:35 +00:00
ndm - > ndm_type = NDA_DST ;
2012-11-20 02:50:14 +00:00
if ( send_eth & & nla_put ( skb , NDA_LLADDR , ETH_ALEN , & fdb - > eth_addr ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
2013-08-31 13:44:33 +08:00
if ( send_ip & & vxlan_nla_put_addr ( skb , NDA_DST , & rdst - > remote_ip ) )
2013-03-15 04:35:51 +00:00
goto nla_put_failure ;
2013-04-27 11:31:57 +00:00
if ( rdst - > remote_port & & rdst - > remote_port ! = vxlan - > dst_port & &
2013-03-15 04:35:51 +00:00
nla_put_be16 ( skb , NDA_PORT , rdst - > remote_port ) )
goto nla_put_failure ;
2013-04-16 02:50:52 +00:00
if ( rdst - > remote_vni ! = vxlan - > default_dst . remote_vni & &
2013-06-20 00:26:31 -07:00
nla_put_u32 ( skb , NDA_VNI , rdst - > remote_vni ) )
2013-03-15 04:35:51 +00:00
goto nla_put_failure ;
if ( rdst - > remote_ifindex & &
nla_put_u32 ( skb , NDA_IFINDEX , rdst - > remote_ifindex ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
ci . ndm_used = jiffies_to_clock_t ( now - fdb - > used ) ;
ci . ndm_confirmed = 0 ;
ci . ndm_updated = jiffies_to_clock_t ( now - fdb - > updated ) ;
ci . ndm_refcnt = 0 ;
if ( nla_put ( skb , NDA_CACHEINFO , sizeof ( ci ) , & ci ) )
goto nla_put_failure ;
return nlmsg_end ( skb , nlh ) ;
nla_put_failure :
nlmsg_cancel ( skb , nlh ) ;
return - EMSGSIZE ;
}
static inline size_t vxlan_nlmsg_size ( void )
{
return NLMSG_ALIGN ( sizeof ( struct ndmsg ) )
+ nla_total_size ( ETH_ALEN ) /* NDA_LLADDR */
2013-08-31 13:44:33 +08:00
+ nla_total_size ( sizeof ( struct in6_addr ) ) /* NDA_DST */
2013-04-27 11:31:54 +00:00
+ nla_total_size ( sizeof ( __be16 ) ) /* NDA_PORT */
2013-03-15 04:35:51 +00:00
+ nla_total_size ( sizeof ( __be32 ) ) /* NDA_VNI */
+ nla_total_size ( sizeof ( __u32 ) ) /* NDA_IFINDEX */
2012-10-01 12:32:35 +00:00
+ nla_total_size ( sizeof ( struct nda_cacheinfo ) ) ;
}
2014-04-22 15:01:30 +02:00
static void vxlan_fdb_notify ( struct vxlan_dev * vxlan , struct vxlan_fdb * fdb ,
struct vxlan_rdst * rd , int type )
2012-10-01 12:32:35 +00:00
{
struct net * net = dev_net ( vxlan - > dev ) ;
struct sk_buff * skb ;
int err = - ENOBUFS ;
skb = nlmsg_new ( vxlan_nlmsg_size ( ) , GFP_ATOMIC ) ;
if ( skb = = NULL )
goto errout ;
2014-04-22 15:01:30 +02:00
err = vxlan_fdb_info ( skb , vxlan , fdb , 0 , 0 , type , 0 , rd ) ;
2012-10-01 12:32:35 +00:00
if ( err < 0 ) {
/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
WARN_ON ( err = = - EMSGSIZE ) ;
kfree_skb ( skb ) ;
goto errout ;
}
rtnl_notify ( skb , net , 0 , RTNLGRP_NEIGH , NULL , GFP_ATOMIC ) ;
return ;
errout :
if ( err < 0 )
rtnl_set_sk_err ( net , RTNLGRP_NEIGH , err ) ;
}
2013-08-31 13:44:33 +08:00
static void vxlan_ip_miss ( struct net_device * dev , union vxlan_addr * ipa )
2012-11-20 02:50:14 +00:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-06-17 14:16:40 -07:00
struct vxlan_fdb f = {
. state = NUD_STALE ,
} ;
struct vxlan_rdst remote = {
2013-08-31 13:44:33 +08:00
. remote_ip = * ipa , /* goes to NDA_DST */
2013-06-17 14:16:40 -07:00
. remote_vni = VXLAN_N_VID ,
} ;
2013-06-17 14:16:12 -07:00
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , & f , & remote , RTM_GETNEIGH ) ;
2012-11-20 02:50:14 +00:00
}
static void vxlan_fdb_miss ( struct vxlan_dev * vxlan , const u8 eth_addr [ ETH_ALEN ] )
{
2013-06-17 14:16:40 -07:00
struct vxlan_fdb f = {
. state = NUD_STALE ,
} ;
2014-04-22 15:01:30 +02:00
struct vxlan_rdst remote = { } ;
2012-11-20 02:50:14 +00:00
memcpy ( f . eth_addr , eth_addr , ETH_ALEN ) ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , & f , & remote , RTM_GETNEIGH ) ;
2012-11-20 02:50:14 +00:00
}
2012-10-01 12:32:35 +00:00
/* Hash Ethernet address */
static u32 eth_hash ( const unsigned char * addr )
{
u64 value = get_unaligned ( ( u64 * ) addr ) ;
/* only want 6 bytes */
# ifdef __BIG_ENDIAN
value > > = 16 ;
2012-10-09 20:35:47 +00:00
# else
value < < = 16 ;
2012-10-01 12:32:35 +00:00
# endif
return hash_64 ( value , FDB_HASH_BITS ) ;
}
/* Hash chain to use given mac address */
static inline struct hlist_head * vxlan_fdb_head ( struct vxlan_dev * vxlan ,
const u8 * mac )
{
return & vxlan - > fdb_head [ eth_hash ( mac ) ] ;
}
/* Look up Ethernet address in forwarding table */
2013-05-17 06:39:07 +00:00
static struct vxlan_fdb * __vxlan_find_mac ( struct vxlan_dev * vxlan ,
2012-10-01 12:32:35 +00:00
const u8 * mac )
{
struct hlist_head * head = vxlan_fdb_head ( vxlan , mac ) ;
struct vxlan_fdb * f ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
hlist_for_each_entry_rcu ( f , head , hlist ) {
drivers/net: Convert uses of compare_ether_addr to ether_addr_equal
Use the new bool function ether_addr_equal to add
some clarity and reduce the likelihood for misuse
of compare_ether_addr for sorting.
Done via cocci script: (and a little typing)
$ cat compare_ether_addr.cocci
@@
expression a,b;
@@
- !compare_ether_addr(a, b)
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- compare_ether_addr(a, b)
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) == 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) != 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) == 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) != 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal(a, b)
+ ether_addr_equal(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-09-01 11:51:23 -07:00
if ( ether_addr_equal ( mac , f - > eth_addr ) )
2012-10-01 12:32:35 +00:00
return f ;
}
return NULL ;
}
2013-05-17 06:39:07 +00:00
static struct vxlan_fdb * vxlan_find_mac ( struct vxlan_dev * vxlan ,
const u8 * mac )
{
struct vxlan_fdb * f ;
f = __vxlan_find_mac ( vxlan , mac ) ;
if ( f )
f - > used = jiffies ;
return f ;
}
2013-06-25 16:01:52 +03:00
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst * vxlan_fdb_find_rdst ( struct vxlan_fdb * f ,
2013-08-31 13:44:33 +08:00
union vxlan_addr * ip , __be16 port ,
2013-06-25 16:01:52 +03:00
__u32 vni , __u32 ifindex )
2013-03-15 04:35:51 +00:00
{
2013-06-17 14:16:12 -07:00
struct vxlan_rdst * rd ;
2013-03-15 04:35:51 +00:00
2013-06-17 14:16:12 -07:00
list_for_each_entry ( rd , & f - > remotes , list ) {
2013-08-31 13:44:33 +08:00
if ( vxlan_addr_equal ( & rd - > remote_ip , ip ) & &
2013-03-15 04:35:51 +00:00
rd - > remote_port = = port & &
rd - > remote_vni = = vni & &
rd - > remote_ifindex = = ifindex )
2013-06-25 16:01:52 +03:00
return rd ;
2013-03-15 04:35:51 +00:00
}
2013-06-17 14:16:12 -07:00
2013-06-25 16:01:52 +03:00
return NULL ;
}
2013-07-19 17:20:07 +02:00
/* Replace destination of unicast mac */
static int vxlan_fdb_replace ( struct vxlan_fdb * f ,
2013-08-31 13:44:33 +08:00
union vxlan_addr * ip , __be16 port , __u32 vni , __u32 ifindex )
2013-07-19 17:20:07 +02:00
{
struct vxlan_rdst * rd ;
rd = vxlan_fdb_find_rdst ( f , ip , port , vni , ifindex ) ;
if ( rd )
return 0 ;
rd = list_first_entry_or_null ( & f - > remotes , struct vxlan_rdst , list ) ;
if ( ! rd )
return 0 ;
2013-08-31 13:44:33 +08:00
rd - > remote_ip = * ip ;
2013-07-19 17:20:07 +02:00
rd - > remote_port = port ;
rd - > remote_vni = vni ;
rd - > remote_ifindex = ifindex ;
return 1 ;
}
2013-06-25 16:01:52 +03:00
/* Add/update destinations for multicast */
static int vxlan_fdb_append ( struct vxlan_fdb * f ,
2014-04-22 15:01:30 +02:00
union vxlan_addr * ip , __be16 port , __u32 vni ,
__u32 ifindex , struct vxlan_rdst * * rdp )
2013-06-25 16:01:52 +03:00
{
struct vxlan_rdst * rd ;
rd = vxlan_fdb_find_rdst ( f , ip , port , vni , ifindex ) ;
if ( rd )
return 0 ;
2013-03-15 04:35:51 +00:00
rd = kmalloc ( sizeof ( * rd ) , GFP_ATOMIC ) ;
if ( rd = = NULL )
return - ENOBUFS ;
2013-08-31 13:44:33 +08:00
rd - > remote_ip = * ip ;
2013-03-15 04:35:51 +00:00
rd - > remote_port = port ;
rd - > remote_vni = vni ;
rd - > remote_ifindex = ifindex ;
2013-06-17 14:16:12 -07:00
list_add_tail_rcu ( & rd - > list , & f - > remotes ) ;
2014-04-22 15:01:30 +02:00
* rdp = rd ;
2013-03-15 04:35:51 +00:00
return 1 ;
}
2014-01-20 13:59:21 +02:00
static struct sk_buff * * vxlan_gro_receive ( struct sk_buff * * head , struct sk_buff * skb )
{
struct sk_buff * p , * * pp = NULL ;
struct vxlanhdr * vh , * vh2 ;
struct ethhdr * eh , * eh2 ;
unsigned int hlen , off_vx , off_eth ;
const struct packet_offload * ptype ;
__be16 type ;
int flush = 1 ;
off_vx = skb_gro_offset ( skb ) ;
hlen = off_vx + sizeof ( * vh ) ;
vh = skb_gro_header_fast ( skb , off_vx ) ;
if ( skb_gro_header_hard ( skb , hlen ) ) {
vh = skb_gro_header_slow ( skb , hlen , off_vx ) ;
if ( unlikely ( ! vh ) )
goto out ;
}
skb_gro_pull ( skb , sizeof ( struct vxlanhdr ) ) ; /* pull vxlan header */
off_eth = skb_gro_offset ( skb ) ;
hlen = off_eth + sizeof ( * eh ) ;
eh = skb_gro_header_fast ( skb , off_eth ) ;
if ( skb_gro_header_hard ( skb , hlen ) ) {
eh = skb_gro_header_slow ( skb , hlen , off_eth ) ;
if ( unlikely ( ! eh ) )
goto out ;
}
flush = 0 ;
for ( p = * head ; p ; p = p - > next ) {
if ( ! NAPI_GRO_CB ( p ) - > same_flow )
continue ;
vh2 = ( struct vxlanhdr * ) ( p - > data + off_vx ) ;
eh2 = ( struct ethhdr * ) ( p - > data + off_eth ) ;
if ( vh - > vx_vni ! = vh2 - > vx_vni | | compare_ether_header ( eh , eh2 ) ) {
NAPI_GRO_CB ( p ) - > same_flow = 0 ;
continue ;
}
}
type = eh - > h_proto ;
rcu_read_lock ( ) ;
ptype = gro_find_receive_by_type ( type ) ;
if ( ptype = = NULL ) {
flush = 1 ;
goto out_unlock ;
}
skb_gro_pull ( skb , sizeof ( * eh ) ) ; /* pull inner eth header */
pp = ptype - > callbacks . gro_receive ( head , skb ) ;
out_unlock :
rcu_read_unlock ( ) ;
out :
NAPI_GRO_CB ( skb ) - > flush | = flush ;
return pp ;
}
static int vxlan_gro_complete ( struct sk_buff * skb , int nhoff )
{
struct ethhdr * eh ;
struct packet_offload * ptype ;
__be16 type ;
int vxlan_len = sizeof ( struct vxlanhdr ) + sizeof ( struct ethhdr ) ;
int err = - ENOSYS ;
eh = ( struct ethhdr * ) ( skb - > data + nhoff + sizeof ( struct vxlanhdr ) ) ;
type = eh - > h_proto ;
rcu_read_lock ( ) ;
ptype = gro_find_complete_by_type ( type ) ;
if ( ptype ! = NULL )
err = ptype - > callbacks . gro_complete ( skb , nhoff + vxlan_len ) ;
rcu_read_unlock ( ) ;
return err ;
}
2013-09-04 02:13:38 -07:00
/* Notify netdevs that UDP port started listening */
2014-01-20 13:59:21 +02:00
static void vxlan_notify_add_rx_port ( struct vxlan_sock * vs )
2013-09-04 02:13:38 -07:00
{
struct net_device * dev ;
2014-01-20 13:59:21 +02:00
struct sock * sk = vs - > sock - > sk ;
2013-09-04 02:13:38 -07:00
struct net * net = sock_net ( sk ) ;
sa_family_t sa_family = sk - > sk_family ;
2013-09-13 07:34:13 -07:00
__be16 port = inet_sk ( sk ) - > inet_sport ;
2014-01-20 13:59:21 +02:00
int err ;
if ( sa_family = = AF_INET ) {
err = udp_add_offload ( & vs - > udp_offloads ) ;
if ( err )
pr_warn ( " vxlan: udp_add_offload failed with status %d \n " , err ) ;
}
2013-09-04 02:13:38 -07:00
rcu_read_lock ( ) ;
for_each_netdev_rcu ( net , dev ) {
if ( dev - > netdev_ops - > ndo_add_vxlan_port )
dev - > netdev_ops - > ndo_add_vxlan_port ( dev , sa_family ,
port ) ;
}
rcu_read_unlock ( ) ;
}
/* Notify netdevs that UDP port is no more listening */
2014-01-20 13:59:21 +02:00
static void vxlan_notify_del_rx_port ( struct vxlan_sock * vs )
2013-09-04 02:13:38 -07:00
{
struct net_device * dev ;
2014-01-20 13:59:21 +02:00
struct sock * sk = vs - > sock - > sk ;
2013-09-04 02:13:38 -07:00
struct net * net = sock_net ( sk ) ;
sa_family_t sa_family = sk - > sk_family ;
2013-09-13 07:34:13 -07:00
__be16 port = inet_sk ( sk ) - > inet_sport ;
2013-09-04 02:13:38 -07:00
rcu_read_lock ( ) ;
for_each_netdev_rcu ( net , dev ) {
if ( dev - > netdev_ops - > ndo_del_vxlan_port )
dev - > netdev_ops - > ndo_del_vxlan_port ( dev , sa_family ,
port ) ;
}
rcu_read_unlock ( ) ;
2014-01-20 13:59:21 +02:00
if ( sa_family = = AF_INET )
udp_del_offload ( & vs - > udp_offloads ) ;
2013-09-04 02:13:38 -07:00
}
2012-10-01 12:32:35 +00:00
/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_create ( struct vxlan_dev * vxlan ,
2013-08-31 13:44:33 +08:00
const u8 * mac , union vxlan_addr * ip ,
2013-03-15 04:35:51 +00:00
__u16 state , __u16 flags ,
2013-04-27 11:31:54 +00:00
__be16 port , __u32 vni , __u32 ifindex ,
2013-04-19 00:36:26 +00:00
__u8 ndm_flags )
2012-10-01 12:32:35 +00:00
{
2014-04-22 15:01:30 +02:00
struct vxlan_rdst * rd = NULL ;
2012-10-01 12:32:35 +00:00
struct vxlan_fdb * f ;
int notify = 0 ;
2013-05-17 06:39:07 +00:00
f = __vxlan_find_mac ( vxlan , mac ) ;
2012-10-01 12:32:35 +00:00
if ( f ) {
if ( flags & NLM_F_EXCL ) {
netdev_dbg ( vxlan - > dev ,
" lost race to create %pM \n " , mac ) ;
return - EEXIST ;
}
if ( f - > state ! = state ) {
f - > state = state ;
f - > updated = jiffies ;
notify = 1 ;
}
2013-04-19 00:36:26 +00:00
if ( f - > flags ! = ndm_flags ) {
f - > flags = ndm_flags ;
f - > updated = jiffies ;
notify = 1 ;
}
2013-07-19 17:20:07 +02:00
if ( ( flags & NLM_F_REPLACE ) ) {
/* Only change unicasts */
if ( ! ( is_multicast_ether_addr ( f - > eth_addr ) | |
is_zero_ether_addr ( f - > eth_addr ) ) ) {
int rc = vxlan_fdb_replace ( f , ip , port , vni ,
ifindex ) ;
if ( rc < 0 )
return rc ;
notify | = rc ;
} else
return - EOPNOTSUPP ;
}
2013-03-15 04:35:51 +00:00
if ( ( flags & NLM_F_APPEND ) & &
2013-06-25 16:01:56 +03:00
( is_multicast_ether_addr ( f - > eth_addr ) | |
is_zero_ether_addr ( f - > eth_addr ) ) ) {
2014-04-22 15:01:30 +02:00
int rc = vxlan_fdb_append ( f , ip , port , vni , ifindex ,
& rd ) ;
2013-03-15 04:35:51 +00:00
if ( rc < 0 )
return rc ;
notify | = rc ;
}
2012-10-01 12:32:35 +00:00
} else {
if ( ! ( flags & NLM_F_CREATE ) )
return - ENOENT ;
if ( vxlan - > addrmax & & vxlan - > addrcnt > = vxlan - > addrmax )
return - ENOSPC ;
2013-07-19 17:20:07 +02:00
/* Disallow replace to add a multicast entry */
if ( ( flags & NLM_F_REPLACE ) & &
( is_multicast_ether_addr ( mac ) | | is_zero_ether_addr ( mac ) ) )
return - EOPNOTSUPP ;
2013-08-31 13:44:33 +08:00
netdev_dbg ( vxlan - > dev , " add %pM -> %pIS \n " , mac , ip ) ;
2012-10-01 12:32:35 +00:00
f = kmalloc ( sizeof ( * f ) , GFP_ATOMIC ) ;
if ( ! f )
return - ENOMEM ;
notify = 1 ;
f - > state = state ;
2013-04-19 00:36:26 +00:00
f - > flags = ndm_flags ;
2012-10-01 12:32:35 +00:00
f - > updated = f - > used = jiffies ;
2013-06-17 14:16:12 -07:00
INIT_LIST_HEAD ( & f - > remotes ) ;
2012-10-01 12:32:35 +00:00
memcpy ( f - > eth_addr , mac , ETH_ALEN ) ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_append ( f , ip , port , vni , ifindex , & rd ) ;
2013-06-17 14:16:12 -07:00
2012-10-01 12:32:35 +00:00
+ + vxlan - > addrcnt ;
hlist_add_head_rcu ( & f - > hlist ,
vxlan_fdb_head ( vxlan , mac ) ) ;
}
2014-04-22 15:01:30 +02:00
if ( notify ) {
if ( rd = = NULL )
rd = first_remote_rtnl ( f ) ;
vxlan_fdb_notify ( vxlan , f , rd , RTM_NEWNEIGH ) ;
}
2012-10-01 12:32:35 +00:00
return 0 ;
}
2013-04-11 19:00:35 +00:00
static void vxlan_fdb_free ( struct rcu_head * head )
2013-03-15 04:35:51 +00:00
{
struct vxlan_fdb * f = container_of ( head , struct vxlan_fdb , rcu ) ;
2013-06-17 14:16:12 -07:00
struct vxlan_rdst * rd , * nd ;
2013-03-15 04:35:51 +00:00
2013-06-17 14:16:12 -07:00
list_for_each_entry_safe ( rd , nd , & f - > remotes , list )
2013-03-15 04:35:51 +00:00
kfree ( rd ) ;
kfree ( f ) ;
}
2012-10-01 12:32:35 +00:00
static void vxlan_fdb_destroy ( struct vxlan_dev * vxlan , struct vxlan_fdb * f )
{
netdev_dbg ( vxlan - > dev ,
" delete %pM \n " , f - > eth_addr ) ;
- - vxlan - > addrcnt ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , f , first_remote_rtnl ( f ) , RTM_DELNEIGH ) ;
2012-10-01 12:32:35 +00:00
hlist_del_rcu ( & f - > hlist ) ;
2013-03-15 04:35:51 +00:00
call_rcu ( & f - > rcu , vxlan_fdb_free ) ;
2012-10-01 12:32:35 +00:00
}
2013-06-25 16:01:53 +03:00
static int vxlan_fdb_parse ( struct nlattr * tb [ ] , struct vxlan_dev * vxlan ,
2013-08-31 13:44:33 +08:00
union vxlan_addr * ip , __be16 * port , u32 * vni , u32 * ifindex )
2012-10-01 12:32:35 +00:00
{
2013-03-15 04:35:51 +00:00
struct net * net = dev_net ( vxlan - > dev ) ;
2013-08-31 13:44:33 +08:00
int err ;
2012-10-01 12:32:35 +00:00
2013-06-25 16:01:53 +03:00
if ( tb [ NDA_DST ] ) {
2013-08-31 13:44:33 +08:00
err = vxlan_nla_get_addr ( ip , tb [ NDA_DST ] ) ;
if ( err )
return err ;
2013-06-25 16:01:53 +03:00
} else {
2013-08-31 13:44:33 +08:00
union vxlan_addr * remote = & vxlan - > default_dst . remote_ip ;
if ( remote - > sa . sa_family = = AF_INET ) {
ip - > sin . sin_addr . s_addr = htonl ( INADDR_ANY ) ;
ip - > sa . sa_family = AF_INET ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
ip - > sin6 . sin6_addr = in6addr_any ;
ip - > sa . sa_family = AF_INET6 ;
# endif
}
2013-06-25 16:01:53 +03:00
}
2012-10-01 12:32:35 +00:00
2013-03-15 04:35:51 +00:00
if ( tb [ NDA_PORT ] ) {
2013-04-27 11:31:54 +00:00
if ( nla_len ( tb [ NDA_PORT ] ) ! = sizeof ( __be16 ) )
2013-03-15 04:35:51 +00:00
return - EINVAL ;
2013-06-25 16:01:53 +03:00
* port = nla_get_be16 ( tb [ NDA_PORT ] ) ;
} else {
* port = vxlan - > dst_port ;
}
2013-03-15 04:35:51 +00:00
if ( tb [ NDA_VNI ] ) {
if ( nla_len ( tb [ NDA_VNI ] ) ! = sizeof ( u32 ) )
return - EINVAL ;
2013-06-25 16:01:53 +03:00
* vni = nla_get_u32 ( tb [ NDA_VNI ] ) ;
} else {
* vni = vxlan - > default_dst . remote_vni ;
}
2013-03-15 04:35:51 +00:00
if ( tb [ NDA_IFINDEX ] ) {
2013-03-26 08:29:30 +00:00
struct net_device * tdev ;
2013-03-15 04:35:51 +00:00
if ( nla_len ( tb [ NDA_IFINDEX ] ) ! = sizeof ( u32 ) )
return - EINVAL ;
2013-06-25 16:01:53 +03:00
* ifindex = nla_get_u32 ( tb [ NDA_IFINDEX ] ) ;
2014-01-15 10:23:41 +08:00
tdev = __dev_get_by_index ( net , * ifindex ) ;
2013-03-26 08:29:30 +00:00
if ( ! tdev )
2013-03-15 04:35:51 +00:00
return - EADDRNOTAVAIL ;
2013-06-25 16:01:53 +03:00
} else {
* ifindex = 0 ;
}
return 0 ;
}
/* Add static entry (via netlink) */
static int vxlan_fdb_add ( struct ndmsg * ndm , struct nlattr * tb [ ] ,
struct net_device * dev ,
const unsigned char * addr , u16 flags )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
/* struct net *net = dev_net(vxlan->dev); */
2013-08-31 13:44:33 +08:00
union vxlan_addr ip ;
2013-06-25 16:01:53 +03:00
__be16 port ;
u32 vni , ifindex ;
int err ;
if ( ! ( ndm - > ndm_state & ( NUD_PERMANENT | NUD_REACHABLE ) ) ) {
pr_info ( " RTM_NEWNEIGH with invalid state %#x \n " ,
ndm - > ndm_state ) ;
return - EINVAL ;
}
if ( tb [ NDA_DST ] = = NULL )
return - EINVAL ;
err = vxlan_fdb_parse ( tb , vxlan , & ip , & port , & vni , & ifindex ) ;
if ( err )
return err ;
2013-03-15 04:35:51 +00:00
2014-04-01 09:23:01 +03:00
if ( vxlan - > default_dst . remote_ip . sa . sa_family ! = ip . sa . sa_family )
return - EAFNOSUPPORT ;
2012-10-01 12:32:35 +00:00
spin_lock_bh ( & vxlan - > hash_lock ) ;
2013-08-31 13:44:33 +08:00
err = vxlan_fdb_create ( vxlan , addr , & ip , ndm - > ndm_state , flags ,
2013-04-27 11:31:54 +00:00
port , vni , ifindex , ndm - > ndm_flags ) ;
2012-10-01 12:32:35 +00:00
spin_unlock_bh ( & vxlan - > hash_lock ) ;
return err ;
}
/* Delete entry (via netlink) */
2013-02-13 12:00:18 +00:00
static int vxlan_fdb_delete ( struct ndmsg * ndm , struct nlattr * tb [ ] ,
struct net_device * dev ,
2012-10-01 12:32:35 +00:00
const unsigned char * addr )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct vxlan_fdb * f ;
2013-06-25 16:01:54 +03:00
struct vxlan_rdst * rd = NULL ;
2013-08-31 13:44:33 +08:00
union vxlan_addr ip ;
2013-06-25 16:01:54 +03:00
__be16 port ;
u32 vni , ifindex ;
int err ;
err = vxlan_fdb_parse ( tb , vxlan , & ip , & port , & vni , & ifindex ) ;
if ( err )
return err ;
err = - ENOENT ;
2012-10-01 12:32:35 +00:00
spin_lock_bh ( & vxlan - > hash_lock ) ;
f = vxlan_find_mac ( vxlan , addr ) ;
2013-06-25 16:01:54 +03:00
if ( ! f )
goto out ;
2013-08-31 13:44:33 +08:00
if ( ! vxlan_addr_any ( & ip ) ) {
rd = vxlan_fdb_find_rdst ( f , & ip , port , vni , ifindex ) ;
2013-06-25 16:01:54 +03:00
if ( ! rd )
goto out ;
}
err = 0 ;
/* remove a destination if it's not the only one on the list,
* otherwise destroy the fdb entry
*/
if ( rd & & ! list_is_singular ( & f - > remotes ) ) {
list_del_rcu ( & rd - > list ) ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , f , rd , RTM_DELNEIGH ) ;
2013-08-17 07:32:09 +08:00
kfree_rcu ( rd , rcu ) ;
2013-06-25 16:01:54 +03:00
goto out ;
2012-10-01 12:32:35 +00:00
}
2013-06-25 16:01:54 +03:00
vxlan_fdb_destroy ( vxlan , f ) ;
out :
2012-10-01 12:32:35 +00:00
spin_unlock_bh ( & vxlan - > hash_lock ) ;
return err ;
}
/* Dump forwarding table */
static int vxlan_fdb_dump ( struct sk_buff * skb , struct netlink_callback * cb ,
struct net_device * dev , int idx )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
unsigned int h ;
for ( h = 0 ; h < FDB_HASH_SIZE ; + + h ) {
struct vxlan_fdb * f ;
int err ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
hlist_for_each_entry_rcu ( f , & vxlan - > fdb_head [ h ] , hlist ) {
2013-03-15 04:35:51 +00:00
struct vxlan_rdst * rd ;
2013-06-17 14:16:12 -07:00
if ( idx < cb - > args [ 0 ] )
goto skip ;
list_for_each_entry_rcu ( rd , & f - > remotes , list ) {
2013-03-15 04:35:51 +00:00
err = vxlan_fdb_info ( skb , vxlan , f ,
NETLINK_CB ( cb - > skb ) . portid ,
cb - > nlh - > nlmsg_seq ,
RTM_NEWNEIGH ,
NLM_F_MULTI , rd ) ;
if ( err < 0 )
2013-06-17 14:16:12 -07:00
goto out ;
2013-03-15 04:35:51 +00:00
}
2013-06-17 14:16:12 -07:00
skip :
+ + idx ;
2012-10-01 12:32:35 +00:00
}
}
2013-06-17 14:16:12 -07:00
out :
2012-10-01 12:32:35 +00:00
return idx ;
}
/* Watch incoming packets to learn mapping between Ethernet address
* and Tunnel endpoint .
2013-06-17 12:09:58 -07:00
* Return true if packet is bogus and should be droppped .
2012-10-01 12:32:35 +00:00
*/
2013-06-17 12:09:58 -07:00
static bool vxlan_snoop ( struct net_device * dev ,
2013-08-31 13:44:33 +08:00
union vxlan_addr * src_ip , const u8 * src_mac )
2012-10-01 12:32:35 +00:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct vxlan_fdb * f ;
f = vxlan_find_mac ( vxlan , src_mac ) ;
if ( likely ( f ) ) {
2013-08-04 17:17:39 -07:00
struct vxlan_rdst * rdst = first_remote_rcu ( f ) ;
2013-06-17 14:16:12 -07:00
2013-08-31 13:44:33 +08:00
if ( likely ( vxlan_addr_equal ( & rdst - > remote_ip , src_ip ) ) )
2013-06-17 12:09:58 -07:00
return false ;
/* Don't migrate static entries, drop packets */
2013-06-18 14:27:01 -07:00
if ( f - > state & NUD_NOARP )
2013-06-17 12:09:58 -07:00
return true ;
2012-10-01 12:32:35 +00:00
if ( net_ratelimit ( ) )
netdev_info ( dev ,
2013-08-31 13:44:33 +08:00
" %pM migrated from %pIS to %pIS \n " ,
2013-06-17 14:16:12 -07:00
src_mac , & rdst - > remote_ip , & src_ip ) ;
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
rdst - > remote_ip = * src_ip ;
2012-10-01 12:32:35 +00:00
f - > updated = jiffies ;
2014-04-22 15:01:30 +02:00
vxlan_fdb_notify ( vxlan , f , rdst , RTM_NEWNEIGH ) ;
2012-10-01 12:32:35 +00:00
} else {
/* learned new entry */
spin_lock ( & vxlan - > hash_lock ) ;
2013-06-17 12:09:57 -07:00
/* close off race between vxlan_flush and incoming packets */
if ( netif_running ( dev ) )
vxlan_fdb_create ( vxlan , src_mac , src_ip ,
NUD_REACHABLE ,
NLM_F_EXCL | NLM_F_CREATE ,
vxlan - > dst_port ,
vxlan - > default_dst . remote_vni ,
0 , NTF_SELF ) ;
2012-10-01 12:32:35 +00:00
spin_unlock ( & vxlan - > hash_lock ) ;
}
2013-06-17 12:09:58 -07:00
return false ;
2012-10-01 12:32:35 +00:00
}
/* See if multicast group is already in use by other ID */
2013-12-10 16:37:33 +08:00
static bool vxlan_group_used ( struct vxlan_net * vn , struct vxlan_dev * dev )
2012-10-01 12:32:35 +00:00
{
2013-05-16 11:35:20 +00:00
struct vxlan_dev * vxlan ;
2012-10-01 12:32:35 +00:00
2013-12-10 16:37:33 +08:00
/* The vxlan_sock is only used by dev, leaving group has
* no effect on other vxlan devices .
*/
if ( atomic_read ( & dev - > vn_sock - > refcnt ) = = 1 )
return false ;
2013-05-16 11:35:20 +00:00
list_for_each_entry ( vxlan , & vn - > vxlan_list , next ) {
2013-12-10 16:37:33 +08:00
if ( ! netif_running ( vxlan - > dev ) | | vxlan = = dev )
2013-05-16 11:35:20 +00:00
continue ;
2012-10-01 12:32:35 +00:00
2013-12-10 16:37:33 +08:00
if ( vxlan - > vn_sock ! = dev - > vn_sock )
continue ;
if ( ! vxlan_addr_equal ( & vxlan - > default_dst . remote_ip ,
& dev - > default_dst . remote_ip ) )
continue ;
if ( vxlan - > default_dst . remote_ifindex ! =
dev - > default_dst . remote_ifindex )
continue ;
return true ;
2013-05-16 11:35:20 +00:00
}
2012-10-01 12:32:35 +00:00
return false ;
}
2013-06-17 14:16:10 -07:00
static void vxlan_sock_hold ( struct vxlan_sock * vs )
2012-10-01 12:32:35 +00:00
{
2013-06-17 14:16:10 -07:00
atomic_inc ( & vs - > refcnt ) ;
}
2012-10-01 12:32:35 +00:00
2013-08-19 11:23:07 -07:00
void vxlan_sock_release ( struct vxlan_sock * vs )
2013-06-17 14:16:10 -07:00
{
2013-09-04 02:13:38 -07:00
struct sock * sk = vs - > sock - > sk ;
struct net * net = sock_net ( sk ) ;
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
2013-08-19 11:23:07 -07:00
2013-06-17 14:16:10 -07:00
if ( ! atomic_dec_and_test ( & vs - > refcnt ) )
return ;
2012-10-01 12:32:35 +00:00
2013-06-17 14:16:11 -07:00
spin_lock ( & vn - > sock_lock ) ;
2013-06-17 14:16:10 -07:00
hlist_del_rcu ( & vs - > hlist ) ;
2013-09-24 10:25:40 -07:00
rcu_assign_sk_user_data ( vs - > sock - > sk , NULL ) ;
2014-01-20 13:59:21 +02:00
vxlan_notify_del_rx_port ( vs ) ;
2013-06-17 14:16:11 -07:00
spin_unlock ( & vn - > sock_lock ) ;
2013-06-17 14:16:10 -07:00
queue_work ( vxlan_wq , & vs - > del_work ) ;
2012-10-01 12:32:35 +00:00
}
2013-08-19 11:23:07 -07:00
EXPORT_SYMBOL_GPL ( vxlan_sock_release ) ;
2012-10-01 12:32:35 +00:00
2013-07-18 08:40:15 -07:00
/* Callback to update multicast group membership when first VNI on
* multicast asddress is brought up
* Done as workqueue because ip_mc_join_group acquires RTNL .
2013-06-17 14:16:10 -07:00
*/
2013-07-18 08:40:15 -07:00
static void vxlan_igmp_join ( struct work_struct * work )
2012-10-01 12:32:35 +00:00
{
2013-07-18 08:40:15 -07:00
struct vxlan_dev * vxlan = container_of ( work , struct vxlan_dev , igmp_join ) ;
2013-06-17 14:16:10 -07:00
struct vxlan_sock * vs = vxlan - > vn_sock ;
struct sock * sk = vs - > sock - > sk ;
2013-08-31 13:44:33 +08:00
union vxlan_addr * ip = & vxlan - > default_dst . remote_ip ;
int ifindex = vxlan - > default_dst . remote_ifindex ;
2012-10-01 12:32:35 +00:00
lock_sock ( sk ) ;
2013-08-31 13:44:33 +08:00
if ( ip - > sa . sa_family = = AF_INET ) {
struct ip_mreqn mreq = {
. imr_multiaddr . s_addr = ip - > sin . sin_addr . s_addr ,
. imr_ifindex = ifindex ,
} ;
ip_mc_join_group ( sk , & mreq ) ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
ipv6_stub - > ipv6_sock_mc_join ( sk , ifindex ,
& ip - > sin6 . sin6_addr ) ;
# endif
}
2013-07-18 08:40:15 -07:00
release_sock ( sk ) ;
2013-08-19 11:23:07 -07:00
vxlan_sock_release ( vs ) ;
2013-07-18 08:40:15 -07:00
dev_put ( vxlan - > dev ) ;
}
/* Inverse of vxlan_igmp_join when last VNI is brought down */
static void vxlan_igmp_leave ( struct work_struct * work )
{
struct vxlan_dev * vxlan = container_of ( work , struct vxlan_dev , igmp_leave ) ;
struct vxlan_sock * vs = vxlan - > vn_sock ;
struct sock * sk = vs - > sock - > sk ;
2013-08-31 13:44:33 +08:00
union vxlan_addr * ip = & vxlan - > default_dst . remote_ip ;
int ifindex = vxlan - > default_dst . remote_ifindex ;
2013-07-18 08:40:15 -07:00
lock_sock ( sk ) ;
2013-08-31 13:44:33 +08:00
if ( ip - > sa . sa_family = = AF_INET ) {
struct ip_mreqn mreq = {
. imr_multiaddr . s_addr = ip - > sin . sin_addr . s_addr ,
. imr_ifindex = ifindex ,
} ;
ip_mc_leave_group ( sk , & mreq ) ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
ipv6_stub - > ipv6_sock_mc_drop ( sk , ifindex ,
& ip - > sin6 . sin6_addr ) ;
# endif
}
2012-10-01 12:32:35 +00:00
release_sock ( sk ) ;
2013-08-19 11:23:07 -07:00
vxlan_sock_release ( vs ) ;
2013-06-17 14:16:10 -07:00
dev_put ( vxlan - > dev ) ;
2012-10-01 12:32:35 +00:00
}
/* Callback from net/ipv4/udp.c to receive packets */
static int vxlan_udp_encap_recv ( struct sock * sk , struct sk_buff * skb )
{
2013-08-19 11:23:02 -07:00
struct vxlan_sock * vs ;
2012-10-01 12:32:35 +00:00
struct vxlanhdr * vxh ;
/* Need Vxlan and inner Ethernet header to be present */
2013-08-19 11:22:54 -07:00
if ( ! pskb_may_pull ( skb , VXLAN_HLEN ) )
2012-10-01 12:32:35 +00:00
goto error ;
2013-08-19 11:22:54 -07:00
/* Return packets with reserved bits set */
vxh = ( struct vxlanhdr * ) ( udp_hdr ( skb ) + 1 ) ;
2012-10-01 12:32:35 +00:00
if ( vxh - > vx_flags ! = htonl ( VXLAN_FLAGS ) | |
( vxh - > vx_vni & htonl ( 0xff ) ) ) {
netdev_dbg ( skb - > dev , " invalid vxlan flags=%#x vni=%#x \n " ,
ntohl ( vxh - > vx_flags ) , ntohl ( vxh - > vx_vni ) ) ;
goto error ;
}
2013-08-19 11:23:02 -07:00
if ( iptunnel_pull_header ( skb , VXLAN_HLEN , htons ( ETH_P_TEB ) ) )
goto drop ;
2013-09-24 10:25:40 -07:00
vs = rcu_dereference_sk_user_data ( sk ) ;
2013-08-19 11:23:02 -07:00
if ( ! vs )
2012-10-01 12:32:35 +00:00
goto drop ;
2014-01-23 11:28:13 +02:00
/* If the NIC driver gave us an encapsulated packet
* with the encapsulation mark , the device checksummed it
* for us . Otherwise force the upper layers to verify it .
*/
if ( ( skb - > ip_summed ! = CHECKSUM_UNNECESSARY & & skb - > ip_summed ! = CHECKSUM_PARTIAL ) | |
! skb - > encapsulation )
skb - > ip_summed = CHECKSUM_NONE ;
skb - > encapsulation = 0 ;
2013-08-19 11:23:02 -07:00
vs - > rcv ( vs , skb , vxh - > vx_vni ) ;
return 0 ;
drop :
/* Consume bad packet */
kfree_skb ( skb ) ;
return 0 ;
error :
/* Return non vxlan pkt */
return 1 ;
}
static void vxlan_rcv ( struct vxlan_sock * vs ,
struct sk_buff * skb , __be32 vx_vni )
{
2013-08-31 13:44:33 +08:00
struct iphdr * oip = NULL ;
struct ipv6hdr * oip6 = NULL ;
2013-08-19 11:23:02 -07:00
struct vxlan_dev * vxlan ;
2014-01-04 13:57:59 +08:00
struct pcpu_sw_netstats * stats ;
2013-08-31 13:44:33 +08:00
union vxlan_addr saddr ;
2013-08-19 11:23:02 -07:00
__u32 vni ;
2013-08-31 13:44:33 +08:00
int err = 0 ;
union vxlan_addr * remote_ip ;
2013-08-19 11:23:02 -07:00
vni = ntohl ( vx_vni ) > > 8 ;
/* Is this VNI defined? */
vxlan = vxlan_vs_find_vni ( vs , vni ) ;
if ( ! vxlan )
2012-10-01 12:32:35 +00:00
goto drop ;
2013-08-31 13:44:33 +08:00
remote_ip = & vxlan - > default_dst . remote_ip ;
2012-11-20 02:50:14 +00:00
skb_reset_mac_header ( skb ) ;
2012-10-01 12:32:35 +00:00
skb - > protocol = eth_type_trans ( skb , vxlan - > dev ) ;
/* Ignore packet loops (and multicast echo) */
drivers/net: Convert uses of compare_ether_addr to ether_addr_equal
Use the new bool function ether_addr_equal to add
some clarity and reduce the likelihood for misuse
of compare_ether_addr for sorting.
Done via cocci script: (and a little typing)
$ cat compare_ether_addr.cocci
@@
expression a,b;
@@
- !compare_ether_addr(a, b)
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- compare_ether_addr(a, b)
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) == 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) != 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) == 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) != 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal(a, b)
+ ether_addr_equal(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-09-01 11:51:23 -07:00
if ( ether_addr_equal ( eth_hdr ( skb ) - > h_source , vxlan - > dev - > dev_addr ) )
2012-10-01 12:32:35 +00:00
goto drop ;
2013-08-19 11:22:54 -07:00
/* Re-examine inner Ethernet packet */
2013-08-31 13:44:33 +08:00
if ( remote_ip - > sa . sa_family = = AF_INET ) {
oip = ip_hdr ( skb ) ;
saddr . sin . sin_addr . s_addr = oip - > saddr ;
saddr . sa . sa_family = AF_INET ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
oip6 = ipv6_hdr ( skb ) ;
saddr . sin6 . sin6_addr = oip6 - > saddr ;
saddr . sa . sa_family = AF_INET6 ;
# endif
}
2013-06-17 12:09:58 -07:00
if ( ( vxlan - > flags & VXLAN_F_LEARN ) & &
2013-08-31 13:44:33 +08:00
vxlan_snoop ( skb - > dev , & saddr , eth_hdr ( skb ) - > h_source ) )
2013-06-17 12:09:58 -07:00
goto drop ;
2012-10-01 12:32:35 +00:00
skb_reset_network_header ( skb ) ;
2012-12-07 14:14:18 +00:00
2013-08-31 13:44:33 +08:00
if ( oip6 )
err = IP6_ECN_decapsulate ( oip6 , skb ) ;
if ( oip )
err = IP_ECN_decapsulate ( oip , skb ) ;
2012-10-01 12:32:35 +00:00
if ( unlikely ( err ) ) {
2013-08-31 13:44:33 +08:00
if ( log_ecn_error ) {
if ( oip6 )
net_info_ratelimited ( " non-ECT from %pI6 \n " ,
& oip6 - > saddr ) ;
if ( oip )
net_info_ratelimited ( " non-ECT from %pI4 with TOS=%#x \n " ,
& oip - > saddr , oip - > tos ) ;
}
2012-10-01 12:32:35 +00:00
if ( err > 1 ) {
+ + vxlan - > dev - > stats . rx_frame_errors ;
+ + vxlan - > dev - > stats . rx_errors ;
goto drop ;
}
}
2013-03-25 14:49:46 +00:00
stats = this_cpu_ptr ( vxlan - > dev - > tstats ) ;
2012-10-01 12:32:35 +00:00
u64_stats_update_begin ( & stats - > syncp ) ;
stats - > rx_packets + + ;
stats - > rx_bytes + = skb - > len ;
u64_stats_update_end ( & stats - > syncp ) ;
netif_rx ( skb ) ;
2013-08-19 11:23:02 -07:00
return ;
2012-10-01 12:32:35 +00:00
drop :
/* Consume bad packet */
kfree_skb ( skb ) ;
}
2012-11-20 02:50:14 +00:00
static int arp_reduce ( struct net_device * dev , struct sk_buff * skb )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct arphdr * parp ;
u8 * arpptr , * sha ;
__be32 sip , tip ;
struct neighbour * n ;
if ( dev - > flags & IFF_NOARP )
goto out ;
if ( ! pskb_may_pull ( skb , arp_hdr_len ( dev ) ) ) {
dev - > stats . tx_dropped + + ;
goto out ;
}
parp = arp_hdr ( skb ) ;
if ( ( parp - > ar_hrd ! = htons ( ARPHRD_ETHER ) & &
parp - > ar_hrd ! = htons ( ARPHRD_IEEE802 ) ) | |
parp - > ar_pro ! = htons ( ETH_P_IP ) | |
parp - > ar_op ! = htons ( ARPOP_REQUEST ) | |
parp - > ar_hln ! = dev - > addr_len | |
parp - > ar_pln ! = 4 )
goto out ;
arpptr = ( u8 * ) parp + sizeof ( struct arphdr ) ;
sha = arpptr ;
arpptr + = dev - > addr_len ; /* sha */
memcpy ( & sip , arpptr , sizeof ( sip ) ) ;
arpptr + = sizeof ( sip ) ;
arpptr + = dev - > addr_len ; /* tha */
memcpy ( & tip , arpptr , sizeof ( tip ) ) ;
if ( ipv4_is_loopback ( tip ) | |
ipv4_is_multicast ( tip ) )
goto out ;
n = neigh_lookup ( & arp_tbl , & tip , dev ) ;
if ( n ) {
struct vxlan_fdb * f ;
struct sk_buff * reply ;
if ( ! ( n - > nud_state & NUD_CONNECTED ) ) {
neigh_release ( n ) ;
goto out ;
}
f = vxlan_find_mac ( vxlan , n - > ha ) ;
2013-08-31 13:44:33 +08:00
if ( f & & vxlan_addr_any ( & ( first_remote_rcu ( f ) - > remote_ip ) ) ) {
2012-11-20 02:50:14 +00:00
/* bridge-local neighbor */
neigh_release ( n ) ;
goto out ;
}
reply = arp_create ( ARPOP_REPLY , ETH_P_ARP , sip , dev , tip , sha ,
n - > ha , sha ) ;
neigh_release ( n ) ;
2014-03-18 12:32:29 -04:00
if ( reply = = NULL )
goto out ;
2012-11-20 02:50:14 +00:00
skb_reset_mac_header ( reply ) ;
__skb_pull ( reply , skb_network_offset ( reply ) ) ;
reply - > ip_summed = CHECKSUM_UNNECESSARY ;
reply - > pkt_type = PACKET_HOST ;
if ( netif_rx_ni ( reply ) = = NET_RX_DROP )
dev - > stats . rx_dropped + + ;
2013-08-31 13:44:33 +08:00
} else if ( vxlan - > flags & VXLAN_F_L3MISS ) {
union vxlan_addr ipa = {
. sin . sin_addr . s_addr = tip ,
. sa . sa_family = AF_INET ,
} ;
vxlan_ip_miss ( dev , & ipa ) ;
}
2012-11-20 02:50:14 +00:00
out :
consume_skb ( skb ) ;
return NETDEV_TX_OK ;
}
2013-08-31 13:44:36 +08:00
# if IS_ENABLED(CONFIG_IPV6)
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
static struct sk_buff * vxlan_na_create ( struct sk_buff * request ,
struct neighbour * n , bool isrouter )
{
struct net_device * dev = request - > dev ;
struct sk_buff * reply ;
struct nd_msg * ns , * na ;
struct ipv6hdr * pip6 ;
u8 * daddr ;
int na_olen = 8 ; /* opt hdr + ETH_ALEN for target */
int ns_olen ;
int i , len ;
if ( dev = = NULL )
return NULL ;
len = LL_RESERVED_SPACE ( dev ) + sizeof ( struct ipv6hdr ) +
sizeof ( * na ) + na_olen + dev - > needed_tailroom ;
reply = alloc_skb ( len , GFP_ATOMIC ) ;
if ( reply = = NULL )
return NULL ;
reply - > protocol = htons ( ETH_P_IPV6 ) ;
reply - > dev = dev ;
skb_reserve ( reply , LL_RESERVED_SPACE ( request - > dev ) ) ;
skb_push ( reply , sizeof ( struct ethhdr ) ) ;
skb_set_mac_header ( reply , 0 ) ;
ns = ( struct nd_msg * ) skb_transport_header ( request ) ;
daddr = eth_hdr ( request ) - > h_source ;
ns_olen = request - > len - skb_transport_offset ( request ) - sizeof ( * ns ) ;
for ( i = 0 ; i < ns_olen - 1 ; i + = ( ns - > opt [ i + 1 ] < < 3 ) ) {
if ( ns - > opt [ i ] = = ND_OPT_SOURCE_LL_ADDR ) {
daddr = ns - > opt + i + sizeof ( struct nd_opt_hdr ) ;
break ;
}
}
/* Ethernet header */
ether_addr_copy ( eth_hdr ( reply ) - > h_dest , daddr ) ;
ether_addr_copy ( eth_hdr ( reply ) - > h_source , n - > ha ) ;
eth_hdr ( reply ) - > h_proto = htons ( ETH_P_IPV6 ) ;
reply - > protocol = htons ( ETH_P_IPV6 ) ;
skb_pull ( reply , sizeof ( struct ethhdr ) ) ;
skb_set_network_header ( reply , 0 ) ;
skb_put ( reply , sizeof ( struct ipv6hdr ) ) ;
/* IPv6 header */
pip6 = ipv6_hdr ( reply ) ;
memset ( pip6 , 0 , sizeof ( struct ipv6hdr ) ) ;
pip6 - > version = 6 ;
pip6 - > priority = ipv6_hdr ( request ) - > priority ;
pip6 - > nexthdr = IPPROTO_ICMPV6 ;
pip6 - > hop_limit = 255 ;
pip6 - > daddr = ipv6_hdr ( request ) - > saddr ;
pip6 - > saddr = * ( struct in6_addr * ) n - > primary_key ;
skb_pull ( reply , sizeof ( struct ipv6hdr ) ) ;
skb_set_transport_header ( reply , 0 ) ;
na = ( struct nd_msg * ) skb_put ( reply , sizeof ( * na ) + na_olen ) ;
/* Neighbor Advertisement */
memset ( na , 0 , sizeof ( * na ) + na_olen ) ;
na - > icmph . icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT ;
na - > icmph . icmp6_router = isrouter ;
na - > icmph . icmp6_override = 1 ;
na - > icmph . icmp6_solicited = 1 ;
na - > target = ns - > target ;
ether_addr_copy ( & na - > opt [ 2 ] , n - > ha ) ;
na - > opt [ 0 ] = ND_OPT_TARGET_LL_ADDR ;
na - > opt [ 1 ] = na_olen > > 3 ;
na - > icmph . icmp6_cksum = csum_ipv6_magic ( & pip6 - > saddr ,
& pip6 - > daddr , sizeof ( * na ) + na_olen , IPPROTO_ICMPV6 ,
csum_partial ( na , sizeof ( * na ) + na_olen , 0 ) ) ;
pip6 - > payload_len = htons ( sizeof ( * na ) + na_olen ) ;
skb_push ( reply , sizeof ( struct ipv6hdr ) ) ;
reply - > ip_summed = CHECKSUM_UNNECESSARY ;
return reply ;
}
2013-08-31 13:44:36 +08:00
static int neigh_reduce ( struct net_device * dev , struct sk_buff * skb )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
struct nd_msg * msg ;
2013-08-31 13:44:36 +08:00
const struct ipv6hdr * iphdr ;
const struct in6_addr * saddr , * daddr ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
struct neighbour * n ;
struct inet6_dev * in6_dev ;
2013-08-31 13:44:36 +08:00
in6_dev = __in6_dev_get ( dev ) ;
if ( ! in6_dev )
goto out ;
if ( ! pskb_may_pull ( skb , skb - > len ) )
goto out ;
iphdr = ipv6_hdr ( skb ) ;
saddr = & iphdr - > saddr ;
daddr = & iphdr - > daddr ;
msg = ( struct nd_msg * ) skb_transport_header ( skb ) ;
if ( msg - > icmph . icmp6_code ! = 0 | |
msg - > icmph . icmp6_type ! = NDISC_NEIGHBOUR_SOLICITATION )
goto out ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
if ( ipv6_addr_loopback ( daddr ) | |
ipv6_addr_is_multicast ( & msg - > target ) )
goto out ;
n = neigh_lookup ( ipv6_stub - > nd_tbl , & msg - > target , dev ) ;
2013-08-31 13:44:36 +08:00
if ( n ) {
struct vxlan_fdb * f ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
struct sk_buff * reply ;
2013-08-31 13:44:36 +08:00
if ( ! ( n - > nud_state & NUD_CONNECTED ) ) {
neigh_release ( n ) ;
goto out ;
}
f = vxlan_find_mac ( vxlan , n - > ha ) ;
if ( f & & vxlan_addr_any ( & ( first_remote_rcu ( f ) - > remote_ip ) ) ) {
/* bridge-local neighbor */
neigh_release ( n ) ;
goto out ;
}
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
reply = vxlan_na_create ( skb , n ,
! ! ( f ? f - > flags & NTF_ROUTER : 0 ) ) ;
2013-08-31 13:44:36 +08:00
neigh_release ( n ) ;
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
if ( reply = = NULL )
goto out ;
if ( netif_rx_ni ( reply ) = = NET_RX_DROP )
dev - > stats . rx_dropped + + ;
2013-08-31 13:44:36 +08:00
} else if ( vxlan - > flags & VXLAN_F_L3MISS ) {
vxlan: fix nonfunctional neigh_reduce()
The VXLAN neigh_reduce() code is completely non-functional since
check-in. Specific errors:
1) The original code drops all packets with a multicast destination address,
even though neighbor solicitations are sent to the solicited-node
address, a multicast address. The code after this check was never run.
2) The neighbor table lookup used the IPv6 header destination, which is the
solicited node address, rather than the target address from the
neighbor solicitation. So neighbor lookups would always fail if it
got this far. Also for L3MISSes.
3) The code calls ndisc_send_na(), which does a send on the tunnel device.
The context for neigh_reduce() is the transmit path, vxlan_xmit(),
where the host or a bridge-attached neighbor is trying to transmit
a neighbor solicitation. To respond to it, the tunnel endpoint needs
to do a *receive* of the appropriate neighbor advertisement. Doing a
send, would only try to send the advertisement, encapsulated, to the
remote destinations in the fdb -- hosts that definitely did not do the
corresponding solicitation.
4) The code uses the tunnel endpoint IPv6 forwarding flag to determine the
isrouter flag in the advertisement. This has nothing to do with whether
or not the target is a router, and generally won't be set since the
tunnel endpoint is bridging, not routing, traffic.
The patch below creates a proxy neighbor advertisement to respond to
neighbor solicitions as intended, providing proper IPv6 support for neighbor
reduction.
Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-24 10:39:58 -04:00
union vxlan_addr ipa = {
. sin6 . sin6_addr = msg - > target ,
. sa . sa_family = AF_INET6 ,
} ;
2013-08-31 13:44:36 +08:00
vxlan_ip_miss ( dev , & ipa ) ;
}
out :
consume_skb ( skb ) ;
return NETDEV_TX_OK ;
}
# endif
2012-11-20 02:50:14 +00:00
static bool route_shortcircuit ( struct net_device * dev , struct sk_buff * skb )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct neighbour * n ;
if ( is_multicast_ether_addr ( eth_hdr ( skb ) - > h_dest ) )
return false ;
n = NULL ;
switch ( ntohs ( eth_hdr ( skb ) - > h_proto ) ) {
case ETH_P_IP :
2013-08-31 13:44:34 +08:00
{
struct iphdr * pip ;
2012-11-20 02:50:14 +00:00
if ( ! pskb_may_pull ( skb , sizeof ( struct iphdr ) ) )
return false ;
pip = ip_hdr ( skb ) ;
n = neigh_lookup ( & arp_tbl , & pip - > daddr , dev ) ;
2013-08-31 13:44:33 +08:00
if ( ! n & & ( vxlan - > flags & VXLAN_F_L3MISS ) ) {
union vxlan_addr ipa = {
. sin . sin_addr . s_addr = pip - > daddr ,
. sa . sa_family = AF_INET ,
} ;
vxlan_ip_miss ( dev , & ipa ) ;
return false ;
}
2012-11-20 02:50:14 +00:00
break ;
2013-08-31 13:44:34 +08:00
}
# if IS_ENABLED(CONFIG_IPV6)
case ETH_P_IPV6 :
{
struct ipv6hdr * pip6 ;
if ( ! pskb_may_pull ( skb , sizeof ( struct ipv6hdr ) ) )
return false ;
pip6 = ipv6_hdr ( skb ) ;
n = neigh_lookup ( ipv6_stub - > nd_tbl , & pip6 - > daddr , dev ) ;
if ( ! n & & ( vxlan - > flags & VXLAN_F_L3MISS ) ) {
union vxlan_addr ipa = {
. sin6 . sin6_addr = pip6 - > daddr ,
. sa . sa_family = AF_INET6 ,
} ;
vxlan_ip_miss ( dev , & ipa ) ;
return false ;
}
break ;
}
# endif
2012-11-20 02:50:14 +00:00
default :
return false ;
}
if ( n ) {
bool diff ;
drivers/net: Convert uses of compare_ether_addr to ether_addr_equal
Use the new bool function ether_addr_equal to add
some clarity and reduce the likelihood for misuse
of compare_ether_addr for sorting.
Done via cocci script: (and a little typing)
$ cat compare_ether_addr.cocci
@@
expression a,b;
@@
- !compare_ether_addr(a, b)
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- compare_ether_addr(a, b)
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) == 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !ether_addr_equal(a, b) != 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) == 0
+ !ether_addr_equal(a, b)
@@
expression a,b;
@@
- ether_addr_equal(a, b) != 0
+ ether_addr_equal(a, b)
@@
expression a,b;
@@
- !!ether_addr_equal(a, b)
+ ether_addr_equal(a, b)
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-09-01 11:51:23 -07:00
diff = ! ether_addr_equal ( eth_hdr ( skb ) - > h_dest , n - > ha ) ;
2012-11-20 02:50:14 +00:00
if ( diff ) {
memcpy ( eth_hdr ( skb ) - > h_source , eth_hdr ( skb ) - > h_dest ,
dev - > addr_len ) ;
memcpy ( eth_hdr ( skb ) - > h_dest , n - > ha , dev - > addr_len ) ;
}
neigh_release ( n ) ;
return diff ;
2013-08-31 13:44:33 +08:00
}
2012-11-20 02:50:14 +00:00
return false ;
}
2012-10-09 20:35:50 +00:00
/* Compute source port for outgoing packet
* first choice to use L4 flow hash since it will spread
* better and maybe available from hardware
* secondary choice is to use jhash on the Ethernet header
*/
2013-08-19 11:23:17 -07:00
__be16 vxlan_src_port ( __u16 port_min , __u16 port_max , struct sk_buff * skb )
2012-10-09 20:35:50 +00:00
{
2013-08-19 11:23:17 -07:00
unsigned int range = ( port_max - port_min ) + 1 ;
2012-10-09 20:35:50 +00:00
u32 hash ;
2013-12-15 22:12:06 -08:00
hash = skb_get_hash ( skb ) ;
2012-10-09 20:35:50 +00:00
if ( ! hash )
hash = jhash ( skb - > data , 2 * ETH_ALEN ,
( __force u32 ) skb - > protocol ) ;
2013-08-19 11:23:17 -07:00
return htons ( ( ( ( u64 ) hash * range ) > > 32 ) + port_min ) ;
2012-10-09 20:35:50 +00:00
}
2013-08-19 11:23:17 -07:00
EXPORT_SYMBOL_GPL ( vxlan_src_port ) ;
2012-10-09 20:35:50 +00:00
2013-03-07 13:22:36 +00:00
static int handle_offloads ( struct sk_buff * skb )
{
if ( skb_is_gso ( skb ) ) {
int err = skb_unclone ( skb , GFP_ATOMIC ) ;
if ( unlikely ( err ) )
return err ;
2013-04-28 08:16:01 +00:00
skb_shinfo ( skb ) - > gso_type | = SKB_GSO_UDP_TUNNEL ;
2013-03-07 13:22:36 +00:00
} else if ( skb - > ip_summed ! = CHECKSUM_PARTIAL )
skb - > ip_summed = CHECKSUM_NONE ;
return 0 ;
}
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
2013-09-02 15:34:55 +02:00
static int vxlan6_xmit_skb ( struct vxlan_sock * vs ,
2013-08-31 13:44:33 +08:00
struct dst_entry * dst , struct sk_buff * skb ,
struct net_device * dev , struct in6_addr * saddr ,
struct in6_addr * daddr , __u8 prio , __u8 ttl ,
__be16 src_port , __be16 dst_port , __be32 vni )
{
struct ipv6hdr * ip6h ;
struct vxlanhdr * vxh ;
struct udphdr * uh ;
int min_headroom ;
int err ;
if ( ! skb - > encapsulation ) {
skb_reset_inner_headers ( skb ) ;
skb - > encapsulation = 1 ;
}
2013-09-02 15:34:57 +02:00
skb_scrub_packet ( skb , false ) ;
2013-08-31 13:44:33 +08:00
min_headroom = LL_RESERVED_SPACE ( dst - > dev ) + dst - > header_len
+ VXLAN_HLEN + sizeof ( struct ipv6hdr )
+ ( vlan_tx_tag_present ( skb ) ? VLAN_HLEN : 0 ) ;
/* Need space for new headers (invalidates iph ptr) */
err = skb_cow_head ( skb , min_headroom ) ;
if ( unlikely ( err ) )
return err ;
if ( vlan_tx_tag_present ( skb ) ) {
if ( WARN_ON ( ! __vlan_put_tag ( skb ,
skb - > vlan_proto ,
vlan_tx_tag_get ( skb ) ) ) )
return - ENOMEM ;
skb - > vlan_tci = 0 ;
}
vxh = ( struct vxlanhdr * ) __skb_push ( skb , sizeof ( * vxh ) ) ;
vxh - > vx_flags = htonl ( VXLAN_FLAGS ) ;
vxh - > vx_vni = vni ;
__skb_push ( skb , sizeof ( * uh ) ) ;
skb_reset_transport_header ( skb ) ;
uh = udp_hdr ( skb ) ;
uh - > dest = dst_port ;
uh - > source = src_port ;
uh - > len = htons ( skb - > len ) ;
uh - > check = 0 ;
memset ( & ( IPCB ( skb ) - > opt ) , 0 , sizeof ( IPCB ( skb ) - > opt ) ) ;
IPCB ( skb ) - > flags & = ~ ( IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED ) ;
skb_dst_set ( skb , dst ) ;
if ( ! skb_is_gso ( skb ) & & ! ( dst - > dev - > features & NETIF_F_IPV6_CSUM ) ) {
__wsum csum = skb_checksum ( skb , 0 , skb - > len , 0 ) ;
skb - > ip_summed = CHECKSUM_UNNECESSARY ;
uh - > check = csum_ipv6_magic ( saddr , daddr , skb - > len ,
IPPROTO_UDP , csum ) ;
if ( uh - > check = = 0 )
uh - > check = CSUM_MANGLED_0 ;
} else {
skb - > ip_summed = CHECKSUM_PARTIAL ;
skb - > csum_start = skb_transport_header ( skb ) - skb - > head ;
skb - > csum_offset = offsetof ( struct udphdr , check ) ;
uh - > check = ~ csum_ipv6_magic ( saddr , daddr ,
skb - > len , IPPROTO_UDP , 0 ) ;
}
__skb_push ( skb , sizeof ( * ip6h ) ) ;
skb_reset_network_header ( skb ) ;
ip6h = ipv6_hdr ( skb ) ;
ip6h - > version = 6 ;
ip6h - > priority = prio ;
ip6h - > flow_lbl [ 0 ] = 0 ;
ip6h - > flow_lbl [ 1 ] = 0 ;
ip6h - > flow_lbl [ 2 ] = 0 ;
ip6h - > payload_len = htons ( skb - > len ) ;
ip6h - > nexthdr = IPPROTO_UDP ;
ip6h - > hop_limit = ttl ;
ip6h - > daddr = * daddr ;
ip6h - > saddr = * saddr ;
err = handle_offloads ( skb ) ;
if ( err )
return err ;
ip6tunnel_xmit ( skb , dev ) ;
return 0 ;
}
# endif
2013-09-02 15:34:55 +02:00
int vxlan_xmit_skb ( struct vxlan_sock * vs ,
2013-08-19 11:23:17 -07:00
struct rtable * rt , struct sk_buff * skb ,
__be32 src , __be32 dst , __u8 tos , __u8 ttl , __be16 df ,
__be16 src_port , __be16 dst_port , __be32 vni )
{
struct vxlanhdr * vxh ;
struct udphdr * uh ;
2013-08-19 11:23:22 -07:00
int min_headroom ;
2013-08-19 11:23:17 -07:00
int err ;
if ( ! skb - > encapsulation ) {
skb_reset_inner_headers ( skb ) ;
skb - > encapsulation = 1 ;
}
2013-08-19 11:23:22 -07:00
min_headroom = LL_RESERVED_SPACE ( rt - > dst . dev ) + rt - > dst . header_len
2013-08-19 11:23:29 -07:00
+ VXLAN_HLEN + sizeof ( struct iphdr )
+ ( vlan_tx_tag_present ( skb ) ? VLAN_HLEN : 0 ) ;
2013-08-19 11:23:22 -07:00
/* Need space for new headers (invalidates iph ptr) */
err = skb_cow_head ( skb , min_headroom ) ;
if ( unlikely ( err ) )
return err ;
2013-08-19 11:23:29 -07:00
if ( vlan_tx_tag_present ( skb ) ) {
if ( WARN_ON ( ! __vlan_put_tag ( skb ,
skb - > vlan_proto ,
vlan_tx_tag_get ( skb ) ) ) )
return - ENOMEM ;
skb - > vlan_tci = 0 ;
}
2013-08-19 11:23:17 -07:00
vxh = ( struct vxlanhdr * ) __skb_push ( skb , sizeof ( * vxh ) ) ;
vxh - > vx_flags = htonl ( VXLAN_FLAGS ) ;
vxh - > vx_vni = vni ;
__skb_push ( skb , sizeof ( * uh ) ) ;
skb_reset_transport_header ( skb ) ;
uh = udp_hdr ( skb ) ;
uh - > dest = dst_port ;
uh - > source = src_port ;
uh - > len = htons ( skb - > len ) ;
uh - > check = 0 ;
err = handle_offloads ( skb ) ;
if ( err )
return err ;
2014-04-15 13:47:15 -04:00
return iptunnel_xmit ( vs - > sock - > sk , rt , skb , src , dst , IPPROTO_UDP ,
tos , ttl , df , false ) ;
2013-08-19 11:23:17 -07:00
}
EXPORT_SYMBOL_GPL ( vxlan_xmit_skb ) ;
2013-04-02 12:31:52 +00:00
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass ( struct sk_buff * skb , struct vxlan_dev * src_vxlan ,
struct vxlan_dev * dst_vxlan )
{
2014-01-04 13:57:59 +08:00
struct pcpu_sw_netstats * tx_stats , * rx_stats ;
2013-08-31 13:44:33 +08:00
union vxlan_addr loopback ;
union vxlan_addr * remote_ip = & dst_vxlan - > default_dst . remote_ip ;
2013-04-02 12:31:52 +00:00
2014-01-04 13:57:59 +08:00
tx_stats = this_cpu_ptr ( src_vxlan - > dev - > tstats ) ;
rx_stats = this_cpu_ptr ( dst_vxlan - > dev - > tstats ) ;
2013-04-02 12:31:52 +00:00
skb - > pkt_type = PACKET_HOST ;
skb - > encapsulation = 0 ;
skb - > dev = dst_vxlan - > dev ;
__skb_pull ( skb , skb_network_offset ( skb ) ) ;
2013-08-31 13:44:33 +08:00
if ( remote_ip - > sa . sa_family = = AF_INET ) {
loopback . sin . sin_addr . s_addr = htonl ( INADDR_LOOPBACK ) ;
loopback . sa . sa_family = AF_INET ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
loopback . sin6 . sin6_addr = in6addr_loopback ;
loopback . sa . sa_family = AF_INET6 ;
# endif
}
2013-04-02 12:31:52 +00:00
if ( dst_vxlan - > flags & VXLAN_F_LEARN )
2013-08-31 13:44:33 +08:00
vxlan_snoop ( skb - > dev , & loopback , eth_hdr ( skb ) - > h_source ) ;
2013-04-02 12:31:52 +00:00
u64_stats_update_begin ( & tx_stats - > syncp ) ;
tx_stats - > tx_packets + + ;
tx_stats - > tx_bytes + = skb - > len ;
u64_stats_update_end ( & tx_stats - > syncp ) ;
if ( netif_rx ( skb ) = = NET_RX_SUCCESS ) {
u64_stats_update_begin ( & rx_stats - > syncp ) ;
rx_stats - > rx_packets + + ;
rx_stats - > rx_bytes + = skb - > len ;
u64_stats_update_end ( & rx_stats - > syncp ) ;
} else {
skb - > dev - > stats . rx_dropped + + ;
}
}
2013-06-17 14:16:11 -07:00
static void vxlan_xmit_one ( struct sk_buff * skb , struct net_device * dev ,
struct vxlan_rdst * rdst , bool did_rsc )
2012-10-01 12:32:35 +00:00
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-08-31 13:44:33 +08:00
struct rtable * rt = NULL ;
2012-10-01 12:32:35 +00:00
const struct iphdr * old_iph ;
struct flowi4 fl4 ;
2013-08-31 13:44:33 +08:00
union vxlan_addr * dst ;
__be16 src_port = 0 , dst_port ;
2013-06-17 14:16:41 -07:00
u32 vni ;
2012-10-01 12:32:35 +00:00
__be16 df = 0 ;
__u8 tos , ttl ;
2013-06-17 17:49:56 -07:00
int err ;
2012-10-01 12:32:35 +00:00
2013-04-27 11:31:57 +00:00
dst_port = rdst - > remote_port ? rdst - > remote_port : vxlan - > dst_port ;
2013-03-15 04:35:51 +00:00
vni = rdst - > remote_vni ;
2013-08-31 13:44:33 +08:00
dst = & rdst - > remote_ip ;
2012-11-20 02:50:14 +00:00
2013-08-31 13:44:33 +08:00
if ( vxlan_addr_any ( dst ) ) {
2012-11-20 02:50:14 +00:00
if ( did_rsc ) {
/* short-circuited back to local bridge */
2013-04-02 12:31:52 +00:00
vxlan_encap_bypass ( skb , vxlan , vxlan ) ;
2013-06-17 14:16:11 -07:00
return ;
2012-11-20 02:50:14 +00:00
}
2012-10-09 20:35:46 +00:00
goto drop ;
2012-11-20 02:50:14 +00:00
}
2012-10-09 20:35:46 +00:00
2012-10-01 12:32:35 +00:00
old_iph = ip_hdr ( skb ) ;
ttl = vxlan - > ttl ;
2013-08-31 13:44:33 +08:00
if ( ! ttl & & vxlan_addr_multicast ( dst ) )
2012-10-01 12:32:35 +00:00
ttl = 1 ;
tos = vxlan - > tos ;
if ( tos = = 1 )
2013-03-25 14:49:53 +00:00
tos = ip_tunnel_get_dsfield ( old_iph , skb ) ;
2012-10-01 12:32:35 +00:00
2013-08-19 11:23:17 -07:00
src_port = vxlan_src_port ( vxlan - > port_min , vxlan - > port_max , skb ) ;
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
if ( dst - > sa . sa_family = = AF_INET ) {
memset ( & fl4 , 0 , sizeof ( fl4 ) ) ;
fl4 . flowi4_oif = rdst - > remote_ifindex ;
fl4 . flowi4_tos = RT_TOS ( tos ) ;
fl4 . daddr = dst - > sin . sin_addr . s_addr ;
fl4 . saddr = vxlan - > saddr . sin . sin_addr . s_addr ;
rt = ip_route_output_key ( dev_net ( dev ) , & fl4 ) ;
if ( IS_ERR ( rt ) ) {
netdev_dbg ( dev , " no route to %pI4 \n " ,
& dst - > sin . sin_addr . s_addr ) ;
dev - > stats . tx_carrier_errors + + ;
goto tx_error ;
}
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
if ( rt - > dst . dev = = dev ) {
netdev_dbg ( dev , " circular route to %pI4 \n " ,
& dst - > sin . sin_addr . s_addr ) ;
dev - > stats . collisions + + ;
2013-12-09 10:33:53 +08:00
goto rt_tx_error ;
2013-08-31 13:44:33 +08:00
}
/* Bypass encapsulation if the destination is local */
if ( rt - > rt_flags & RTCF_LOCAL & &
! ( rt - > rt_flags & ( RTCF_BROADCAST | RTCF_MULTICAST ) ) ) {
struct vxlan_dev * dst_vxlan ;
ip_rt_put ( rt ) ;
dst_vxlan = vxlan_find_vni ( dev_net ( dev ) , vni , dst_port ) ;
if ( ! dst_vxlan )
goto tx_error ;
vxlan_encap_bypass ( skb , vxlan , dst_vxlan ) ;
return ;
}
tos = ip_tunnel_ecn_encap ( tos , old_iph , skb ) ;
ttl = ttl ? : ip4_dst_hoplimit ( & rt - > dst ) ;
2012-10-01 12:32:35 +00:00
2013-09-02 15:34:55 +02:00
err = vxlan_xmit_skb ( vxlan - > vn_sock , rt , skb ,
2013-08-31 13:44:33 +08:00
fl4 . saddr , dst - > sin . sin_addr . s_addr ,
tos , ttl , df , src_port , dst_port ,
htonl ( vni < < 8 ) ) ;
2013-04-02 12:31:52 +00:00
2013-08-31 13:44:33 +08:00
if ( err < 0 )
goto rt_tx_error ;
iptunnel_xmit_stats ( err , & dev - > stats , dev - > tstats ) ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
struct sock * sk = vxlan - > vn_sock - > sock - > sk ;
struct dst_entry * ndst ;
struct flowi6 fl6 ;
u32 flags ;
memset ( & fl6 , 0 , sizeof ( fl6 ) ) ;
fl6 . flowi6_oif = rdst - > remote_ifindex ;
fl6 . daddr = dst - > sin6 . sin6_addr ;
fl6 . saddr = vxlan - > saddr . sin6 . sin6_addr ;
2013-09-02 10:06:51 +08:00
fl6 . flowi6_proto = IPPROTO_UDP ;
2013-08-31 13:44:33 +08:00
if ( ipv6_stub - > ipv6_dst_lookup ( sk , & ndst , & fl6 ) ) {
netdev_dbg ( dev , " no route to %pI6 \n " ,
& dst - > sin6 . sin6_addr ) ;
dev - > stats . tx_carrier_errors + + ;
2013-04-02 12:31:52 +00:00
goto tx_error ;
2013-08-31 13:44:33 +08:00
}
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
if ( ndst - > dev = = dev ) {
netdev_dbg ( dev , " circular route to %pI6 \n " ,
& dst - > sin6 . sin6_addr ) ;
dst_release ( ndst ) ;
dev - > stats . collisions + + ;
goto tx_error ;
}
2013-06-17 17:49:56 -07:00
2013-08-31 13:44:33 +08:00
/* Bypass encapsulation if the destination is local */
flags = ( ( struct rt6_info * ) ndst ) - > rt6i_flags ;
if ( flags & RTF_LOCAL & &
! ( flags & ( RTCF_BROADCAST | RTCF_MULTICAST ) ) ) {
struct vxlan_dev * dst_vxlan ;
dst_release ( ndst ) ;
dst_vxlan = vxlan_find_vni ( dev_net ( dev ) , vni , dst_port ) ;
if ( ! dst_vxlan )
goto tx_error ;
vxlan_encap_bypass ( skb , vxlan , dst_vxlan ) ;
return ;
}
2013-08-19 11:23:17 -07:00
2013-08-31 13:44:33 +08:00
ttl = ttl ? : ip6_dst_hoplimit ( ndst ) ;
2013-09-02 15:34:55 +02:00
err = vxlan6_xmit_skb ( vxlan - > vn_sock , ndst , skb ,
2013-08-31 13:44:33 +08:00
dev , & fl6 . saddr , & fl6 . daddr , 0 , ttl ,
src_port , dst_port , htonl ( vni < < 8 ) ) ;
# endif
}
2013-06-17 17:49:56 -07:00
2013-06-17 14:16:11 -07:00
return ;
2012-10-01 12:32:35 +00:00
drop :
dev - > stats . tx_dropped + + ;
goto tx_free ;
2013-08-19 11:23:17 -07:00
rt_tx_error :
ip_rt_put ( rt ) ;
2012-10-01 12:32:35 +00:00
tx_error :
dev - > stats . tx_errors + + ;
tx_free :
dev_kfree_skb ( skb ) ;
}
2013-03-15 04:35:51 +00:00
/* Transmit local packets over Vxlan
*
* Outer IP header inherits ECN and DF from inner header .
* Outer UDP destination is the VXLAN assigned port .
* source port is based on hash of flow
*/
static netdev_tx_t vxlan_xmit ( struct sk_buff * skb , struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct ethhdr * eth ;
bool did_rsc = false ;
2014-01-06 09:54:31 -08:00
struct vxlan_rdst * rdst , * fdst = NULL ;
2013-03-15 04:35:51 +00:00
struct vxlan_fdb * f ;
skb_reset_mac_header ( skb ) ;
eth = eth_hdr ( skb ) ;
2013-08-31 13:44:36 +08:00
if ( ( vxlan - > flags & VXLAN_F_PROXY ) ) {
if ( ntohs ( eth - > h_proto ) = = ETH_P_ARP )
return arp_reduce ( dev , skb ) ;
# if IS_ENABLED(CONFIG_IPV6)
else if ( ntohs ( eth - > h_proto ) = = ETH_P_IPV6 & &
skb - > len > = sizeof ( struct ipv6hdr ) + sizeof ( struct nd_msg ) & &
ipv6_hdr ( skb ) - > nexthdr = = IPPROTO_ICMPV6 ) {
struct nd_msg * msg ;
msg = ( struct nd_msg * ) skb_transport_header ( skb ) ;
if ( msg - > icmph . icmp6_code = = 0 & &
msg - > icmph . icmp6_type = = NDISC_NEIGHBOUR_SOLICITATION )
return neigh_reduce ( dev , skb ) ;
}
# endif
}
2013-03-15 04:35:51 +00:00
f = vxlan_find_mac ( vxlan , eth - > h_dest ) ;
2013-04-19 00:36:26 +00:00
did_rsc = false ;
if ( f & & ( f - > flags & NTF_ROUTER ) & & ( vxlan - > flags & VXLAN_F_RSC ) & &
2013-08-31 13:44:34 +08:00
( ntohs ( eth - > h_proto ) = = ETH_P_IP | |
ntohs ( eth - > h_proto ) = = ETH_P_IPV6 ) ) {
2013-04-19 00:36:26 +00:00
did_rsc = route_shortcircuit ( dev , skb ) ;
if ( did_rsc )
f = vxlan_find_mac ( vxlan , eth - > h_dest ) ;
}
2013-03-15 04:35:51 +00:00
if ( f = = NULL ) {
2013-06-25 16:01:51 +03:00
f = vxlan_find_mac ( vxlan , all_zeros_mac ) ;
if ( f = = NULL ) {
if ( ( vxlan - > flags & VXLAN_F_L2MISS ) & &
! is_multicast_ether_addr ( eth - > h_dest ) )
vxlan_fdb_miss ( vxlan , eth - > h_dest ) ;
dev - > stats . tx_dropped + + ;
2014-01-06 09:54:31 -08:00
kfree_skb ( skb ) ;
2013-06-25 16:01:51 +03:00
return NETDEV_TX_OK ;
}
}
2013-03-15 04:35:51 +00:00
2013-06-25 16:01:51 +03:00
list_for_each_entry_rcu ( rdst , & f - > remotes , list ) {
struct sk_buff * skb1 ;
2013-03-15 04:35:51 +00:00
2014-01-06 09:54:31 -08:00
if ( ! fdst ) {
fdst = rdst ;
continue ;
}
2013-06-25 16:01:51 +03:00
skb1 = skb_clone ( skb , GFP_ATOMIC ) ;
if ( skb1 )
vxlan_xmit_one ( skb1 , dev , rdst , did_rsc ) ;
2013-03-15 04:35:51 +00:00
}
2014-01-06 09:54:31 -08:00
if ( fdst )
vxlan_xmit_one ( skb , dev , fdst , did_rsc ) ;
else
kfree_skb ( skb ) ;
2013-06-17 14:16:11 -07:00
return NETDEV_TX_OK ;
2013-03-15 04:35:51 +00:00
}
2012-10-01 12:32:35 +00:00
/* Walk the forwarding table and purge stale entries */
static void vxlan_cleanup ( unsigned long arg )
{
struct vxlan_dev * vxlan = ( struct vxlan_dev * ) arg ;
unsigned long next_timer = jiffies + FDB_AGE_INTERVAL ;
unsigned int h ;
if ( ! netif_running ( vxlan - > dev ) )
return ;
spin_lock_bh ( & vxlan - > hash_lock ) ;
for ( h = 0 ; h < FDB_HASH_SIZE ; + + h ) {
struct hlist_node * p , * n ;
hlist_for_each_safe ( p , n , & vxlan - > fdb_head [ h ] ) {
struct vxlan_fdb * f
= container_of ( p , struct vxlan_fdb , hlist ) ;
unsigned long timeout ;
2012-10-26 06:24:34 +00:00
if ( f - > state & NUD_PERMANENT )
2012-10-01 12:32:35 +00:00
continue ;
timeout = f - > used + vxlan - > age_interval * HZ ;
if ( time_before_eq ( timeout , jiffies ) ) {
netdev_dbg ( vxlan - > dev ,
" garbage collect %pM \n " ,
f - > eth_addr ) ;
f - > state = NUD_STALE ;
vxlan_fdb_destroy ( vxlan , f ) ;
} else if ( time_before ( timeout , next_timer ) )
next_timer = timeout ;
}
}
spin_unlock_bh ( & vxlan - > hash_lock ) ;
mod_timer ( & vxlan - > age_timer , next_timer ) ;
}
2013-08-19 11:22:48 -07:00
static void vxlan_vs_add_dev ( struct vxlan_sock * vs , struct vxlan_dev * vxlan )
{
__u32 vni = vxlan - > default_dst . remote_vni ;
vxlan - > vn_sock = vs ;
hlist_add_head_rcu ( & vxlan - > hlist , vni_head ( vs , vni ) ) ;
}
2012-10-01 12:32:35 +00:00
/* Setup stats when device is created */
static int vxlan_init ( struct net_device * dev )
{
2013-06-17 14:16:11 -07:00
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct vxlan_net * vn = net_generic ( dev_net ( dev ) , vxlan_net_id ) ;
struct vxlan_sock * vs ;
2014-02-13 11:46:28 -08:00
dev - > tstats = netdev_alloc_pcpu_stats ( struct pcpu_sw_netstats ) ;
2013-03-25 14:49:46 +00:00
if ( ! dev - > tstats )
2012-10-01 12:32:35 +00:00
return - ENOMEM ;
2013-06-17 14:16:11 -07:00
spin_lock ( & vn - > sock_lock ) ;
2013-08-19 11:22:48 -07:00
vs = vxlan_find_sock ( dev_net ( dev ) , vxlan - > dst_port ) ;
2013-06-17 14:16:11 -07:00
if ( vs ) {
/* If we have a socket with same port already, reuse it */
atomic_inc ( & vs - > refcnt ) ;
2013-08-19 11:22:48 -07:00
vxlan_vs_add_dev ( vs , vxlan ) ;
2013-06-17 14:16:11 -07:00
} else {
/* otherwise make new socket outside of RTNL */
dev_hold ( dev ) ;
queue_work ( vxlan_wq , & vxlan - > sock_work ) ;
}
spin_unlock ( & vn - > sock_lock ) ;
2012-10-01 12:32:35 +00:00
return 0 ;
}
2013-06-25 17:06:01 -07:00
static void vxlan_fdb_delete_default ( struct vxlan_dev * vxlan )
2013-06-25 16:01:51 +03:00
{
struct vxlan_fdb * f ;
spin_lock_bh ( & vxlan - > hash_lock ) ;
f = __vxlan_find_mac ( vxlan , all_zeros_mac ) ;
if ( f )
vxlan_fdb_destroy ( vxlan , f ) ;
spin_unlock_bh ( & vxlan - > hash_lock ) ;
}
2013-06-17 14:16:11 -07:00
static void vxlan_uninit ( struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct vxlan_sock * vs = vxlan - > vn_sock ;
2013-06-25 17:06:01 -07:00
vxlan_fdb_delete_default ( vxlan ) ;
2013-06-25 16:01:51 +03:00
2013-06-17 14:16:11 -07:00
if ( vs )
2013-08-19 11:23:07 -07:00
vxlan_sock_release ( vs ) ;
2013-06-17 14:16:11 -07:00
free_percpu ( dev - > tstats ) ;
}
2012-10-01 12:32:35 +00:00
/* Start ageing timer and join group when device is brought up */
static int vxlan_open ( struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-06-17 14:16:11 -07:00
struct vxlan_sock * vs = vxlan - > vn_sock ;
/* socket hasn't been created */
if ( ! vs )
return - ENOTCONN ;
2012-10-01 12:32:35 +00:00
2013-12-10 16:37:32 +08:00
if ( vxlan_addr_multicast ( & vxlan - > default_dst . remote_ip ) ) {
2013-06-17 14:16:11 -07:00
vxlan_sock_hold ( vs ) ;
2013-06-17 14:16:10 -07:00
dev_hold ( dev ) ;
2013-07-18 08:40:15 -07:00
queue_work ( vxlan_wq , & vxlan - > igmp_join ) ;
2012-10-01 12:32:35 +00:00
}
if ( vxlan - > age_interval )
mod_timer ( & vxlan - > age_timer , jiffies + FDB_AGE_INTERVAL ) ;
return 0 ;
}
/* Purge the forwarding table */
static void vxlan_flush ( struct vxlan_dev * vxlan )
{
2013-05-27 22:35:52 +00:00
unsigned int h ;
2012-10-01 12:32:35 +00:00
spin_lock_bh ( & vxlan - > hash_lock ) ;
for ( h = 0 ; h < FDB_HASH_SIZE ; + + h ) {
struct hlist_node * p , * n ;
hlist_for_each_safe ( p , n , & vxlan - > fdb_head [ h ] ) {
struct vxlan_fdb * f
= container_of ( p , struct vxlan_fdb , hlist ) ;
2013-06-25 16:01:51 +03:00
/* the all_zeros_mac entry is deleted at vxlan_uninit */
if ( ! is_zero_ether_addr ( f - > eth_addr ) )
vxlan_fdb_destroy ( vxlan , f ) ;
2012-10-01 12:32:35 +00:00
}
}
spin_unlock_bh ( & vxlan - > hash_lock ) ;
}
/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop ( struct net_device * dev )
{
2013-07-18 08:40:15 -07:00
struct vxlan_net * vn = net_generic ( dev_net ( dev ) , vxlan_net_id ) ;
2012-10-01 12:32:35 +00:00
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-06-17 14:16:11 -07:00
struct vxlan_sock * vs = vxlan - > vn_sock ;
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
if ( vs & & vxlan_addr_multicast ( & vxlan - > default_dst . remote_ip ) & &
2013-12-10 16:37:33 +08:00
! vxlan_group_used ( vn , vxlan ) ) {
2013-06-17 14:16:11 -07:00
vxlan_sock_hold ( vs ) ;
2013-06-17 14:16:10 -07:00
dev_hold ( dev ) ;
2013-07-18 08:40:15 -07:00
queue_work ( vxlan_wq , & vxlan - > igmp_leave ) ;
2013-06-17 14:16:10 -07:00
}
2012-10-01 12:32:35 +00:00
del_timer_sync ( & vxlan - > age_timer ) ;
vxlan_flush ( vxlan ) ;
return 0 ;
}
/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list ( struct net_device * dev )
{
}
2013-12-18 00:21:08 +01:00
static int vxlan_change_mtu ( struct net_device * dev , int new_mtu )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
struct vxlan_rdst * dst = & vxlan - > default_dst ;
struct net_device * lowerdev ;
int max_mtu ;
lowerdev = __dev_get_by_index ( dev_net ( dev ) , dst - > remote_ifindex ) ;
if ( lowerdev = = NULL )
return eth_change_mtu ( dev , new_mtu ) ;
if ( dst - > remote_ip . sa . sa_family = = AF_INET6 )
max_mtu = lowerdev - > mtu - VXLAN6_HEADROOM ;
else
max_mtu = lowerdev - > mtu - VXLAN_HEADROOM ;
if ( new_mtu < 68 | | new_mtu > max_mtu )
return - EINVAL ;
dev - > mtu = new_mtu ;
return 0 ;
}
2012-10-01 12:32:35 +00:00
static const struct net_device_ops vxlan_netdev_ops = {
. ndo_init = vxlan_init ,
2013-06-17 14:16:11 -07:00
. ndo_uninit = vxlan_uninit ,
2012-10-01 12:32:35 +00:00
. ndo_open = vxlan_open ,
. ndo_stop = vxlan_stop ,
. ndo_start_xmit = vxlan_xmit ,
2013-03-25 14:49:46 +00:00
. ndo_get_stats64 = ip_tunnel_get_stats64 ,
2012-10-01 12:32:35 +00:00
. ndo_set_rx_mode = vxlan_set_multicast_list ,
2013-12-18 00:21:08 +01:00
. ndo_change_mtu = vxlan_change_mtu ,
2012-10-01 12:32:35 +00:00
. ndo_validate_addr = eth_validate_addr ,
. ndo_set_mac_address = eth_mac_addr ,
. ndo_fdb_add = vxlan_fdb_add ,
. ndo_fdb_del = vxlan_fdb_delete ,
. ndo_fdb_dump = vxlan_fdb_dump ,
} ;
/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type vxlan_type = {
. name = " vxlan " ,
} ;
2013-09-04 02:13:38 -07:00
/* Calls the ndo_add_vxlan_port of the caller in order to
2013-09-13 07:34:13 -07:00
* supply the listening VXLAN udp ports . Callers are expected
* to implement the ndo_add_vxlan_port .
2013-09-04 02:13:38 -07:00
*/
void vxlan_get_rx_port ( struct net_device * dev )
{
struct vxlan_sock * vs ;
struct net * net = dev_net ( dev ) ;
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
sa_family_t sa_family ;
2013-09-13 07:34:13 -07:00
__be16 port ;
unsigned int i ;
2013-09-04 02:13:38 -07:00
spin_lock ( & vn - > sock_lock ) ;
for ( i = 0 ; i < PORT_HASH_SIZE ; + + i ) {
2013-09-13 07:34:13 -07:00
hlist_for_each_entry_rcu ( vs , & vn - > sock_list [ i ] , hlist ) {
port = inet_sk ( vs - > sock - > sk ) - > inet_sport ;
2013-09-04 02:13:38 -07:00
sa_family = vs - > sock - > sk - > sk_family ;
dev - > netdev_ops - > ndo_add_vxlan_port ( dev , sa_family ,
port ) ;
}
}
spin_unlock ( & vn - > sock_lock ) ;
}
EXPORT_SYMBOL_GPL ( vxlan_get_rx_port ) ;
2012-10-01 12:32:35 +00:00
/* Initialize the device structure. */
static void vxlan_setup ( struct net_device * dev )
{
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-05-27 22:35:52 +00:00
unsigned int h ;
2012-10-09 20:35:50 +00:00
int low , high ;
2012-10-01 12:32:35 +00:00
eth_hw_addr_random ( dev ) ;
ether_setup ( dev ) ;
2013-08-31 13:44:33 +08:00
if ( vxlan - > default_dst . remote_ip . sa . sa_family = = AF_INET6 )
dev - > hard_header_len = ETH_HLEN + VXLAN6_HEADROOM ;
else
dev - > hard_header_len = ETH_HLEN + VXLAN_HEADROOM ;
2012-10-01 12:32:35 +00:00
dev - > netdev_ops = & vxlan_netdev_ops ;
2013-06-17 14:16:11 -07:00
dev - > destructor = free_netdev ;
2012-10-01 12:32:35 +00:00
SET_NETDEV_DEVTYPE ( dev , & vxlan_type ) ;
dev - > tx_queue_len = 0 ;
dev - > features | = NETIF_F_LLTX ;
dev - > features | = NETIF_F_NETNS_LOCAL ;
2012-12-07 14:14:16 +00:00
dev - > features | = NETIF_F_SG | NETIF_F_HW_CSUM ;
2012-12-07 14:14:18 +00:00
dev - > features | = NETIF_F_RXCSUM ;
2013-03-07 13:22:36 +00:00
dev - > features | = NETIF_F_GSO_SOFTWARE ;
2012-12-07 14:14:18 +00:00
2013-08-19 11:23:29 -07:00
dev - > vlan_features = dev - > features ;
dev - > features | = NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX ;
2012-12-07 14:14:18 +00:00
dev - > hw_features | = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM ;
2013-03-07 13:22:36 +00:00
dev - > hw_features | = NETIF_F_GSO_SOFTWARE ;
2013-08-19 11:23:29 -07:00
dev - > hw_features | = NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX ;
2012-10-01 12:32:35 +00:00
dev - > priv_flags & = ~ IFF_XMIT_DST_RELEASE ;
2012-12-31 12:00:21 +00:00
dev - > priv_flags | = IFF_LIVE_ADDR_CHANGE ;
2012-10-01 12:32:35 +00:00
2013-05-16 11:35:20 +00:00
INIT_LIST_HEAD ( & vxlan - > next ) ;
2012-10-01 12:32:35 +00:00
spin_lock_init ( & vxlan - > hash_lock ) ;
2013-07-18 08:40:15 -07:00
INIT_WORK ( & vxlan - > igmp_join , vxlan_igmp_join ) ;
INIT_WORK ( & vxlan - > igmp_leave , vxlan_igmp_leave ) ;
2013-06-17 14:16:11 -07:00
INIT_WORK ( & vxlan - > sock_work , vxlan_sock_work ) ;
2012-10-01 12:32:35 +00:00
init_timer_deferrable ( & vxlan - > age_timer ) ;
vxlan - > age_timer . function = vxlan_cleanup ;
vxlan - > age_timer . data = ( unsigned long ) vxlan ;
2013-09-28 14:10:59 -07:00
inet_get_local_port_range ( dev_net ( dev ) , & low , & high ) ;
2012-10-09 20:35:50 +00:00
vxlan - > port_min = low ;
vxlan - > port_max = high ;
2013-04-27 11:31:57 +00:00
vxlan - > dst_port = htons ( vxlan_port ) ;
2012-10-09 20:35:50 +00:00
2012-10-01 12:32:35 +00:00
vxlan - > dev = dev ;
for ( h = 0 ; h < FDB_HASH_SIZE ; + + h )
INIT_HLIST_HEAD ( & vxlan - > fdb_head [ h ] ) ;
}
static const struct nla_policy vxlan_policy [ IFLA_VXLAN_MAX + 1 ] = {
[ IFLA_VXLAN_ID ] = { . type = NLA_U32 } ,
2013-04-27 11:31:55 +00:00
[ IFLA_VXLAN_GROUP ] = { . len = FIELD_SIZEOF ( struct iphdr , daddr ) } ,
2013-08-31 13:44:33 +08:00
[ IFLA_VXLAN_GROUP6 ] = { . len = sizeof ( struct in6_addr ) } ,
2012-10-01 12:32:35 +00:00
[ IFLA_VXLAN_LINK ] = { . type = NLA_U32 } ,
[ IFLA_VXLAN_LOCAL ] = { . len = FIELD_SIZEOF ( struct iphdr , saddr ) } ,
2013-08-31 13:44:33 +08:00
[ IFLA_VXLAN_LOCAL6 ] = { . len = sizeof ( struct in6_addr ) } ,
2012-10-01 12:32:35 +00:00
[ IFLA_VXLAN_TOS ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_TTL ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_LEARNING ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_AGEING ] = { . type = NLA_U32 } ,
[ IFLA_VXLAN_LIMIT ] = { . type = NLA_U32 } ,
2012-10-09 20:35:50 +00:00
[ IFLA_VXLAN_PORT_RANGE ] = { . len = sizeof ( struct ifla_vxlan_port_range ) } ,
2012-11-20 02:50:14 +00:00
[ IFLA_VXLAN_PROXY ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_RSC ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_L2MISS ] = { . type = NLA_U8 } ,
[ IFLA_VXLAN_L3MISS ] = { . type = NLA_U8 } ,
2013-04-27 11:31:57 +00:00
[ IFLA_VXLAN_PORT ] = { . type = NLA_U16 } ,
2012-10-01 12:32:35 +00:00
} ;
static int vxlan_validate ( struct nlattr * tb [ ] , struct nlattr * data [ ] )
{
if ( tb [ IFLA_ADDRESS ] ) {
if ( nla_len ( tb [ IFLA_ADDRESS ] ) ! = ETH_ALEN ) {
pr_debug ( " invalid link address (not ethernet) \n " ) ;
return - EINVAL ;
}
if ( ! is_valid_ether_addr ( nla_data ( tb [ IFLA_ADDRESS ] ) ) ) {
pr_debug ( " invalid all zero ethernet address \n " ) ;
return - EADDRNOTAVAIL ;
}
}
if ( ! data )
return - EINVAL ;
if ( data [ IFLA_VXLAN_ID ] ) {
__u32 id = nla_get_u32 ( data [ IFLA_VXLAN_ID ] ) ;
if ( id > = VXLAN_VID_MASK )
return - ERANGE ;
}
2012-10-09 20:35:50 +00:00
if ( data [ IFLA_VXLAN_PORT_RANGE ] ) {
const struct ifla_vxlan_port_range * p
= nla_data ( data [ IFLA_VXLAN_PORT_RANGE ] ) ;
if ( ntohs ( p - > high ) < ntohs ( p - > low ) ) {
pr_debug ( " port range %u .. %u not valid \n " ,
ntohs ( p - > low ) , ntohs ( p - > high ) ) ;
return - EINVAL ;
}
}
2012-10-01 12:32:35 +00:00
return 0 ;
}
2013-01-29 23:43:07 +00:00
static void vxlan_get_drvinfo ( struct net_device * netdev ,
struct ethtool_drvinfo * drvinfo )
{
strlcpy ( drvinfo - > version , VXLAN_VERSION , sizeof ( drvinfo - > version ) ) ;
strlcpy ( drvinfo - > driver , " vxlan " , sizeof ( drvinfo - > driver ) ) ;
}
static const struct ethtool_ops vxlan_ethtool_ops = {
. get_drvinfo = vxlan_get_drvinfo ,
. get_link = ethtool_op_get_link ,
} ;
2013-05-16 11:35:20 +00:00
static void vxlan_del_work ( struct work_struct * work )
{
struct vxlan_sock * vs = container_of ( work , struct vxlan_sock , del_work ) ;
sk_release_kernel ( vs - > sock - > sk ) ;
kfree_rcu ( vs , rcu ) ;
}
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
/* Create UDP socket for encapsulation receive. AF_INET6 socket
* could be used for both IPv4 and IPv6 communications , but
* users may set bindv6only = 1.
*/
2013-10-28 14:01:48 +08:00
static struct socket * create_v6_sock ( struct net * net , __be16 port )
2013-05-16 11:35:20 +00:00
{
struct sock * sk ;
2013-08-31 13:44:33 +08:00
struct socket * sock ;
struct sockaddr_in6 vxlan_addr = {
. sin6_family = AF_INET6 ,
. sin6_port = port ,
} ;
int rc , val = 1 ;
rc = sock_create_kern ( AF_INET6 , SOCK_DGRAM , IPPROTO_UDP , & sock ) ;
if ( rc < 0 ) {
pr_debug ( " UDPv6 socket create failed \n " ) ;
2013-10-28 14:01:48 +08:00
return ERR_PTR ( rc ) ;
2013-08-31 13:44:33 +08:00
}
/* Put in proper namespace */
sk = sock - > sk ;
sk_change_net ( sk , net ) ;
kernel_setsockopt ( sock , SOL_IPV6 , IPV6_V6ONLY ,
( char * ) & val , sizeof ( val ) ) ;
rc = kernel_bind ( sock , ( struct sockaddr * ) & vxlan_addr ,
sizeof ( struct sockaddr_in6 ) ) ;
if ( rc < 0 ) {
pr_debug ( " bind for UDPv6 socket %pI6:%u (%d) \n " ,
& vxlan_addr . sin6_addr , ntohs ( vxlan_addr . sin6_port ) , rc ) ;
sk_release_kernel ( sk ) ;
2013-10-28 14:01:48 +08:00
return ERR_PTR ( rc ) ;
2013-08-31 13:44:33 +08:00
}
/* At this point, IPv6 module should have been loaded in
* sock_create_kern ( ) .
*/
BUG_ON ( ! ipv6_stub ) ;
/* Disable multicast loopback */
inet_sk ( sk ) - > mc_loop = 0 ;
2013-10-28 14:01:48 +08:00
return sock ;
2013-08-31 13:44:33 +08:00
}
# else
2013-10-28 14:01:48 +08:00
static struct socket * create_v6_sock ( struct net * net , __be16 port )
2013-08-31 13:44:33 +08:00
{
2013-10-28 14:01:48 +08:00
return ERR_PTR ( - EPFNOSUPPORT ) ;
2013-08-31 13:44:33 +08:00
}
# endif
2013-10-28 14:01:48 +08:00
static struct socket * create_v4_sock ( struct net * net , __be16 port )
2013-08-31 13:44:33 +08:00
{
struct sock * sk ;
struct socket * sock ;
2013-05-16 11:35:20 +00:00
struct sockaddr_in vxlan_addr = {
. sin_family = AF_INET ,
. sin_addr . s_addr = htonl ( INADDR_ANY ) ,
2013-06-17 14:16:40 -07:00
. sin_port = port ,
2013-05-16 11:35:20 +00:00
} ;
int rc ;
/* Create UDP socket for encapsulation receive. */
2013-08-31 13:44:33 +08:00
rc = sock_create_kern ( AF_INET , SOCK_DGRAM , IPPROTO_UDP , & sock ) ;
2013-05-16 11:35:20 +00:00
if ( rc < 0 ) {
pr_debug ( " UDP socket create failed \n " ) ;
2013-10-28 14:01:48 +08:00
return ERR_PTR ( rc ) ;
2013-05-16 11:35:20 +00:00
}
/* Put in proper namespace */
2013-08-31 13:44:33 +08:00
sk = sock - > sk ;
2013-05-16 11:35:20 +00:00
sk_change_net ( sk , net ) ;
2013-08-31 13:44:33 +08:00
rc = kernel_bind ( sock , ( struct sockaddr * ) & vxlan_addr ,
2013-05-16 11:35:20 +00:00
sizeof ( vxlan_addr ) ) ;
if ( rc < 0 ) {
pr_debug ( " bind for UDP socket %pI4:%u (%d) \n " ,
& vxlan_addr . sin_addr , ntohs ( vxlan_addr . sin_port ) , rc ) ;
sk_release_kernel ( sk ) ;
2013-10-28 14:01:48 +08:00
return ERR_PTR ( rc ) ;
2013-08-31 13:44:33 +08:00
}
/* Disable multicast loopback */
inet_sk ( sk ) - > mc_loop = 0 ;
2013-10-28 14:01:48 +08:00
return sock ;
2013-08-31 13:44:33 +08:00
}
/* Create new listen socket if needed */
static struct vxlan_sock * vxlan_socket_create ( struct net * net , __be16 port ,
vxlan_rcv_t * rcv , void * data , bool ipv6 )
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
struct vxlan_sock * vs ;
struct socket * sock ;
struct sock * sk ;
unsigned int h ;
2014-01-20 13:59:21 +02:00
vs = kzalloc ( sizeof ( * vs ) , GFP_KERNEL ) ;
2013-08-31 13:44:33 +08:00
if ( ! vs )
return ERR_PTR ( - ENOMEM ) ;
for ( h = 0 ; h < VNI_HASH_SIZE ; + + h )
INIT_HLIST_HEAD ( & vs - > vni_list [ h ] ) ;
INIT_WORK ( & vs - > del_work , vxlan_del_work ) ;
if ( ipv6 )
2013-10-28 14:01:48 +08:00
sock = create_v6_sock ( net , port ) ;
2013-08-31 13:44:33 +08:00
else
2013-10-28 14:01:48 +08:00
sock = create_v4_sock ( net , port ) ;
if ( IS_ERR ( sock ) ) {
2013-05-16 11:35:20 +00:00
kfree ( vs ) ;
2013-11-01 13:09:43 +08:00
return ERR_CAST ( sock ) ;
2013-05-16 11:35:20 +00:00
}
2013-08-31 13:44:33 +08:00
vs - > sock = sock ;
sk = sock - > sk ;
2013-08-19 11:22:48 -07:00
atomic_set ( & vs - > refcnt , 1 ) ;
2013-08-19 11:23:02 -07:00
vs - > rcv = rcv ;
2013-08-19 11:23:07 -07:00
vs - > data = data ;
2013-09-24 10:25:40 -07:00
rcu_assign_sk_user_data ( vs - > sock - > sk , vs ) ;
2013-05-16 11:35:20 +00:00
2014-01-20 13:59:21 +02:00
/* Initialize the vxlan udp offloads structure */
vs - > udp_offloads . port = port ;
vs - > udp_offloads . callbacks . gro_receive = vxlan_gro_receive ;
vs - > udp_offloads . callbacks . gro_complete = vxlan_gro_complete ;
2013-08-19 11:22:48 -07:00
spin_lock ( & vn - > sock_lock ) ;
hlist_add_head_rcu ( & vs - > hlist , vs_head ( net , port ) ) ;
2014-01-20 13:59:21 +02:00
vxlan_notify_add_rx_port ( vs ) ;
2013-08-19 11:22:48 -07:00
spin_unlock ( & vn - > sock_lock ) ;
2013-05-16 11:35:20 +00:00
/* Mark socket as an encapsulation socket. */
udp_sk ( sk ) - > encap_type = 1 ;
udp_sk ( sk ) - > encap_rcv = vxlan_udp_encap_recv ;
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
if ( ipv6 )
ipv6_stub - > udpv6_encap_enable ( ) ;
else
# endif
udp_encap_enable ( ) ;
2013-08-19 11:22:48 -07:00
return vs ;
}
2013-08-19 11:23:07 -07:00
struct vxlan_sock * vxlan_sock_add ( struct net * net , __be16 port ,
vxlan_rcv_t * rcv , void * data ,
2013-08-31 13:44:33 +08:00
bool no_share , bool ipv6 )
2013-08-19 11:22:48 -07:00
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
struct vxlan_sock * vs ;
2013-08-31 13:44:33 +08:00
vs = vxlan_socket_create ( net , port , rcv , data , ipv6 ) ;
2013-08-19 11:22:48 -07:00
if ( ! IS_ERR ( vs ) )
return vs ;
2013-05-16 11:35:20 +00:00
2013-08-19 11:23:07 -07:00
if ( no_share ) /* Return error if sharing is not allowed. */
return vs ;
2013-08-19 11:22:48 -07:00
spin_lock ( & vn - > sock_lock ) ;
vs = vxlan_find_sock ( net , port ) ;
2013-08-19 11:23:02 -07:00
if ( vs ) {
if ( vs - > rcv = = rcv )
atomic_inc ( & vs - > refcnt ) ;
else
vs = ERR_PTR ( - EBUSY ) ;
}
spin_unlock ( & vn - > sock_lock ) ;
if ( ! vs )
2013-08-19 11:22:48 -07:00
vs = ERR_PTR ( - EINVAL ) ;
2013-05-16 11:35:20 +00:00
return vs ;
}
2013-08-19 11:23:07 -07:00
EXPORT_SYMBOL_GPL ( vxlan_sock_add ) ;
2013-05-16 11:35:20 +00:00
2013-06-17 14:16:11 -07:00
/* Scheduled at device creation to bind to a socket */
static void vxlan_sock_work ( struct work_struct * work )
{
2013-08-19 11:22:48 -07:00
struct vxlan_dev * vxlan = container_of ( work , struct vxlan_dev , sock_work ) ;
struct net * net = dev_net ( vxlan - > dev ) ;
2013-06-17 14:16:11 -07:00
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
2013-08-19 11:22:48 -07:00
__be16 port = vxlan - > dst_port ;
struct vxlan_sock * nvs ;
2013-06-17 14:16:11 -07:00
2013-08-31 13:44:33 +08:00
nvs = vxlan_sock_add ( net , port , vxlan_rcv , NULL , false , vxlan - > flags & VXLAN_F_IPV6 ) ;
2013-06-17 14:16:11 -07:00
spin_lock ( & vn - > sock_lock ) ;
2013-08-19 11:22:48 -07:00
if ( ! IS_ERR ( nvs ) )
vxlan_vs_add_dev ( nvs , vxlan ) ;
spin_unlock ( & vn - > sock_lock ) ;
dev_put ( vxlan - > dev ) ;
2013-06-17 14:16:11 -07:00
}
2012-10-01 12:32:35 +00:00
static int vxlan_newlink ( struct net * net , struct net_device * dev ,
struct nlattr * tb [ ] , struct nlattr * data [ ] )
{
2013-05-16 11:35:20 +00:00
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
2012-10-01 12:32:35 +00:00
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-04-16 02:50:52 +00:00
struct vxlan_rdst * dst = & vxlan - > default_dst ;
2012-10-01 12:32:35 +00:00
__u32 vni ;
int err ;
2013-08-31 13:44:33 +08:00
bool use_ipv6 = false ;
2012-10-01 12:32:35 +00:00
if ( ! data [ IFLA_VXLAN_ID ] )
return - EINVAL ;
vni = nla_get_u32 ( data [ IFLA_VXLAN_ID ] ) ;
2013-04-16 02:50:52 +00:00
dst - > remote_vni = vni ;
2012-10-01 12:32:35 +00:00
2014-04-01 09:23:01 +03:00
/* Unless IPv6 is explicitly requested, assume IPv4 */
dst - > remote_ip . sa . sa_family = AF_INET ;
2013-08-31 13:44:33 +08:00
if ( data [ IFLA_VXLAN_GROUP ] ) {
dst - > remote_ip . sin . sin_addr . s_addr = nla_get_be32 ( data [ IFLA_VXLAN_GROUP ] ) ;
} else if ( data [ IFLA_VXLAN_GROUP6 ] ) {
if ( ! IS_ENABLED ( CONFIG_IPV6 ) )
return - EPFNOSUPPORT ;
nla_memcpy ( & dst - > remote_ip . sin6 . sin6_addr , data [ IFLA_VXLAN_GROUP6 ] ,
sizeof ( struct in6_addr ) ) ;
dst - > remote_ip . sa . sa_family = AF_INET6 ;
use_ipv6 = true ;
}
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
if ( data [ IFLA_VXLAN_LOCAL ] ) {
vxlan - > saddr . sin . sin_addr . s_addr = nla_get_be32 ( data [ IFLA_VXLAN_LOCAL ] ) ;
vxlan - > saddr . sa . sa_family = AF_INET ;
} else if ( data [ IFLA_VXLAN_LOCAL6 ] ) {
if ( ! IS_ENABLED ( CONFIG_IPV6 ) )
return - EPFNOSUPPORT ;
/* TODO: respect scope id */
nla_memcpy ( & vxlan - > saddr . sin6 . sin6_addr , data [ IFLA_VXLAN_LOCAL6 ] ,
sizeof ( struct in6_addr ) ) ;
vxlan - > saddr . sa . sa_family = AF_INET6 ;
use_ipv6 = true ;
}
2012-10-01 12:32:35 +00:00
2012-10-09 20:35:53 +00:00
if ( data [ IFLA_VXLAN_LINK ] & &
2013-04-16 02:50:52 +00:00
( dst - > remote_ifindex = nla_get_u32 ( data [ IFLA_VXLAN_LINK ] ) ) ) {
2012-10-09 20:35:53 +00:00
struct net_device * lowerdev
2013-04-16 02:50:52 +00:00
= __dev_get_by_index ( net , dst - > remote_ifindex ) ;
2012-10-09 20:35:53 +00:00
if ( ! lowerdev ) {
2013-04-16 02:50:52 +00:00
pr_info ( " ifindex %d does not exist \n " , dst - > remote_ifindex ) ;
2012-10-09 20:35:53 +00:00
return - ENODEV ;
}
2012-10-01 12:32:35 +00:00
2013-08-31 13:44:33 +08:00
# if IS_ENABLED(CONFIG_IPV6)
if ( use_ipv6 ) {
struct inet6_dev * idev = __in6_dev_get ( lowerdev ) ;
if ( idev & & idev - > cnf . disable_ipv6 ) {
pr_info ( " IPv6 is disabled via sysctl \n " ) ;
return - EPERM ;
}
vxlan - > flags | = VXLAN_F_IPV6 ;
}
# endif
2012-10-09 20:35:53 +00:00
if ( ! tb [ IFLA_MTU ] )
2013-08-31 13:44:33 +08:00
dev - > mtu = lowerdev - > mtu - ( use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM ) ;
2012-11-13 13:10:59 +00:00
/* update header length based on lower device */
dev - > hard_header_len = lowerdev - > hard_header_len +
2013-08-31 13:44:33 +08:00
( use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM ) ;
2014-01-03 10:18:58 +08:00
} else if ( use_ipv6 )
vxlan - > flags | = VXLAN_F_IPV6 ;
2012-10-01 12:32:35 +00:00
if ( data [ IFLA_VXLAN_TOS ] )
vxlan - > tos = nla_get_u8 ( data [ IFLA_VXLAN_TOS ] ) ;
2012-10-30 10:27:16 +00:00
if ( data [ IFLA_VXLAN_TTL ] )
vxlan - > ttl = nla_get_u8 ( data [ IFLA_VXLAN_TTL ] ) ;
2012-10-01 12:32:35 +00:00
if ( ! data [ IFLA_VXLAN_LEARNING ] | | nla_get_u8 ( data [ IFLA_VXLAN_LEARNING ] ) )
2012-11-20 02:50:14 +00:00
vxlan - > flags | = VXLAN_F_LEARN ;
2012-10-01 12:32:35 +00:00
if ( data [ IFLA_VXLAN_AGEING ] )
vxlan - > age_interval = nla_get_u32 ( data [ IFLA_VXLAN_AGEING ] ) ;
else
vxlan - > age_interval = FDB_AGE_DEFAULT ;
2012-11-20 02:50:14 +00:00
if ( data [ IFLA_VXLAN_PROXY ] & & nla_get_u8 ( data [ IFLA_VXLAN_PROXY ] ) )
vxlan - > flags | = VXLAN_F_PROXY ;
if ( data [ IFLA_VXLAN_RSC ] & & nla_get_u8 ( data [ IFLA_VXLAN_RSC ] ) )
vxlan - > flags | = VXLAN_F_RSC ;
if ( data [ IFLA_VXLAN_L2MISS ] & & nla_get_u8 ( data [ IFLA_VXLAN_L2MISS ] ) )
vxlan - > flags | = VXLAN_F_L2MISS ;
if ( data [ IFLA_VXLAN_L3MISS ] & & nla_get_u8 ( data [ IFLA_VXLAN_L3MISS ] ) )
vxlan - > flags | = VXLAN_F_L3MISS ;
2012-10-01 12:32:35 +00:00
if ( data [ IFLA_VXLAN_LIMIT ] )
vxlan - > addrmax = nla_get_u32 ( data [ IFLA_VXLAN_LIMIT ] ) ;
2012-10-09 20:35:50 +00:00
if ( data [ IFLA_VXLAN_PORT_RANGE ] ) {
const struct ifla_vxlan_port_range * p
= nla_data ( data [ IFLA_VXLAN_PORT_RANGE ] ) ;
vxlan - > port_min = ntohs ( p - > low ) ;
vxlan - > port_max = ntohs ( p - > high ) ;
}
2013-04-27 11:31:57 +00:00
if ( data [ IFLA_VXLAN_PORT ] )
vxlan - > dst_port = nla_get_be16 ( data [ IFLA_VXLAN_PORT ] ) ;
2013-05-16 11:35:20 +00:00
if ( vxlan_find_vni ( net , vni , vxlan - > dst_port ) ) {
pr_info ( " duplicate VNI %u \n " , vni ) ;
return - EEXIST ;
}
2013-01-29 23:43:07 +00:00
SET_ETHTOOL_OPS ( dev , & vxlan_ethtool_ops ) ;
2013-09-17 12:12:40 -07:00
/* create an fdb entry for a valid default destination */
if ( ! vxlan_addr_any ( & vxlan - > default_dst . remote_ip ) ) {
err = vxlan_fdb_create ( vxlan , all_zeros_mac ,
& vxlan - > default_dst . remote_ip ,
NUD_REACHABLE | NUD_PERMANENT ,
NLM_F_EXCL | NLM_F_CREATE ,
vxlan - > dst_port ,
vxlan - > default_dst . remote_vni ,
vxlan - > default_dst . remote_ifindex ,
NTF_SELF ) ;
if ( err )
return err ;
}
2012-10-01 12:32:35 +00:00
2013-06-25 16:01:51 +03:00
err = register_netdevice ( dev ) ;
if ( err ) {
2013-06-25 17:06:01 -07:00
vxlan_fdb_delete_default ( vxlan ) ;
2013-06-25 16:01:51 +03:00
return err ;
}
2013-05-16 11:35:20 +00:00
list_add ( & vxlan - > next , & vn - > vxlan_list ) ;
return 0 ;
2012-10-01 12:32:35 +00:00
}
static void vxlan_dellink ( struct net_device * dev , struct list_head * head )
{
2013-07-13 10:18:18 -07:00
struct vxlan_net * vn = net_generic ( dev_net ( dev ) , vxlan_net_id ) ;
2012-10-01 12:32:35 +00:00
struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-07-13 10:18:18 -07:00
spin_lock ( & vn - > sock_lock ) ;
2013-08-19 11:22:48 -07:00
if ( ! hlist_unhashed ( & vxlan - > hlist ) )
hlist_del_rcu ( & vxlan - > hlist ) ;
2013-07-13 10:18:18 -07:00
spin_unlock ( & vn - > sock_lock ) ;
2013-05-16 11:35:20 +00:00
list_del ( & vxlan - > next ) ;
2012-10-01 12:32:35 +00:00
unregister_netdevice_queue ( dev , head ) ;
}
static size_t vxlan_get_size ( const struct net_device * dev )
{
return nla_total_size ( sizeof ( __u32 ) ) + /* IFLA_VXLAN_ID */
2013-08-31 13:44:33 +08:00
nla_total_size ( sizeof ( struct in6_addr ) ) + /* IFLA_VXLAN_GROUP{6} */
2012-10-01 12:32:35 +00:00
nla_total_size ( sizeof ( __u32 ) ) + /* IFLA_VXLAN_LINK */
2013-08-31 13:44:33 +08:00
nla_total_size ( sizeof ( struct in6_addr ) ) + /* IFLA_VXLAN_LOCAL{6} */
2012-10-01 12:32:35 +00:00
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_TTL */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_TOS */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_LEARNING */
2012-11-20 02:50:14 +00:00
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_PROXY */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_RSC */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_L2MISS */
nla_total_size ( sizeof ( __u8 ) ) + /* IFLA_VXLAN_L3MISS */
2012-10-01 12:32:35 +00:00
nla_total_size ( sizeof ( __u32 ) ) + /* IFLA_VXLAN_AGEING */
nla_total_size ( sizeof ( __u32 ) ) + /* IFLA_VXLAN_LIMIT */
2012-10-09 20:35:50 +00:00
nla_total_size ( sizeof ( struct ifla_vxlan_port_range ) ) +
2013-04-27 11:31:57 +00:00
nla_total_size ( sizeof ( __be16 ) ) + /* IFLA_VXLAN_PORT */
2012-10-01 12:32:35 +00:00
0 ;
}
static int vxlan_fill_info ( struct sk_buff * skb , const struct net_device * dev )
{
const struct vxlan_dev * vxlan = netdev_priv ( dev ) ;
2013-04-16 02:50:52 +00:00
const struct vxlan_rdst * dst = & vxlan - > default_dst ;
2012-10-09 20:35:50 +00:00
struct ifla_vxlan_port_range ports = {
. low = htons ( vxlan - > port_min ) ,
. high = htons ( vxlan - > port_max ) ,
} ;
2012-10-01 12:32:35 +00:00
2013-04-16 02:50:52 +00:00
if ( nla_put_u32 ( skb , IFLA_VXLAN_ID , dst - > remote_vni ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
2013-08-31 13:44:33 +08:00
if ( ! vxlan_addr_any ( & dst - > remote_ip ) ) {
if ( dst - > remote_ip . sa . sa_family = = AF_INET ) {
if ( nla_put_be32 ( skb , IFLA_VXLAN_GROUP ,
dst - > remote_ip . sin . sin_addr . s_addr ) )
goto nla_put_failure ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
if ( nla_put ( skb , IFLA_VXLAN_GROUP6 , sizeof ( struct in6_addr ) ,
& dst - > remote_ip . sin6 . sin6_addr ) )
goto nla_put_failure ;
# endif
}
}
2012-10-01 12:32:35 +00:00
2013-04-16 02:50:52 +00:00
if ( dst - > remote_ifindex & & nla_put_u32 ( skb , IFLA_VXLAN_LINK , dst - > remote_ifindex ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
2013-08-31 13:44:33 +08:00
if ( ! vxlan_addr_any ( & vxlan - > saddr ) ) {
if ( vxlan - > saddr . sa . sa_family = = AF_INET ) {
if ( nla_put_be32 ( skb , IFLA_VXLAN_LOCAL ,
vxlan - > saddr . sin . sin_addr . s_addr ) )
goto nla_put_failure ;
# if IS_ENABLED(CONFIG_IPV6)
} else {
if ( nla_put ( skb , IFLA_VXLAN_LOCAL6 , sizeof ( struct in6_addr ) ,
& vxlan - > saddr . sin6 . sin6_addr ) )
goto nla_put_failure ;
# endif
}
}
2012-10-01 12:32:35 +00:00
if ( nla_put_u8 ( skb , IFLA_VXLAN_TTL , vxlan - > ttl ) | |
nla_put_u8 ( skb , IFLA_VXLAN_TOS , vxlan - > tos ) | |
2012-11-20 02:50:14 +00:00
nla_put_u8 ( skb , IFLA_VXLAN_LEARNING ,
! ! ( vxlan - > flags & VXLAN_F_LEARN ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_PROXY ,
! ! ( vxlan - > flags & VXLAN_F_PROXY ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_RSC , ! ! ( vxlan - > flags & VXLAN_F_RSC ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_L2MISS ,
! ! ( vxlan - > flags & VXLAN_F_L2MISS ) ) | |
nla_put_u8 ( skb , IFLA_VXLAN_L3MISS ,
! ! ( vxlan - > flags & VXLAN_F_L3MISS ) ) | |
2012-10-01 12:32:35 +00:00
nla_put_u32 ( skb , IFLA_VXLAN_AGEING , vxlan - > age_interval ) | |
2013-04-27 11:31:57 +00:00
nla_put_u32 ( skb , IFLA_VXLAN_LIMIT , vxlan - > addrmax ) | |
nla_put_be16 ( skb , IFLA_VXLAN_PORT , vxlan - > dst_port ) )
2012-10-01 12:32:35 +00:00
goto nla_put_failure ;
2012-10-09 20:35:50 +00:00
if ( nla_put ( skb , IFLA_VXLAN_PORT_RANGE , sizeof ( ports ) , & ports ) )
goto nla_put_failure ;
2012-10-01 12:32:35 +00:00
return 0 ;
nla_put_failure :
return - EMSGSIZE ;
}
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
. kind = " vxlan " ,
. maxtype = IFLA_VXLAN_MAX ,
. policy = vxlan_policy ,
. priv_size = sizeof ( struct vxlan_dev ) ,
. setup = vxlan_setup ,
. validate = vxlan_validate ,
. newlink = vxlan_newlink ,
. dellink = vxlan_dellink ,
. get_size = vxlan_get_size ,
. fill_info = vxlan_fill_info ,
} ;
2014-01-13 18:41:19 +01:00
static void vxlan_handle_lowerdev_unregister ( struct vxlan_net * vn ,
struct net_device * dev )
{
struct vxlan_dev * vxlan , * next ;
LIST_HEAD ( list_kill ) ;
list_for_each_entry_safe ( vxlan , next , & vn - > vxlan_list , next ) {
struct vxlan_rdst * dst = & vxlan - > default_dst ;
/* In case we created vxlan device with carrier
* and we loose the carrier due to module unload
* we also need to remove vxlan device . In other
* cases , it ' s not necessary and remote_ifindex
* is 0 here , so no matches .
*/
if ( dst - > remote_ifindex = = dev - > ifindex )
vxlan_dellink ( vxlan - > dev , & list_kill ) ;
}
unregister_netdevice_many ( & list_kill ) ;
}
static int vxlan_lowerdev_event ( struct notifier_block * unused ,
unsigned long event , void * ptr )
{
struct net_device * dev = netdev_notifier_info_to_dev ( ptr ) ;
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
struct vxlan_net * vn = net_generic ( dev_net ( dev ) , vxlan_net_id ) ;
2014-01-13 18:41:19 +01:00
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
if ( event = = NETDEV_UNREGISTER )
2014-01-13 18:41:19 +01:00
vxlan_handle_lowerdev_unregister ( vn , dev ) ;
return NOTIFY_DONE ;
}
static struct notifier_block vxlan_notifier_block __read_mostly = {
. notifier_call = vxlan_lowerdev_event ,
} ;
2012-10-01 12:32:35 +00:00
static __net_init int vxlan_init_net ( struct net * net )
{
struct vxlan_net * vn = net_generic ( net , vxlan_net_id ) ;
2013-05-27 22:35:52 +00:00
unsigned int h ;
2012-10-01 12:32:35 +00:00
2013-05-16 11:35:20 +00:00
INIT_LIST_HEAD ( & vn - > vxlan_list ) ;
2013-06-17 14:16:11 -07:00
spin_lock_init ( & vn - > sock_lock ) ;
2012-10-01 12:32:35 +00:00
2013-05-16 11:35:20 +00:00
for ( h = 0 ; h < PORT_HASH_SIZE ; + + h )
INIT_HLIST_HEAD ( & vn - > sock_list [ h ] ) ;
2012-10-01 12:32:35 +00:00
return 0 ;
}
static struct pernet_operations vxlan_net_ops = {
. init = vxlan_init_net ,
. id = & vxlan_net_id ,
. size = sizeof ( struct vxlan_net ) ,
} ;
static int __init vxlan_init_module ( void )
{
int rc ;
2013-06-17 14:16:09 -07:00
vxlan_wq = alloc_workqueue ( " vxlan " , 0 , 0 ) ;
if ( ! vxlan_wq )
return - ENOMEM ;
2012-10-01 12:32:35 +00:00
get_random_bytes ( & vxlan_salt , sizeof ( vxlan_salt ) ) ;
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
rc = register_pernet_subsys ( & vxlan_net_ops ) ;
2012-10-01 12:32:35 +00:00
if ( rc )
goto out1 ;
2014-01-13 18:41:19 +01:00
rc = register_netdevice_notifier ( & vxlan_notifier_block ) ;
2012-10-01 12:32:35 +00:00
if ( rc )
goto out2 ;
2014-01-13 18:41:19 +01:00
rc = rtnl_link_register ( & vxlan_link_ops ) ;
if ( rc )
goto out3 ;
2012-10-01 12:32:35 +00:00
2014-01-13 18:41:19 +01:00
return 0 ;
out3 :
unregister_netdevice_notifier ( & vxlan_notifier_block ) ;
2012-10-01 12:32:35 +00:00
out2 :
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
unregister_pernet_subsys ( & vxlan_net_ops ) ;
2012-10-01 12:32:35 +00:00
out1 :
2013-06-17 14:16:09 -07:00
destroy_workqueue ( vxlan_wq ) ;
2012-10-01 12:32:35 +00:00
return rc ;
}
2013-05-27 22:35:53 +00:00
late_initcall ( vxlan_init_module ) ;
2012-10-01 12:32:35 +00:00
static void __exit vxlan_cleanup_module ( void )
{
2013-06-17 14:16:09 -07:00
rtnl_link_unregister ( & vxlan_link_ops ) ;
2014-01-13 18:41:19 +01:00
unregister_netdevice_notifier ( & vxlan_notifier_block ) ;
2013-06-17 14:16:09 -07:00
destroy_workqueue ( vxlan_wq ) ;
net: vxlan: convert to act as a pernet subsystem
As per suggestion from Eric W. Biederman, vxlan should be using
{un,}register_pernet_subsys() instead of {un,}register_pernet_device()
to ensure the vxlan_net structure is initialized before and cleaned
up after all network devices in a given network namespace i.e. when
dealing with network notifiers. This is similarly handeled already in
commit 91e2ff3528ac ("net: Teach vlans to cleanup as a pernet subsystem")
and, thus, improves upon fd27e0d44a89 ("net: vxlan: do not use vxlan_net
before checking event type"). Just as in 91e2ff3528ac, we do not need
to explicitly handle deletion of vxlan devices as network namespace
exit calls dellink on all remaining virtual devices, and
rtnl_link_unregister() calls dellink on all outstanding devices in that
network namespace, so we can entirely drop the pernet exit operation
as well. Moreover, on vxlan module exit, rcu_barrier() is called by
netns since commit 3a765edadb28 ("netns: Add an explicit rcu_barrier
to unregister_pernet_{device|subsys}"), so this may be omitted. Tested
with various scenarios and works well on my side.
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 21:07:53 +01:00
unregister_pernet_subsys ( & vxlan_net_ops ) ;
/* rcu_barrier() is called by netns */
2012-10-01 12:32:35 +00:00
}
module_exit ( vxlan_cleanup_module ) ;
MODULE_LICENSE ( " GPL " ) ;
MODULE_VERSION ( VXLAN_VERSION ) ;
2013-04-27 11:31:52 +00:00
MODULE_AUTHOR ( " Stephen Hemminger <stephen@networkplumber.org> " ) ;
2014-01-17 11:00:33 -08:00
MODULE_DESCRIPTION ( " Driver for VXLAN encapsulated traffic " ) ;
2012-10-01 12:32:35 +00:00
MODULE_ALIAS_RTNL_LINK ( " vxlan " ) ;