2019-05-27 09:55:01 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2013-08-19 10:07:34 +04:00
/*
* IPv6 virtual tunneling interface
*
* Copyright ( C ) 2013 secunet Security Networks AG
*
* Author :
* Steffen Klassert < steffen . klassert @ secunet . com >
*
* Based on :
* net / ipv6 / ip6_tunnel . c
*/
# include <linux/module.h>
# include <linux/capability.h>
# include <linux/errno.h>
# include <linux/types.h>
# include <linux/sockios.h>
# include <linux/icmp.h>
# include <linux/if.h>
# include <linux/in.h>
# include <linux/ip.h>
# include <linux/net.h>
# include <linux/in6.h>
# include <linux/netdevice.h>
# include <linux/if_arp.h>
# include <linux/icmpv6.h>
# include <linux/init.h>
# include <linux/route.h>
# include <linux/rtnetlink.h>
# include <linux/netfilter_ipv6.h>
# include <linux/slab.h>
# include <linux/hash.h>
# include <linux/uaccess.h>
# include <linux/atomic.h>
# include <net/icmp.h>
# include <net/ip.h>
# include <net/ip_tunnels.h>
# include <net/ipv6.h>
# include <net/ip6_route.h>
# include <net/addrconf.h>
# include <net/ip6_tunnel.h>
# include <net/xfrm.h>
# include <net/net_namespace.h>
# include <net/netns/generic.h>
2017-01-26 06:59:18 +03:00
# include <linux/etherdevice.h>
2013-08-19 10:07:34 +04:00
2016-08-10 12:03:35 +03:00
# define IP6_VTI_HASH_SIZE_SHIFT 5
# define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT)
2013-08-19 10:07:34 +04:00
static u32 HASH ( const struct in6_addr * addr1 , const struct in6_addr * addr2 )
{
u32 hash = ipv6_addr_hash ( addr1 ) ^ ipv6_addr_hash ( addr2 ) ;
2016-08-10 12:03:35 +03:00
return hash_32 ( hash , IP6_VTI_HASH_SIZE_SHIFT ) ;
2013-08-19 10:07:34 +04:00
}
static int vti6_dev_init ( struct net_device * dev ) ;
static void vti6_dev_setup ( struct net_device * dev ) ;
static struct rtnl_link_ops vti6_link_ops __read_mostly ;
netns: make struct pernet_operations::id unsigned int
Make struct pernet_operations::id unsigned.
There are 2 reasons to do so:
1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.
2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.
"int" being used as an array index needs to be sign-extended
to 64-bit before being used.
void f(long *p, int i)
{
g(p[i]);
}
roughly translates to
movsx rsi, esi
mov rdi, [rsi+...]
call g
MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.
Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:
static inline void *net_generic(const struct net *net, int id)
{
...
ptr = ng->ptr[id - 1];
...
}
And this function is used a lot, so those sign extensions add up.
Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]
However, overall balance is in negative direction:
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
function old new delta
nfsd4_lock 3886 3959 +73
tipc_link_build_proto_msg 1096 1140 +44
mac80211_hwsim_new_radio 2776 2808 +32
tipc_mon_rcv 1032 1058 +26
svcauth_gss_legacy_init 1413 1429 +16
tipc_bcbase_select_primary 379 392 +13
nfsd4_exchange_id 1247 1260 +13
nfsd4_setclientid_confirm 782 793 +11
...
put_client_renew_locked 494 480 -14
ip_set_sockfn_get 730 716 -14
geneve_sock_add 829 813 -16
nfsd4_sequence_done 721 703 -18
nlmclnt_lookup_host 708 686 -22
nfsd4_lockt 1085 1063 -22
nfs_get_client 1077 1050 -27
tcf_bpf_init 1106 1076 -30
nfsd4_encode_fattr 5997 5930 -67
Total: Before=154856051, After=154854321, chg -0.00%
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 04:58:21 +03:00
static unsigned int vti6_net_id __read_mostly ;
2013-08-19 10:07:34 +04:00
struct vti6_net {
/* the vti6 tunnel fallback device */
struct net_device * fb_tnl_dev ;
/* lists for storing tunnels in use */
2016-08-10 12:03:35 +03:00
struct ip6_tnl __rcu * tnls_r_l [ IP6_VTI_HASH_SIZE ] ;
2013-08-19 10:07:34 +04:00
struct ip6_tnl __rcu * tnls_wc [ 1 ] ;
struct ip6_tnl __rcu * * tnls [ 2 ] ;
} ;
# define for_each_vti6_tunnel_rcu(start) \
for ( t = rcu_dereference ( start ) ; t ; t = rcu_dereference ( t - > next ) )
/**
* vti6_tnl_lookup - fetch tunnel matching the end - point addresses
* @ net : network namespace
* @ remote : the address of the tunnel exit - point
* @ local : the address of the tunnel entry - point
*
* Return :
* tunnel matching given end - points if found ,
* else fallback tunnel if its device is up ,
* else % NULL
* */
static struct ip6_tnl *
vti6_tnl_lookup ( struct net * net , const struct in6_addr * remote ,
const struct in6_addr * local )
{
unsigned int hash = HASH ( remote , local ) ;
struct ip6_tnl * t ;
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
2014-11-20 12:01:49 +03:00
struct in6_addr any ;
2013-08-19 10:07:34 +04:00
for_each_vti6_tunnel_rcu ( ip6n - > tnls_r_l [ hash ] ) {
if ( ipv6_addr_equal ( local , & t - > parms . laddr ) & &
ipv6_addr_equal ( remote , & t - > parms . raddr ) & &
( t - > dev - > flags & IFF_UP ) )
return t ;
}
2014-11-20 12:01:49 +03:00
memset ( & any , 0 , sizeof ( any ) ) ;
hash = HASH ( & any , local ) ;
for_each_vti6_tunnel_rcu ( ip6n - > tnls_r_l [ hash ] ) {
if ( ipv6_addr_equal ( local , & t - > parms . laddr ) & &
( t - > dev - > flags & IFF_UP ) )
return t ;
}
hash = HASH ( remote , & any ) ;
for_each_vti6_tunnel_rcu ( ip6n - > tnls_r_l [ hash ] ) {
if ( ipv6_addr_equal ( remote , & t - > parms . raddr ) & &
( t - > dev - > flags & IFF_UP ) )
return t ;
}
2013-08-19 10:07:34 +04:00
t = rcu_dereference ( ip6n - > tnls_wc [ 0 ] ) ;
if ( t & & ( t - > dev - > flags & IFF_UP ) )
return t ;
return NULL ;
}
/**
* vti6_tnl_bucket - get head of list matching given tunnel parameters
2020-10-31 21:30:44 +03:00
* @ ip6n : the private data for ip6_vti in the netns
2013-08-19 10:07:34 +04:00
* @ p : parameters containing tunnel end - points
*
* Description :
* vti6_tnl_bucket ( ) returns the head of the list matching the
* & struct in6_addr entries laddr and raddr in @ p .
*
* Return : head of IPv6 tunnel list
* */
static struct ip6_tnl __rcu * *
vti6_tnl_bucket ( struct vti6_net * ip6n , const struct __ip6_tnl_parm * p )
{
const struct in6_addr * remote = & p - > raddr ;
const struct in6_addr * local = & p - > laddr ;
unsigned int h = 0 ;
int prio = 0 ;
if ( ! ipv6_addr_any ( remote ) | | ! ipv6_addr_any ( local ) ) {
prio = 1 ;
h = HASH ( remote , local ) ;
}
return & ip6n - > tnls [ prio ] [ h ] ;
}
static void
vti6_tnl_link ( struct vti6_net * ip6n , struct ip6_tnl * t )
{
struct ip6_tnl __rcu * * tp = vti6_tnl_bucket ( ip6n , & t - > parms ) ;
2022-09-29 09:12:05 +03:00
rcu_assign_pointer ( t - > next , rtnl_dereference ( * tp ) ) ;
2013-08-19 10:07:34 +04:00
rcu_assign_pointer ( * tp , t ) ;
}
static void
vti6_tnl_unlink ( struct vti6_net * ip6n , struct ip6_tnl * t )
{
struct ip6_tnl __rcu * * tp ;
struct ip6_tnl * iter ;
for ( tp = vti6_tnl_bucket ( ip6n , & t - > parms ) ;
( iter = rtnl_dereference ( * tp ) ) ! = NULL ;
tp = & iter - > next ) {
if ( t = = iter ) {
rcu_assign_pointer ( * tp , t - > next ) ;
break ;
}
}
}
static void vti6_dev_free ( struct net_device * dev )
{
free_percpu ( dev - > tstats ) ;
}
static int vti6_tnl_create2 ( struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct net * net = dev_net ( dev ) ;
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
int err ;
2017-01-06 13:27:59 +03:00
dev - > rtnl_link_ops = & vti6_link_ops ;
2013-08-19 10:07:34 +04:00
err = register_netdevice ( dev ) ;
if ( err < 0 )
goto out ;
strcpy ( t - > parms . name , dev - > name ) ;
vti6_tnl_link ( ip6n , t ) ;
return 0 ;
out :
return err ;
}
static struct ip6_tnl * vti6_tnl_create ( struct net * net , struct __ip6_tnl_parm * p )
{
struct net_device * dev ;
struct ip6_tnl * t ;
char name [ IFNAMSIZ ] ;
int err ;
2018-04-05 16:39:31 +03:00
if ( p - > name [ 0 ] ) {
if ( ! dev_valid_name ( p - > name ) )
goto failed ;
2022-08-19 00:02:20 +03:00
strscpy ( name , p - > name , IFNAMSIZ ) ;
2018-04-05 16:39:31 +03:00
} else {
2013-08-19 10:07:34 +04:00
sprintf ( name , " ip6_vti%%d " ) ;
2018-04-05 16:39:31 +03:00
}
2013-08-19 10:07:34 +04:00
net: set name_assign_type in alloc_netdev()
Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert
all users to pass NET_NAME_UNKNOWN.
Coccinelle patch:
@@
expression sizeof_priv, name, setup, txqs, rxqs, count;
@@
(
-alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs)
+alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs)
|
-alloc_netdev_mq(sizeof_priv, name, setup, count)
+alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count)
|
-alloc_netdev(sizeof_priv, name, setup)
+alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup)
)
v9: move comments here from the wrong commit
Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-14 18:37:24 +04:00
dev = alloc_netdev ( sizeof ( * t ) , name , NET_NAME_UNKNOWN , vti6_dev_setup ) ;
2015-03-29 16:00:04 +03:00
if ( ! dev )
2013-08-19 10:07:34 +04:00
goto failed ;
dev_net_set ( dev , net ) ;
t = netdev_priv ( dev ) ;
t - > parms = * p ;
t - > net = dev_net ( dev ) ;
err = vti6_tnl_create2 ( dev ) ;
if ( err < 0 )
goto failed_free ;
return t ;
failed_free :
net: Fix inconsistent teardown and release of private netdev state.
Network devices can allocate reasources and private memory using
netdev_ops->ndo_init(). However, the release of these resources
can occur in one of two different places.
Either netdev_ops->ndo_uninit() or netdev->destructor().
The decision of which operation frees the resources depends upon
whether it is necessary for all netdev refs to be released before it
is safe to perform the freeing.
netdev_ops->ndo_uninit() presumably can occur right after the
NETDEV_UNREGISTER notifier completes and the unicast and multicast
address lists are flushed.
netdev->destructor(), on the other hand, does not run until the
netdev references all go away.
Further complicating the situation is that netdev->destructor()
almost universally does also a free_netdev().
This creates a problem for the logic in register_netdevice().
Because all callers of register_netdevice() manage the freeing
of the netdev, and invoke free_netdev(dev) if register_netdevice()
fails.
If netdev_ops->ndo_init() succeeds, but something else fails inside
of register_netdevice(), it does call ndo_ops->ndo_uninit(). But
it is not able to invoke netdev->destructor().
This is because netdev->destructor() will do a free_netdev() and
then the caller of register_netdevice() will do the same.
However, this means that the resources that would normally be released
by netdev->destructor() will not be.
Over the years drivers have added local hacks to deal with this, by
invoking their destructor parts by hand when register_netdevice()
fails.
Many drivers do not try to deal with this, and instead we have leaks.
Let's close this hole by formalizing the distinction between what
private things need to be freed up by netdev->destructor() and whether
the driver needs unregister_netdevice() to perform the free_netdev().
netdev->priv_destructor() performs all actions to free up the private
resources that used to be freed by netdev->destructor(), except for
free_netdev().
netdev->needs_free_netdev is a boolean that indicates whether
free_netdev() should be done at the end of unregister_netdevice().
Now, register_netdevice() can sanely release all resources after
ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit()
and netdev->priv_destructor().
And at the end of unregister_netdevice(), we invoke
netdev->priv_destructor() and optionally call free_netdev().
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-08 19:52:56 +03:00
free_netdev ( dev ) ;
2013-08-19 10:07:34 +04:00
failed :
return NULL ;
}
/**
* vti6_locate - find or create tunnel matching given parameters
* @ net : network namespace
* @ p : tunnel parameters
* @ create : ! = 0 if allowed to create new tunnel if no match found
*
* Description :
* vti6_locate ( ) first tries to locate an existing tunnel
* based on @ parms . If this is unsuccessful , but @ create is set a new
* tunnel device is created and registered for use .
*
* Return :
* matching tunnel or NULL
* */
static struct ip6_tnl * vti6_locate ( struct net * net , struct __ip6_tnl_parm * p ,
int create )
{
const struct in6_addr * remote = & p - > raddr ;
const struct in6_addr * local = & p - > laddr ;
struct ip6_tnl __rcu * * tp ;
struct ip6_tnl * t ;
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
for ( tp = vti6_tnl_bucket ( ip6n , p ) ;
( t = rtnl_dereference ( * tp ) ) ! = NULL ;
tp = & t - > next ) {
if ( ipv6_addr_equal ( local , & t - > parms . laddr ) & &
2014-09-22 12:07:25 +04:00
ipv6_addr_equal ( remote , & t - > parms . raddr ) ) {
if ( create )
return NULL ;
2013-08-19 10:07:34 +04:00
return t ;
2014-09-22 12:07:25 +04:00
}
2013-08-19 10:07:34 +04:00
}
if ( ! create )
return NULL ;
return vti6_tnl_create ( net , p ) ;
}
/**
* vti6_dev_uninit - tunnel device uninitializer
* @ dev : the device to be destroyed
*
* Description :
* vti6_dev_uninit ( ) removes tunnel from its list
* */
static void vti6_dev_uninit ( struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
2015-04-02 18:31:17 +03:00
struct vti6_net * ip6n = net_generic ( t - > net , vti6_net_id ) ;
2013-08-19 10:07:34 +04:00
if ( dev = = ip6n - > fb_tnl_dev )
RCU_INIT_POINTER ( ip6n - > tnls_wc [ 0 ] , NULL ) ;
else
vti6_tnl_unlink ( ip6n , t ) ;
2022-06-08 07:39:55 +03:00
netdev_put ( dev , & t - > dev_tracker ) ;
2013-08-19 10:07:34 +04:00
}
2020-04-27 18:59:34 +03:00
static int vti6_input_proto ( struct sk_buff * skb , int nexthdr , __be32 spi ,
int encap_type )
2013-08-19 10:07:34 +04:00
{
struct ip6_tnl * t ;
const struct ipv6hdr * ipv6h = ipv6_hdr ( skb ) ;
rcu_read_lock ( ) ;
2014-11-24 00:28:43 +03:00
t = vti6_tnl_lookup ( dev_net ( skb - > dev ) , & ipv6h - > saddr , & ipv6h - > daddr ) ;
2015-03-29 16:00:05 +03:00
if ( t ) {
2013-08-19 10:07:34 +04:00
if ( t - > parms . proto ! = IPPROTO_IPV6 & & t - > parms . proto ! = 0 ) {
rcu_read_unlock ( ) ;
goto discard ;
}
if ( ! xfrm6_policy_check ( NULL , XFRM_POLICY_IN , skb ) ) {
rcu_read_unlock ( ) ;
2020-03-11 13:19:06 +03:00
goto discard ;
2013-08-19 10:07:34 +04:00
}
2018-12-21 18:47:51 +03:00
ipv6h = ipv6_hdr ( skb ) ;
2013-08-19 10:07:34 +04:00
if ( ! ip6_tnl_rcv_ctl ( t , & ipv6h - > daddr , & ipv6h - > saddr ) ) {
2022-11-15 11:53:57 +03:00
DEV_STATS_INC ( t - > dev , rx_dropped ) ;
2013-08-19 10:07:34 +04:00
rcu_read_unlock ( ) ;
goto discard ;
}
rcu_read_unlock ( ) ;
2014-03-14 10:28:08 +04:00
2020-04-27 18:59:34 +03:00
XFRM_TUNNEL_SKB_CB ( skb ) - > tunnel . ip6 = t ;
XFRM_SPI_SKB_CB ( skb ) - > family = AF_INET6 ;
XFRM_SPI_SKB_CB ( skb ) - > daddroff = offsetof ( struct ipv6hdr , daddr ) ;
return xfrm_input ( skb , nexthdr , spi , encap_type ) ;
2013-08-19 10:07:34 +04:00
}
rcu_read_unlock ( ) ;
2014-03-14 10:28:08 +04:00
return - EINVAL ;
2013-08-19 10:07:34 +04:00
discard :
kfree_skb ( skb ) ;
return 0 ;
}
2020-04-27 18:59:34 +03:00
static int vti6_rcv ( struct sk_buff * skb )
{
int nexthdr = skb_network_header ( skb ) [ IP6CB ( skb ) - > nhoff ] ;
return vti6_input_proto ( skb , nexthdr , 0 , 0 ) ;
}
2014-03-14 10:28:08 +04:00
static int vti6_rcv_cb ( struct sk_buff * skb , int err )
{
unsigned short family ;
struct net_device * dev ;
struct xfrm_state * x ;
2019-03-29 23:16:31 +03:00
const struct xfrm_mode * inner_mode ;
2014-03-14 10:28:08 +04:00
struct ip6_tnl * t = XFRM_TUNNEL_SKB_CB ( skb ) - > tunnel . ip6 ;
2015-05-27 17:16:54 +03:00
u32 orig_mark = skb - > mark ;
int ret ;
2014-03-14 10:28:08 +04:00
if ( ! t )
return 1 ;
dev = t - > dev ;
if ( err ) {
2022-11-15 11:53:57 +03:00
DEV_STATS_INC ( dev , rx_errors ) ;
DEV_STATS_INC ( dev , rx_dropped ) ;
2014-03-14 10:28:08 +04:00
return 0 ;
}
x = xfrm_input_state ( skb ) ;
2016-09-07 21:40:38 +03:00
2019-03-29 23:16:32 +03:00
inner_mode = & x - > inner_mode ;
2016-09-07 21:40:38 +03:00
if ( x - > sel . family = = AF_UNSPEC ) {
inner_mode = xfrm_ip2inner_mode ( x , XFRM_MODE_SKB_CB ( skb ) - > protocol ) ;
if ( inner_mode = = NULL ) {
XFRM_INC_STATS ( dev_net ( skb - > dev ) ,
LINUX_MIB_XFRMINSTATEMODEERROR ) ;
return - EINVAL ;
}
}
2019-03-29 23:16:23 +03:00
family = inner_mode - > family ;
2014-03-14 10:28:08 +04:00
2015-05-27 17:16:54 +03:00
skb - > mark = be32_to_cpu ( t - > parms . i_key ) ;
ret = xfrm_policy_check ( NULL , XFRM_POLICY_IN , skb , family ) ;
skb - > mark = orig_mark ;
if ( ! ret )
2014-03-14 10:28:08 +04:00
return - EPERM ;
skb_scrub_packet ( skb , ! net_eq ( t - > net , dev_net ( skb - > dev ) ) ) ;
skb - > dev = dev ;
2020-10-05 23:36:19 +03:00
dev_sw_netstats_rx_add ( dev , skb - > len ) ;
2014-03-14 10:28:08 +04:00
return 0 ;
}
2013-08-19 10:07:34 +04:00
/**
* vti6_addr_conflict - compare packet addresses to tunnel ' s own
* @ t : the outgoing tunnel device
* @ hdr : IPv6 header from the incoming packet
*
* Description :
* Avoid trivial tunneling loop by checking that tunnel exit - point
* doesn ' t match source of incoming packet .
*
* Return :
* 1 if conflict ,
* 0 else
* */
static inline bool
vti6_addr_conflict ( const struct ip6_tnl * t , const struct ipv6hdr * hdr )
{
return ipv6_addr_equal ( & t - > parms . raddr , & hdr - > saddr ) ;
}
2014-03-14 10:28:09 +04:00
static bool vti6_state_check ( const struct xfrm_state * x ,
const struct in6_addr * dst ,
const struct in6_addr * src )
{
xfrm_address_t * daddr = ( xfrm_address_t * ) dst ;
xfrm_address_t * saddr = ( xfrm_address_t * ) src ;
/* if there is no transform then this tunnel is not functional.
* Or if the xfrm is not mode tunnel .
*/
if ( ! x | | x - > props . mode ! = XFRM_MODE_TUNNEL | |
x - > props . family ! = AF_INET6 )
return false ;
if ( ipv6_addr_any ( dst ) )
return xfrm_addr_equal ( saddr , & x - > props . saddr , AF_INET6 ) ;
if ( ! xfrm_state_addr_check ( x , daddr , saddr , AF_INET6 ) )
return false ;
return true ;
}
2013-08-19 10:07:34 +04:00
/**
* vti6_xmit - send a packet
* @ skb : the outgoing socket buffer
* @ dev : the outgoing tunnel device
2014-03-14 10:28:08 +04:00
* @ fl : the flow informations for the xfrm_lookup
2013-08-19 10:07:34 +04:00
* */
2014-03-14 10:28:08 +04:00
static int
vti6_xmit ( struct sk_buff * skb , struct net_device * dev , struct flowi * fl )
2013-08-19 10:07:34 +04:00
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
2014-03-14 10:28:08 +04:00
struct dst_entry * dst = skb_dst ( skb ) ;
2013-08-19 10:07:34 +04:00
struct net_device * tdev ;
2014-11-05 10:02:48 +03:00
struct xfrm_state * x ;
2017-09-26 15:14:29 +03:00
int pkt_len = skb - > len ;
2013-08-19 10:07:34 +04:00
int err = - 1 ;
2015-05-29 21:28:26 +03:00
int mtu ;
2013-08-19 10:07:34 +04:00
2020-01-13 11:32:46 +03:00
if ( ! dst ) {
2020-02-04 19:00:27 +03:00
switch ( skb - > protocol ) {
case htons ( ETH_P_IP ) : {
struct rtable * rt ;
fl - > u . ip4 . flowi4_oif = dev - > ifindex ;
fl - > u . ip4 . flowi4_flags | = FLOWI_FLAG_ANYSRC ;
rt = __ip_route_output_key ( dev_net ( dev ) , & fl - > u . ip4 ) ;
if ( IS_ERR ( rt ) )
goto tx_err_link_failure ;
dst = & rt - > dst ;
skb_dst_set ( skb , dst ) ;
break ;
}
case htons ( ETH_P_IPV6 ) :
fl - > u . ip6 . flowi6_oif = dev - > ifindex ;
fl - > u . ip6 . flowi6_flags | = FLOWI_FLAG_ANYSRC ;
dst = ip6_route_output ( dev_net ( dev ) , NULL , & fl - > u . ip6 ) ;
if ( dst - > error ) {
dst_release ( dst ) ;
dst = NULL ;
goto tx_err_link_failure ;
}
skb_dst_set ( skb , dst ) ;
break ;
default :
2020-01-13 11:32:46 +03:00
goto tx_err_link_failure ;
}
}
2013-08-19 10:07:34 +04:00
2014-03-14 10:28:08 +04:00
dst_hold ( dst ) ;
2020-07-17 11:35:32 +03:00
dst = xfrm_lookup_route ( t - > net , dst , fl , NULL , 0 ) ;
2014-03-14 10:28:08 +04:00
if ( IS_ERR ( dst ) ) {
err = PTR_ERR ( dst ) ;
dst = NULL ;
goto tx_err_link_failure ;
2013-08-19 10:07:34 +04:00
}
2020-07-17 11:35:32 +03:00
if ( dst - > flags & DST_XFRM_QUEUE )
2021-02-27 00:35:06 +03:00
goto xmit ;
2020-07-17 11:35:32 +03:00
2014-11-05 10:02:48 +03:00
x = dst - > xfrm ;
if ( ! vti6_state_check ( x , & t - > parms . raddr , & t - > parms . laddr ) )
goto tx_err_link_failure ;
if ( ! ip6_tnl_xmit_ctl ( t , ( const struct in6_addr * ) & x - > props . saddr ,
( const struct in6_addr * ) & x - > id . daddr ) )
2013-08-19 10:07:34 +04:00
goto tx_err_link_failure ;
tdev = dst - > dev ;
if ( tdev = = dev ) {
2022-11-15 11:53:57 +03:00
DEV_STATS_INC ( dev , collisions ) ;
2013-08-19 10:07:34 +04:00
net_warn_ratelimited ( " %s: Local routing loop detected! \n " ,
t - > parms . name ) ;
goto tx_err_dst_release ;
}
2015-05-29 21:28:26 +03:00
mtu = dst_mtu ( dst ) ;
2018-08-23 19:49:54 +03:00
if ( skb - > len > mtu ) {
2019-12-22 05:51:14 +03:00
skb_dst_update_pmtu_no_confirm ( skb , mtu ) ;
2015-05-29 21:28:26 +03:00
2017-02-15 13:38:58 +03:00
if ( skb - > protocol = = htons ( ETH_P_IPV6 ) ) {
if ( mtu < IPV6_MIN_MTU )
mtu = IPV6_MIN_MTU ;
2021-02-27 03:40:19 +03:00
icmpv6_ndo_send ( skb , ICMPV6_PKT_TOOBIG , 0 , mtu ) ;
2017-02-15 13:38:58 +03:00
} else {
2021-02-27 00:35:06 +03:00
if ( ! ( ip_hdr ( skb ) - > frag_off & htons ( IP_DF ) ) )
goto xmit ;
2021-02-27 03:40:19 +03:00
icmp_ndo_send ( skb , ICMP_DEST_UNREACH , ICMP_FRAG_NEEDED ,
htonl ( mtu ) ) ;
2017-02-15 13:38:58 +03:00
}
2015-05-29 21:28:26 +03:00
2018-06-07 10:11:02 +03:00
err = - EMSGSIZE ;
goto tx_err_dst_release ;
2015-05-29 21:28:26 +03:00
}
2021-02-27 00:35:06 +03:00
xmit :
2018-06-07 10:11:02 +03:00
skb_scrub_packet ( skb , ! net_eq ( t - > net , dev_net ( dev ) ) ) ;
skb_dst_set ( skb , dst ) ;
skb - > dev = skb_dst ( skb ) - > dev ;
2015-10-08 00:48:35 +03:00
err = dst_output ( t - > net , skb - > sk , skb ) ;
2018-08-18 17:43:48 +03:00
if ( net_xmit_eval ( err ) = = 0 )
err = pkt_len ;
iptunnel_xmit_stats ( dev , err ) ;
2013-08-19 10:07:34 +04:00
return 0 ;
tx_err_link_failure :
2022-11-15 11:53:57 +03:00
DEV_STATS_INC ( dev , tx_carrier_errors ) ;
2013-08-19 10:07:34 +04:00
dst_link_failure ( skb ) ;
tx_err_dst_release :
2014-03-14 10:28:08 +04:00
dst_release ( dst ) ;
2013-08-19 10:07:34 +04:00
return err ;
}
static netdev_tx_t
vti6_tnl_xmit ( struct sk_buff * skb , struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
2014-03-14 10:28:08 +04:00
struct flowi fl ;
2013-08-19 10:07:34 +04:00
int ret ;
2018-12-31 01:24:36 +03:00
if ( ! pskb_inet_may_pull ( skb ) )
goto tx_err ;
2014-03-14 10:28:08 +04:00
memset ( & fl , 0 , sizeof ( fl ) ) ;
2013-08-19 10:07:34 +04:00
switch ( skb - > protocol ) {
case htons ( ETH_P_IPV6 ) :
2014-03-14 10:28:08 +04:00
if ( ( t - > parms . proto ! = IPPROTO_IPV6 & & t - > parms . proto ! = 0 ) | |
2018-12-31 01:24:36 +03:00
vti6_addr_conflict ( t , ipv6_hdr ( skb ) ) )
2014-03-14 10:28:08 +04:00
goto tx_err ;
memset ( IP6CB ( skb ) , 0 , sizeof ( * IP6CB ( skb ) ) ) ;
2023-10-04 19:09:51 +03:00
xfrm_decode_session ( dev_net ( dev ) , skb , & fl , AF_INET6 ) ;
2014-03-14 10:28:08 +04:00
break ;
case htons ( ETH_P_IP ) :
memset ( IPCB ( skb ) , 0 , sizeof ( * IPCB ( skb ) ) ) ;
2023-10-04 19:09:51 +03:00
xfrm_decode_session ( dev_net ( dev ) , skb , & fl , AF_INET ) ;
2013-08-19 10:07:34 +04:00
break ;
default :
goto tx_err ;
}
2015-05-27 17:16:43 +03:00
/* override mark with tunnel output key */
fl . flowi_mark = be32_to_cpu ( t - > parms . o_key ) ;
2014-03-14 10:28:08 +04:00
ret = vti6_xmit ( skb , dev , & fl ) ;
2013-08-19 10:07:34 +04:00
if ( ret < 0 )
goto tx_err ;
return NETDEV_TX_OK ;
tx_err :
2022-11-15 11:53:57 +03:00
DEV_STATS_INC ( dev , tx_errors ) ;
DEV_STATS_INC ( dev , tx_dropped ) ;
2013-08-19 10:07:34 +04:00
kfree_skb ( skb ) ;
return NETDEV_TX_OK ;
}
2014-03-14 10:28:08 +04:00
static int vti6_err ( struct sk_buff * skb , struct inet6_skb_parm * opt ,
u8 type , u8 code , int offset , __be32 info )
{
__be32 spi ;
2014-05-12 11:09:26 +04:00
__u32 mark ;
2014-03-14 10:28:08 +04:00
struct xfrm_state * x ;
struct ip6_tnl * t ;
struct ip_esp_hdr * esph ;
struct ip_auth_hdr * ah ;
struct ip_comp_hdr * ipch ;
struct net * net = dev_net ( skb - > dev ) ;
const struct ipv6hdr * iph = ( const struct ipv6hdr * ) skb - > data ;
int protocol = iph - > nexthdr ;
t = vti6_tnl_lookup ( dev_net ( skb - > dev ) , & iph - > daddr , & iph - > saddr ) ;
if ( ! t )
return - 1 ;
2014-05-12 11:09:26 +04:00
mark = be32_to_cpu ( t - > parms . o_key ) ;
2014-03-14 10:28:08 +04:00
switch ( protocol ) {
case IPPROTO_ESP :
esph = ( struct ip_esp_hdr * ) ( skb - > data + offset ) ;
spi = esph - > spi ;
break ;
case IPPROTO_AH :
ah = ( struct ip_auth_hdr * ) ( skb - > data + offset ) ;
spi = ah - > spi ;
break ;
case IPPROTO_COMP :
ipch = ( struct ip_comp_hdr * ) ( skb - > data + offset ) ;
spi = htonl ( ntohs ( ipch - > cpi ) ) ;
break ;
default :
return 0 ;
}
if ( type ! = ICMPV6_PKT_TOOBIG & &
type ! = NDISC_REDIRECT )
return 0 ;
2014-05-12 11:09:26 +04:00
x = xfrm_state_lookup ( net , mark , ( const xfrm_address_t * ) & iph - > daddr ,
2014-03-14 10:28:08 +04:00
spi , protocol , AF_INET6 ) ;
if ( ! x )
return 0 ;
if ( type = = NDISC_REDIRECT )
2016-11-03 20:23:43 +03:00
ip6_redirect ( skb , net , skb - > dev - > ifindex , 0 ,
sock_net_uid ( net , NULL ) ) ;
2014-03-14 10:28:08 +04:00
else
2016-11-03 20:23:43 +03:00
ip6_update_pmtu ( skb , net , info , 0 , 0 , sock_net_uid ( net , NULL ) ) ;
2014-03-14 10:28:08 +04:00
xfrm_state_put ( x ) ;
return 0 ;
}
2018-03-15 19:17:12 +03:00
static void vti6_link_config ( struct ip6_tnl * t , bool keep_mtu )
2013-08-19 10:07:34 +04:00
{
struct net_device * dev = t - > dev ;
struct __ip6_tnl_parm * p = & t - > parms ;
ip6_vti: adjust vti mtu according to mtu of lower device
LTP/udp6_ipsec_vti tests fail when sending large UDP datagrams over
ip6_vti that require fragmentation and the underlying device has an
MTU smaller than 1500 plus some extra space for headers. This happens
because ip6_vti, by default, sets MTU to ETH_DATA_LEN and not updating
it depending on a destination address or link parameter. Further
attempts to send UDP packets may succeed because pmtu gets updated on
ICMPV6_PKT_TOOBIG in vti6_err().
In case the lower device has larger MTU size, e.g. 9000, ip6_vti works
but not using the possible maximum size, output packets have 1500 limit.
The above cases require manual MTU setup after ip6_vti creation. However
ip_vti already updates MTU based on lower device with ip_tunnel_bind_dev().
Here is the example when the lower device MTU is set to 9000:
# ip a sh ltp_ns_veth2
ltp_ns_veth2@if7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9000 ...
inet 10.0.0.2/24 scope global ltp_ns_veth2
inet6 fd00::2/64 scope global
# ip li add vti6 type vti6 local fd00::2 remote fd00::1
# ip li show vti6
vti6@NONE: <POINTOPOINT,NOARP> mtu 1500 ...
link/tunnel6 fd00::2 peer fd00::1
After the patch:
# ip li add vti6 type vti6 local fd00::2 remote fd00::1
# ip li show vti6
vti6@NONE: <POINTOPOINT,NOARP> mtu 8832 ...
link/tunnel6 fd00::2 peer fd00::1
Reported-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-12-19 16:59:21 +03:00
struct net_device * tdev = NULL ;
vti6: Properly adjust vti6 MTU from MTU of lower device
If a lower device is found, we don't need to subtract
LL_MAX_HEADER to calculate our MTU: just use its MTU, the link
layer headers are already taken into account by it.
If the lower device is not found, start from ETH_DATA_LEN
instead, and only in this case subtract a worst-case
LL_MAX_HEADER.
We then need to subtract our additional IPv6 header from the
calculation.
While at it, note that vti6 doesn't have a hardware header, so
it doesn't need to set dev->hard_header_len. And as
vti6_link_config() now always sets the MTU, there's no need to
set a default value in vti6_dev_setup().
This makes the behaviour consistent with IPv4 vti, after
commit a32452366b72 ("vti4: Don't count header length twice."),
which was accidentally reverted by merge commit f895f0cfbb77
("Merge branch 'master' of
git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec").
While commit 53c81e95df17 ("ip6_vti: adjust vti mtu according to
mtu of lower device") improved on the original situation, this
was still not ideal. As reported in that commit message itself,
if we start from an underlying veth MTU of 9000, we end up with
an MTU of 8832, that is, 9000 - LL_MAX_HEADER - sizeof(ipv6hdr).
This should simply be 8880, or 9000 - sizeof(ipv6hdr) instead:
we found the lower device (veth) and we know we don't have any
additional link layer header, so there's no need to subtract an
hypothetical worst-case number.
Fixes: 53c81e95df17 ("ip6_vti: adjust vti mtu according to mtu of lower device")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2018-03-15 19:17:11 +03:00
int mtu ;
2013-08-19 10:07:34 +04:00
2021-10-12 19:06:34 +03:00
__dev_addr_set ( dev , & p - > laddr , sizeof ( struct in6_addr ) ) ;
2013-08-19 10:07:34 +04:00
memcpy ( dev - > broadcast , & p - > raddr , sizeof ( struct in6_addr ) ) ;
p - > flags & = ~ ( IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV |
IP6_TNL_F_CAP_PER_PACKET ) ;
p - > flags | = ip6_tnl_get_cap ( t , & p - > laddr , & p - > raddr ) ;
if ( p - > flags & IP6_TNL_F_CAP_XMIT & & p - > flags & IP6_TNL_F_CAP_RCV )
dev - > flags | = IFF_POINTOPOINT ;
else
dev - > flags & = ~ IFF_POINTOPOINT ;
ip6_vti: adjust vti mtu according to mtu of lower device
LTP/udp6_ipsec_vti tests fail when sending large UDP datagrams over
ip6_vti that require fragmentation and the underlying device has an
MTU smaller than 1500 plus some extra space for headers. This happens
because ip6_vti, by default, sets MTU to ETH_DATA_LEN and not updating
it depending on a destination address or link parameter. Further
attempts to send UDP packets may succeed because pmtu gets updated on
ICMPV6_PKT_TOOBIG in vti6_err().
In case the lower device has larger MTU size, e.g. 9000, ip6_vti works
but not using the possible maximum size, output packets have 1500 limit.
The above cases require manual MTU setup after ip6_vti creation. However
ip_vti already updates MTU based on lower device with ip_tunnel_bind_dev().
Here is the example when the lower device MTU is set to 9000:
# ip a sh ltp_ns_veth2
ltp_ns_veth2@if7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9000 ...
inet 10.0.0.2/24 scope global ltp_ns_veth2
inet6 fd00::2/64 scope global
# ip li add vti6 type vti6 local fd00::2 remote fd00::1
# ip li show vti6
vti6@NONE: <POINTOPOINT,NOARP> mtu 1500 ...
link/tunnel6 fd00::2 peer fd00::1
After the patch:
# ip li add vti6 type vti6 local fd00::2 remote fd00::1
# ip li show vti6
vti6@NONE: <POINTOPOINT,NOARP> mtu 8832 ...
link/tunnel6 fd00::2 peer fd00::1
Reported-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-12-19 16:59:21 +03:00
2018-03-15 19:17:12 +03:00
if ( keep_mtu & & dev - > mtu ) {
dev - > mtu = clamp ( dev - > mtu , dev - > min_mtu , dev - > max_mtu ) ;
return ;
}
ip6_vti: adjust vti mtu according to mtu of lower device
LTP/udp6_ipsec_vti tests fail when sending large UDP datagrams over
ip6_vti that require fragmentation and the underlying device has an
MTU smaller than 1500 plus some extra space for headers. This happens
because ip6_vti, by default, sets MTU to ETH_DATA_LEN and not updating
it depending on a destination address or link parameter. Further
attempts to send UDP packets may succeed because pmtu gets updated on
ICMPV6_PKT_TOOBIG in vti6_err().
In case the lower device has larger MTU size, e.g. 9000, ip6_vti works
but not using the possible maximum size, output packets have 1500 limit.
The above cases require manual MTU setup after ip6_vti creation. However
ip_vti already updates MTU based on lower device with ip_tunnel_bind_dev().
Here is the example when the lower device MTU is set to 9000:
# ip a sh ltp_ns_veth2
ltp_ns_veth2@if7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9000 ...
inet 10.0.0.2/24 scope global ltp_ns_veth2
inet6 fd00::2/64 scope global
# ip li add vti6 type vti6 local fd00::2 remote fd00::1
# ip li show vti6
vti6@NONE: <POINTOPOINT,NOARP> mtu 1500 ...
link/tunnel6 fd00::2 peer fd00::1
After the patch:
# ip li add vti6 type vti6 local fd00::2 remote fd00::1
# ip li show vti6
vti6@NONE: <POINTOPOINT,NOARP> mtu 8832 ...
link/tunnel6 fd00::2 peer fd00::1
Reported-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-12-19 16:59:21 +03:00
if ( p - > flags & IP6_TNL_F_CAP_XMIT ) {
int strict = ( ipv6_addr_type ( & p - > raddr ) &
( IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL ) ) ;
struct rt6_info * rt = rt6_lookup ( t - > net ,
& p - > raddr , & p - > laddr ,
2018-03-02 19:32:17 +03:00
p - > link , NULL , strict ) ;
ip6_vti: adjust vti mtu according to mtu of lower device
LTP/udp6_ipsec_vti tests fail when sending large UDP datagrams over
ip6_vti that require fragmentation and the underlying device has an
MTU smaller than 1500 plus some extra space for headers. This happens
because ip6_vti, by default, sets MTU to ETH_DATA_LEN and not updating
it depending on a destination address or link parameter. Further
attempts to send UDP packets may succeed because pmtu gets updated on
ICMPV6_PKT_TOOBIG in vti6_err().
In case the lower device has larger MTU size, e.g. 9000, ip6_vti works
but not using the possible maximum size, output packets have 1500 limit.
The above cases require manual MTU setup after ip6_vti creation. However
ip_vti already updates MTU based on lower device with ip_tunnel_bind_dev().
Here is the example when the lower device MTU is set to 9000:
# ip a sh ltp_ns_veth2
ltp_ns_veth2@if7: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9000 ...
inet 10.0.0.2/24 scope global ltp_ns_veth2
inet6 fd00::2/64 scope global
# ip li add vti6 type vti6 local fd00::2 remote fd00::1
# ip li show vti6
vti6@NONE: <POINTOPOINT,NOARP> mtu 1500 ...
link/tunnel6 fd00::2 peer fd00::1
After the patch:
# ip li add vti6 type vti6 local fd00::2 remote fd00::1
# ip li show vti6
vti6@NONE: <POINTOPOINT,NOARP> mtu 8832 ...
link/tunnel6 fd00::2 peer fd00::1
Reported-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-12-19 16:59:21 +03:00
if ( rt )
tdev = rt - > dst . dev ;
ip6_rt_put ( rt ) ;
}
if ( ! tdev & & p - > link )
tdev = __dev_get_by_index ( t - > net , p - > link ) ;
if ( tdev )
vti6: Properly adjust vti6 MTU from MTU of lower device
If a lower device is found, we don't need to subtract
LL_MAX_HEADER to calculate our MTU: just use its MTU, the link
layer headers are already taken into account by it.
If the lower device is not found, start from ETH_DATA_LEN
instead, and only in this case subtract a worst-case
LL_MAX_HEADER.
We then need to subtract our additional IPv6 header from the
calculation.
While at it, note that vti6 doesn't have a hardware header, so
it doesn't need to set dev->hard_header_len. And as
vti6_link_config() now always sets the MTU, there's no need to
set a default value in vti6_dev_setup().
This makes the behaviour consistent with IPv4 vti, after
commit a32452366b72 ("vti4: Don't count header length twice."),
which was accidentally reverted by merge commit f895f0cfbb77
("Merge branch 'master' of
git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec").
While commit 53c81e95df17 ("ip6_vti: adjust vti mtu according to
mtu of lower device") improved on the original situation, this
was still not ideal. As reported in that commit message itself,
if we start from an underlying veth MTU of 9000, we end up with
an MTU of 8832, that is, 9000 - LL_MAX_HEADER - sizeof(ipv6hdr).
This should simply be 8880, or 9000 - sizeof(ipv6hdr) instead:
we found the lower device (veth) and we know we don't have any
additional link layer header, so there's no need to subtract an
hypothetical worst-case number.
Fixes: 53c81e95df17 ("ip6_vti: adjust vti mtu according to mtu of lower device")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
2018-03-15 19:17:11 +03:00
mtu = tdev - > mtu - sizeof ( struct ipv6hdr ) ;
else
mtu = ETH_DATA_LEN - LL_MAX_HEADER - sizeof ( struct ipv6hdr ) ;
2018-04-26 20:39:09 +03:00
dev - > mtu = max_t ( int , mtu , IPV4_MIN_MTU ) ;
2013-08-19 10:07:34 +04:00
}
/**
* vti6_tnl_change - update the tunnel parameters
* @ t : tunnel to be changed
* @ p : tunnel configuration parameters
2018-03-15 19:17:12 +03:00
* @ keep_mtu : MTU was set from userspace , don ' t re - compute it
2013-08-19 10:07:34 +04:00
*
* Description :
* vti6_tnl_change ( ) updates the tunnel parameters
* */
static int
2018-03-15 19:17:12 +03:00
vti6_tnl_change ( struct ip6_tnl * t , const struct __ip6_tnl_parm * p ,
bool keep_mtu )
2013-08-19 10:07:34 +04:00
{
t - > parms . laddr = p - > laddr ;
t - > parms . raddr = p - > raddr ;
t - > parms . link = p - > link ;
t - > parms . i_key = p - > i_key ;
t - > parms . o_key = p - > o_key ;
t - > parms . proto = p - > proto ;
2017-04-19 19:30:53 +03:00
t - > parms . fwmark = p - > fwmark ;
2016-02-12 17:43:54 +03:00
dst_cache_reset ( & t - > dst_cache ) ;
2018-03-15 19:17:12 +03:00
vti6_link_config ( t , keep_mtu ) ;
2013-08-19 10:07:34 +04:00
return 0 ;
}
2018-03-15 19:17:12 +03:00
static int vti6_update ( struct ip6_tnl * t , struct __ip6_tnl_parm * p ,
bool keep_mtu )
2013-08-19 10:07:34 +04:00
{
struct net * net = dev_net ( t - > dev ) ;
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
int err ;
vti6_tnl_unlink ( ip6n , t ) ;
synchronize_net ( ) ;
2018-03-15 19:17:12 +03:00
err = vti6_tnl_change ( t , p , keep_mtu ) ;
2013-08-19 10:07:34 +04:00
vti6_tnl_link ( ip6n , t ) ;
netdev_state_change ( t - > dev ) ;
return err ;
}
static void
vti6_parm_from_user ( struct __ip6_tnl_parm * p , const struct ip6_tnl_parm2 * u )
{
p - > laddr = u - > laddr ;
p - > raddr = u - > raddr ;
p - > link = u - > link ;
p - > i_key = u - > i_key ;
p - > o_key = u - > o_key ;
p - > proto = u - > proto ;
memcpy ( p - > name , u - > name , sizeof ( u - > name ) ) ;
}
static void
vti6_parm_to_user ( struct ip6_tnl_parm2 * u , const struct __ip6_tnl_parm * p )
{
u - > laddr = p - > laddr ;
u - > raddr = p - > raddr ;
u - > link = p - > link ;
u - > i_key = p - > i_key ;
u - > o_key = p - > o_key ;
2017-02-24 17:20:32 +03:00
if ( u - > i_key )
u - > i_flags | = GRE_KEY ;
if ( u - > o_key )
u - > o_flags | = GRE_KEY ;
2013-08-19 10:07:34 +04:00
u - > proto = p - > proto ;
memcpy ( u - > name , p - > name , sizeof ( u - > name ) ) ;
}
/**
2021-07-27 16:45:06 +03:00
* vti6_siocdevprivate - configure vti6 tunnels from userspace
2013-08-19 10:07:34 +04:00
* @ dev : virtual device associated with tunnel
2021-07-27 16:45:06 +03:00
* @ ifr : unused
* @ data : parameters passed from userspace
2013-08-19 10:07:34 +04:00
* @ cmd : command to be performed
*
* Description :
2021-07-27 16:45:06 +03:00
* vti6_siocdevprivate ( ) is used for managing vti6 tunnels
2013-08-19 10:07:34 +04:00
* from userspace .
*
* The possible commands are the following :
* % SIOCGETTUNNEL : get tunnel parameters for device
* % SIOCADDTUNNEL : add tunnel matching given tunnel parameters
* % SIOCCHGTUNNEL : change tunnel parameters to those given
* % SIOCDELTUNNEL : delete tunnel
*
* The fallback device " ip6_vti0 " , created during module
* initialization , can be used for creating other tunnel devices .
*
* Return :
* 0 on success ,
* % - EFAULT if unable to copy data to or from userspace ,
* % - EPERM if current process hasn ' t % CAP_NET_ADMIN set
* % - EINVAL if passed tunnel parameters are invalid ,
* % - EEXIST if changing a tunnel ' s parameters would cause a conflict
* % - ENODEV if attempting to change or delete a nonexisting device
* */
static int
2021-07-27 16:45:06 +03:00
vti6_siocdevprivate ( struct net_device * dev , struct ifreq * ifr , void __user * data , int cmd )
2013-08-19 10:07:34 +04:00
{
int err = 0 ;
struct ip6_tnl_parm2 p ;
struct __ip6_tnl_parm p1 ;
struct ip6_tnl * t = NULL ;
struct net * net = dev_net ( dev ) ;
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
2021-12-23 20:33:16 +03:00
memset ( & p1 , 0 , sizeof ( p1 ) ) ;
2013-08-19 10:07:34 +04:00
switch ( cmd ) {
case SIOCGETTUNNEL :
if ( dev = = ip6n - > fb_tnl_dev ) {
2021-07-27 16:45:06 +03:00
if ( copy_from_user ( & p , data , sizeof ( p ) ) ) {
2013-08-19 10:07:34 +04:00
err = - EFAULT ;
break ;
}
vti6_parm_from_user ( & p1 , & p ) ;
t = vti6_locate ( net , & p1 , 0 ) ;
} else {
memset ( & p , 0 , sizeof ( p ) ) ;
}
2015-03-29 16:00:04 +03:00
if ( ! t )
2013-08-19 10:07:34 +04:00
t = netdev_priv ( dev ) ;
vti6_parm_to_user ( & p , & t - > parms ) ;
2021-07-27 16:45:06 +03:00
if ( copy_to_user ( data , & p , sizeof ( p ) ) )
2013-08-19 10:07:34 +04:00
err = - EFAULT ;
break ;
case SIOCADDTUNNEL :
case SIOCCHGTUNNEL :
err = - EPERM ;
if ( ! ns_capable ( net - > user_ns , CAP_NET_ADMIN ) )
break ;
err = - EFAULT ;
2021-07-27 16:45:06 +03:00
if ( copy_from_user ( & p , data , sizeof ( p ) ) )
2013-08-19 10:07:34 +04:00
break ;
err = - EINVAL ;
if ( p . proto ! = IPPROTO_IPV6 & & p . proto ! = 0 )
break ;
vti6_parm_from_user ( & p1 , & p ) ;
t = vti6_locate ( net , & p1 , cmd = = SIOCADDTUNNEL ) ;
if ( dev ! = ip6n - > fb_tnl_dev & & cmd = = SIOCCHGTUNNEL ) {
2015-03-29 16:00:05 +03:00
if ( t ) {
2013-08-19 10:07:34 +04:00
if ( t - > dev ! = dev ) {
err = - EEXIST ;
break ;
}
} else
t = netdev_priv ( dev ) ;
2018-03-15 19:17:12 +03:00
err = vti6_update ( t , & p1 , false ) ;
2013-08-19 10:07:34 +04:00
}
if ( t ) {
err = 0 ;
vti6_parm_to_user ( & p , & t - > parms ) ;
2021-07-27 16:45:06 +03:00
if ( copy_to_user ( data , & p , sizeof ( p ) ) )
2013-08-19 10:07:34 +04:00
err = - EFAULT ;
} else
err = ( cmd = = SIOCADDTUNNEL ? - ENOBUFS : - ENOENT ) ;
break ;
case SIOCDELTUNNEL :
err = - EPERM ;
if ( ! ns_capable ( net - > user_ns , CAP_NET_ADMIN ) )
break ;
if ( dev = = ip6n - > fb_tnl_dev ) {
err = - EFAULT ;
2021-07-27 16:45:06 +03:00
if ( copy_from_user ( & p , data , sizeof ( p ) ) )
2013-08-19 10:07:34 +04:00
break ;
err = - ENOENT ;
vti6_parm_from_user ( & p1 , & p ) ;
t = vti6_locate ( net , & p1 , 0 ) ;
2015-03-29 16:00:04 +03:00
if ( ! t )
2013-08-19 10:07:34 +04:00
break ;
err = - EPERM ;
if ( t - > dev = = ip6n - > fb_tnl_dev )
break ;
dev = t - > dev ;
}
err = 0 ;
unregister_netdevice ( dev ) ;
break ;
default :
err = - EINVAL ;
}
return err ;
}
static const struct net_device_ops vti6_netdev_ops = {
2014-11-03 11:19:28 +03:00
. ndo_init = vti6_dev_init ,
2013-08-19 10:07:34 +04:00
. ndo_uninit = vti6_dev_uninit ,
. ndo_start_xmit = vti6_tnl_xmit ,
2021-07-27 16:45:06 +03:00
. ndo_siocdevprivate = vti6_siocdevprivate ,
2020-11-07 23:53:53 +03:00
. ndo_get_stats64 = dev_get_tstats64 ,
2015-04-02 18:07:01 +03:00
. ndo_get_iflink = ip6_tnl_get_iflink ,
2013-08-19 10:07:34 +04:00
} ;
/**
* vti6_dev_setup - setup virtual tunnel device
* @ dev : virtual device associated with tunnel
*
* Description :
* Initialize function pointers and device parameters
* */
static void vti6_dev_setup ( struct net_device * dev )
{
dev - > netdev_ops = & vti6_netdev_ops ;
2020-06-30 04:06:23 +03:00
dev - > header_ops = & ip_tunnel_header_ops ;
net: Fix inconsistent teardown and release of private netdev state.
Network devices can allocate reasources and private memory using
netdev_ops->ndo_init(). However, the release of these resources
can occur in one of two different places.
Either netdev_ops->ndo_uninit() or netdev->destructor().
The decision of which operation frees the resources depends upon
whether it is necessary for all netdev refs to be released before it
is safe to perform the freeing.
netdev_ops->ndo_uninit() presumably can occur right after the
NETDEV_UNREGISTER notifier completes and the unicast and multicast
address lists are flushed.
netdev->destructor(), on the other hand, does not run until the
netdev references all go away.
Further complicating the situation is that netdev->destructor()
almost universally does also a free_netdev().
This creates a problem for the logic in register_netdevice().
Because all callers of register_netdevice() manage the freeing
of the netdev, and invoke free_netdev(dev) if register_netdevice()
fails.
If netdev_ops->ndo_init() succeeds, but something else fails inside
of register_netdevice(), it does call ndo_ops->ndo_uninit(). But
it is not able to invoke netdev->destructor().
This is because netdev->destructor() will do a free_netdev() and
then the caller of register_netdevice() will do the same.
However, this means that the resources that would normally be released
by netdev->destructor() will not be.
Over the years drivers have added local hacks to deal with this, by
invoking their destructor parts by hand when register_netdevice()
fails.
Many drivers do not try to deal with this, and instead we have leaks.
Let's close this hole by formalizing the distinction between what
private things need to be freed up by netdev->destructor() and whether
the driver needs unregister_netdevice() to perform the free_netdev().
netdev->priv_destructor() performs all actions to free up the private
resources that used to be freed by netdev->destructor(), except for
free_netdev().
netdev->needs_free_netdev is a boolean that indicates whether
free_netdev() should be done at the end of unregister_netdevice().
Now, register_netdevice() can sanely release all resources after
ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit()
and netdev->priv_destructor().
And at the end of unregister_netdevice(), we invoke
netdev->priv_destructor() and optionally call free_netdev().
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-08 19:52:56 +03:00
dev - > needs_free_netdev = true ;
dev - > priv_destructor = vti6_dev_free ;
2013-08-19 10:07:34 +04:00
dev - > type = ARPHRD_TUNNEL6 ;
2018-04-26 20:39:09 +03:00
dev - > min_mtu = IPV4_MIN_MTU ;
2018-03-15 19:17:13 +03:00
dev - > max_mtu = IP_MAX_MTU - sizeof ( struct ipv6hdr ) ;
2013-08-19 10:07:34 +04:00
dev - > flags | = IFF_NOARP ;
dev - > addr_len = sizeof ( struct in6_addr ) ;
2014-10-06 05:38:35 +04:00
netif_keep_dst ( dev ) ;
2017-01-26 06:59:18 +03:00
/* This perm addr will be used as interface identifier by IPv6 */
dev - > addr_assign_type = NET_ADDR_RANDOM ;
eth_random_addr ( dev - > perm_addr ) ;
2013-08-19 10:07:34 +04:00
}
/**
* vti6_dev_init_gen - general initializer for all tunnel devices
* @ dev : virtual device associated with tunnel
* */
static inline int vti6_dev_init_gen ( struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
t - > dev = dev ;
t - > net = dev_net ( dev ) ;
2014-02-13 23:46:28 +04:00
dev - > tstats = netdev_alloc_pcpu_stats ( struct pcpu_sw_netstats ) ;
2013-08-19 10:07:34 +04:00
if ( ! dev - > tstats )
return - ENOMEM ;
2022-06-08 07:39:55 +03:00
netdev_hold ( dev , & t - > dev_tracker , GFP_KERNEL ) ;
2013-08-19 10:07:34 +04:00
return 0 ;
}
/**
* vti6_dev_init - initializer for all non fallback tunnel devices
* @ dev : virtual device associated with tunnel
* */
static int vti6_dev_init ( struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
int err = vti6_dev_init_gen ( dev ) ;
if ( err )
return err ;
2018-03-15 19:17:12 +03:00
vti6_link_config ( t , true ) ;
2013-08-19 10:07:34 +04:00
return 0 ;
}
/**
* vti6_fb_tnl_dev_init - initializer for fallback tunnel device
* @ dev : fallback device
*
* Return : 0
* */
static int __net_init vti6_fb_tnl_dev_init ( struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct net * net = dev_net ( dev ) ;
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
t - > parms . proto = IPPROTO_IPV6 ;
rcu_assign_pointer ( ip6n - > tnls_wc [ 0 ] , t ) ;
return 0 ;
}
2017-06-26 00:56:01 +03:00
static int vti6_validate ( struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
2013-08-19 10:07:34 +04:00
{
return 0 ;
}
static void vti6_netlink_parms ( struct nlattr * data [ ] ,
struct __ip6_tnl_parm * parms )
{
memset ( parms , 0 , sizeof ( * parms ) ) ;
if ( ! data )
return ;
if ( data [ IFLA_VTI_LINK ] )
parms - > link = nla_get_u32 ( data [ IFLA_VTI_LINK ] ) ;
if ( data [ IFLA_VTI_LOCAL ] )
2015-03-29 17:59:26 +03:00
parms - > laddr = nla_get_in6_addr ( data [ IFLA_VTI_LOCAL ] ) ;
2013-08-19 10:07:34 +04:00
if ( data [ IFLA_VTI_REMOTE ] )
2015-03-29 17:59:26 +03:00
parms - > raddr = nla_get_in6_addr ( data [ IFLA_VTI_REMOTE ] ) ;
2013-08-19 10:07:34 +04:00
if ( data [ IFLA_VTI_IKEY ] )
parms - > i_key = nla_get_be32 ( data [ IFLA_VTI_IKEY ] ) ;
if ( data [ IFLA_VTI_OKEY ] )
parms - > o_key = nla_get_be32 ( data [ IFLA_VTI_OKEY ] ) ;
2017-04-19 19:30:53 +03:00
if ( data [ IFLA_VTI_FWMARK ] )
parms - > fwmark = nla_get_u32 ( data [ IFLA_VTI_FWMARK ] ) ;
2013-08-19 10:07:34 +04:00
}
static int vti6_newlink ( struct net * src_net , struct net_device * dev ,
2017-06-26 00:55:59 +03:00
struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
2013-08-19 10:07:34 +04:00
{
struct net * net = dev_net ( dev ) ;
struct ip6_tnl * nt ;
nt = netdev_priv ( dev ) ;
vti6_netlink_parms ( data , & nt - > parms ) ;
nt - > parms . proto = IPPROTO_IPV6 ;
if ( vti6_locate ( net , & nt - > parms , 0 ) )
return - EEXIST ;
return vti6_tnl_create2 ( dev ) ;
}
ip_tunnel: the lack of vti_link_ops' dellink() cause kernel panic
Now the vti_link_ops do not point the .dellink, for fb tunnel device
(ip_vti0), the net_device will be removed as the default .dellink is
unregister_netdevice_queue,but the tunnel still in the tunnel list,
then if we add a new vti tunnel, in ip_tunnel_find():
hlist_for_each_entry_rcu(t, head, hash_node) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
link == t->parms.link &&
==> type == t->dev->type &&
ip_tunnel_key_match(&t->parms, flags, key))
break;
}
the panic will happen, cause dev of ip_tunnel *t is null:
[ 3835.072977] IP: [<ffffffffa04103fd>] ip_tunnel_find+0x9d/0xc0 [ip_tunnel]
[ 3835.073008] PGD b2c21067 PUD b7277067 PMD 0
[ 3835.073008] Oops: 0000 [#1] SMP
.....
[ 3835.073008] Stack:
[ 3835.073008] ffff8800b72d77f0 ffffffffa0411924 ffff8800bb956000 ffff8800b72d78e0
[ 3835.073008] ffff8800b72d78a0 0000000000000000 ffffffffa040d100 ffff8800b72d7858
[ 3835.073008] ffffffffa040b2e3 0000000000000000 0000000000000000 0000000000000000
[ 3835.073008] Call Trace:
[ 3835.073008] [<ffffffffa0411924>] ip_tunnel_newlink+0x64/0x160 [ip_tunnel]
[ 3835.073008] [<ffffffffa040b2e3>] vti_newlink+0x43/0x70 [ip_vti]
[ 3835.073008] [<ffffffff8150d4da>] rtnl_newlink+0x4fa/0x5f0
[ 3835.073008] [<ffffffff812f68bb>] ? nla_strlcpy+0x5b/0x70
[ 3835.073008] [<ffffffff81508fb0>] ? rtnl_link_ops_get+0x40/0x60
[ 3835.073008] [<ffffffff8150d11f>] ? rtnl_newlink+0x13f/0x5f0
[ 3835.073008] [<ffffffff81509cf4>] rtnetlink_rcv_msg+0xa4/0x270
[ 3835.073008] [<ffffffff8126adf5>] ? sock_has_perm+0x75/0x90
[ 3835.073008] [<ffffffff81509c50>] ? rtnetlink_rcv+0x30/0x30
[ 3835.073008] [<ffffffff81529e39>] netlink_rcv_skb+0xa9/0xc0
[ 3835.073008] [<ffffffff81509c48>] rtnetlink_rcv+0x28/0x30
....
modprobe ip_vti
ip link del ip_vti0 type vti
ip link add ip_vti0 type vti
rmmod ip_vti
do that one or more times, kernel will panic.
fix it by assigning ip_tunnel_dellink to vti_link_ops' dellink, in
which we skip the unregister of fb tunnel device. do the same on ip6_vti.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-23 10:04:11 +03:00
static void vti6_dellink ( struct net_device * dev , struct list_head * head )
{
struct net * net = dev_net ( dev ) ;
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
if ( dev ! = ip6n - > fb_tnl_dev )
unregister_netdevice_queue ( dev , head ) ;
}
2013-08-19 10:07:34 +04:00
static int vti6_changelink ( struct net_device * dev , struct nlattr * tb [ ] ,
2017-06-26 00:56:00 +03:00
struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
2013-08-19 10:07:34 +04:00
{
struct ip6_tnl * t ;
struct __ip6_tnl_parm p ;
struct net * net = dev_net ( dev ) ;
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
if ( dev = = ip6n - > fb_tnl_dev )
return - EINVAL ;
vti6_netlink_parms ( data , & p ) ;
t = vti6_locate ( net , & p , 0 ) ;
if ( t ) {
if ( t - > dev ! = dev )
return - EEXIST ;
} else
t = netdev_priv ( dev ) ;
2018-03-15 19:17:12 +03:00
return vti6_update ( t , & p , tb & & tb [ IFLA_MTU ] ) ;
2013-08-19 10:07:34 +04:00
}
static size_t vti6_get_size ( const struct net_device * dev )
{
return
/* IFLA_VTI_LINK */
nla_total_size ( 4 ) +
/* IFLA_VTI_LOCAL */
nla_total_size ( sizeof ( struct in6_addr ) ) +
/* IFLA_VTI_REMOTE */
nla_total_size ( sizeof ( struct in6_addr ) ) +
/* IFLA_VTI_IKEY */
nla_total_size ( 4 ) +
/* IFLA_VTI_OKEY */
nla_total_size ( 4 ) +
2017-04-19 19:30:53 +03:00
/* IFLA_VTI_FWMARK */
nla_total_size ( 4 ) +
2013-08-19 10:07:34 +04:00
0 ;
}
static int vti6_fill_info ( struct sk_buff * skb , const struct net_device * dev )
{
struct ip6_tnl * tunnel = netdev_priv ( dev ) ;
struct __ip6_tnl_parm * parm = & tunnel - > parms ;
if ( nla_put_u32 ( skb , IFLA_VTI_LINK , parm - > link ) | |
2015-03-29 17:59:25 +03:00
nla_put_in6_addr ( skb , IFLA_VTI_LOCAL , & parm - > laddr ) | |
nla_put_in6_addr ( skb , IFLA_VTI_REMOTE , & parm - > raddr ) | |
2013-08-19 10:07:34 +04:00
nla_put_be32 ( skb , IFLA_VTI_IKEY , parm - > i_key ) | |
2017-04-19 19:30:53 +03:00
nla_put_be32 ( skb , IFLA_VTI_OKEY , parm - > o_key ) | |
nla_put_u32 ( skb , IFLA_VTI_FWMARK , parm - > fwmark ) )
2013-08-19 10:07:34 +04:00
goto nla_put_failure ;
return 0 ;
nla_put_failure :
return - EMSGSIZE ;
}
static const struct nla_policy vti6_policy [ IFLA_VTI_MAX + 1 ] = {
[ IFLA_VTI_LINK ] = { . type = NLA_U32 } ,
[ IFLA_VTI_LOCAL ] = { . len = sizeof ( struct in6_addr ) } ,
[ IFLA_VTI_REMOTE ] = { . len = sizeof ( struct in6_addr ) } ,
[ IFLA_VTI_IKEY ] = { . type = NLA_U32 } ,
[ IFLA_VTI_OKEY ] = { . type = NLA_U32 } ,
2017-04-19 19:30:53 +03:00
[ IFLA_VTI_FWMARK ] = { . type = NLA_U32 } ,
2013-08-19 10:07:34 +04:00
} ;
static struct rtnl_link_ops vti6_link_ops __read_mostly = {
. kind = " vti6 " ,
. maxtype = IFLA_VTI_MAX ,
. policy = vti6_policy ,
. priv_size = sizeof ( struct ip6_tnl ) ,
. setup = vti6_dev_setup ,
. validate = vti6_validate ,
. newlink = vti6_newlink ,
ip_tunnel: the lack of vti_link_ops' dellink() cause kernel panic
Now the vti_link_ops do not point the .dellink, for fb tunnel device
(ip_vti0), the net_device will be removed as the default .dellink is
unregister_netdevice_queue,but the tunnel still in the tunnel list,
then if we add a new vti tunnel, in ip_tunnel_find():
hlist_for_each_entry_rcu(t, head, hash_node) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
link == t->parms.link &&
==> type == t->dev->type &&
ip_tunnel_key_match(&t->parms, flags, key))
break;
}
the panic will happen, cause dev of ip_tunnel *t is null:
[ 3835.072977] IP: [<ffffffffa04103fd>] ip_tunnel_find+0x9d/0xc0 [ip_tunnel]
[ 3835.073008] PGD b2c21067 PUD b7277067 PMD 0
[ 3835.073008] Oops: 0000 [#1] SMP
.....
[ 3835.073008] Stack:
[ 3835.073008] ffff8800b72d77f0 ffffffffa0411924 ffff8800bb956000 ffff8800b72d78e0
[ 3835.073008] ffff8800b72d78a0 0000000000000000 ffffffffa040d100 ffff8800b72d7858
[ 3835.073008] ffffffffa040b2e3 0000000000000000 0000000000000000 0000000000000000
[ 3835.073008] Call Trace:
[ 3835.073008] [<ffffffffa0411924>] ip_tunnel_newlink+0x64/0x160 [ip_tunnel]
[ 3835.073008] [<ffffffffa040b2e3>] vti_newlink+0x43/0x70 [ip_vti]
[ 3835.073008] [<ffffffff8150d4da>] rtnl_newlink+0x4fa/0x5f0
[ 3835.073008] [<ffffffff812f68bb>] ? nla_strlcpy+0x5b/0x70
[ 3835.073008] [<ffffffff81508fb0>] ? rtnl_link_ops_get+0x40/0x60
[ 3835.073008] [<ffffffff8150d11f>] ? rtnl_newlink+0x13f/0x5f0
[ 3835.073008] [<ffffffff81509cf4>] rtnetlink_rcv_msg+0xa4/0x270
[ 3835.073008] [<ffffffff8126adf5>] ? sock_has_perm+0x75/0x90
[ 3835.073008] [<ffffffff81509c50>] ? rtnetlink_rcv+0x30/0x30
[ 3835.073008] [<ffffffff81529e39>] netlink_rcv_skb+0xa9/0xc0
[ 3835.073008] [<ffffffff81509c48>] rtnetlink_rcv+0x28/0x30
....
modprobe ip_vti
ip link del ip_vti0 type vti
ip link add ip_vti0 type vti
rmmod ip_vti
do that one or more times, kernel will panic.
fix it by assigning ip_tunnel_dellink to vti_link_ops' dellink, in
which we skip the unregister of fb tunnel device. do the same on ip6_vti.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-23 10:04:11 +03:00
. dellink = vti6_dellink ,
2013-08-19 10:07:34 +04:00
. changelink = vti6_changelink ,
. get_size = vti6_get_size ,
. fill_info = vti6_fill_info ,
2015-01-15 17:11:17 +03:00
. get_link_net = ip6_tnl_get_link_net ,
2013-08-19 10:07:34 +04:00
} ;
2017-09-20 02:27:08 +03:00
static void __net_exit vti6_destroy_tunnels ( struct vti6_net * ip6n ,
struct list_head * list )
2013-08-19 10:07:34 +04:00
{
int h ;
struct ip6_tnl * t ;
2016-08-10 12:03:35 +03:00
for ( h = 0 ; h < IP6_VTI_HASH_SIZE ; h + + ) {
2013-08-19 10:07:34 +04:00
t = rtnl_dereference ( ip6n - > tnls_r_l [ h ] ) ;
2015-03-29 16:00:05 +03:00
while ( t ) {
2017-09-20 02:27:08 +03:00
unregister_netdevice_queue ( t - > dev , list ) ;
2013-08-19 10:07:34 +04:00
t = rtnl_dereference ( t - > next ) ;
}
}
t = rtnl_dereference ( ip6n - > tnls_wc [ 0 ] ) ;
2018-08-20 05:51:05 +03:00
if ( t )
unregister_netdevice_queue ( t - > dev , list ) ;
2013-08-19 10:07:34 +04:00
}
static int __net_init vti6_init_net ( struct net * net )
{
struct vti6_net * ip6n = net_generic ( net , vti6_net_id ) ;
struct ip6_tnl * t = NULL ;
int err ;
ip6n - > tnls [ 0 ] = ip6n - > tnls_wc ;
ip6n - > tnls [ 1 ] = ip6n - > tnls_r_l ;
2018-08-19 10:05:05 +03:00
if ( ! net_has_fallback_tunnels ( net ) )
return 0 ;
2013-08-19 10:07:34 +04:00
err = - ENOMEM ;
ip6n - > fb_tnl_dev = alloc_netdev ( sizeof ( struct ip6_tnl ) , " ip6_vti0 " ,
net: set name_assign_type in alloc_netdev()
Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert
all users to pass NET_NAME_UNKNOWN.
Coccinelle patch:
@@
expression sizeof_priv, name, setup, txqs, rxqs, count;
@@
(
-alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs)
+alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs)
|
-alloc_netdev_mq(sizeof_priv, name, setup, count)
+alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count)
|
-alloc_netdev(sizeof_priv, name, setup)
+alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup)
)
v9: move comments here from the wrong commit
Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-14 18:37:24 +04:00
NET_NAME_UNKNOWN , vti6_dev_setup ) ;
2013-08-19 10:07:34 +04:00
if ( ! ip6n - > fb_tnl_dev )
goto err_alloc_dev ;
dev_net_set ( ip6n - > fb_tnl_dev , net ) ;
ip_tunnel: the lack of vti_link_ops' dellink() cause kernel panic
Now the vti_link_ops do not point the .dellink, for fb tunnel device
(ip_vti0), the net_device will be removed as the default .dellink is
unregister_netdevice_queue,but the tunnel still in the tunnel list,
then if we add a new vti tunnel, in ip_tunnel_find():
hlist_for_each_entry_rcu(t, head, hash_node) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
link == t->parms.link &&
==> type == t->dev->type &&
ip_tunnel_key_match(&t->parms, flags, key))
break;
}
the panic will happen, cause dev of ip_tunnel *t is null:
[ 3835.072977] IP: [<ffffffffa04103fd>] ip_tunnel_find+0x9d/0xc0 [ip_tunnel]
[ 3835.073008] PGD b2c21067 PUD b7277067 PMD 0
[ 3835.073008] Oops: 0000 [#1] SMP
.....
[ 3835.073008] Stack:
[ 3835.073008] ffff8800b72d77f0 ffffffffa0411924 ffff8800bb956000 ffff8800b72d78e0
[ 3835.073008] ffff8800b72d78a0 0000000000000000 ffffffffa040d100 ffff8800b72d7858
[ 3835.073008] ffffffffa040b2e3 0000000000000000 0000000000000000 0000000000000000
[ 3835.073008] Call Trace:
[ 3835.073008] [<ffffffffa0411924>] ip_tunnel_newlink+0x64/0x160 [ip_tunnel]
[ 3835.073008] [<ffffffffa040b2e3>] vti_newlink+0x43/0x70 [ip_vti]
[ 3835.073008] [<ffffffff8150d4da>] rtnl_newlink+0x4fa/0x5f0
[ 3835.073008] [<ffffffff812f68bb>] ? nla_strlcpy+0x5b/0x70
[ 3835.073008] [<ffffffff81508fb0>] ? rtnl_link_ops_get+0x40/0x60
[ 3835.073008] [<ffffffff8150d11f>] ? rtnl_newlink+0x13f/0x5f0
[ 3835.073008] [<ffffffff81509cf4>] rtnetlink_rcv_msg+0xa4/0x270
[ 3835.073008] [<ffffffff8126adf5>] ? sock_has_perm+0x75/0x90
[ 3835.073008] [<ffffffff81509c50>] ? rtnetlink_rcv+0x30/0x30
[ 3835.073008] [<ffffffff81529e39>] netlink_rcv_skb+0xa9/0xc0
[ 3835.073008] [<ffffffff81509c48>] rtnetlink_rcv+0x28/0x30
....
modprobe ip_vti
ip link del ip_vti0 type vti
ip link add ip_vti0 type vti
rmmod ip_vti
do that one or more times, kernel will panic.
fix it by assigning ip_tunnel_dellink to vti_link_ops' dellink, in
which we skip the unregister of fb tunnel device. do the same on ip6_vti.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-23 10:04:11 +03:00
ip6n - > fb_tnl_dev - > rtnl_link_ops = & vti6_link_ops ;
2013-08-19 10:07:34 +04:00
err = vti6_fb_tnl_dev_init ( ip6n - > fb_tnl_dev ) ;
if ( err < 0 )
goto err_register ;
err = register_netdev ( ip6n - > fb_tnl_dev ) ;
if ( err < 0 )
goto err_register ;
t = netdev_priv ( ip6n - > fb_tnl_dev ) ;
strcpy ( t - > parms . name , ip6n - > fb_tnl_dev - > name ) ;
return 0 ;
err_register :
net: Fix inconsistent teardown and release of private netdev state.
Network devices can allocate reasources and private memory using
netdev_ops->ndo_init(). However, the release of these resources
can occur in one of two different places.
Either netdev_ops->ndo_uninit() or netdev->destructor().
The decision of which operation frees the resources depends upon
whether it is necessary for all netdev refs to be released before it
is safe to perform the freeing.
netdev_ops->ndo_uninit() presumably can occur right after the
NETDEV_UNREGISTER notifier completes and the unicast and multicast
address lists are flushed.
netdev->destructor(), on the other hand, does not run until the
netdev references all go away.
Further complicating the situation is that netdev->destructor()
almost universally does also a free_netdev().
This creates a problem for the logic in register_netdevice().
Because all callers of register_netdevice() manage the freeing
of the netdev, and invoke free_netdev(dev) if register_netdevice()
fails.
If netdev_ops->ndo_init() succeeds, but something else fails inside
of register_netdevice(), it does call ndo_ops->ndo_uninit(). But
it is not able to invoke netdev->destructor().
This is because netdev->destructor() will do a free_netdev() and
then the caller of register_netdevice() will do the same.
However, this means that the resources that would normally be released
by netdev->destructor() will not be.
Over the years drivers have added local hacks to deal with this, by
invoking their destructor parts by hand when register_netdevice()
fails.
Many drivers do not try to deal with this, and instead we have leaks.
Let's close this hole by formalizing the distinction between what
private things need to be freed up by netdev->destructor() and whether
the driver needs unregister_netdevice() to perform the free_netdev().
netdev->priv_destructor() performs all actions to free up the private
resources that used to be freed by netdev->destructor(), except for
free_netdev().
netdev->needs_free_netdev is a boolean that indicates whether
free_netdev() should be done at the end of unregister_netdevice().
Now, register_netdevice() can sanely release all resources after
ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit()
and netdev->priv_destructor().
And at the end of unregister_netdevice(), we invoke
netdev->priv_destructor() and optionally call free_netdev().
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-08 19:52:56 +03:00
free_netdev ( ip6n - > fb_tnl_dev ) ;
2013-08-19 10:07:34 +04:00
err_alloc_dev :
return err ;
}
2017-09-20 02:27:08 +03:00
static void __net_exit vti6_exit_batch_net ( struct list_head * net_list )
2013-08-19 10:07:34 +04:00
{
2017-09-20 02:27:08 +03:00
struct vti6_net * ip6n ;
struct net * net ;
LIST_HEAD ( list ) ;
2013-08-19 10:07:34 +04:00
rtnl_lock ( ) ;
2017-09-20 02:27:08 +03:00
list_for_each_entry ( net , net_list , exit_list ) {
ip6n = net_generic ( net , vti6_net_id ) ;
vti6_destroy_tunnels ( ip6n , & list ) ;
}
unregister_netdevice_many ( & list ) ;
2013-08-19 10:07:34 +04:00
rtnl_unlock ( ) ;
}
static struct pernet_operations vti6_net_ops = {
. init = vti6_init_net ,
2017-09-20 02:27:08 +03:00
. exit_batch = vti6_exit_batch_net ,
2013-08-19 10:07:34 +04:00
. id = & vti6_net_id ,
. size = sizeof ( struct vti6_net ) ,
} ;
2014-03-14 10:28:08 +04:00
static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
. handler = vti6_rcv ,
2020-04-27 18:59:34 +03:00
. input_handler = vti6_input_proto ,
2014-03-14 10:28:08 +04:00
. cb_handler = vti6_rcv_cb ,
. err_handler = vti6_err ,
. priority = 100 ,
} ;
static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
. handler = vti6_rcv ,
2020-04-27 18:59:34 +03:00
. input_handler = vti6_input_proto ,
2014-03-14 10:28:08 +04:00
. cb_handler = vti6_rcv_cb ,
. err_handler = vti6_err ,
. priority = 100 ,
} ;
static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
. handler = vti6_rcv ,
2020-04-27 18:59:34 +03:00
. input_handler = vti6_input_proto ,
2014-03-14 10:28:08 +04:00
. cb_handler = vti6_rcv_cb ,
. err_handler = vti6_err ,
. priority = 100 ,
} ;
2020-07-17 10:03:14 +03:00
# if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
2020-07-06 15:01:34 +03:00
static int vti6_rcv_tunnel ( struct sk_buff * skb )
{
const xfrm_address_t * saddr ;
__be32 spi ;
saddr = ( const xfrm_address_t * ) & ipv6_hdr ( skb ) - > saddr ;
spi = xfrm6_tunnel_spi_lookup ( dev_net ( skb - > dev ) , saddr ) ;
return vti6_input_proto ( skb , IPPROTO_IPV6 , spi , 0 ) ;
}
static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = {
. handler = vti6_rcv_tunnel ,
. cb_handler = vti6_rcv_cb ,
. err_handler = vti6_err ,
. priority = 0 ,
} ;
2020-07-13 10:42:37 +03:00
static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = {
. handler = vti6_rcv_tunnel ,
. cb_handler = vti6_rcv_cb ,
. err_handler = vti6_err ,
. priority = 0 ,
} ;
2020-07-06 15:01:34 +03:00
# endif
2013-08-19 10:07:34 +04:00
/**
* vti6_tunnel_init - register protocol and reserve needed resources
*
* Return : 0 on success
* */
static int __init vti6_tunnel_init ( void )
{
2014-05-10 01:43:41 +04:00
const char * msg ;
int err ;
2013-08-19 10:07:34 +04:00
2014-05-10 01:43:41 +04:00
msg = " tunnel device " ;
2013-08-19 10:07:34 +04:00
err = register_pernet_device ( & vti6_net_ops ) ;
if ( err < 0 )
2014-05-10 01:43:41 +04:00
goto pernet_dev_failed ;
2013-08-19 10:07:34 +04:00
2014-05-10 01:43:41 +04:00
msg = " tunnel protocols " ;
2014-03-14 10:28:08 +04:00
err = xfrm6_protocol_register ( & vti_esp6_protocol , IPPROTO_ESP ) ;
2014-05-10 01:43:41 +04:00
if ( err < 0 )
goto xfrm_proto_esp_failed ;
2014-03-14 10:28:08 +04:00
err = xfrm6_protocol_register ( & vti_ah6_protocol , IPPROTO_AH ) ;
2014-05-10 01:43:41 +04:00
if ( err < 0 )
goto xfrm_proto_ah_failed ;
2014-03-14 10:28:08 +04:00
err = xfrm6_protocol_register ( & vti_ipcomp6_protocol , IPPROTO_COMP ) ;
2014-05-10 01:43:41 +04:00
if ( err < 0 )
goto xfrm_proto_comp_failed ;
2020-07-17 10:03:14 +03:00
# if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
2020-07-06 15:01:34 +03:00
msg = " ipv6 tunnel " ;
err = xfrm6_tunnel_register ( & vti_ipv6_handler , AF_INET6 ) ;
if ( err < 0 )
2020-07-06 15:01:35 +03:00
goto vti_tunnel_ipv6_failed ;
2020-07-13 10:42:37 +03:00
err = xfrm6_tunnel_register ( & vti_ip6ip_handler , AF_INET ) ;
2020-07-06 15:01:35 +03:00
if ( err < 0 )
goto vti_tunnel_ip6ip_failed ;
2020-07-06 15:01:34 +03:00
# endif
2014-03-14 10:28:08 +04:00
2014-05-10 01:43:41 +04:00
msg = " netlink interface " ;
2013-08-19 10:07:34 +04:00
err = rtnl_link_register ( & vti6_link_ops ) ;
if ( err < 0 )
goto rtnl_link_failed ;
return 0 ;
rtnl_link_failed :
2020-07-17 10:03:14 +03:00
# if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
2020-07-13 10:42:37 +03:00
err = xfrm6_tunnel_deregister ( & vti_ip6ip_handler , AF_INET ) ;
2020-07-06 15:01:35 +03:00
vti_tunnel_ip6ip_failed :
2020-07-06 15:01:34 +03:00
err = xfrm6_tunnel_deregister ( & vti_ipv6_handler , AF_INET6 ) ;
2020-07-06 15:01:35 +03:00
vti_tunnel_ipv6_failed :
2020-07-06 15:01:34 +03:00
# endif
2014-03-14 10:28:08 +04:00
xfrm6_protocol_deregister ( & vti_ipcomp6_protocol , IPPROTO_COMP ) ;
2014-05-10 01:43:41 +04:00
xfrm_proto_comp_failed :
2014-03-14 10:28:08 +04:00
xfrm6_protocol_deregister ( & vti_ah6_protocol , IPPROTO_AH ) ;
2014-05-10 01:43:41 +04:00
xfrm_proto_ah_failed :
2014-03-14 10:28:08 +04:00
xfrm6_protocol_deregister ( & vti_esp6_protocol , IPPROTO_ESP ) ;
2014-05-10 01:43:41 +04:00
xfrm_proto_esp_failed :
2013-08-19 10:07:34 +04:00
unregister_pernet_device ( & vti6_net_ops ) ;
2014-05-10 01:43:41 +04:00
pernet_dev_failed :
pr_err ( " vti6 init: failed to register %s \n " , msg ) ;
2013-08-19 10:07:34 +04:00
return err ;
}
/**
* vti6_tunnel_cleanup - free resources and unregister protocol
* */
static void __exit vti6_tunnel_cleanup ( void )
{
rtnl_link_unregister ( & vti6_link_ops ) ;
2020-07-17 10:03:14 +03:00
# if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
2020-07-13 10:42:37 +03:00
xfrm6_tunnel_deregister ( & vti_ip6ip_handler , AF_INET ) ;
2020-07-06 15:01:34 +03:00
xfrm6_tunnel_deregister ( & vti_ipv6_handler , AF_INET6 ) ;
# endif
2014-05-10 01:43:41 +04:00
xfrm6_protocol_deregister ( & vti_ipcomp6_protocol , IPPROTO_COMP ) ;
xfrm6_protocol_deregister ( & vti_ah6_protocol , IPPROTO_AH ) ;
xfrm6_protocol_deregister ( & vti_esp6_protocol , IPPROTO_ESP ) ;
2013-08-19 10:07:34 +04:00
unregister_pernet_device ( & vti6_net_ops ) ;
}
module_init ( vti6_tunnel_init ) ;
module_exit ( vti6_tunnel_cleanup ) ;
MODULE_LICENSE ( " GPL " ) ;
MODULE_ALIAS_RTNL_LINK ( " vti6 " ) ;
MODULE_ALIAS_NETDEV ( " ip6_vti0 " ) ;
MODULE_AUTHOR ( " Steffen Klassert " ) ;
MODULE_DESCRIPTION ( " IPv6 virtual tunnel interface " ) ;