2019-05-27 08:55:01 +02:00
// SPDX-License-Identifier: GPL-2.0-or-later
2012-08-10 00:51:50 +00:00
/*
* GRE over IPv6 protocol decoder .
*
* Authors : Dmitry Kozlov ( xeb @ mail . ru )
*/
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
# include <linux/capability.h>
# include <linux/module.h>
# include <linux/types.h>
# include <linux/kernel.h>
# include <linux/slab.h>
# include <linux/uaccess.h>
# include <linux/skbuff.h>
# include <linux/netdevice.h>
# include <linux/in.h>
# include <linux/tcp.h>
# include <linux/udp.h>
# include <linux/if_arp.h>
# include <linux/init.h>
# include <linux/in6.h>
# include <linux/inetdevice.h>
# include <linux/igmp.h>
# include <linux/netfilter_ipv4.h>
# include <linux/etherdevice.h>
# include <linux/if_ether.h>
# include <linux/hash.h>
# include <linux/if_tunnel.h>
# include <linux/ip6_tunnel.h>
# include <net/sock.h>
# include <net/ip.h>
2013-03-25 14:49:35 +00:00
# include <net/ip_tunnels.h>
2012-08-10 00:51:50 +00:00
# include <net/icmp.h>
# include <net/protocol.h>
# include <net/addrconf.h>
# include <net/arp.h>
# include <net/checksum.h>
# include <net/dsfield.h>
# include <net/inet_ecn.h>
# include <net/xfrm.h>
# include <net/net_namespace.h>
# include <net/netns/generic.h>
# include <net/rtnetlink.h>
# include <net/ipv6.h>
# include <net/ip6_fib.h>
# include <net/ip6_route.h>
# include <net/ip6_tunnel.h>
2016-04-29 17:12:17 -07:00
# include <net/gre.h>
2017-11-30 11:51:29 -08:00
# include <net/erspan.h>
2017-12-01 15:26:08 -08:00
# include <net/dst_metadata.h>
2012-08-10 00:51:50 +00:00
2012-09-25 11:02:48 +00:00
static bool log_ecn_error = true ;
module_param ( log_ecn_error , bool , 0644 ) ;
MODULE_PARM_DESC ( log_ecn_error , " Log packets received with corrupted ECN " ) ;
2016-08-10 11:03:35 +02:00
# define IP6_GRE_HASH_SIZE_SHIFT 5
# define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
2012-08-10 00:51:50 +00:00
netns: make struct pernet_operations::id unsigned int
Make struct pernet_operations::id unsigned.
There are 2 reasons to do so:
1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.
2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.
"int" being used as an array index needs to be sign-extended
to 64-bit before being used.
void f(long *p, int i)
{
g(p[i]);
}
roughly translates to
movsx rsi, esi
mov rdi, [rsi+...]
call g
MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.
Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:
static inline void *net_generic(const struct net *net, int id)
{
...
ptr = ng->ptr[id - 1];
...
}
And this function is used a lot, so those sign extensions add up.
Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]
However, overall balance is in negative direction:
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
function old new delta
nfsd4_lock 3886 3959 +73
tipc_link_build_proto_msg 1096 1140 +44
mac80211_hwsim_new_radio 2776 2808 +32
tipc_mon_rcv 1032 1058 +26
svcauth_gss_legacy_init 1413 1429 +16
tipc_bcbase_select_primary 379 392 +13
nfsd4_exchange_id 1247 1260 +13
nfsd4_setclientid_confirm 782 793 +11
...
put_client_renew_locked 494 480 -14
ip_set_sockfn_get 730 716 -14
geneve_sock_add 829 813 -16
nfsd4_sequence_done 721 703 -18
nlmclnt_lookup_host 708 686 -22
nfsd4_lockt 1085 1063 -22
nfs_get_client 1077 1050 -27
tcf_bpf_init 1106 1076 -30
nfsd4_encode_fattr 5997 5930 -67
Total: Before=154856051, After=154854321, chg -0.00%
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 04:58:21 +03:00
static unsigned int ip6gre_net_id __read_mostly ;
2012-08-10 00:51:50 +00:00
struct ip6gre_net {
2016-08-10 11:03:35 +02:00
struct ip6_tnl __rcu * tunnels [ 4 ] [ IP6_GRE_HASH_SIZE ] ;
2012-08-10 00:51:50 +00:00
2017-12-01 15:26:08 -08:00
struct ip6_tnl __rcu * collect_md_tun ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
struct ip6_tnl __rcu * collect_md_tun_erspan ;
2012-08-10 00:51:50 +00:00
struct net_device * fb_tunnel_dev ;
} ;
static struct rtnl_link_ops ip6gre_link_ops __read_mostly ;
2014-04-22 10:15:24 +02:00
static struct rtnl_link_ops ip6gre_tap_ops __read_mostly ;
2017-11-30 11:51:29 -08:00
static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly ;
2012-08-10 00:51:50 +00:00
static int ip6gre_tunnel_init ( struct net_device * dev ) ;
static void ip6gre_tunnel_setup ( struct net_device * dev ) ;
static void ip6gre_tunnel_link ( struct ip6gre_net * ign , struct ip6_tnl * t ) ;
static void ip6gre_tnl_link_config ( struct ip6_tnl * t , int set_mtu ) ;
2018-05-17 16:36:51 +02:00
static void ip6erspan_tnl_link_config ( struct ip6_tnl * t , int set_mtu ) ;
2012-08-10 00:51:50 +00:00
/* Tunnel hash table */
/*
4 hash tables :
3 : ( remote , local )
2 : ( remote , * )
1 : ( * , local )
0 : ( * , * )
We require exact key match i . e . if a key is present in packet
it will match only tunnel with the same key ; if it is not present ,
it will match only keyless tunnel .
All keysless packets , if not matched configured keyless tunnels
will match fallback tunnel .
*/
2016-08-10 11:03:35 +02:00
# define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1))
2012-08-10 00:51:50 +00:00
static u32 HASH_ADDR ( const struct in6_addr * addr )
{
u32 hash = ipv6_addr_hash ( addr ) ;
2016-08-10 11:03:35 +02:00
return hash_32 ( hash , IP6_GRE_HASH_SIZE_SHIFT ) ;
2012-08-10 00:51:50 +00:00
}
# define tunnels_r_l tunnels[3]
# define tunnels_r tunnels[2]
# define tunnels_l tunnels[1]
# define tunnels_wc tunnels[0]
/* Given src, dst and key, find appropriate for input tunnel. */
static struct ip6_tnl * ip6gre_tunnel_lookup ( struct net_device * dev ,
const struct in6_addr * remote , const struct in6_addr * local ,
__be32 key , __be16 gre_proto )
{
struct net * net = dev_net ( dev ) ;
int link = dev - > ifindex ;
unsigned int h0 = HASH_ADDR ( remote ) ;
unsigned int h1 = HASH_KEY ( key ) ;
struct ip6_tnl * t , * cand = NULL ;
struct ip6gre_net * ign = net_generic ( net , ip6gre_net_id ) ;
2017-11-30 11:51:29 -08:00
int dev_type = ( gre_proto = = htons ( ETH_P_TEB ) | |
2018-03-09 07:34:40 -08:00
gre_proto = = htons ( ETH_P_ERSPAN ) | |
gre_proto = = htons ( ETH_P_ERSPAN2 ) ) ?
2012-08-10 00:51:50 +00:00
ARPHRD_ETHER : ARPHRD_IP6GRE ;
int score , cand_score = 4 ;
2020-06-16 16:04:00 +00:00
struct net_device * ndev ;
2012-08-10 00:51:50 +00:00
2012-11-11 21:52:34 +00:00
for_each_ip_tunnel_rcu ( t , ign - > tunnels_r_l [ h0 ^ h1 ] ) {
2012-08-10 00:51:50 +00:00
if ( ! ipv6_addr_equal ( local , & t - > parms . laddr ) | |
! ipv6_addr_equal ( remote , & t - > parms . raddr ) | |
key ! = t - > parms . i_key | |
! ( t - > dev - > flags & IFF_UP ) )
continue ;
if ( t - > dev - > type ! = ARPHRD_IP6GRE & &
t - > dev - > type ! = dev_type )
continue ;
score = 0 ;
if ( t - > parms . link ! = link )
score | = 1 ;
if ( t - > dev - > type ! = dev_type )
score | = 2 ;
if ( score = = 0 )
return t ;
if ( score < cand_score ) {
cand = t ;
cand_score = score ;
}
}
2012-11-11 21:52:34 +00:00
for_each_ip_tunnel_rcu ( t , ign - > tunnels_r [ h0 ^ h1 ] ) {
2012-08-10 00:51:50 +00:00
if ( ! ipv6_addr_equal ( remote , & t - > parms . raddr ) | |
key ! = t - > parms . i_key | |
! ( t - > dev - > flags & IFF_UP ) )
continue ;
if ( t - > dev - > type ! = ARPHRD_IP6GRE & &
t - > dev - > type ! = dev_type )
continue ;
score = 0 ;
if ( t - > parms . link ! = link )
score | = 1 ;
if ( t - > dev - > type ! = dev_type )
score | = 2 ;
if ( score = = 0 )
return t ;
if ( score < cand_score ) {
cand = t ;
cand_score = score ;
}
}
2012-11-11 21:52:34 +00:00
for_each_ip_tunnel_rcu ( t , ign - > tunnels_l [ h1 ] ) {
2012-08-10 00:51:50 +00:00
if ( ( ! ipv6_addr_equal ( local , & t - > parms . laddr ) & &
( ! ipv6_addr_equal ( local , & t - > parms . raddr ) | |
! ipv6_addr_is_multicast ( local ) ) ) | |
key ! = t - > parms . i_key | |
! ( t - > dev - > flags & IFF_UP ) )
continue ;
if ( t - > dev - > type ! = ARPHRD_IP6GRE & &
t - > dev - > type ! = dev_type )
continue ;
score = 0 ;
if ( t - > parms . link ! = link )
score | = 1 ;
if ( t - > dev - > type ! = dev_type )
score | = 2 ;
if ( score = = 0 )
return t ;
if ( score < cand_score ) {
cand = t ;
cand_score = score ;
}
}
2012-11-11 21:52:34 +00:00
for_each_ip_tunnel_rcu ( t , ign - > tunnels_wc [ h1 ] ) {
2012-08-10 00:51:50 +00:00
if ( t - > parms . i_key ! = key | |
! ( t - > dev - > flags & IFF_UP ) )
continue ;
if ( t - > dev - > type ! = ARPHRD_IP6GRE & &
t - > dev - > type ! = dev_type )
continue ;
score = 0 ;
if ( t - > parms . link ! = link )
score | = 1 ;
if ( t - > dev - > type ! = dev_type )
score | = 2 ;
if ( score = = 0 )
return t ;
if ( score < cand_score ) {
cand = t ;
cand_score = score ;
}
}
2015-03-29 14:00:05 +01:00
if ( cand )
2012-08-10 00:51:50 +00:00
return cand ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
if ( gre_proto = = htons ( ETH_P_ERSPAN ) | |
gre_proto = = htons ( ETH_P_ERSPAN2 ) )
t = rcu_dereference ( ign - > collect_md_tun_erspan ) ;
else
t = rcu_dereference ( ign - > collect_md_tun ) ;
2017-12-01 15:26:08 -08:00
if ( t & & t - > dev - > flags & IFF_UP )
return t ;
2020-06-16 16:04:00 +00:00
ndev = READ_ONCE ( ign - > fb_tunnel_dev ) ;
if ( ndev & & ndev - > flags & IFF_UP )
return netdev_priv ( ndev ) ;
2012-08-10 00:51:50 +00:00
return NULL ;
}
static struct ip6_tnl __rcu * * __ip6gre_bucket ( struct ip6gre_net * ign ,
const struct __ip6_tnl_parm * p )
{
const struct in6_addr * remote = & p - > raddr ;
const struct in6_addr * local = & p - > laddr ;
unsigned int h = HASH_KEY ( p - > i_key ) ;
int prio = 0 ;
if ( ! ipv6_addr_any ( local ) )
prio | = 1 ;
if ( ! ipv6_addr_any ( remote ) & & ! ipv6_addr_is_multicast ( remote ) ) {
prio | = 2 ;
h ^ = HASH_ADDR ( remote ) ;
}
return & ign - > tunnels [ prio ] [ h ] ;
}
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
static void ip6gre_tunnel_link_md ( struct ip6gre_net * ign , struct ip6_tnl * t )
{
if ( t - > parms . collect_md )
rcu_assign_pointer ( ign - > collect_md_tun , t ) ;
}
static void ip6erspan_tunnel_link_md ( struct ip6gre_net * ign , struct ip6_tnl * t )
{
if ( t - > parms . collect_md )
rcu_assign_pointer ( ign - > collect_md_tun_erspan , t ) ;
}
static void ip6gre_tunnel_unlink_md ( struct ip6gre_net * ign , struct ip6_tnl * t )
{
if ( t - > parms . collect_md )
rcu_assign_pointer ( ign - > collect_md_tun , NULL ) ;
}
static void ip6erspan_tunnel_unlink_md ( struct ip6gre_net * ign ,
struct ip6_tnl * t )
{
if ( t - > parms . collect_md )
rcu_assign_pointer ( ign - > collect_md_tun_erspan , NULL ) ;
}
2012-08-10 00:51:50 +00:00
static inline struct ip6_tnl __rcu * * ip6gre_bucket ( struct ip6gre_net * ign ,
const struct ip6_tnl * t )
{
return __ip6gre_bucket ( ign , & t - > parms ) ;
}
static void ip6gre_tunnel_link ( struct ip6gre_net * ign , struct ip6_tnl * t )
{
struct ip6_tnl __rcu * * tp = ip6gre_bucket ( ign , t ) ;
rcu_assign_pointer ( t - > next , rtnl_dereference ( * tp ) ) ;
rcu_assign_pointer ( * tp , t ) ;
}
static void ip6gre_tunnel_unlink ( struct ip6gre_net * ign , struct ip6_tnl * t )
{
struct ip6_tnl __rcu * * tp ;
struct ip6_tnl * iter ;
for ( tp = ip6gre_bucket ( ign , t ) ;
( iter = rtnl_dereference ( * tp ) ) ! = NULL ;
tp = & iter - > next ) {
if ( t = = iter ) {
rcu_assign_pointer ( * tp , t - > next ) ;
break ;
}
}
}
static struct ip6_tnl * ip6gre_tunnel_find ( struct net * net ,
const struct __ip6_tnl_parm * parms ,
int type )
{
const struct in6_addr * remote = & parms - > raddr ;
const struct in6_addr * local = & parms - > laddr ;
__be32 key = parms - > i_key ;
int link = parms - > link ;
struct ip6_tnl * t ;
struct ip6_tnl __rcu * * tp ;
struct ip6gre_net * ign = net_generic ( net , ip6gre_net_id ) ;
for ( tp = __ip6gre_bucket ( ign , parms ) ;
( t = rtnl_dereference ( * tp ) ) ! = NULL ;
tp = & t - > next )
if ( ipv6_addr_equal ( local , & t - > parms . laddr ) & &
ipv6_addr_equal ( remote , & t - > parms . raddr ) & &
key = = t - > parms . i_key & &
link = = t - > parms . link & &
type = = t - > dev - > type )
break ;
return t ;
}
static struct ip6_tnl * ip6gre_tunnel_locate ( struct net * net ,
const struct __ip6_tnl_parm * parms , int create )
{
struct ip6_tnl * t , * nt ;
struct net_device * dev ;
char name [ IFNAMSIZ ] ;
struct ip6gre_net * ign = net_generic ( net , ip6gre_net_id ) ;
t = ip6gre_tunnel_find ( net , parms , ARPHRD_IP6GRE ) ;
2014-09-22 10:07:26 +02:00
if ( t & & create )
return NULL ;
2012-08-10 00:51:50 +00:00
if ( t | | ! create )
return t ;
2018-04-05 06:39:29 -07:00
if ( parms - > name [ 0 ] ) {
if ( ! dev_valid_name ( parms - > name ) )
return NULL ;
2022-08-18 23:02:20 +02:00
strscpy ( name , parms - > name , IFNAMSIZ ) ;
2018-04-05 06:39:29 -07:00
} else {
2012-08-10 00:51:50 +00:00
strcpy ( name , " ip6gre%d " ) ;
2018-04-05 06:39:29 -07:00
}
net: set name_assign_type in alloc_netdev()
Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert
all users to pass NET_NAME_UNKNOWN.
Coccinelle patch:
@@
expression sizeof_priv, name, setup, txqs, rxqs, count;
@@
(
-alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs)
+alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs)
|
-alloc_netdev_mq(sizeof_priv, name, setup, count)
+alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count)
|
-alloc_netdev(sizeof_priv, name, setup)
+alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup)
)
v9: move comments here from the wrong commit
Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-14 16:37:24 +02:00
dev = alloc_netdev ( sizeof ( * t ) , name , NET_NAME_UNKNOWN ,
ip6gre_tunnel_setup ) ;
2012-08-10 00:51:50 +00:00
if ( ! dev )
return NULL ;
dev_net_set ( dev , net ) ;
nt = netdev_priv ( dev ) ;
nt - > parms = * parms ;
dev - > rtnl_link_ops = & ip6gre_link_ops ;
nt - > dev = dev ;
2013-08-13 17:51:12 +02:00
nt - > net = dev_net ( dev ) ;
2012-08-10 00:51:50 +00:00
if ( register_netdevice ( dev ) < 0 )
goto failed_free ;
ip6_gre: init dev->mtu and dev->hard_header_len correctly
Commit b05229f44228 ("gre6: Cleanup GREv6 transmit path,
call common GRE functions") moved dev->mtu initialization
from ip6gre_tunnel_setup() to ip6gre_tunnel_init(), as a
result, the previously set values, before ndo_init(), are
reset in the following cases:
* rtnl_create_link() can update dev->mtu from IFLA_MTU
parameter.
* ip6gre_tnl_link_config() is invoked before ndo_init() in
netlink and ioctl setup, so ndo_init() can reset MTU
adjustments with the lower device MTU as well, dev->mtu
and dev->hard_header_len.
Not applicable for ip6gretap because it has one more call
to ip6gre_tnl_link_config(tunnel, 1) in ip6gre_tap_init().
Fix the first case by updating dev->mtu with 'tb[IFLA_MTU]'
parameter if a user sets it manually on a device creation,
and fix the second one by moving ip6gre_tnl_link_config()
call after register_netdevice().
Fixes: b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE functions")
Fixes: db2ec95d1ba4 ("ip6_gre: Fix MTU setting")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-18 20:51:12 +03:00
ip6gre_tnl_link_config ( nt , 1 ) ;
2012-08-10 00:51:50 +00:00
ip6gre_tunnel_link ( ign , nt ) ;
return nt ;
failed_free :
free_netdev ( dev ) ;
return NULL ;
}
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
static void ip6erspan_tunnel_uninit ( struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct ip6gre_net * ign = net_generic ( t - > net , ip6gre_net_id ) ;
ip6erspan_tunnel_unlink_md ( ign , t ) ;
ip6gre_tunnel_unlink ( ign , t ) ;
dst_cache_reset ( & t - > dst_cache ) ;
2022-06-07 21:39:55 -07:00
netdev_put ( dev , & t - > dev_tracker ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
}
2012-08-10 00:51:50 +00:00
static void ip6gre_tunnel_uninit ( struct net_device * dev )
{
2014-04-22 10:15:24 +02:00
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct ip6gre_net * ign = net_generic ( t - > net , ip6gre_net_id ) ;
2012-08-10 00:51:50 +00:00
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
ip6gre_tunnel_unlink_md ( ign , t ) ;
2014-04-22 10:15:24 +02:00
ip6gre_tunnel_unlink ( ign , t ) ;
2020-06-16 16:04:00 +00:00
if ( ign - > fb_tunnel_dev = = dev )
WRITE_ONCE ( ign - > fb_tunnel_dev , NULL ) ;
2016-02-12 15:43:54 +01:00
dst_cache_reset ( & t - > dst_cache ) ;
2022-06-07 21:39:55 -07:00
netdev_put ( dev , & t - > dev_tracker ) ;
2012-08-10 00:51:50 +00:00
}
2018-11-08 12:19:21 +01:00
static int ip6gre_err ( struct sk_buff * skb , struct inet6_skb_parm * opt ,
2017-02-04 23:18:55 -08:00
u8 type , u8 code , int offset , __be32 info )
2012-08-10 00:51:50 +00:00
{
2017-11-11 19:06:49 +08:00
struct net * net = dev_net ( skb - > dev ) ;
2017-02-04 23:18:55 -08:00
const struct ipv6hdr * ipv6h ;
2018-09-14 12:26:48 +08:00
struct tnl_ptk_info tpi ;
2012-08-10 00:51:50 +00:00
struct ip6_tnl * t ;
2018-09-14 12:26:48 +08:00
if ( gre_parse_header ( skb , & tpi , NULL , htons ( ETH_P_IPV6 ) ,
offset ) < 0 )
2018-11-08 12:19:21 +01:00
return - EINVAL ;
2012-08-10 00:51:50 +00:00
2012-08-19 03:47:30 +00:00
ipv6h = ( const struct ipv6hdr * ) skb - > data ;
2012-08-10 00:51:50 +00:00
t = ip6gre_tunnel_lookup ( skb - > dev , & ipv6h - > daddr , & ipv6h - > saddr ,
2018-09-14 12:26:48 +08:00
tpi . key , tpi . proto ) ;
2015-03-29 14:00:04 +01:00
if ( ! t )
2018-11-08 12:19:21 +01:00
return - ENOENT ;
2012-08-10 00:51:50 +00:00
switch ( type ) {
case ICMPV6_DEST_UNREACH :
2015-09-23 16:58:31 +12:00
net_dbg_ratelimited ( " %s: Path to destination invalid or inactive! \n " ,
t - > parms . name ) ;
2017-10-26 19:23:27 +08:00
if ( code ! = ICMPV6_PORT_UNREACH )
break ;
2018-11-08 12:19:21 +01:00
return 0 ;
2012-08-10 00:51:50 +00:00
case ICMPV6_TIME_EXCEED :
if ( code = = ICMPV6_EXC_HOPLIMIT ) {
2015-09-23 16:58:31 +12:00
net_dbg_ratelimited ( " %s: Too small hop limit or routing loop in tunnel! \n " ,
t - > parms . name ) ;
2017-10-26 19:23:27 +08:00
break ;
2012-08-10 00:51:50 +00:00
}
2018-11-08 12:19:21 +01:00
return 0 ;
2020-02-19 22:23:07 -08:00
case ICMPV6_PARAMPROB : {
struct ipv6_tlv_tnl_enc_lim * tel ;
__u32 teli ;
2012-08-10 00:51:50 +00:00
teli = 0 ;
if ( code = = ICMPV6_HDR_FIELD )
teli = ip6_tnl_parse_tlv_enc_lim ( skb , skb - > data ) ;
2015-02-04 15:25:09 +01:00
if ( teli & & teli = = be32_to_cpu ( info ) - 2 ) {
2012-08-10 00:51:50 +00:00
tel = ( struct ipv6_tlv_tnl_enc_lim * ) & skb - > data [ teli ] ;
if ( tel - > encap_limit = = 0 ) {
2015-09-23 16:58:31 +12:00
net_dbg_ratelimited ( " %s: Too small encapsulation limit or routing loop in tunnel! \n " ,
t - > parms . name ) ;
2012-08-10 00:51:50 +00:00
}
} else {
2015-09-23 16:58:31 +12:00
net_dbg_ratelimited ( " %s: Recipient unable to parse tunneled packet! \n " ,
t - > parms . name ) ;
2012-08-10 00:51:50 +00:00
}
2018-11-08 12:19:21 +01:00
return 0 ;
2020-02-19 22:23:07 -08:00
}
2012-08-10 00:51:50 +00:00
case ICMPV6_PKT_TOOBIG :
2017-11-11 19:06:50 +08:00
ip6_update_pmtu ( skb , net , info , 0 , 0 , sock_net_uid ( net , NULL ) ) ;
2018-11-08 12:19:21 +01:00
return 0 ;
2017-11-11 19:06:49 +08:00
case NDISC_REDIRECT :
ip6_redirect ( skb , net , skb - > dev - > ifindex , 0 ,
sock_net_uid ( net , NULL ) ) ;
2018-11-08 12:19:21 +01:00
return 0 ;
2012-08-10 00:51:50 +00:00
}
if ( time_before ( jiffies , t - > err_time + IP6TUNNEL_ERR_TIMEO ) )
t - > err_count + + ;
else
t - > err_count = 1 ;
t - > err_time = jiffies ;
2018-11-08 12:19:21 +01:00
return 0 ;
2012-08-10 00:51:50 +00:00
}
2016-04-29 17:12:17 -07:00
static int ip6gre_rcv ( struct sk_buff * skb , const struct tnl_ptk_info * tpi )
2012-08-10 00:51:50 +00:00
{
const struct ipv6hdr * ipv6h ;
struct ip6_tnl * tunnel ;
ipv6h = ipv6_hdr ( skb ) ;
tunnel = ip6gre_tunnel_lookup ( skb - > dev ,
2016-04-29 17:12:17 -07:00
& ipv6h - > saddr , & ipv6h - > daddr , tpi - > key ,
tpi - > proto ) ;
2012-08-10 00:51:50 +00:00
if ( tunnel ) {
2017-12-01 15:26:08 -08:00
if ( tunnel - > parms . collect_md ) {
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
IP_TUNNEL_DECLARE_FLAGS ( flags ) ;
2017-12-01 15:26:08 -08:00
struct metadata_dst * tun_dst ;
__be64 tun_id ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
ip_tunnel_flags_copy ( flags , tpi - > flags ) ;
2017-12-01 15:26:08 -08:00
tun_id = key32_to_tunnel_id ( tpi - > key ) ;
tun_dst = ipv6_tun_rx_dst ( skb , flags , tun_id , 0 ) ;
if ( ! tun_dst )
return PACKET_REJECT ;
ip6_tnl_rcv ( tunnel , skb , tpi , tun_dst , log_ecn_error ) ;
} else {
ip6_tnl_rcv ( tunnel , skb , tpi , NULL , log_ecn_error ) ;
}
2012-08-10 00:51:50 +00:00
2016-04-29 17:12:17 -07:00
return PACKET_RCVD ;
}
2012-09-25 11:02:48 +00:00
2016-04-29 17:12:17 -07:00
return PACKET_REJECT ;
}
2012-09-25 11:02:48 +00:00
2019-01-16 19:38:05 +01:00
static int ip6erspan_rcv ( struct sk_buff * skb ,
2019-04-06 17:16:53 +02:00
struct tnl_ptk_info * tpi ,
int gre_hdr_len )
2017-11-30 11:51:29 -08:00
{
2017-12-13 16:38:55 -08:00
struct erspan_base_hdr * ershdr ;
2017-11-30 11:51:29 -08:00
const struct ipv6hdr * ipv6h ;
2018-02-05 13:35:34 -08:00
struct erspan_md2 * md2 ;
2017-11-30 11:51:29 -08:00
struct ip6_tnl * tunnel ;
2017-12-13 16:38:55 -08:00
u8 ver ;
2017-11-30 11:51:29 -08:00
2024-03-28 11:22:48 +00:00
if ( unlikely ( ! pskb_may_pull ( skb , sizeof ( * ershdr ) ) ) )
return PACKET_REJECT ;
2017-12-20 09:53:19 +08:00
ipv6h = ipv6_hdr ( skb ) ;
ershdr = ( struct erspan_base_hdr * ) skb - > data ;
2018-01-25 13:20:09 -08:00
ver = ershdr - > ver ;
2017-11-30 11:51:29 -08:00
tunnel = ip6gre_tunnel_lookup ( skb - > dev ,
& ipv6h - > saddr , & ipv6h - > daddr , tpi - > key ,
tpi - > proto ) ;
if ( tunnel ) {
2017-12-13 16:38:55 -08:00
int len = erspan_hdr_len ( ver ) ;
if ( unlikely ( ! pskb_may_pull ( skb , len ) ) )
2017-12-15 14:27:43 -08:00
return PACKET_REJECT ;
2017-12-13 16:38:55 -08:00
if ( __iptunnel_pull_header ( skb , len ,
2017-11-30 11:51:29 -08:00
htons ( ETH_P_TEB ) ,
false , false ) < 0 )
return PACKET_REJECT ;
2017-12-05 15:15:44 -08:00
if ( tunnel - > parms . collect_md ) {
2019-04-06 17:16:53 +02:00
struct erspan_metadata * pkt_md , * md ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
IP_TUNNEL_DECLARE_FLAGS ( flags ) ;
2017-12-05 15:15:44 -08:00
struct metadata_dst * tun_dst ;
struct ip_tunnel_info * info ;
2019-04-06 17:16:53 +02:00
unsigned char * gh ;
2017-12-05 15:15:44 -08:00
__be64 tun_id ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
__set_bit ( IP_TUNNEL_KEY_BIT , tpi - > flags ) ;
ip_tunnel_flags_copy ( flags , tpi - > flags ) ;
2017-12-05 15:15:44 -08:00
tun_id = key32_to_tunnel_id ( tpi - > key ) ;
tun_dst = ipv6_tun_rx_dst ( skb , flags , tun_id ,
sizeof ( * md ) ) ;
if ( ! tun_dst )
return PACKET_REJECT ;
2019-04-06 17:16:53 +02:00
/* skb can be uncloned in __iptunnel_pull_header, so
* old pkt_md is no longer valid and we need to reset
* it
*/
gh = skb_network_header ( skb ) +
skb_network_header_len ( skb ) ;
pkt_md = ( struct erspan_metadata * ) ( gh + gre_hdr_len +
sizeof ( * ershdr ) ) ;
2017-12-05 15:15:44 -08:00
info = & tun_dst - > u . tun_info ;
md = ip_tunnel_info_opts ( info ) ;
2017-12-13 16:38:57 -08:00
md - > version = ver ;
2018-02-05 13:35:34 -08:00
md2 = & md - > u . md2 ;
memcpy ( md2 , pkt_md , ver = = 1 ? ERSPAN_V1_MDSIZE :
ERSPAN_V2_MDSIZE ) ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
__set_bit ( IP_TUNNEL_ERSPAN_OPT_BIT ,
info - > key . tun_flags ) ;
2017-12-05 15:15:44 -08:00
info - > options_len = sizeof ( * md ) ;
ip6_tnl_rcv ( tunnel , skb , tpi , tun_dst , log_ecn_error ) ;
} else {
ip6_tnl_rcv ( tunnel , skb , tpi , NULL , log_ecn_error ) ;
}
2017-11-30 11:51:29 -08:00
return PACKET_RCVD ;
}
return PACKET_REJECT ;
}
2016-04-29 17:12:17 -07:00
static int gre_rcv ( struct sk_buff * skb )
{
struct tnl_ptk_info tpi ;
bool csum_err = false ;
int hdr_len ;
2012-09-25 11:02:48 +00:00
2016-06-15 06:24:00 -07:00
hdr_len = gre_parse_header ( skb , & tpi , & csum_err , htons ( ETH_P_IPV6 ) , 0 ) ;
2016-05-03 15:00:21 +02:00
if ( hdr_len < 0 )
2016-04-29 17:12:17 -07:00
goto drop ;
2012-08-10 00:51:50 +00:00
2016-04-29 17:12:17 -07:00
if ( iptunnel_pull_header ( skb , hdr_len , tpi . proto , false ) )
goto drop ;
2012-08-10 00:51:50 +00:00
2017-12-13 16:38:57 -08:00
if ( unlikely ( tpi . proto = = htons ( ETH_P_ERSPAN ) | |
tpi . proto = = htons ( ETH_P_ERSPAN2 ) ) ) {
2019-04-06 17:16:53 +02:00
if ( ip6erspan_rcv ( skb , & tpi , hdr_len ) = = PACKET_RCVD )
2017-11-30 11:51:29 -08:00
return 0 ;
2017-12-20 10:21:47 +08:00
goto out ;
2017-11-30 11:51:29 -08:00
}
2016-04-29 17:12:17 -07:00
if ( ip6gre_rcv ( skb , & tpi ) = = PACKET_RCVD )
2012-08-10 00:51:50 +00:00
return 0 ;
2017-12-20 10:21:47 +08:00
out :
2016-04-29 17:12:17 -07:00
icmpv6_send ( skb , ICMPV6_DEST_UNREACH , ICMPV6_PORT_UNREACH , 0 ) ;
2012-08-10 00:51:50 +00:00
drop :
kfree_skb ( skb ) ;
return 0 ;
}
2016-04-29 17:12:21 -07:00
static int gre_handle_offloads ( struct sk_buff * skb , bool csum )
2012-08-10 00:51:50 +00:00
{
2016-04-29 17:12:21 -07:00
return iptunnel_handle_offloads ( skb ,
csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE ) ;
2012-08-10 00:51:50 +00:00
}
2017-11-30 11:51:28 -08:00
static void prepare_ip6gre_xmit_ipv4 ( struct sk_buff * skb ,
struct net_device * dev ,
struct flowi6 * fl6 , __u8 * dsfield ,
int * encap_limit )
{
const struct iphdr * iph = ip_hdr ( skb ) ;
struct ip6_tnl * t = netdev_priv ( dev ) ;
if ( ! ( t - > parms . flags & IP6_TNL_F_IGN_ENCAP_LIMIT ) )
* encap_limit = t - > parms . encap_limit ;
memcpy ( fl6 , & t - > fl . u . ip6 , sizeof ( * fl6 ) ) ;
if ( t - > parms . flags & IP6_TNL_F_USE_ORIG_TCLASS )
* dsfield = ipv4_get_dsfield ( iph ) ;
else
* dsfield = ip6_tclass ( t - > parms . flowinfo ) ;
if ( t - > parms . flags & IP6_TNL_F_USE_ORIG_FWMARK )
fl6 - > flowi6_mark = skb - > mark ;
else
fl6 - > flowi6_mark = t - > parms . fwmark ;
fl6 - > flowi6_uid = sock_net_uid ( dev_net ( dev ) , NULL ) ;
}
static int prepare_ip6gre_xmit_ipv6 ( struct sk_buff * skb ,
struct net_device * dev ,
struct flowi6 * fl6 , __u8 * dsfield ,
int * encap_limit )
{
2019-07-24 20:00:42 +08:00
struct ipv6hdr * ipv6h ;
2017-11-30 11:51:28 -08:00
struct ip6_tnl * t = netdev_priv ( dev ) ;
__u16 offset ;
offset = ip6_tnl_parse_tlv_enc_lim ( skb , skb_network_header ( skb ) ) ;
/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
2019-07-24 20:00:42 +08:00
ipv6h = ipv6_hdr ( skb ) ;
2017-11-30 11:51:28 -08:00
if ( offset > 0 ) {
struct ipv6_tlv_tnl_enc_lim * tel ;
tel = ( struct ipv6_tlv_tnl_enc_lim * ) & skb_network_header ( skb ) [ offset ] ;
if ( tel - > encap_limit = = 0 ) {
2021-02-27 01:40:19 +01:00
icmpv6_ndo_send ( skb , ICMPV6_PARAMPROB ,
ICMPV6_HDR_FIELD , offset + 2 ) ;
2017-11-30 11:51:28 -08:00
return - 1 ;
}
* encap_limit = tel - > encap_limit - 1 ;
} else if ( ! ( t - > parms . flags & IP6_TNL_F_IGN_ENCAP_LIMIT ) ) {
* encap_limit = t - > parms . encap_limit ;
}
memcpy ( fl6 , & t - > fl . u . ip6 , sizeof ( * fl6 ) ) ;
if ( t - > parms . flags & IP6_TNL_F_USE_ORIG_TCLASS )
* dsfield = ipv6_get_dsfield ( ipv6h ) ;
else
* dsfield = ip6_tclass ( t - > parms . flowinfo ) ;
if ( t - > parms . flags & IP6_TNL_F_USE_ORIG_FLOWLABEL )
fl6 - > flowlabel | = ip6_flowlabel ( ipv6h ) ;
if ( t - > parms . flags & IP6_TNL_F_USE_ORIG_FWMARK )
fl6 - > flowi6_mark = skb - > mark ;
else
fl6 - > flowi6_mark = t - > parms . fwmark ;
fl6 - > flowi6_uid = sock_net_uid ( dev_net ( dev ) , NULL ) ;
return 0 ;
}
2022-07-11 11:17:20 +02:00
static int prepare_ip6gre_xmit_other ( struct sk_buff * skb ,
struct net_device * dev ,
struct flowi6 * fl6 , __u8 * dsfield ,
int * encap_limit )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
if ( ! ( t - > parms . flags & IP6_TNL_F_IGN_ENCAP_LIMIT ) )
* encap_limit = t - > parms . encap_limit ;
memcpy ( fl6 , & t - > fl . u . ip6 , sizeof ( * fl6 ) ) ;
if ( t - > parms . flags & IP6_TNL_F_USE_ORIG_TCLASS )
* dsfield = 0 ;
else
* dsfield = ip6_tclass ( t - > parms . flowinfo ) ;
if ( t - > parms . flags & IP6_TNL_F_USE_ORIG_FWMARK )
fl6 - > flowi6_mark = skb - > mark ;
else
fl6 - > flowi6_mark = t - > parms . fwmark ;
fl6 - > flowi6_uid = sock_net_uid ( dev_net ( dev ) , NULL ) ;
return 0 ;
}
2020-09-27 16:08:21 +02:00
static struct ip_tunnel_info * skb_tunnel_info_txcheck ( struct sk_buff * skb )
{
struct ip_tunnel_info * tun_info ;
tun_info = skb_tunnel_info ( skb ) ;
if ( unlikely ( ! tun_info | | ! ( tun_info - > mode & IP_TUNNEL_INFO_TX ) ) )
return ERR_PTR ( - EINVAL ) ;
return tun_info ;
}
2016-04-29 17:12:21 -07:00
static netdev_tx_t __gre6_xmit ( struct sk_buff * skb ,
struct net_device * dev , __u8 dsfield ,
struct flowi6 * fl6 , int encap_limit ,
__u32 * pmtu , __be16 proto )
2012-08-10 00:51:50 +00:00
{
struct ip6_tnl * tunnel = netdev_priv ( dev ) ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
IP_TUNNEL_DECLARE_FLAGS ( flags ) ;
2017-10-26 19:27:17 +08:00
__be16 protocol ;
2012-08-10 00:51:50 +00:00
if ( dev - > type = = ARPHRD_ETHER )
IPCB ( skb ) - > flags = 0 ;
2016-04-29 17:12:21 -07:00
if ( dev - > header_ops & & dev - > type = = ARPHRD_IP6GRE )
fl6 - > daddr = ( ( struct ipv6hdr * ) skb - > data ) - > daddr ;
else
2012-08-10 00:51:50 +00:00
fl6 - > daddr = tunnel - > parms . raddr ;
2016-04-29 17:12:21 -07:00
/* Push GRE header. */
2017-10-26 19:27:17 +08:00
protocol = ( dev - > type = = ARPHRD_ETHER ) ? htons ( ETH_P_TEB ) : proto ;
2017-12-01 15:26:08 -08:00
if ( tunnel - > parms . collect_md ) {
struct ip_tunnel_info * tun_info ;
const struct ip_tunnel_key * key ;
2022-04-14 13:34:26 -07:00
int tun_hlen ;
2017-12-01 15:26:08 -08:00
2020-09-27 16:08:21 +02:00
tun_info = skb_tunnel_info_txcheck ( skb ) ;
if ( IS_ERR ( tun_info ) | |
unlikely ( ip_tunnel_info_af ( tun_info ) ! = AF_INET6 ) )
2017-12-01 15:26:08 -08:00
return - EINVAL ;
key = & tun_info - > key ;
memset ( fl6 , 0 , sizeof ( * fl6 ) ) ;
fl6 - > flowi6_proto = IPPROTO_GRE ;
fl6 - > daddr = key - > u . ipv6 . dst ;
fl6 - > flowlabel = key - > label ;
fl6 - > flowi6_uid = sock_net_uid ( dev_net ( dev ) , NULL ) ;
2021-11-19 18:20:16 +01:00
fl6 - > fl6_gre_key = tunnel_id_to_key32 ( key - > tun_id ) ;
2017-12-01 15:26:08 -08:00
dsfield = key - > tos ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
ip_tunnel_flags_zero ( flags ) ;
__set_bit ( IP_TUNNEL_CSUM_BIT , flags ) ;
__set_bit ( IP_TUNNEL_KEY_BIT , flags ) ;
__set_bit ( IP_TUNNEL_SEQ_BIT , flags ) ;
ip_tunnel_flags_and ( flags , flags , key - > tun_flags ) ;
2022-04-14 13:34:26 -07:00
tun_hlen = gre_calc_hlen ( flags ) ;
2017-12-01 15:26:08 -08:00
2022-04-14 13:35:40 -07:00
if ( skb_cow_head ( skb , dev - > needed_headroom ? : tun_hlen + tunnel - > encap_hlen ) )
return - ENOMEM ;
2022-04-14 13:34:26 -07:00
gre_build_header ( skb , tun_hlen ,
2017-12-01 15:26:08 -08:00
flags , protocol ,
2018-03-01 13:49:57 -08:00
tunnel_id_to_key32 ( tun_info - > key . tun_id ) ,
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
test_bit ( IP_TUNNEL_SEQ_BIT , flags ) ?
htonl ( atomic_fetch_inc ( & tunnel - > o_seqno ) ) :
0 ) ;
2017-12-01 15:26:08 -08:00
} else {
2022-04-14 13:35:40 -07:00
if ( skb_cow_head ( skb , dev - > needed_headroom ? : tunnel - > hlen ) )
return - ENOMEM ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
ip_tunnel_flags_copy ( flags , tunnel - > parms . o_flags ) ;
2022-04-21 15:08:38 -07:00
gre_build_header ( skb , tunnel - > tun_hlen , flags ,
2017-12-01 15:26:08 -08:00
protocol , tunnel - > parms . o_key ,
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
test_bit ( IP_TUNNEL_SEQ_BIT , flags ) ?
htonl ( atomic_fetch_inc ( & tunnel - > o_seqno ) ) :
0 ) ;
2017-12-01 15:26:08 -08:00
}
2012-08-10 00:51:50 +00:00
2016-04-29 17:12:21 -07:00
return ip6_tnl_xmit ( skb , dev , dsfield , fl6 , encap_limit , pmtu ,
NEXTHDR_GRE ) ;
2012-08-10 00:51:50 +00:00
}
static inline int ip6gre_xmit_ipv4 ( struct sk_buff * skb , struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
int encap_limit = - 1 ;
struct flowi6 fl6 ;
2017-12-01 15:26:08 -08:00
__u8 dsfield = 0 ;
2012-08-10 00:51:50 +00:00
__u32 mtu ;
int err ;
2016-02-22 12:58:05 +13:00
memset ( & ( IPCB ( skb ) - > opt ) , 0 , sizeof ( IPCB ( skb ) - > opt ) ) ;
2017-12-01 15:26:08 -08:00
if ( ! t - > parms . collect_md )
prepare_ip6gre_xmit_ipv4 ( skb , dev , & fl6 ,
& dsfield , & encap_limit ) ;
2016-11-04 02:23:43 +09:00
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
err = gre_handle_offloads ( skb , test_bit ( IP_TUNNEL_CSUM_BIT ,
t - > parms . o_flags ) ) ;
2016-04-29 17:12:21 -07:00
if ( err )
return - 1 ;
err = __gre6_xmit ( skb , dev , dsfield , & fl6 , encap_limit , & mtu ,
skb - > protocol ) ;
2012-08-10 00:51:50 +00:00
if ( err ! = 0 ) {
/* XXX: send ICMP error even if DF is not set. */
if ( err = = - EMSGSIZE )
2021-02-27 01:40:19 +01:00
icmp_ndo_send ( skb , ICMP_DEST_UNREACH , ICMP_FRAG_NEEDED ,
htonl ( mtu ) ) ;
2012-08-10 00:51:50 +00:00
return - 1 ;
}
return 0 ;
}
static inline int ip6gre_xmit_ipv6 ( struct sk_buff * skb , struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct ipv6hdr * ipv6h = ipv6_hdr ( skb ) ;
int encap_limit = - 1 ;
struct flowi6 fl6 ;
2017-12-01 15:26:08 -08:00
__u8 dsfield = 0 ;
2012-08-10 00:51:50 +00:00
__u32 mtu ;
int err ;
if ( ipv6_addr_equal ( & t - > parms . raddr , & ipv6h - > saddr ) )
return - 1 ;
2017-12-01 15:26:08 -08:00
if ( ! t - > parms . collect_md & &
prepare_ip6gre_xmit_ipv6 ( skb , dev , & fl6 , & dsfield , & encap_limit ) )
2017-11-30 11:51:28 -08:00
return - 1 ;
2016-11-04 02:23:43 +09:00
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
if ( gre_handle_offloads ( skb , test_bit ( IP_TUNNEL_CSUM_BIT ,
t - > parms . o_flags ) ) )
2016-04-29 17:12:21 -07:00
return - 1 ;
err = __gre6_xmit ( skb , dev , dsfield , & fl6 , encap_limit ,
& mtu , skb - > protocol ) ;
2012-08-10 00:51:50 +00:00
if ( err ! = 0 ) {
if ( err = = - EMSGSIZE )
2021-02-27 01:40:19 +01:00
icmpv6_ndo_send ( skb , ICMPV6_PKT_TOOBIG , 0 , mtu ) ;
2012-08-10 00:51:50 +00:00
return - 1 ;
}
return 0 ;
}
static int ip6gre_xmit_other ( struct sk_buff * skb , struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
int encap_limit = - 1 ;
struct flowi6 fl6 ;
2022-07-11 11:17:20 +02:00
__u8 dsfield = 0 ;
2012-08-10 00:51:50 +00:00
__u32 mtu ;
int err ;
2022-07-11 11:17:20 +02:00
if ( ! t - > parms . collect_md & &
prepare_ip6gre_xmit_other ( skb , dev , & fl6 , & dsfield , & encap_limit ) )
return - 1 ;
2012-08-10 00:51:50 +00:00
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
err = gre_handle_offloads ( skb , test_bit ( IP_TUNNEL_CSUM_BIT ,
t - > parms . o_flags ) ) ;
2016-04-29 17:12:21 -07:00
if ( err )
return err ;
2022-07-11 11:17:20 +02:00
err = __gre6_xmit ( skb , dev , dsfield , & fl6 , encap_limit , & mtu , skb - > protocol ) ;
2012-08-10 00:51:50 +00:00
return err ;
}
static netdev_tx_t ip6gre_tunnel_xmit ( struct sk_buff * skb ,
struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
2022-07-11 11:17:21 +02:00
__be16 payload_protocol ;
2012-08-10 00:51:50 +00:00
int ret ;
2018-12-30 17:24:36 -05:00
if ( ! pskb_inet_may_pull ( skb ) )
goto tx_err ;
2014-11-05 08:02:48 +01:00
if ( ! ip6_tnl_xmit_ctl ( t , & t - > parms . laddr , & t - > parms . raddr ) )
2013-02-06 03:24:02 +00:00
goto tx_err ;
2012-08-10 00:51:50 +00:00
2022-07-11 11:17:21 +02:00
payload_protocol = skb_protocol ( skb , true ) ;
switch ( payload_protocol ) {
2012-08-10 00:51:50 +00:00
case htons ( ETH_P_IP ) :
ret = ip6gre_xmit_ipv4 ( skb , dev ) ;
break ;
case htons ( ETH_P_IPV6 ) :
ret = ip6gre_xmit_ipv6 ( skb , dev ) ;
break ;
default :
ret = ip6gre_xmit_other ( skb , dev ) ;
break ;
}
if ( ret < 0 )
goto tx_err ;
return NETDEV_TX_OK ;
tx_err :
2020-09-27 16:08:21 +02:00
if ( ! t - > parms . collect_md | | ! IS_ERR ( skb_tunnel_info_txcheck ( skb ) ) )
2022-11-15 08:53:57 +00:00
DEV_STATS_INC ( dev , tx_errors ) ;
DEV_STATS_INC ( dev , tx_dropped ) ;
2012-08-10 00:51:50 +00:00
kfree_skb ( skb ) ;
return NETDEV_TX_OK ;
}
2017-11-30 11:51:29 -08:00
static netdev_tx_t ip6erspan_tunnel_xmit ( struct sk_buff * skb ,
struct net_device * dev )
{
2020-09-27 16:08:21 +02:00
struct ip_tunnel_info * tun_info = NULL ;
2017-11-30 11:51:29 -08:00
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct dst_entry * dst = skb_dst ( skb ) ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
IP_TUNNEL_DECLARE_FLAGS ( flags ) = { } ;
2017-11-30 11:51:29 -08:00
bool truncate = false ;
int encap_limit = - 1 ;
__u8 dsfield = false ;
struct flowi6 fl6 ;
int err = - EINVAL ;
2019-01-14 18:10:06 +08:00
__be16 proto ;
2017-11-30 11:51:29 -08:00
__u32 mtu ;
2018-04-27 14:16:32 -07:00
int nhoff ;
2017-11-30 11:51:29 -08:00
2018-12-30 17:24:36 -05:00
if ( ! pskb_inet_may_pull ( skb ) )
goto tx_err ;
2017-11-30 11:51:29 -08:00
if ( ! ip6_tnl_xmit_ctl ( t , & t - > parms . laddr , & t - > parms . raddr ) )
goto tx_err ;
if ( gre_handle_offloads ( skb , false ) )
goto tx_err ;
if ( skb - > len > dev - > mtu + dev - > hard_header_len ) {
2023-07-17 22:45:19 +08:00
if ( pskb_trim ( skb , dev - > mtu + dev - > hard_header_len ) )
goto tx_err ;
2017-11-30 11:51:29 -08:00
truncate = true ;
}
2023-03-20 16:34:27 +00:00
nhoff = skb_network_offset ( skb ) ;
2018-04-27 14:16:32 -07:00
if ( skb - > protocol = = htons ( ETH_P_IP ) & &
( ntohs ( ip_hdr ( skb ) - > tot_len ) > skb - > len - nhoff ) )
truncate = true ;
2022-06-20 01:35:06 -07:00
if ( skb - > protocol = = htons ( ETH_P_IPV6 ) ) {
int thoff ;
if ( skb_transport_header_was_set ( skb ) )
2023-03-20 16:34:27 +00:00
thoff = skb_transport_offset ( skb ) ;
2022-06-20 01:35:06 -07:00
else
thoff = nhoff + sizeof ( struct ipv6hdr ) ;
if ( ntohs ( ipv6_hdr ( skb ) - > payload_len ) > skb - > len - thoff )
truncate = true ;
}
2018-05-11 05:49:47 -07:00
2018-05-17 16:36:15 +02:00
if ( skb_cow_head ( skb , dev - > needed_headroom ? : t - > hlen ) )
2018-03-09 07:34:42 -08:00
goto tx_err ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
__clear_bit ( IP_TUNNEL_KEY_BIT , t - > parms . o_flags ) ;
2017-11-30 11:51:29 -08:00
IPCB ( skb ) - > flags = 0 ;
2017-12-05 15:15:44 -08:00
/* For collect_md mode, derive fl6 from the tunnel key,
* for native mode , call prepare_ip6gre_xmit_ { ipv4 , ipv6 } .
*/
if ( t - > parms . collect_md ) {
const struct ip_tunnel_key * key ;
struct erspan_metadata * md ;
2018-01-25 13:20:09 -08:00
__be32 tun_id ;
2017-12-05 15:15:44 -08:00
2020-09-27 16:08:21 +02:00
tun_info = skb_tunnel_info_txcheck ( skb ) ;
if ( IS_ERR ( tun_info ) | |
unlikely ( ip_tunnel_info_af ( tun_info ) ! = AF_INET6 ) )
2019-09-13 17:45:47 +08:00
goto tx_err ;
2017-12-05 15:15:44 -08:00
key = & tun_info - > key ;
memset ( & fl6 , 0 , sizeof ( fl6 ) ) ;
fl6 . flowi6_proto = IPPROTO_GRE ;
fl6 . daddr = key - > u . ipv6 . dst ;
fl6 . flowlabel = key - > label ;
fl6 . flowi6_uid = sock_net_uid ( dev_net ( dev ) , NULL ) ;
2021-11-19 18:20:16 +01:00
fl6 . fl6_gre_key = tunnel_id_to_key32 ( key - > tun_id ) ;
2017-12-05 15:15:44 -08:00
dsfield = key - > tos ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
if ( ! test_bit ( IP_TUNNEL_ERSPAN_OPT_BIT ,
tun_info - > key . tun_flags ) )
2018-06-26 21:39:36 -07:00
goto tx_err ;
2019-10-28 23:19:35 +08:00
if ( tun_info - > options_len < sizeof ( * md ) )
2017-12-05 15:15:44 -08:00
goto tx_err ;
2019-10-28 23:19:35 +08:00
md = ip_tunnel_info_opts ( tun_info ) ;
2017-12-05 15:15:44 -08:00
2018-01-25 13:20:09 -08:00
tun_id = tunnel_id_to_key32 ( key - > tun_id ) ;
2017-12-13 16:38:57 -08:00
if ( md - > version = = 1 ) {
erspan_build_header ( skb ,
2018-01-25 13:20:09 -08:00
ntohl ( tun_id ) ,
2017-12-13 16:38:57 -08:00
ntohl ( md - > u . index ) , truncate ,
false ) ;
2023-05-11 19:22:11 -04:00
proto = htons ( ETH_P_ERSPAN ) ;
2017-12-13 16:38:57 -08:00
} else if ( md - > version = = 2 ) {
erspan_build_header_v2 ( skb ,
2018-01-25 13:20:09 -08:00
ntohl ( tun_id ) ,
md - > u . md2 . dir ,
get_hwid ( & md - > u . md2 ) ,
truncate , false ) ;
2023-05-11 19:22:11 -04:00
proto = htons ( ETH_P_ERSPAN2 ) ;
2018-03-09 07:34:41 -08:00
} else {
goto tx_err ;
2017-12-13 16:38:57 -08:00
}
2017-12-05 15:15:44 -08:00
} else {
switch ( skb - > protocol ) {
case htons ( ETH_P_IP ) :
memset ( & ( IPCB ( skb ) - > opt ) , 0 , sizeof ( IPCB ( skb ) - > opt ) ) ;
prepare_ip6gre_xmit_ipv4 ( skb , dev , & fl6 ,
& dsfield , & encap_limit ) ;
break ;
case htons ( ETH_P_IPV6 ) :
2018-12-30 17:24:36 -05:00
if ( ipv6_addr_equal ( & t - > parms . raddr , & ipv6_hdr ( skb ) - > saddr ) )
2017-12-05 15:15:44 -08:00
goto tx_err ;
if ( prepare_ip6gre_xmit_ipv6 ( skb , dev , & fl6 ,
& dsfield , & encap_limit ) )
goto tx_err ;
break ;
default :
memcpy ( & fl6 , & t - > fl . u . ip6 , sizeof ( fl6 ) ) ;
break ;
}
2023-05-11 19:22:11 -04:00
if ( t - > parms . erspan_ver = = 1 ) {
2018-01-25 13:20:09 -08:00
erspan_build_header ( skb , ntohl ( t - > parms . o_key ) ,
2017-12-13 16:38:57 -08:00
t - > parms . index ,
truncate , false ) ;
2023-05-11 19:22:11 -04:00
proto = htons ( ETH_P_ERSPAN ) ;
} else if ( t - > parms . erspan_ver = = 2 ) {
2018-01-25 13:20:09 -08:00
erspan_build_header_v2 ( skb , ntohl ( t - > parms . o_key ) ,
2017-12-13 16:38:57 -08:00
t - > parms . dir ,
t - > parms . hwid ,
truncate , false ) ;
2023-05-11 19:22:11 -04:00
proto = htons ( ETH_P_ERSPAN2 ) ;
} else {
2018-05-16 17:24:32 -07:00
goto tx_err ;
2023-05-11 19:22:11 -04:00
}
2018-05-16 17:24:32 -07:00
2017-12-05 15:15:44 -08:00
fl6 . daddr = t - > parms . raddr ;
}
2017-11-30 11:51:29 -08:00
/* Push GRE header. */
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
__set_bit ( IP_TUNNEL_SEQ_BIT , flags ) ;
gre_build_header ( skb , 8 , flags , proto , 0 ,
htonl ( atomic_fetch_inc ( & t - > o_seqno ) ) ) ;
2017-11-30 11:51:29 -08:00
/* TooBig packet may have updated dst->dev's mtu */
2017-12-05 15:15:44 -08:00
if ( ! t - > parms . collect_md & & dst & & dst_mtu ( dst ) > dst - > dev - > mtu )
2019-12-22 10:51:10 +08:00
dst - > ops - > update_pmtu ( dst , NULL , skb , dst - > dev - > mtu , false ) ;
2017-11-30 11:51:29 -08:00
err = ip6_tnl_xmit ( skb , dev , dsfield , & fl6 , encap_limit , & mtu ,
NEXTHDR_GRE ) ;
if ( err ! = 0 ) {
/* XXX: send ICMP error even if DF is not set. */
if ( err = = - EMSGSIZE ) {
if ( skb - > protocol = = htons ( ETH_P_IP ) )
2021-02-27 01:40:19 +01:00
icmp_ndo_send ( skb , ICMP_DEST_UNREACH ,
ICMP_FRAG_NEEDED , htonl ( mtu ) ) ;
2017-11-30 11:51:29 -08:00
else
2021-02-27 01:40:19 +01:00
icmpv6_ndo_send ( skb , ICMPV6_PKT_TOOBIG , 0 , mtu ) ;
2017-11-30 11:51:29 -08:00
}
goto tx_err ;
}
return NETDEV_TX_OK ;
tx_err :
2020-09-27 16:08:21 +02:00
if ( ! IS_ERR ( tun_info ) )
2022-11-15 08:53:57 +00:00
DEV_STATS_INC ( dev , tx_errors ) ;
DEV_STATS_INC ( dev , tx_dropped ) ;
2017-11-30 11:51:29 -08:00
kfree_skb ( skb ) ;
return NETDEV_TX_OK ;
}
2018-05-17 16:36:27 +02:00
static void ip6gre_tnl_link_config_common ( struct ip6_tnl * t )
2012-08-10 00:51:50 +00:00
{
struct net_device * dev = t - > dev ;
struct __ip6_tnl_parm * p = & t - > parms ;
struct flowi6 * fl6 = & t - > fl . u . ip6 ;
if ( dev - > type ! = ARPHRD_ETHER ) {
2021-10-12 09:06:34 -07:00
__dev_addr_set ( dev , & p - > laddr , sizeof ( struct in6_addr ) ) ;
2012-08-10 00:51:50 +00:00
memcpy ( dev - > broadcast , & p - > raddr , sizeof ( struct in6_addr ) ) ;
}
/* Set up flowi template */
fl6 - > saddr = p - > laddr ;
fl6 - > daddr = p - > raddr ;
fl6 - > flowi6_oif = p - > link ;
fl6 - > flowlabel = 0 ;
2016-05-21 18:17:35 +08:00
fl6 - > flowi6_proto = IPPROTO_GRE ;
2021-11-19 18:20:16 +01:00
fl6 - > fl6_gre_key = t - > parms . o_key ;
2012-08-10 00:51:50 +00:00
if ( ! ( p - > flags & IP6_TNL_F_USE_ORIG_TCLASS ) )
fl6 - > flowlabel | = IPV6_TCLASS_MASK & p - > flowinfo ;
if ( ! ( p - > flags & IP6_TNL_F_USE_ORIG_FLOWLABEL ) )
fl6 - > flowlabel | = IPV6_FLOWLABEL_MASK & p - > flowinfo ;
p - > flags & = ~ ( IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | IP6_TNL_F_CAP_PER_PACKET ) ;
p - > flags | = ip6_tnl_get_cap ( t , & p - > laddr , & p - > raddr ) ;
if ( p - > flags & IP6_TNL_F_CAP_XMIT & &
p - > flags & IP6_TNL_F_CAP_RCV & & dev - > type ! = ARPHRD_ETHER )
dev - > flags | = IFF_POINTOPOINT ;
else
dev - > flags & = ~ IFF_POINTOPOINT ;
2018-05-17 16:36:27 +02:00
}
2012-08-10 00:51:50 +00:00
2018-05-17 16:36:27 +02:00
static void ip6gre_tnl_link_config_route ( struct ip6_tnl * t , int set_mtu ,
int t_hlen )
{
const struct __ip6_tnl_parm * p = & t - > parms ;
struct net_device * dev = t - > dev ;
2012-08-10 00:51:50 +00:00
if ( p - > flags & IP6_TNL_F_CAP_XMIT ) {
int strict = ( ipv6_addr_type ( & p - > raddr ) &
( IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL ) ) ;
2014-04-22 10:15:24 +02:00
struct rt6_info * rt = rt6_lookup ( t - > net ,
2012-08-10 00:51:50 +00:00
& p - > raddr , & p - > laddr ,
2018-03-02 08:32:17 -08:00
p - > link , NULL , strict ) ;
2012-08-10 00:51:50 +00:00
2015-03-29 14:00:04 +01:00
if ( ! rt )
2012-08-10 00:51:50 +00:00
return ;
if ( rt - > dst . dev ) {
2020-11-30 17:19:11 +01:00
unsigned short dst_len = rt - > dst . dev - > hard_header_len +
t_hlen ;
if ( t - > dev - > header_ops )
dev - > hard_header_len = dst_len ;
else
dev - > needed_headroom = dst_len ;
2012-08-10 00:51:50 +00:00
if ( set_mtu ) {
2022-10-23 19:01:24 -07:00
int mtu = rt - > dst . dev - > mtu - t_hlen ;
2012-08-10 00:51:50 +00:00
if ( ! ( t - > parms . flags & IP6_TNL_F_IGN_ENCAP_LIMIT ) )
2022-10-23 19:01:24 -07:00
mtu - = 8 ;
2016-04-14 15:33:45 -04:00
if ( dev - > type = = ARPHRD_ETHER )
2022-10-23 19:01:24 -07:00
mtu - = ETH_HLEN ;
2012-08-10 00:51:50 +00:00
2022-10-23 19:01:24 -07:00
if ( mtu < IPV6_MIN_MTU )
mtu = IPV6_MIN_MTU ;
WRITE_ONCE ( dev - > mtu , mtu ) ;
2012-08-10 00:51:50 +00:00
}
}
2012-10-29 00:13:19 +00:00
ip6_rt_put ( rt ) ;
2012-08-10 00:51:50 +00:00
}
}
2018-05-17 16:36:27 +02:00
static int ip6gre_calc_hlen ( struct ip6_tnl * tunnel )
{
int t_hlen ;
tunnel - > tun_hlen = gre_calc_hlen ( tunnel - > parms . o_flags ) ;
tunnel - > hlen = tunnel - > tun_hlen + tunnel - > encap_hlen ;
t_hlen = tunnel - > hlen + sizeof ( struct ipv6hdr ) ;
2020-11-30 17:19:11 +01:00
if ( tunnel - > dev - > header_ops )
tunnel - > dev - > hard_header_len = LL_MAX_HEADER + t_hlen ;
else
tunnel - > dev - > needed_headroom = LL_MAX_HEADER + t_hlen ;
2018-05-17 16:36:27 +02:00
return t_hlen ;
}
static void ip6gre_tnl_link_config ( struct ip6_tnl * t , int set_mtu )
{
ip6gre_tnl_link_config_common ( t ) ;
ip6gre_tnl_link_config_route ( t , set_mtu , ip6gre_calc_hlen ( t ) ) ;
}
2018-05-17 16:36:33 +02:00
static void ip6gre_tnl_copy_tnl_parm ( struct ip6_tnl * t ,
const struct __ip6_tnl_parm * p )
2012-08-10 00:51:50 +00:00
{
t - > parms . laddr = p - > laddr ;
t - > parms . raddr = p - > raddr ;
t - > parms . flags = p - > flags ;
t - > parms . hop_limit = p - > hop_limit ;
t - > parms . encap_limit = p - > encap_limit ;
t - > parms . flowinfo = p - > flowinfo ;
t - > parms . link = p - > link ;
t - > parms . proto = p - > proto ;
t - > parms . i_key = p - > i_key ;
t - > parms . o_key = p - > o_key ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
ip_tunnel_flags_copy ( t - > parms . i_flags , p - > i_flags ) ;
ip_tunnel_flags_copy ( t - > parms . o_flags , p - > o_flags ) ;
2017-04-19 12:30:53 -04:00
t - > parms . fwmark = p - > fwmark ;
2019-01-10 11:17:42 +08:00
t - > parms . erspan_ver = p - > erspan_ver ;
t - > parms . index = p - > index ;
t - > parms . dir = p - > dir ;
t - > parms . hwid = p - > hwid ;
2016-02-12 15:43:54 +01:00
dst_cache_reset ( & t - > dst_cache ) ;
2018-05-17 16:36:33 +02:00
}
static int ip6gre_tnl_change ( struct ip6_tnl * t , const struct __ip6_tnl_parm * p ,
int set_mtu )
{
ip6gre_tnl_copy_tnl_parm ( t , p ) ;
2012-08-10 00:51:50 +00:00
ip6gre_tnl_link_config ( t , set_mtu ) ;
return 0 ;
}
static void ip6gre_tnl_parm_from_user ( struct __ip6_tnl_parm * p ,
const struct ip6_tnl_parm2 * u )
{
p - > laddr = u - > laddr ;
p - > raddr = u - > raddr ;
p - > flags = u - > flags ;
p - > hop_limit = u - > hop_limit ;
p - > encap_limit = u - > encap_limit ;
p - > flowinfo = u - > flowinfo ;
p - > link = u - > link ;
p - > i_key = u - > i_key ;
p - > o_key = u - > o_key ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
gre_flags_to_tnl_flags ( p - > i_flags , u - > i_flags ) ;
gre_flags_to_tnl_flags ( p - > o_flags , u - > o_flags ) ;
2012-08-10 00:51:50 +00:00
memcpy ( p - > name , u - > name , sizeof ( u - > name ) ) ;
}
static void ip6gre_tnl_parm_to_user ( struct ip6_tnl_parm2 * u ,
const struct __ip6_tnl_parm * p )
{
u - > proto = IPPROTO_GRE ;
u - > laddr = p - > laddr ;
u - > raddr = p - > raddr ;
u - > flags = p - > flags ;
u - > hop_limit = p - > hop_limit ;
u - > encap_limit = p - > encap_limit ;
u - > flowinfo = p - > flowinfo ;
u - > link = p - > link ;
u - > i_key = p - > i_key ;
u - > o_key = p - > o_key ;
2016-05-09 17:12:09 -07:00
u - > i_flags = gre_tnl_flags_to_gre_flags ( p - > i_flags ) ;
u - > o_flags = gre_tnl_flags_to_gre_flags ( p - > o_flags ) ;
2012-08-10 00:51:50 +00:00
memcpy ( u - > name , p - > name , sizeof ( u - > name ) ) ;
}
2021-07-27 15:45:06 +02:00
static int ip6gre_tunnel_siocdevprivate ( struct net_device * dev ,
struct ifreq * ifr , void __user * data ,
int cmd )
2012-08-10 00:51:50 +00:00
{
int err = 0 ;
struct ip6_tnl_parm2 p ;
struct __ip6_tnl_parm p1 ;
2014-04-22 10:15:24 +02:00
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct net * net = t - > net ;
2012-08-10 00:51:50 +00:00
struct ip6gre_net * ign = net_generic ( net , ip6gre_net_id ) ;
2016-04-29 17:12:17 -07:00
memset ( & p1 , 0 , sizeof ( p1 ) ) ;
2012-08-10 00:51:50 +00:00
switch ( cmd ) {
case SIOCGETTUNNEL :
if ( dev = = ign - > fb_tunnel_dev ) {
2021-07-27 15:45:06 +02:00
if ( copy_from_user ( & p , data , sizeof ( p ) ) ) {
2012-08-10 00:51:50 +00:00
err = - EFAULT ;
break ;
}
ip6gre_tnl_parm_from_user ( & p1 , & p ) ;
t = ip6gre_tunnel_locate ( net , & p1 , 0 ) ;
2015-03-29 14:00:04 +01:00
if ( ! t )
2014-04-22 10:15:24 +02:00
t = netdev_priv ( dev ) ;
2012-08-10 00:51:50 +00:00
}
2013-05-09 21:56:37 +00:00
memset ( & p , 0 , sizeof ( p ) ) ;
2012-08-10 00:51:50 +00:00
ip6gre_tnl_parm_to_user ( & p , & t - > parms ) ;
2021-07-27 15:45:06 +02:00
if ( copy_to_user ( data , & p , sizeof ( p ) ) )
2012-08-10 00:51:50 +00:00
err = - EFAULT ;
break ;
case SIOCADDTUNNEL :
case SIOCCHGTUNNEL :
err = - EPERM ;
net: Allow userns root to control ipv6
Allow an unpriviled user who has created a user namespace, and then
created a network namespace to effectively use the new network
namespace, by reducing capable(CAP_NET_ADMIN) and
capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns,
CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls.
Settings that merely control a single network device are allowed.
Either the network device is a logical network device where
restrictions make no difference or the network device is hardware NIC
that has been explicity moved from the initial network namespace.
In general policy and network stack state changes are allowed while
resource control is left unchanged.
Allow the SIOCSIFADDR ioctl to add ipv6 addresses.
Allow the SIOCDIFADDR ioctl to delete ipv6 addresses.
Allow the SIOCADDRT ioctl to add ipv6 routes.
Allow the SIOCDELRT ioctl to delete ipv6 routes.
Allow creation of ipv6 raw sockets.
Allow setting the IPV6_JOIN_ANYCAST socket option.
Allow setting the IPV6_FL_A_RENEW parameter of the IPV6_FLOWLABEL_MGR
socket option.
Allow setting the IPV6_TRANSPARENT socket option.
Allow setting the IPV6_HOPOPTS socket option.
Allow setting the IPV6_RTHDRDSTOPTS socket option.
Allow setting the IPV6_DSTOPTS socket option.
Allow setting the IPV6_IPSEC_POLICY socket option.
Allow setting the IPV6_XFRM_POLICY socket option.
Allow sending packets with the IPV6_2292HOPOPTS control message.
Allow sending packets with the IPV6_2292DSTOPTS control message.
Allow sending packets with the IPV6_RTHDRDSTOPTS control message.
Allow setting the multicast routing socket options on non multicast
routing sockets.
Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, and SIOCDELTUNNEL ioctls for
setting up, changing and deleting tunnels over ipv6.
Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, SIOCDELTUNNEL ioctls for
setting up, changing and deleting ipv6 over ipv4 tunnels.
Allow the SIOCADDPRL, SIOCDELPRL, SIOCCHGPRL ioctls for adding,
deleting, and changing the potential router list for ISATAP tunnels.
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 03:03:06 +00:00
if ( ! ns_capable ( net - > user_ns , CAP_NET_ADMIN ) )
2012-08-10 00:51:50 +00:00
goto done ;
err = - EFAULT ;
2021-07-27 15:45:06 +02:00
if ( copy_from_user ( & p , data , sizeof ( p ) ) )
2012-08-10 00:51:50 +00:00
goto done ;
err = - EINVAL ;
if ( ( p . i_flags | p . o_flags ) & ( GRE_VERSION | GRE_ROUTING ) )
goto done ;
if ( ! ( p . i_flags & GRE_KEY ) )
p . i_key = 0 ;
if ( ! ( p . o_flags & GRE_KEY ) )
p . o_key = 0 ;
ip6gre_tnl_parm_from_user ( & p1 , & p ) ;
t = ip6gre_tunnel_locate ( net , & p1 , cmd = = SIOCADDTUNNEL ) ;
if ( dev ! = ign - > fb_tunnel_dev & & cmd = = SIOCCHGTUNNEL ) {
2015-03-29 14:00:05 +01:00
if ( t ) {
2012-08-10 00:51:50 +00:00
if ( t - > dev ! = dev ) {
err = - EEXIST ;
break ;
}
} else {
t = netdev_priv ( dev ) ;
ip6gre_tunnel_unlink ( ign , t ) ;
synchronize_net ( ) ;
ip6gre_tnl_change ( t , & p1 , 1 ) ;
ip6gre_tunnel_link ( ign , t ) ;
netdev_state_change ( dev ) ;
}
}
if ( t ) {
err = 0 ;
2013-05-09 21:56:37 +00:00
memset ( & p , 0 , sizeof ( p ) ) ;
2012-08-10 00:51:50 +00:00
ip6gre_tnl_parm_to_user ( & p , & t - > parms ) ;
2021-07-27 15:45:06 +02:00
if ( copy_to_user ( data , & p , sizeof ( p ) ) )
2012-08-10 00:51:50 +00:00
err = - EFAULT ;
} else
err = ( cmd = = SIOCADDTUNNEL ? - ENOBUFS : - ENOENT ) ;
break ;
case SIOCDELTUNNEL :
err = - EPERM ;
net: Allow userns root to control ipv6
Allow an unpriviled user who has created a user namespace, and then
created a network namespace to effectively use the new network
namespace, by reducing capable(CAP_NET_ADMIN) and
capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns,
CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls.
Settings that merely control a single network device are allowed.
Either the network device is a logical network device where
restrictions make no difference or the network device is hardware NIC
that has been explicity moved from the initial network namespace.
In general policy and network stack state changes are allowed while
resource control is left unchanged.
Allow the SIOCSIFADDR ioctl to add ipv6 addresses.
Allow the SIOCDIFADDR ioctl to delete ipv6 addresses.
Allow the SIOCADDRT ioctl to add ipv6 routes.
Allow the SIOCDELRT ioctl to delete ipv6 routes.
Allow creation of ipv6 raw sockets.
Allow setting the IPV6_JOIN_ANYCAST socket option.
Allow setting the IPV6_FL_A_RENEW parameter of the IPV6_FLOWLABEL_MGR
socket option.
Allow setting the IPV6_TRANSPARENT socket option.
Allow setting the IPV6_HOPOPTS socket option.
Allow setting the IPV6_RTHDRDSTOPTS socket option.
Allow setting the IPV6_DSTOPTS socket option.
Allow setting the IPV6_IPSEC_POLICY socket option.
Allow setting the IPV6_XFRM_POLICY socket option.
Allow sending packets with the IPV6_2292HOPOPTS control message.
Allow sending packets with the IPV6_2292DSTOPTS control message.
Allow sending packets with the IPV6_RTHDRDSTOPTS control message.
Allow setting the multicast routing socket options on non multicast
routing sockets.
Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, and SIOCDELTUNNEL ioctls for
setting up, changing and deleting tunnels over ipv6.
Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, SIOCDELTUNNEL ioctls for
setting up, changing and deleting ipv6 over ipv4 tunnels.
Allow the SIOCADDPRL, SIOCDELPRL, SIOCCHGPRL ioctls for adding,
deleting, and changing the potential router list for ISATAP tunnels.
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 03:03:06 +00:00
if ( ! ns_capable ( net - > user_ns , CAP_NET_ADMIN ) )
2012-08-10 00:51:50 +00:00
goto done ;
if ( dev = = ign - > fb_tunnel_dev ) {
err = - EFAULT ;
2021-07-27 15:45:06 +02:00
if ( copy_from_user ( & p , data , sizeof ( p ) ) )
2012-08-10 00:51:50 +00:00
goto done ;
err = - ENOENT ;
ip6gre_tnl_parm_from_user ( & p1 , & p ) ;
t = ip6gre_tunnel_locate ( net , & p1 , 0 ) ;
2015-03-29 14:00:04 +01:00
if ( ! t )
2012-08-10 00:51:50 +00:00
goto done ;
err = - EPERM ;
if ( t = = netdev_priv ( ign - > fb_tunnel_dev ) )
goto done ;
dev = t - > dev ;
}
unregister_netdevice ( dev ) ;
err = 0 ;
break ;
default :
err = - EINVAL ;
}
done :
return err ;
}
static int ip6gre_header ( struct sk_buff * skb , struct net_device * dev ,
2017-09-15 12:00:07 +08:00
unsigned short type , const void * daddr ,
const void * saddr , unsigned int len )
2012-08-10 00:51:50 +00:00
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
2017-09-15 12:00:07 +08:00
struct ipv6hdr * ipv6h ;
__be16 * p ;
2012-08-10 00:51:50 +00:00
2017-09-15 12:00:07 +08:00
ipv6h = skb_push ( skb , t - > hlen + sizeof ( * ipv6h ) ) ;
ip6_flow_hdr ( ipv6h , 0 , ip6_make_flowlabel ( dev_net ( dev ) , skb ,
t - > fl . u . ip6 . flowlabel ,
true , & t - > fl . u . ip6 ) ) ;
2012-08-10 00:51:50 +00:00
ipv6h - > hop_limit = t - > parms . hop_limit ;
ipv6h - > nexthdr = NEXTHDR_GRE ;
ipv6h - > saddr = t - > parms . laddr ;
ipv6h - > daddr = t - > parms . raddr ;
2017-09-15 12:00:07 +08:00
p = ( __be16 * ) ( ipv6h + 1 ) ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
p [ 0 ] = ip_tunnel_flags_to_be16 ( t - > parms . o_flags ) ;
2017-09-15 12:00:07 +08:00
p [ 1 ] = htons ( type ) ;
2012-08-10 00:51:50 +00:00
/*
* Set the source hardware address .
*/
if ( saddr )
memcpy ( & ipv6h - > saddr , saddr , sizeof ( struct in6_addr ) ) ;
if ( daddr )
memcpy ( & ipv6h - > daddr , daddr , sizeof ( struct in6_addr ) ) ;
if ( ! ipv6_addr_any ( & ipv6h - > daddr ) )
return t - > hlen ;
return - t - > hlen ;
}
static const struct header_ops ip6gre_header_ops = {
. create = ip6gre_header ,
} ;
static const struct net_device_ops ip6gre_netdev_ops = {
. ndo_init = ip6gre_tunnel_init ,
. ndo_uninit = ip6gre_tunnel_uninit ,
. ndo_start_xmit = ip6gre_tunnel_xmit ,
2021-07-27 15:45:06 +02:00
. ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate ,
2016-04-29 17:12:21 -07:00
. ndo_change_mtu = ip6_tnl_change_mtu ,
2015-04-02 17:07:01 +02:00
. ndo_get_iflink = ip6_tnl_get_iflink ,
2012-08-10 00:51:50 +00:00
} ;
static void ip6gre_dev_free ( struct net_device * dev )
{
2015-09-15 14:30:07 -07:00
struct ip6_tnl * t = netdev_priv ( dev ) ;
2018-05-07 10:45:27 +03:00
gro_cells_destroy ( & t - > gro_cells ) ;
2016-02-12 15:43:54 +01:00
dst_cache_destroy ( & t - > dst_cache ) ;
2012-08-10 00:51:50 +00:00
}
static void ip6gre_tunnel_setup ( struct net_device * dev )
{
dev - > netdev_ops = & ip6gre_netdev_ops ;
net: Fix inconsistent teardown and release of private netdev state.
Network devices can allocate reasources and private memory using
netdev_ops->ndo_init(). However, the release of these resources
can occur in one of two different places.
Either netdev_ops->ndo_uninit() or netdev->destructor().
The decision of which operation frees the resources depends upon
whether it is necessary for all netdev refs to be released before it
is safe to perform the freeing.
netdev_ops->ndo_uninit() presumably can occur right after the
NETDEV_UNREGISTER notifier completes and the unicast and multicast
address lists are flushed.
netdev->destructor(), on the other hand, does not run until the
netdev references all go away.
Further complicating the situation is that netdev->destructor()
almost universally does also a free_netdev().
This creates a problem for the logic in register_netdevice().
Because all callers of register_netdevice() manage the freeing
of the netdev, and invoke free_netdev(dev) if register_netdevice()
fails.
If netdev_ops->ndo_init() succeeds, but something else fails inside
of register_netdevice(), it does call ndo_ops->ndo_uninit(). But
it is not able to invoke netdev->destructor().
This is because netdev->destructor() will do a free_netdev() and
then the caller of register_netdevice() will do the same.
However, this means that the resources that would normally be released
by netdev->destructor() will not be.
Over the years drivers have added local hacks to deal with this, by
invoking their destructor parts by hand when register_netdevice()
fails.
Many drivers do not try to deal with this, and instead we have leaks.
Let's close this hole by formalizing the distinction between what
private things need to be freed up by netdev->destructor() and whether
the driver needs unregister_netdevice() to perform the free_netdev().
netdev->priv_destructor() performs all actions to free up the private
resources that used to be freed by netdev->destructor(), except for
free_netdev().
netdev->needs_free_netdev is a boolean that indicates whether
free_netdev() should be done at the end of unregister_netdevice().
Now, register_netdevice() can sanely release all resources after
ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit()
and netdev->priv_destructor().
And at the end of unregister_netdevice(), we invoke
netdev->priv_destructor() and optionally call free_netdev().
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-08 12:52:56 -04:00
dev - > needs_free_netdev = true ;
dev - > priv_destructor = ip6gre_dev_free ;
2012-08-10 00:51:50 +00:00
2024-04-12 08:19:25 -07:00
dev - > pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS ;
2012-08-10 00:51:50 +00:00
dev - > type = ARPHRD_IP6GRE ;
2016-04-29 17:12:21 -07:00
2012-08-10 00:51:50 +00:00
dev - > flags | = IFF_NOARP ;
dev - > addr_len = sizeof ( struct in6_addr ) ;
2014-10-05 18:38:35 -07:00
netif_keep_dst ( dev ) ;
2017-01-26 16:59:18 +13:00
/* This perm addr will be used as interface identifier by IPv6 */
dev - > addr_assign_type = NET_ADDR_RANDOM ;
eth_random_addr ( dev - > perm_addr ) ;
2012-08-10 00:51:50 +00:00
}
2017-12-20 19:36:03 +03:00
# define GRE6_FEATURES (NETIF_F_SG | \
NETIF_F_FRAGLIST | \
NETIF_F_HIGHDMA | \
NETIF_F_HW_CSUM )
static void ip6gre_tnl_init_features ( struct net_device * dev )
{
struct ip6_tnl * nt = netdev_priv ( dev ) ;
2022-04-28 22:25:47 -07:00
dev - > features | = GRE6_FEATURES | NETIF_F_LLTX ;
2017-12-20 19:36:03 +03:00
dev - > hw_features | = GRE6_FEATURES ;
2022-04-28 22:25:47 -07:00
/* TCP offload with GRE SEQ is not supported, nor can we support 2
* levels of outer headers requiring an update .
*/
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
if ( test_bit ( IP_TUNNEL_SEQ_BIT , nt - > parms . o_flags ) )
2022-04-28 22:25:47 -07:00
return ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
if ( test_bit ( IP_TUNNEL_CSUM_BIT , nt - > parms . o_flags ) & &
nt - > encap . type ! = TUNNEL_ENCAP_NONE )
2022-04-28 22:25:47 -07:00
return ;
dev - > features | = NETIF_F_GSO_SOFTWARE ;
dev - > hw_features | = NETIF_F_GSO_SOFTWARE ;
2017-12-20 19:36:03 +03:00
}
2015-09-15 14:30:05 -07:00
static int ip6gre_tunnel_init_common ( struct net_device * dev )
2012-08-10 00:51:50 +00:00
{
struct ip6_tnl * tunnel ;
2015-09-15 14:30:07 -07:00
int ret ;
2016-04-29 17:12:21 -07:00
int t_hlen ;
2012-08-10 00:51:50 +00:00
tunnel = netdev_priv ( dev ) ;
tunnel - > dev = dev ;
2013-08-13 17:51:12 +02:00
tunnel - > net = dev_net ( dev ) ;
2012-08-10 00:51:50 +00:00
strcpy ( tunnel - > parms . name , dev - > name ) ;
2016-02-12 15:43:54 +01:00
ret = dst_cache_init ( & tunnel - > dst_cache , GFP_KERNEL ) ;
2018-05-07 10:45:27 +03:00
if ( ret )
2024-04-12 08:19:25 -07:00
return ret ;
2018-05-07 10:45:27 +03:00
ret = gro_cells_init ( & tunnel - > gro_cells , dev ) ;
if ( ret )
goto cleanup_dst_cache_init ;
2015-09-15 14:30:07 -07:00
2018-05-17 16:36:27 +02:00
t_hlen = ip6gre_calc_hlen ( tunnel ) ;
2016-05-09 17:12:08 -07:00
dev - > mtu = ETH_DATA_LEN - t_hlen ;
2016-05-21 18:17:34 +08:00
if ( dev - > type = = ARPHRD_ETHER )
dev - > mtu - = ETH_HLEN ;
2016-04-29 17:12:21 -07:00
if ( ! ( tunnel - > parms . flags & IP6_TNL_F_IGN_ENCAP_LIMIT ) )
dev - > mtu - = 8 ;
2017-12-01 15:26:08 -08:00
if ( tunnel - > parms . collect_md ) {
netif_keep_dst ( dev ) ;
}
2017-12-20 19:36:03 +03:00
ip6gre_tnl_init_features ( dev ) ;
2017-12-01 15:26:08 -08:00
2022-06-07 21:39:55 -07:00
netdev_hold ( dev , & tunnel - > dev_tracker , GFP_KERNEL ) ;
net: add netdev_lockdep_set_classes() to virtual drivers
Based on a syzbot report, it appears many virtual
drivers do not yet use netdev_lockdep_set_classes(),
triggerring lockdep false positives.
WARNING: possible recursive locking detected
6.8.0-rc4-next-20240212-syzkaller #0 Not tainted
syz-executor.0/19016 is trying to acquire lock:
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
but task is already holding lock:
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
lock(_xmit_ETHER#2);
lock(_xmit_ETHER#2);
*** DEADLOCK ***
May be due to missing lock nesting notation
9 locks held by syz-executor.0/19016:
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock net/core/rtnetlink.c:79 [inline]
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x82c/0x1040 net/core/rtnetlink.c:6603
#1: ffffc90000a08c00 ((&in_dev->mr_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0xc0/0x600 kernel/time/timer.c:1697
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
stack backtrace:
CPU: 1 PID: 19016 Comm: syz-executor.0 Not tainted 6.8.0-rc4-next-20240212-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
check_deadlock kernel/locking/lockdep.c:3062 [inline]
validate_chain+0x15c1/0x58e0 kernel/locking/lockdep.c:3856
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1e4/0x530 kernel/locking/lockdep.c:5754
__raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
_raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
spin_lock include/linux/spinlock.h:351 [inline]
__netif_tx_lock include/linux/netdevice.h:4452 [inline]
sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
iptunnel_xmit+0x540/0x9b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x20ee/0x2960 net/ipv4/ip_tunnel.c:831
erspan_xmit+0x9de/0x1460 net/ipv4/ip_gre.c:720
__netdev_start_xmit include/linux/netdevice.h:4989 [inline]
netdev_start_xmit include/linux/netdevice.h:5003 [inline]
xmit_one net/core/dev.c:3555 [inline]
dev_hard_start_xmit+0x242/0x770 net/core/dev.c:3571
sch_direct_xmit+0x2b6/0x5f0 net/sched/sch_generic.c:342
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
igmpv3_send_cr net/ipv4/igmp.c:723 [inline]
igmp_ifc_timer_expire+0xb71/0xd90 net/ipv4/igmp.c:813
call_timer_fn+0x17e/0x600 kernel/time/timer.c:1700
expire_timers kernel/time/timer.c:1751 [inline]
__run_timers+0x621/0x830 kernel/time/timer.c:2038
run_timer_softirq+0x67/0xf0 kernel/time/timer.c:2051
__do_softirq+0x2bc/0x943 kernel/softirq.c:554
invoke_softirq kernel/softirq.c:428 [inline]
__irq_exit_rcu+0xf2/0x1c0 kernel/softirq.c:633
irq_exit_rcu+0x9/0x30 kernel/softirq.c:645
instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1076 [inline]
sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1076
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
RIP: 0010:resched_offsets_ok kernel/sched/core.c:10127 [inline]
RIP: 0010:__might_resched+0x16f/0x780 kernel/sched/core.c:10142
Code: 00 4c 89 e8 48 c1 e8 03 48 ba 00 00 00 00 00 fc ff df 48 89 44 24 38 0f b6 04 10 84 c0 0f 85 87 04 00 00 41 8b 45 00 c1 e0 08 <01> d8 44 39 e0 0f 85 d6 00 00 00 44 89 64 24 1c 48 8d bc 24 a0 00
RSP: 0018:ffffc9000ee069e0 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8880296a9e00
RDX: dffffc0000000000 RSI: ffff8880296a9e00 RDI: ffffffff8bfe8fa0
RBP: ffffc9000ee06b00 R08: ffffffff82326877 R09: 1ffff11002b5ad1b
R10: dffffc0000000000 R11: ffffed1002b5ad1c R12: 0000000000000000
R13: ffff8880296aa23c R14: 000000000000062a R15: 1ffff92001dc0d44
down_write+0x19/0x50 kernel/locking/rwsem.c:1578
kernfs_activate fs/kernfs/dir.c:1403 [inline]
kernfs_add_one+0x4af/0x8b0 fs/kernfs/dir.c:819
__kernfs_create_file+0x22e/0x2e0 fs/kernfs/file.c:1056
sysfs_add_file_mode_ns+0x24a/0x310 fs/sysfs/file.c:307
create_files fs/sysfs/group.c:64 [inline]
internal_create_group+0x4f4/0xf20 fs/sysfs/group.c:152
internal_create_groups fs/sysfs/group.c:192 [inline]
sysfs_create_groups+0x56/0x120 fs/sysfs/group.c:218
create_dir lib/kobject.c:78 [inline]
kobject_add_internal+0x472/0x8d0 lib/kobject.c:240
kobject_add_varg lib/kobject.c:374 [inline]
kobject_init_and_add+0x124/0x190 lib/kobject.c:457
netdev_queue_add_kobject net/core/net-sysfs.c:1706 [inline]
netdev_queue_update_kobjects+0x1f3/0x480 net/core/net-sysfs.c:1758
register_queue_kobjects net/core/net-sysfs.c:1819 [inline]
netdev_register_kobject+0x265/0x310 net/core/net-sysfs.c:2059
register_netdevice+0x1191/0x19c0 net/core/dev.c:10298
bond_newlink+0x3b/0x90 drivers/net/bonding/bond_netlink.c:576
rtnl_newlink_create net/core/rtnetlink.c:3506 [inline]
__rtnl_newlink net/core/rtnetlink.c:3726 [inline]
rtnl_newlink+0x158f/0x20a0 net/core/rtnetlink.c:3739
rtnetlink_rcv_msg+0x885/0x1040 net/core/rtnetlink.c:6606
netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367
netlink_sendmsg+0xa3c/0xd70 net/netlink/af_netlink.c:1908
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0x221/0x270 net/socket.c:745
__sys_sendto+0x3a4/0x4f0 net/socket.c:2191
__do_sys_sendto net/socket.c:2203 [inline]
__se_sys_sendto net/socket.c:2199 [inline]
__x64_sys_sendto+0xde/0x100 net/socket.c:2199
do_syscall_64+0xfb/0x240
entry_SYSCALL_64_after_hwframe+0x6d/0x75
RIP: 0033:0x7fc3fa87fa9c
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240212140700.2795436-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-02-12 14:07:00 +00:00
netdev_lockdep_set_classes ( dev ) ;
2015-09-15 14:30:05 -07:00
return 0 ;
2018-05-07 10:45:27 +03:00
cleanup_dst_cache_init :
dst_cache_destroy ( & tunnel - > dst_cache ) ;
return ret ;
2015-09-15 14:30:05 -07:00
}
static int ip6gre_tunnel_init ( struct net_device * dev )
{
struct ip6_tnl * tunnel ;
int ret ;
ret = ip6gre_tunnel_init_common ( dev ) ;
if ( ret )
return ret ;
tunnel = netdev_priv ( dev ) ;
2017-12-01 15:26:08 -08:00
if ( tunnel - > parms . collect_md )
return 0 ;
2021-10-12 09:06:34 -07:00
__dev_addr_set ( dev , & tunnel - > parms . laddr , sizeof ( struct in6_addr ) ) ;
2012-08-10 00:51:50 +00:00
memcpy ( dev - > broadcast , & tunnel - > parms . raddr , sizeof ( struct in6_addr ) ) ;
if ( ipv6_addr_any ( & tunnel - > parms . raddr ) )
dev - > header_ops = & ip6gre_header_ops ;
return 0 ;
}
static void ip6gre_fb_tunnel_init ( struct net_device * dev )
{
struct ip6_tnl * tunnel = netdev_priv ( dev ) ;
tunnel - > dev = dev ;
2013-08-13 17:51:12 +02:00
tunnel - > net = dev_net ( dev ) ;
2012-08-10 00:51:50 +00:00
strcpy ( tunnel - > parms . name , dev - > name ) ;
tunnel - > hlen = sizeof ( struct ipv6hdr ) + 4 ;
}
static struct inet6_protocol ip6gre_protocol __read_mostly = {
2016-04-29 17:12:17 -07:00
. handler = gre_rcv ,
2012-08-10 00:51:50 +00:00
. err_handler = ip6gre_err ,
2021-11-19 18:20:16 +01:00
. flags = INET6_PROTO_FINAL ,
2012-08-10 00:51:50 +00:00
} ;
2014-04-22 10:15:24 +02:00
static void ip6gre_destroy_tunnels ( struct net * net , struct list_head * head )
2012-08-10 00:51:50 +00:00
{
2014-04-22 10:15:24 +02:00
struct ip6gre_net * ign = net_generic ( net , ip6gre_net_id ) ;
struct net_device * dev , * aux ;
2012-08-10 00:51:50 +00:00
int prio ;
2014-04-22 10:15:24 +02:00
for_each_netdev_safe ( net , dev , aux )
if ( dev - > rtnl_link_ops = = & ip6gre_link_ops | |
2017-11-30 11:51:29 -08:00
dev - > rtnl_link_ops = = & ip6gre_tap_ops | |
dev - > rtnl_link_ops = = & ip6erspan_tap_ops )
2014-04-22 10:15:24 +02:00
unregister_netdevice_queue ( dev , head ) ;
2012-08-10 00:51:50 +00:00
for ( prio = 0 ; prio < 4 ; prio + + ) {
int h ;
2016-08-10 11:03:35 +02:00
for ( h = 0 ; h < IP6_GRE_HASH_SIZE ; h + + ) {
2012-08-10 00:51:50 +00:00
struct ip6_tnl * t ;
t = rtnl_dereference ( ign - > tunnels [ prio ] [ h ] ) ;
2015-03-29 14:00:05 +01:00
while ( t ) {
2014-04-22 10:15:24 +02:00
/* If dev is in the same netns, it has already
* been added to the list by the previous loop .
*/
if ( ! net_eq ( dev_net ( t - > dev ) , net ) )
unregister_netdevice_queue ( t - > dev ,
head ) ;
2012-08-10 00:51:50 +00:00
t = rtnl_dereference ( t - > next ) ;
}
}
}
}
static int __net_init ip6gre_init_net ( struct net * net )
{
struct ip6gre_net * ign = net_generic ( net , ip6gre_net_id ) ;
2020-07-13 23:59:50 +08:00
struct net_device * ndev ;
2012-08-10 00:51:50 +00:00
int err ;
net: do not create fallback tunnels for non-default namespaces
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.
These tunnels are also automatically created when a new network
namespace is created, at a great cost.
In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)
Add a new sysctl so that we can opt-out from this automatic creation.
Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.
Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
(for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait
lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-08 12:51:41 -08:00
if ( ! net_has_fallback_tunnels ( net ) )
return 0 ;
2020-07-13 23:59:50 +08:00
ndev = alloc_netdev ( sizeof ( struct ip6_tnl ) , " ip6gre0 " ,
NET_NAME_UNKNOWN , ip6gre_tunnel_setup ) ;
if ( ! ndev ) {
2012-08-10 00:51:50 +00:00
err = - ENOMEM ;
goto err_alloc_dev ;
}
2020-07-13 23:59:50 +08:00
ign - > fb_tunnel_dev = ndev ;
2012-08-10 00:51:50 +00:00
dev_net_set ( ign - > fb_tunnel_dev , net ) ;
2014-04-22 10:15:24 +02:00
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe .
*/
ign - > fb_tunnel_dev - > features | = NETIF_F_NETNS_LOCAL ;
2012-08-10 00:51:50 +00:00
ip6gre_fb_tunnel_init ( ign - > fb_tunnel_dev ) ;
ign - > fb_tunnel_dev - > rtnl_link_ops = & ip6gre_link_ops ;
err = register_netdev ( ign - > fb_tunnel_dev ) ;
if ( err )
goto err_reg_dev ;
rcu_assign_pointer ( ign - > tunnels_wc [ 0 ] ,
netdev_priv ( ign - > fb_tunnel_dev ) ) ;
return 0 ;
err_reg_dev :
2020-07-13 23:59:50 +08:00
free_netdev ( ndev ) ;
2012-08-10 00:51:50 +00:00
err_alloc_dev :
return err ;
}
2024-02-06 14:43:06 +00:00
static void __net_exit ip6gre_exit_batch_rtnl ( struct list_head * net_list ,
struct list_head * dev_to_kill )
2012-08-10 00:51:50 +00:00
{
2017-09-19 16:27:08 -07:00
struct net * net ;
2012-08-10 00:51:50 +00:00
2024-02-06 14:43:06 +00:00
ASSERT_RTNL ( ) ;
2017-09-19 16:27:08 -07:00
list_for_each_entry ( net , net_list , exit_list )
2024-02-06 14:43:06 +00:00
ip6gre_destroy_tunnels ( net , dev_to_kill ) ;
2012-08-10 00:51:50 +00:00
}
static struct pernet_operations ip6gre_net_ops = {
. init = ip6gre_init_net ,
2024-02-06 14:43:06 +00:00
. exit_batch_rtnl = ip6gre_exit_batch_rtnl ,
2012-08-10 00:51:50 +00:00
. id = & ip6gre_net_id ,
. size = sizeof ( struct ip6gre_net ) ,
} ;
2017-06-25 23:56:01 +02:00
static int ip6gre_tunnel_validate ( struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
2012-08-10 00:51:50 +00:00
{
__be16 flags ;
if ( ! data )
return 0 ;
flags = 0 ;
if ( data [ IFLA_GRE_IFLAGS ] )
flags | = nla_get_be16 ( data [ IFLA_GRE_IFLAGS ] ) ;
if ( data [ IFLA_GRE_OFLAGS ] )
flags | = nla_get_be16 ( data [ IFLA_GRE_OFLAGS ] ) ;
if ( flags & ( GRE_VERSION | GRE_ROUTING ) )
return - EINVAL ;
return 0 ;
}
2017-06-25 23:56:01 +02:00
static int ip6gre_tap_validate ( struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
2012-08-10 00:51:50 +00:00
{
struct in6_addr daddr ;
if ( tb [ IFLA_ADDRESS ] ) {
if ( nla_len ( tb [ IFLA_ADDRESS ] ) ! = ETH_ALEN )
return - EINVAL ;
if ( ! is_valid_ether_addr ( nla_data ( tb [ IFLA_ADDRESS ] ) ) )
return - EADDRNOTAVAIL ;
}
if ( ! data )
goto out ;
if ( data [ IFLA_GRE_REMOTE ] ) {
2015-03-29 16:59:26 +02:00
daddr = nla_get_in6_addr ( data [ IFLA_GRE_REMOTE ] ) ;
2012-08-10 00:51:50 +00:00
if ( ipv6_addr_any ( & daddr ) )
return - EINVAL ;
}
out :
2017-06-25 23:56:01 +02:00
return ip6gre_tunnel_validate ( tb , data , extack ) ;
2012-08-10 00:51:50 +00:00
}
2017-11-30 11:51:29 -08:00
static int ip6erspan_tap_validate ( struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
{
__be16 flags = 0 ;
2017-12-13 16:38:57 -08:00
int ret , ver = 0 ;
2017-11-30 11:51:29 -08:00
if ( ! data )
return 0 ;
ret = ip6gre_tap_validate ( tb , data , extack ) ;
if ( ret )
return ret ;
/* ERSPAN should only have GRE sequence and key flag */
if ( data [ IFLA_GRE_OFLAGS ] )
flags | = nla_get_be16 ( data [ IFLA_GRE_OFLAGS ] ) ;
if ( data [ IFLA_GRE_IFLAGS ] )
flags | = nla_get_be16 ( data [ IFLA_GRE_IFLAGS ] ) ;
if ( ! data [ IFLA_GRE_COLLECT_METADATA ] & &
flags ! = ( GRE_SEQ | GRE_KEY ) )
return - EINVAL ;
/* ERSPAN Session ID only has 10-bit. Since we reuse
* 32 - bit key field as ID , check it ' s range .
*/
if ( data [ IFLA_GRE_IKEY ] & &
( ntohl ( nla_get_be32 ( data [ IFLA_GRE_IKEY ] ) ) & ~ ID_MASK ) )
return - EINVAL ;
if ( data [ IFLA_GRE_OKEY ] & &
( ntohl ( nla_get_be32 ( data [ IFLA_GRE_OKEY ] ) ) & ~ ID_MASK ) )
return - EINVAL ;
2017-12-13 16:38:57 -08:00
if ( data [ IFLA_GRE_ERSPAN_VER ] ) {
ver = nla_get_u8 ( data [ IFLA_GRE_ERSPAN_VER ] ) ;
if ( ver ! = 1 & & ver ! = 2 )
2017-11-30 11:51:29 -08:00
return - EINVAL ;
}
2017-12-13 16:38:57 -08:00
if ( ver = = 1 ) {
if ( data [ IFLA_GRE_ERSPAN_INDEX ] ) {
u32 index = nla_get_u32 ( data [ IFLA_GRE_ERSPAN_INDEX ] ) ;
if ( index & ~ INDEX_MASK )
return - EINVAL ;
}
} else if ( ver = = 2 ) {
if ( data [ IFLA_GRE_ERSPAN_DIR ] ) {
u16 dir = nla_get_u8 ( data [ IFLA_GRE_ERSPAN_DIR ] ) ;
if ( dir & ~ ( DIR_MASK > > DIR_OFFSET ) )
return - EINVAL ;
}
if ( data [ IFLA_GRE_ERSPAN_HWID ] ) {
u16 hwid = nla_get_u16 ( data [ IFLA_GRE_ERSPAN_HWID ] ) ;
if ( hwid & ~ ( HWID_MASK > > HWID_OFFSET ) )
return - EINVAL ;
}
}
2017-11-30 11:51:29 -08:00
return 0 ;
}
2012-08-10 00:51:50 +00:00
2019-02-15 15:10:32 +01:00
static void ip6erspan_set_version ( struct nlattr * data [ ] ,
struct __ip6_tnl_parm * parms )
{
2019-02-20 09:23:03 +01:00
if ( ! data )
return ;
2019-02-15 15:10:32 +01:00
parms - > erspan_ver = 1 ;
if ( data [ IFLA_GRE_ERSPAN_VER ] )
parms - > erspan_ver = nla_get_u8 ( data [ IFLA_GRE_ERSPAN_VER ] ) ;
if ( parms - > erspan_ver = = 1 ) {
if ( data [ IFLA_GRE_ERSPAN_INDEX ] )
parms - > index = nla_get_u32 ( data [ IFLA_GRE_ERSPAN_INDEX ] ) ;
} else if ( parms - > erspan_ver = = 2 ) {
if ( data [ IFLA_GRE_ERSPAN_DIR ] )
parms - > dir = nla_get_u8 ( data [ IFLA_GRE_ERSPAN_DIR ] ) ;
if ( data [ IFLA_GRE_ERSPAN_HWID ] )
parms - > hwid = nla_get_u16 ( data [ IFLA_GRE_ERSPAN_HWID ] ) ;
}
}
2012-08-10 00:51:50 +00:00
static void ip6gre_netlink_parms ( struct nlattr * data [ ] ,
struct __ip6_tnl_parm * parms )
{
memset ( parms , 0 , sizeof ( * parms ) ) ;
if ( ! data )
return ;
if ( data [ IFLA_GRE_LINK ] )
parms - > link = nla_get_u32 ( data [ IFLA_GRE_LINK ] ) ;
if ( data [ IFLA_GRE_IFLAGS ] )
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
gre_flags_to_tnl_flags ( parms - > i_flags ,
nla_get_be16 ( data [ IFLA_GRE_IFLAGS ] ) ) ;
2012-08-10 00:51:50 +00:00
if ( data [ IFLA_GRE_OFLAGS ] )
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
gre_flags_to_tnl_flags ( parms - > o_flags ,
nla_get_be16 ( data [ IFLA_GRE_OFLAGS ] ) ) ;
2012-08-10 00:51:50 +00:00
if ( data [ IFLA_GRE_IKEY ] )
parms - > i_key = nla_get_be32 ( data [ IFLA_GRE_IKEY ] ) ;
if ( data [ IFLA_GRE_OKEY ] )
parms - > o_key = nla_get_be32 ( data [ IFLA_GRE_OKEY ] ) ;
if ( data [ IFLA_GRE_LOCAL ] )
2015-03-29 16:59:26 +02:00
parms - > laddr = nla_get_in6_addr ( data [ IFLA_GRE_LOCAL ] ) ;
2012-08-10 00:51:50 +00:00
if ( data [ IFLA_GRE_REMOTE ] )
2015-03-29 16:59:26 +02:00
parms - > raddr = nla_get_in6_addr ( data [ IFLA_GRE_REMOTE ] ) ;
2012-08-10 00:51:50 +00:00
if ( data [ IFLA_GRE_TTL ] )
parms - > hop_limit = nla_get_u8 ( data [ IFLA_GRE_TTL ] ) ;
if ( data [ IFLA_GRE_ENCAP_LIMIT ] )
parms - > encap_limit = nla_get_u8 ( data [ IFLA_GRE_ENCAP_LIMIT ] ) ;
if ( data [ IFLA_GRE_FLOWINFO ] )
2016-09-24 14:01:04 -04:00
parms - > flowinfo = nla_get_be32 ( data [ IFLA_GRE_FLOWINFO ] ) ;
2012-08-10 00:51:50 +00:00
if ( data [ IFLA_GRE_FLAGS ] )
parms - > flags = nla_get_u32 ( data [ IFLA_GRE_FLAGS ] ) ;
2017-04-19 12:30:53 -04:00
if ( data [ IFLA_GRE_FWMARK ] )
parms - > fwmark = nla_get_u32 ( data [ IFLA_GRE_FWMARK ] ) ;
2017-11-30 11:51:29 -08:00
2017-12-01 15:26:08 -08:00
if ( data [ IFLA_GRE_COLLECT_METADATA ] )
parms - > collect_md = true ;
2012-08-10 00:51:50 +00:00
}
static int ip6gre_tap_init ( struct net_device * dev )
{
2015-09-15 14:30:05 -07:00
int ret ;
2012-08-10 00:51:50 +00:00
2015-09-15 14:30:05 -07:00
ret = ip6gre_tunnel_init_common ( dev ) ;
if ( ret )
return ret ;
2012-08-10 00:51:50 +00:00
2016-06-08 20:15:43 +01:00
dev - > priv_flags | = IFF_LIVE_ADDR_CHANGE ;
2012-08-10 00:51:50 +00:00
return 0 ;
}
static const struct net_device_ops ip6gre_tap_netdev_ops = {
. ndo_init = ip6gre_tap_init ,
. ndo_uninit = ip6gre_tunnel_uninit ,
. ndo_start_xmit = ip6gre_tunnel_xmit ,
. ndo_set_mac_address = eth_mac_addr ,
. ndo_validate_addr = eth_validate_addr ,
2016-04-29 17:12:21 -07:00
. ndo_change_mtu = ip6_tnl_change_mtu ,
2015-04-02 17:07:01 +02:00
. ndo_get_iflink = ip6_tnl_get_iflink ,
2012-08-10 00:51:50 +00:00
} ;
2018-05-17 16:36:51 +02:00
static int ip6erspan_calc_hlen ( struct ip6_tnl * tunnel )
{
int t_hlen ;
tunnel - > tun_hlen = 8 ;
tunnel - > hlen = tunnel - > tun_hlen + tunnel - > encap_hlen +
erspan_hdr_len ( tunnel - > parms . erspan_ver ) ;
t_hlen = tunnel - > hlen + sizeof ( struct ipv6hdr ) ;
net: ipv6_gre: Fix GRO to work on IPv6 over GRE tap
IPv6 GRO over GRE tap is not working while GRO is not set
over the native interface.
gro_list_prepare function updates the same_flow variable
of existing sessions to 1 if their mac headers match the one
of the incoming packet.
same_flow is used to filter out non-matching sessions and keep
potential ones for aggregation.
The number of bytes to compare should be the number of bytes
in the mac headers. In gro_list_prepare this number is set to
be skb->dev->hard_header_len. For GRE interfaces this hard_header_len
should be as it is set in the initialization process (when GRE is
created), it should not be overridden. But currently it is being overridden
by the value that is actually supposed to represent the needed_headroom.
Therefore, the number of bytes compared in order to decide whether the
the mac headers are the same is greater than the length of the headers.
As it's documented in netdevice.h, hard_header_len is the maximum
hardware header length, and needed_headroom is the extra headroom
the hardware may need.
hard_header_len is basically all the bytes received by the physical
till layer 3 header of the packet received by the interface.
For example, if the interface is a GRE tap then the needed_headroom
should be the total length of the following headers:
IP header of the physical, GRE header, mac header of GRE.
It is often used to calculate the MTU of the created interface.
This patch removes the override of the hard_header_len, and
assigns the calculated value to needed_headroom.
This way, the comparison in gro_list_prepare is really of
the mac headers, and if the packets have the same mac headers
the same_flow will be set to 1.
Performance testing: 45% higher bandwidth.
Measuring bandwidth of single-stream IPv4 TCP traffic over IPv6
GRE tap while GRO is not set on the native.
NIC: ConnectX-4LX
Before (GRO not working) : 7.2 Gbits/sec
After (GRO working): 10.5 Gbits/sec
Signed-off-by: Maria Pasechnik <mariap@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-08-08 11:46:30 +03:00
tunnel - > dev - > needed_headroom = LL_MAX_HEADER + t_hlen ;
2018-05-17 16:36:51 +02:00
return t_hlen ;
}
2017-11-30 11:51:29 -08:00
static int ip6erspan_tap_init ( struct net_device * dev )
{
struct ip6_tnl * tunnel ;
int t_hlen ;
int ret ;
tunnel = netdev_priv ( dev ) ;
tunnel - > dev = dev ;
tunnel - > net = dev_net ( dev ) ;
strcpy ( tunnel - > parms . name , dev - > name ) ;
ret = dst_cache_init ( & tunnel - > dst_cache , GFP_KERNEL ) ;
2018-05-07 10:45:27 +03:00
if ( ret )
2024-04-12 08:19:25 -07:00
return ret ;
2018-05-07 10:45:27 +03:00
ret = gro_cells_init ( & tunnel - > gro_cells , dev ) ;
if ( ret )
goto cleanup_dst_cache_init ;
2017-11-30 11:51:29 -08:00
2018-05-17 16:36:51 +02:00
t_hlen = ip6erspan_calc_hlen ( tunnel ) ;
2017-11-30 11:51:29 -08:00
dev - > mtu = ETH_DATA_LEN - t_hlen ;
if ( dev - > type = = ARPHRD_ETHER )
dev - > mtu - = ETH_HLEN ;
if ( ! ( tunnel - > parms . flags & IP6_TNL_F_IGN_ENCAP_LIMIT ) )
dev - > mtu - = 8 ;
dev - > priv_flags | = IFF_LIVE_ADDR_CHANGE ;
2018-05-17 16:36:51 +02:00
ip6erspan_tnl_link_config ( tunnel , 1 ) ;
2017-11-30 11:51:29 -08:00
2022-06-07 21:39:55 -07:00
netdev_hold ( dev , & tunnel - > dev_tracker , GFP_KERNEL ) ;
net: add netdev_lockdep_set_classes() to virtual drivers
Based on a syzbot report, it appears many virtual
drivers do not yet use netdev_lockdep_set_classes(),
triggerring lockdep false positives.
WARNING: possible recursive locking detected
6.8.0-rc4-next-20240212-syzkaller #0 Not tainted
syz-executor.0/19016 is trying to acquire lock:
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880162cb298 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
but task is already holding lock:
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
lock(_xmit_ETHER#2);
lock(_xmit_ETHER#2);
*** DEADLOCK ***
May be due to missing lock nesting notation
9 locks held by syz-executor.0/19016:
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock net/core/rtnetlink.c:79 [inline]
#0: ffffffff8f385208 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x82c/0x1040 net/core/rtnetlink.c:6603
#1: ffffc90000a08c00 ((&in_dev->mr_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0xc0/0x600 kernel/time/timer.c:1697
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#2: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#3: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#4: ffff8880416e3258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: __netif_tx_lock include/linux/netdevice.h:4452 [inline]
#5: ffff8880223db4d8 (_xmit_ETHER#2){+.-.}-{2:2}, at: sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:298 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:750 [inline]
#6: ffffffff8e131520 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1360 net/ipv4/ip_output.c:228
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: local_bh_disable include/linux/bottom_half.h:20 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: rcu_read_lock_bh include/linux/rcupdate.h:802 [inline]
#7: ffffffff8e131580 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x2c4/0x3b10 net/core/dev.c:4284
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: spin_trylock include/linux/spinlock.h:361 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: qdisc_run_begin include/net/sch_generic.h:195 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_xmit_skb net/core/dev.c:3771 [inline]
#8: ffff888014d9d258 (dev->qdisc_tx_busylock ?: &qdisc_tx_busylock){+...}-{2:2}, at: __dev_queue_xmit+0x1262/0x3b10 net/core/dev.c:4325
stack backtrace:
CPU: 1 PID: 19016 Comm: syz-executor.0 Not tainted 6.8.0-rc4-next-20240212-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
check_deadlock kernel/locking/lockdep.c:3062 [inline]
validate_chain+0x15c1/0x58e0 kernel/locking/lockdep.c:3856
__lock_acquire+0x1346/0x1fd0 kernel/locking/lockdep.c:5137
lock_acquire+0x1e4/0x530 kernel/locking/lockdep.c:5754
__raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
_raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
spin_lock include/linux/spinlock.h:351 [inline]
__netif_tx_lock include/linux/netdevice.h:4452 [inline]
sch_direct_xmit+0x1c4/0x5f0 net/sched/sch_generic.c:340
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
iptunnel_xmit+0x540/0x9b0 net/ipv4/ip_tunnel_core.c:82
ip_tunnel_xmit+0x20ee/0x2960 net/ipv4/ip_tunnel.c:831
erspan_xmit+0x9de/0x1460 net/ipv4/ip_gre.c:720
__netdev_start_xmit include/linux/netdevice.h:4989 [inline]
netdev_start_xmit include/linux/netdevice.h:5003 [inline]
xmit_one net/core/dev.c:3555 [inline]
dev_hard_start_xmit+0x242/0x770 net/core/dev.c:3571
sch_direct_xmit+0x2b6/0x5f0 net/sched/sch_generic.c:342
__dev_xmit_skb net/core/dev.c:3784 [inline]
__dev_queue_xmit+0x1912/0x3b10 net/core/dev.c:4325
neigh_output include/net/neighbour.h:542 [inline]
ip_finish_output2+0xe66/0x1360 net/ipv4/ip_output.c:235
igmpv3_send_cr net/ipv4/igmp.c:723 [inline]
igmp_ifc_timer_expire+0xb71/0xd90 net/ipv4/igmp.c:813
call_timer_fn+0x17e/0x600 kernel/time/timer.c:1700
expire_timers kernel/time/timer.c:1751 [inline]
__run_timers+0x621/0x830 kernel/time/timer.c:2038
run_timer_softirq+0x67/0xf0 kernel/time/timer.c:2051
__do_softirq+0x2bc/0x943 kernel/softirq.c:554
invoke_softirq kernel/softirq.c:428 [inline]
__irq_exit_rcu+0xf2/0x1c0 kernel/softirq.c:633
irq_exit_rcu+0x9/0x30 kernel/softirq.c:645
instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1076 [inline]
sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1076
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
RIP: 0010:resched_offsets_ok kernel/sched/core.c:10127 [inline]
RIP: 0010:__might_resched+0x16f/0x780 kernel/sched/core.c:10142
Code: 00 4c 89 e8 48 c1 e8 03 48 ba 00 00 00 00 00 fc ff df 48 89 44 24 38 0f b6 04 10 84 c0 0f 85 87 04 00 00 41 8b 45 00 c1 e0 08 <01> d8 44 39 e0 0f 85 d6 00 00 00 44 89 64 24 1c 48 8d bc 24 a0 00
RSP: 0018:ffffc9000ee069e0 EFLAGS: 00000246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8880296a9e00
RDX: dffffc0000000000 RSI: ffff8880296a9e00 RDI: ffffffff8bfe8fa0
RBP: ffffc9000ee06b00 R08: ffffffff82326877 R09: 1ffff11002b5ad1b
R10: dffffc0000000000 R11: ffffed1002b5ad1c R12: 0000000000000000
R13: ffff8880296aa23c R14: 000000000000062a R15: 1ffff92001dc0d44
down_write+0x19/0x50 kernel/locking/rwsem.c:1578
kernfs_activate fs/kernfs/dir.c:1403 [inline]
kernfs_add_one+0x4af/0x8b0 fs/kernfs/dir.c:819
__kernfs_create_file+0x22e/0x2e0 fs/kernfs/file.c:1056
sysfs_add_file_mode_ns+0x24a/0x310 fs/sysfs/file.c:307
create_files fs/sysfs/group.c:64 [inline]
internal_create_group+0x4f4/0xf20 fs/sysfs/group.c:152
internal_create_groups fs/sysfs/group.c:192 [inline]
sysfs_create_groups+0x56/0x120 fs/sysfs/group.c:218
create_dir lib/kobject.c:78 [inline]
kobject_add_internal+0x472/0x8d0 lib/kobject.c:240
kobject_add_varg lib/kobject.c:374 [inline]
kobject_init_and_add+0x124/0x190 lib/kobject.c:457
netdev_queue_add_kobject net/core/net-sysfs.c:1706 [inline]
netdev_queue_update_kobjects+0x1f3/0x480 net/core/net-sysfs.c:1758
register_queue_kobjects net/core/net-sysfs.c:1819 [inline]
netdev_register_kobject+0x265/0x310 net/core/net-sysfs.c:2059
register_netdevice+0x1191/0x19c0 net/core/dev.c:10298
bond_newlink+0x3b/0x90 drivers/net/bonding/bond_netlink.c:576
rtnl_newlink_create net/core/rtnetlink.c:3506 [inline]
__rtnl_newlink net/core/rtnetlink.c:3726 [inline]
rtnl_newlink+0x158f/0x20a0 net/core/rtnetlink.c:3739
rtnetlink_rcv_msg+0x885/0x1040 net/core/rtnetlink.c:6606
netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2543
netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
netlink_unicast+0x7ea/0x980 net/netlink/af_netlink.c:1367
netlink_sendmsg+0xa3c/0xd70 net/netlink/af_netlink.c:1908
sock_sendmsg_nosec net/socket.c:730 [inline]
__sock_sendmsg+0x221/0x270 net/socket.c:745
__sys_sendto+0x3a4/0x4f0 net/socket.c:2191
__do_sys_sendto net/socket.c:2203 [inline]
__se_sys_sendto net/socket.c:2199 [inline]
__x64_sys_sendto+0xde/0x100 net/socket.c:2199
do_syscall_64+0xfb/0x240
entry_SYSCALL_64_after_hwframe+0x6d/0x75
RIP: 0033:0x7fc3fa87fa9c
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240212140700.2795436-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2024-02-12 14:07:00 +00:00
netdev_lockdep_set_classes ( dev ) ;
2017-11-30 11:51:29 -08:00
return 0 ;
2018-05-07 10:45:27 +03:00
cleanup_dst_cache_init :
dst_cache_destroy ( & tunnel - > dst_cache ) ;
return ret ;
2017-11-30 11:51:29 -08:00
}
static const struct net_device_ops ip6erspan_netdev_ops = {
. ndo_init = ip6erspan_tap_init ,
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
. ndo_uninit = ip6erspan_tunnel_uninit ,
2017-11-30 11:51:29 -08:00
. ndo_start_xmit = ip6erspan_tunnel_xmit ,
. ndo_set_mac_address = eth_mac_addr ,
. ndo_validate_addr = eth_validate_addr ,
. ndo_change_mtu = ip6_tnl_change_mtu ,
. ndo_get_iflink = ip6_tnl_get_iflink ,
} ;
2012-08-10 00:51:50 +00:00
static void ip6gre_tap_setup ( struct net_device * dev )
{
ether_setup ( dev ) ;
2017-12-18 14:25:09 +08:00
dev - > max_mtu = 0 ;
2012-08-10 00:51:50 +00:00
dev - > netdev_ops = & ip6gre_tap_netdev_ops ;
net: Fix inconsistent teardown and release of private netdev state.
Network devices can allocate reasources and private memory using
netdev_ops->ndo_init(). However, the release of these resources
can occur in one of two different places.
Either netdev_ops->ndo_uninit() or netdev->destructor().
The decision of which operation frees the resources depends upon
whether it is necessary for all netdev refs to be released before it
is safe to perform the freeing.
netdev_ops->ndo_uninit() presumably can occur right after the
NETDEV_UNREGISTER notifier completes and the unicast and multicast
address lists are flushed.
netdev->destructor(), on the other hand, does not run until the
netdev references all go away.
Further complicating the situation is that netdev->destructor()
almost universally does also a free_netdev().
This creates a problem for the logic in register_netdevice().
Because all callers of register_netdevice() manage the freeing
of the netdev, and invoke free_netdev(dev) if register_netdevice()
fails.
If netdev_ops->ndo_init() succeeds, but something else fails inside
of register_netdevice(), it does call ndo_ops->ndo_uninit(). But
it is not able to invoke netdev->destructor().
This is because netdev->destructor() will do a free_netdev() and
then the caller of register_netdevice() will do the same.
However, this means that the resources that would normally be released
by netdev->destructor() will not be.
Over the years drivers have added local hacks to deal with this, by
invoking their destructor parts by hand when register_netdevice()
fails.
Many drivers do not try to deal with this, and instead we have leaks.
Let's close this hole by formalizing the distinction between what
private things need to be freed up by netdev->destructor() and whether
the driver needs unregister_netdevice() to perform the free_netdev().
netdev->priv_destructor() performs all actions to free up the private
resources that used to be freed by netdev->destructor(), except for
free_netdev().
netdev->needs_free_netdev is a boolean that indicates whether
free_netdev() should be done at the end of unregister_netdevice().
Now, register_netdevice() can sanely release all resources after
ndo_ops->ndo_init() succeeds, by invoking both ndo_ops->ndo_uninit()
and netdev->priv_destructor().
And at the end of unregister_netdevice(), we invoke
netdev->priv_destructor() and optionally call free_netdev().
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-05-08 12:52:56 -04:00
dev - > needs_free_netdev = true ;
dev - > priv_destructor = ip6gre_dev_free ;
2012-08-10 00:51:50 +00:00
2024-04-12 08:19:25 -07:00
dev - > pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS ;
2016-02-17 15:32:53 +01:00
dev - > priv_flags & = ~ IFF_TX_SKB_SHARING ;
2016-06-08 20:15:43 +01:00
dev - > priv_flags | = IFF_LIVE_ADDR_CHANGE ;
2017-09-28 13:23:50 +08:00
netif_keep_dst ( dev ) ;
2012-08-10 00:51:50 +00:00
}
2016-05-18 09:06:19 -07:00
static bool ip6gre_netlink_encap_parms ( struct nlattr * data [ ] ,
struct ip_tunnel_encap * ipencap )
{
bool ret = false ;
memset ( ipencap , 0 , sizeof ( * ipencap ) ) ;
if ( ! data )
return ret ;
if ( data [ IFLA_GRE_ENCAP_TYPE ] ) {
ret = true ;
ipencap - > type = nla_get_u16 ( data [ IFLA_GRE_ENCAP_TYPE ] ) ;
}
if ( data [ IFLA_GRE_ENCAP_FLAGS ] ) {
ret = true ;
ipencap - > flags = nla_get_u16 ( data [ IFLA_GRE_ENCAP_FLAGS ] ) ;
}
if ( data [ IFLA_GRE_ENCAP_SPORT ] ) {
ret = true ;
ipencap - > sport = nla_get_be16 ( data [ IFLA_GRE_ENCAP_SPORT ] ) ;
}
if ( data [ IFLA_GRE_ENCAP_DPORT ] ) {
ret = true ;
ipencap - > dport = nla_get_be16 ( data [ IFLA_GRE_ENCAP_DPORT ] ) ;
}
return ret ;
}
2018-05-17 16:36:39 +02:00
static int ip6gre_newlink_common ( struct net * src_net , struct net_device * dev ,
struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
2012-08-10 00:51:50 +00:00
{
struct ip6_tnl * nt ;
2016-05-18 09:06:19 -07:00
struct ip_tunnel_encap ipencap ;
2012-08-10 00:51:50 +00:00
int err ;
nt = netdev_priv ( dev ) ;
2016-05-18 09:06:19 -07:00
if ( ip6gre_netlink_encap_parms ( data , & ipencap ) ) {
int err = ip6_tnl_encap_setup ( nt , & ipencap ) ;
if ( err < 0 )
return err ;
}
2012-08-10 00:51:50 +00:00
if ( dev - > type = = ARPHRD_ETHER & & ! tb [ IFLA_ADDRESS ] )
eth_hw_addr_random ( dev ) ;
nt - > dev = dev ;
2013-08-13 17:51:12 +02:00
nt - > net = dev_net ( dev ) ;
2012-08-10 00:51:50 +00:00
err = register_netdevice ( dev ) ;
if ( err )
goto out ;
ip6_gre: init dev->mtu and dev->hard_header_len correctly
Commit b05229f44228 ("gre6: Cleanup GREv6 transmit path,
call common GRE functions") moved dev->mtu initialization
from ip6gre_tunnel_setup() to ip6gre_tunnel_init(), as a
result, the previously set values, before ndo_init(), are
reset in the following cases:
* rtnl_create_link() can update dev->mtu from IFLA_MTU
parameter.
* ip6gre_tnl_link_config() is invoked before ndo_init() in
netlink and ioctl setup, so ndo_init() can reset MTU
adjustments with the lower device MTU as well, dev->mtu
and dev->hard_header_len.
Not applicable for ip6gretap because it has one more call
to ip6gre_tnl_link_config(tunnel, 1) in ip6gre_tap_init().
Fix the first case by updating dev->mtu with 'tb[IFLA_MTU]'
parameter if a user sets it manually on a device creation,
and fix the second one by moving ip6gre_tnl_link_config()
call after register_netdevice().
Fixes: b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE functions")
Fixes: db2ec95d1ba4 ("ip6_gre: Fix MTU setting")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-18 20:51:12 +03:00
if ( tb [ IFLA_MTU ] )
ip6_tnl_change_mtu ( dev , nla_get_u32 ( tb [ IFLA_MTU ] ) ) ;
2012-08-10 00:51:50 +00:00
out :
return err ;
}
2018-05-17 16:36:39 +02:00
static int ip6gre_newlink ( struct net * src_net , struct net_device * dev ,
struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
{
struct ip6_tnl * nt = netdev_priv ( dev ) ;
struct net * net = dev_net ( dev ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
struct ip6gre_net * ign ;
int err ;
ip6gre_netlink_parms ( data , & nt - > parms ) ;
ign = net_generic ( net , ip6gre_net_id ) ;
if ( nt - > parms . collect_md ) {
if ( rtnl_dereference ( ign - > collect_md_tun ) )
return - EEXIST ;
} else {
if ( ip6gre_tunnel_find ( net , & nt - > parms , dev - > type ) )
return - EEXIST ;
}
2018-05-17 16:36:39 +02:00
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
err = ip6gre_newlink_common ( src_net , dev , tb , data , extack ) ;
2018-05-17 16:36:39 +02:00
if ( ! err ) {
ip6gre_tnl_link_config ( nt , ! tb [ IFLA_MTU ] ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
ip6gre_tunnel_link_md ( ign , nt ) ;
2018-05-17 16:36:39 +02:00
ip6gre_tunnel_link ( net_generic ( net , ip6gre_net_id ) , nt ) ;
}
return err ;
}
2018-05-17 16:36:45 +02:00
static struct ip6_tnl *
ip6gre_changelink_common ( struct net_device * dev , struct nlattr * tb [ ] ,
struct nlattr * data [ ] , struct __ip6_tnl_parm * p_p ,
struct netlink_ext_ack * extack )
2012-08-10 00:51:50 +00:00
{
2014-04-22 10:15:24 +02:00
struct ip6_tnl * t , * nt = netdev_priv ( dev ) ;
struct net * net = nt - > net ;
2012-08-10 00:51:50 +00:00
struct ip6gre_net * ign = net_generic ( net , ip6gre_net_id ) ;
2016-05-18 09:06:19 -07:00
struct ip_tunnel_encap ipencap ;
2012-08-10 00:51:50 +00:00
if ( dev = = ign - > fb_tunnel_dev )
2018-05-17 16:36:45 +02:00
return ERR_PTR ( - EINVAL ) ;
2012-08-10 00:51:50 +00:00
2016-05-18 09:06:19 -07:00
if ( ip6gre_netlink_encap_parms ( data , & ipencap ) ) {
int err = ip6_tnl_encap_setup ( nt , & ipencap ) ;
if ( err < 0 )
2018-05-17 16:36:45 +02:00
return ERR_PTR ( err ) ;
2016-05-18 09:06:19 -07:00
}
2018-05-17 16:36:45 +02:00
ip6gre_netlink_parms ( data , p_p ) ;
2012-08-10 00:51:50 +00:00
2018-05-17 16:36:45 +02:00
t = ip6gre_tunnel_locate ( net , p_p , 0 ) ;
2012-08-10 00:51:50 +00:00
if ( t ) {
if ( t - > dev ! = dev )
2018-05-17 16:36:45 +02:00
return ERR_PTR ( - EEXIST ) ;
2012-08-10 00:51:50 +00:00
} else {
t = nt ;
}
2018-05-17 16:36:45 +02:00
return t ;
}
static int ip6gre_changelink ( struct net_device * dev , struct nlattr * tb [ ] ,
struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
{
2019-01-09 10:57:21 +01:00
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct ip6gre_net * ign = net_generic ( t - > net , ip6gre_net_id ) ;
2018-05-17 16:36:45 +02:00
struct __ip6_tnl_parm p ;
t = ip6gre_changelink_common ( dev , tb , data , & p , extack ) ;
if ( IS_ERR ( t ) )
return PTR_ERR ( t ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
ip6gre_tunnel_unlink_md ( ign , t ) ;
2015-12-03 17:21:50 +01:00
ip6gre_tunnel_unlink ( ign , t ) ;
ip6gre_tnl_change ( t , & p , ! tb [ IFLA_MTU ] ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
ip6gre_tunnel_link_md ( ign , t ) ;
2015-12-03 17:21:50 +01:00
ip6gre_tunnel_link ( ign , t ) ;
2012-08-10 00:51:50 +00:00
return 0 ;
}
2014-04-14 17:11:38 +02:00
static void ip6gre_dellink ( struct net_device * dev , struct list_head * head )
{
struct net * net = dev_net ( dev ) ;
struct ip6gre_net * ign = net_generic ( net , ip6gre_net_id ) ;
if ( dev ! = ign - > fb_tunnel_dev )
unregister_netdevice_queue ( dev , head ) ;
}
2012-08-10 00:51:50 +00:00
static size_t ip6gre_get_size ( const struct net_device * dev )
{
return
/* IFLA_GRE_LINK */
nla_total_size ( 4 ) +
/* IFLA_GRE_IFLAGS */
nla_total_size ( 2 ) +
/* IFLA_GRE_OFLAGS */
nla_total_size ( 2 ) +
/* IFLA_GRE_IKEY */
nla_total_size ( 4 ) +
/* IFLA_GRE_OKEY */
nla_total_size ( 4 ) +
/* IFLA_GRE_LOCAL */
2012-11-09 05:34:56 +00:00
nla_total_size ( sizeof ( struct in6_addr ) ) +
2012-08-10 00:51:50 +00:00
/* IFLA_GRE_REMOTE */
2012-11-09 05:34:56 +00:00
nla_total_size ( sizeof ( struct in6_addr ) ) +
2012-08-10 00:51:50 +00:00
/* IFLA_GRE_TTL */
nla_total_size ( 1 ) +
/* IFLA_GRE_ENCAP_LIMIT */
nla_total_size ( 1 ) +
/* IFLA_GRE_FLOWINFO */
nla_total_size ( 4 ) +
/* IFLA_GRE_FLAGS */
nla_total_size ( 4 ) +
2016-05-18 09:06:19 -07:00
/* IFLA_GRE_ENCAP_TYPE */
nla_total_size ( 2 ) +
/* IFLA_GRE_ENCAP_FLAGS */
nla_total_size ( 2 ) +
/* IFLA_GRE_ENCAP_SPORT */
nla_total_size ( 2 ) +
/* IFLA_GRE_ENCAP_DPORT */
nla_total_size ( 2 ) +
2017-12-01 15:26:08 -08:00
/* IFLA_GRE_COLLECT_METADATA */
nla_total_size ( 0 ) +
2017-04-19 12:30:53 -04:00
/* IFLA_GRE_FWMARK */
nla_total_size ( 4 ) +
2017-11-30 11:51:29 -08:00
/* IFLA_GRE_ERSPAN_INDEX */
nla_total_size ( 4 ) +
2012-08-10 00:51:50 +00:00
0 ;
}
static int ip6gre_fill_info ( struct sk_buff * skb , const struct net_device * dev )
{
struct ip6_tnl * t = netdev_priv ( dev ) ;
struct __ip6_tnl_parm * p = & t - > parms ;
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
IP_TUNNEL_DECLARE_FLAGS ( o_flags ) ;
ip_tunnel_flags_copy ( o_flags , p - > o_flags ) ;
2019-01-28 22:23:49 +01:00
2019-02-19 17:42:06 +01:00
if ( p - > erspan_ver = = 1 | | p - > erspan_ver = = 2 ) {
if ( ! p - > collect_md )
ip_tunnel: convert __be16 tunnel flags to bitmaps
Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.
Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.
Most noticeable bloat-o-meter difference (.text):
vmlinux: 307/-1 (306)
gre.ko: 62/0 (62)
ip_gre.ko: 941/-217 (724) [*]
ip_tunnel.ko: 390/-900 (-510) [**]
ip_vti.ko: 138/0 (138)
ip6_gre.ko: 534/-18 (516) [*]
ip6_tunnel.ko: 118/-10 (108)
[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease
The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-27 16:23:53 +01:00
__set_bit ( IP_TUNNEL_KEY_BIT , o_flags ) ;
2019-02-19 17:42:06 +01:00
if ( nla_put_u8 ( skb , IFLA_GRE_ERSPAN_VER , p - > erspan_ver ) )
goto nla_put_failure ;
if ( p - > erspan_ver = = 1 ) {
if ( nla_put_u32 ( skb , IFLA_GRE_ERSPAN_INDEX , p - > index ) )
goto nla_put_failure ;
} else {
if ( nla_put_u8 ( skb , IFLA_GRE_ERSPAN_DIR , p - > dir ) )
goto nla_put_failure ;
if ( nla_put_u16 ( skb , IFLA_GRE_ERSPAN_HWID , p - > hwid ) )
goto nla_put_failure ;
}
}
2012-08-10 00:51:50 +00:00
if ( nla_put_u32 ( skb , IFLA_GRE_LINK , p - > link ) | |
2016-05-09 17:12:09 -07:00
nla_put_be16 ( skb , IFLA_GRE_IFLAGS ,
gre_tnl_flags_to_gre_flags ( p - > i_flags ) ) | |
nla_put_be16 ( skb , IFLA_GRE_OFLAGS ,
2019-01-28 22:23:49 +01:00
gre_tnl_flags_to_gre_flags ( o_flags ) ) | |
2012-08-10 00:51:50 +00:00
nla_put_be32 ( skb , IFLA_GRE_IKEY , p - > i_key ) | |
nla_put_be32 ( skb , IFLA_GRE_OKEY , p - > o_key ) | |
2015-03-29 16:59:25 +02:00
nla_put_in6_addr ( skb , IFLA_GRE_LOCAL , & p - > laddr ) | |
nla_put_in6_addr ( skb , IFLA_GRE_REMOTE , & p - > raddr ) | |
2012-08-10 00:51:50 +00:00
nla_put_u8 ( skb , IFLA_GRE_TTL , p - > hop_limit ) | |
nla_put_u8 ( skb , IFLA_GRE_ENCAP_LIMIT , p - > encap_limit ) | |
nla_put_be32 ( skb , IFLA_GRE_FLOWINFO , p - > flowinfo ) | |
2017-04-19 12:30:53 -04:00
nla_put_u32 ( skb , IFLA_GRE_FLAGS , p - > flags ) | |
2019-02-19 17:42:06 +01:00
nla_put_u32 ( skb , IFLA_GRE_FWMARK , p - > fwmark ) )
2012-08-10 00:51:50 +00:00
goto nla_put_failure ;
2016-05-18 09:06:19 -07:00
if ( nla_put_u16 ( skb , IFLA_GRE_ENCAP_TYPE ,
t - > encap . type ) | |
nla_put_be16 ( skb , IFLA_GRE_ENCAP_SPORT ,
t - > encap . sport ) | |
nla_put_be16 ( skb , IFLA_GRE_ENCAP_DPORT ,
t - > encap . dport ) | |
nla_put_u16 ( skb , IFLA_GRE_ENCAP_FLAGS ,
t - > encap . flags ) )
goto nla_put_failure ;
2017-12-01 15:26:08 -08:00
if ( p - > collect_md ) {
if ( nla_put_flag ( skb , IFLA_GRE_COLLECT_METADATA ) )
goto nla_put_failure ;
}
2012-08-10 00:51:50 +00:00
return 0 ;
nla_put_failure :
return - EMSGSIZE ;
}
static const struct nla_policy ip6gre_policy [ IFLA_GRE_MAX + 1 ] = {
[ IFLA_GRE_LINK ] = { . type = NLA_U32 } ,
[ IFLA_GRE_IFLAGS ] = { . type = NLA_U16 } ,
[ IFLA_GRE_OFLAGS ] = { . type = NLA_U16 } ,
[ IFLA_GRE_IKEY ] = { . type = NLA_U32 } ,
[ IFLA_GRE_OKEY ] = { . type = NLA_U32 } ,
2019-12-09 10:31:43 -08:00
[ IFLA_GRE_LOCAL ] = { . len = sizeof_field ( struct ipv6hdr , saddr ) } ,
[ IFLA_GRE_REMOTE ] = { . len = sizeof_field ( struct ipv6hdr , daddr ) } ,
2012-08-10 00:51:50 +00:00
[ IFLA_GRE_TTL ] = { . type = NLA_U8 } ,
[ IFLA_GRE_ENCAP_LIMIT ] = { . type = NLA_U8 } ,
[ IFLA_GRE_FLOWINFO ] = { . type = NLA_U32 } ,
[ IFLA_GRE_FLAGS ] = { . type = NLA_U32 } ,
2016-05-18 09:06:19 -07:00
[ IFLA_GRE_ENCAP_TYPE ] = { . type = NLA_U16 } ,
[ IFLA_GRE_ENCAP_FLAGS ] = { . type = NLA_U16 } ,
[ IFLA_GRE_ENCAP_SPORT ] = { . type = NLA_U16 } ,
[ IFLA_GRE_ENCAP_DPORT ] = { . type = NLA_U16 } ,
2017-12-01 15:26:08 -08:00
[ IFLA_GRE_COLLECT_METADATA ] = { . type = NLA_FLAG } ,
2017-04-19 12:30:53 -04:00
[ IFLA_GRE_FWMARK ] = { . type = NLA_U32 } ,
2017-11-30 11:51:29 -08:00
[ IFLA_GRE_ERSPAN_INDEX ] = { . type = NLA_U32 } ,
2017-12-13 16:38:57 -08:00
[ IFLA_GRE_ERSPAN_VER ] = { . type = NLA_U8 } ,
[ IFLA_GRE_ERSPAN_DIR ] = { . type = NLA_U8 } ,
[ IFLA_GRE_ERSPAN_HWID ] = { . type = NLA_U16 } ,
2012-08-10 00:51:50 +00:00
} ;
2017-11-30 11:51:29 -08:00
static void ip6erspan_tap_setup ( struct net_device * dev )
{
ether_setup ( dev ) ;
2019-10-08 17:56:03 +08:00
dev - > max_mtu = 0 ;
2017-11-30 11:51:29 -08:00
dev - > netdev_ops = & ip6erspan_netdev_ops ;
dev - > needs_free_netdev = true ;
dev - > priv_destructor = ip6gre_dev_free ;
2024-04-12 08:19:25 -07:00
dev - > pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS ;
2017-11-30 11:51:29 -08:00
dev - > priv_flags & = ~ IFF_TX_SKB_SHARING ;
dev - > priv_flags | = IFF_LIVE_ADDR_CHANGE ;
netif_keep_dst ( dev ) ;
}
2018-05-17 16:36:51 +02:00
static int ip6erspan_newlink ( struct net * src_net , struct net_device * dev ,
struct nlattr * tb [ ] , struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
{
struct ip6_tnl * nt = netdev_priv ( dev ) ;
struct net * net = dev_net ( dev ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
struct ip6gre_net * ign ;
int err ;
ip6gre_netlink_parms ( data , & nt - > parms ) ;
2019-02-15 15:10:32 +01:00
ip6erspan_set_version ( data , & nt - > parms ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
ign = net_generic ( net , ip6gre_net_id ) ;
if ( nt - > parms . collect_md ) {
if ( rtnl_dereference ( ign - > collect_md_tun_erspan ) )
return - EEXIST ;
} else {
if ( ip6gre_tunnel_find ( net , & nt - > parms , dev - > type ) )
return - EEXIST ;
}
2018-05-17 16:36:51 +02:00
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
err = ip6gre_newlink_common ( src_net , dev , tb , data , extack ) ;
2018-05-17 16:36:51 +02:00
if ( ! err ) {
ip6erspan_tnl_link_config ( nt , ! tb [ IFLA_MTU ] ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
ip6erspan_tunnel_link_md ( ign , nt ) ;
2018-05-17 16:36:51 +02:00
ip6gre_tunnel_link ( net_generic ( net , ip6gre_net_id ) , nt ) ;
}
return err ;
}
static void ip6erspan_tnl_link_config ( struct ip6_tnl * t , int set_mtu )
{
ip6gre_tnl_link_config_common ( t ) ;
ip6gre_tnl_link_config_route ( t , set_mtu , ip6erspan_calc_hlen ( t ) ) ;
}
static int ip6erspan_tnl_change ( struct ip6_tnl * t ,
const struct __ip6_tnl_parm * p , int set_mtu )
{
ip6gre_tnl_copy_tnl_parm ( t , p ) ;
ip6erspan_tnl_link_config ( t , set_mtu ) ;
return 0 ;
}
static int ip6erspan_changelink ( struct net_device * dev , struct nlattr * tb [ ] ,
struct nlattr * data [ ] ,
struct netlink_ext_ack * extack )
{
struct ip6gre_net * ign = net_generic ( dev_net ( dev ) , ip6gre_net_id ) ;
struct __ip6_tnl_parm p ;
struct ip6_tnl * t ;
t = ip6gre_changelink_common ( dev , tb , data , & p , extack ) ;
if ( IS_ERR ( t ) )
return PTR_ERR ( t ) ;
2019-02-15 15:10:32 +01:00
ip6erspan_set_version ( data , & p ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
ip6gre_tunnel_unlink_md ( ign , t ) ;
2018-05-17 16:36:51 +02:00
ip6gre_tunnel_unlink ( ign , t ) ;
ip6erspan_tnl_change ( t , & p , ! tb [ IFLA_MTU ] ) ;
net: ip6_gre: fix tunnel metadata device sharing.
Currently ip6gre and ip6erspan share single metadata mode device,
using 'collect_md_tun'. Thus, when doing:
ip link add dev ip6gre11 type ip6gretap external
ip link add dev ip6erspan12 type ip6erspan external
RTNETLINK answers: File exists
simply fails due to the 2nd tries to create the same collect_md_tun.
The patch fixes it by adding a separate collect md tunnel device
for the ip6erspan, 'collect_md_tun_erspan'. As a result, a couple
of places need to refactor/split up in order to distinguish ip6gre
and ip6erspan.
First, move the collect_md check at ip6gre_tunnel_{unlink,link} and
create separate function {ip6gre,ip6ersapn}_tunnel_{link_md,unlink_md}.
Then before link/unlink, make sure the link_md/unlink_md is called.
Finally, a separate ndo_uninit is created for ip6erspan. Tested it
using the samples/bpf/test_tunnel_bpf.sh.
Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode")
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-18 19:22:28 -07:00
ip6erspan_tunnel_link_md ( ign , t ) ;
2018-05-17 16:36:51 +02:00
ip6gre_tunnel_link ( ign , t ) ;
return 0 ;
}
2012-08-10 00:51:50 +00:00
static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
. kind = " ip6gre " ,
. maxtype = IFLA_GRE_MAX ,
. policy = ip6gre_policy ,
. priv_size = sizeof ( struct ip6_tnl ) ,
. setup = ip6gre_tunnel_setup ,
. validate = ip6gre_tunnel_validate ,
. newlink = ip6gre_newlink ,
. changelink = ip6gre_changelink ,
2014-04-14 17:11:38 +02:00
. dellink = ip6gre_dellink ,
2012-08-10 00:51:50 +00:00
. get_size = ip6gre_get_size ,
. fill_info = ip6gre_fill_info ,
2015-01-15 15:11:17 +01:00
. get_link_net = ip6_tnl_get_link_net ,
2012-08-10 00:51:50 +00:00
} ;
static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
. kind = " ip6gretap " ,
. maxtype = IFLA_GRE_MAX ,
. policy = ip6gre_policy ,
. priv_size = sizeof ( struct ip6_tnl ) ,
. setup = ip6gre_tap_setup ,
. validate = ip6gre_tap_validate ,
. newlink = ip6gre_newlink ,
. changelink = ip6gre_changelink ,
. get_size = ip6gre_get_size ,
. fill_info = ip6gre_fill_info ,
2015-01-20 15:15:43 +01:00
. get_link_net = ip6_tnl_get_link_net ,
2012-08-10 00:51:50 +00:00
} ;
2017-11-30 11:51:29 -08:00
static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
. kind = " ip6erspan " ,
. maxtype = IFLA_GRE_MAX ,
. policy = ip6gre_policy ,
. priv_size = sizeof ( struct ip6_tnl ) ,
. setup = ip6erspan_tap_setup ,
. validate = ip6erspan_tap_validate ,
2018-05-17 16:36:51 +02:00
. newlink = ip6erspan_newlink ,
. changelink = ip6erspan_changelink ,
2017-11-30 11:51:29 -08:00
. get_size = ip6gre_get_size ,
. fill_info = ip6gre_fill_info ,
. get_link_net = ip6_tnl_get_link_net ,
} ;
2012-08-10 00:51:50 +00:00
/*
* And now the modules code and kernel interface .
*/
static int __init ip6gre_init ( void )
{
int err ;
pr_info ( " GRE over IPv6 tunneling driver \n " ) ;
err = register_pernet_device ( & ip6gre_net_ops ) ;
if ( err < 0 )
return err ;
err = inet6_add_protocol ( & ip6gre_protocol , IPPROTO_GRE ) ;
if ( err < 0 ) {
pr_info ( " %s: can't add protocol \n " , __func__ ) ;
goto add_proto_failed ;
}
err = rtnl_link_register ( & ip6gre_link_ops ) ;
if ( err < 0 )
goto rtnl_link_failed ;
err = rtnl_link_register ( & ip6gre_tap_ops ) ;
if ( err < 0 )
goto tap_ops_failed ;
2017-11-30 11:51:29 -08:00
err = rtnl_link_register ( & ip6erspan_tap_ops ) ;
if ( err < 0 )
goto erspan_link_failed ;
2012-08-10 00:51:50 +00:00
out :
return err ;
2017-11-30 11:51:29 -08:00
erspan_link_failed :
rtnl_link_unregister ( & ip6gre_tap_ops ) ;
2012-08-10 00:51:50 +00:00
tap_ops_failed :
rtnl_link_unregister ( & ip6gre_link_ops ) ;
rtnl_link_failed :
inet6_del_protocol ( & ip6gre_protocol , IPPROTO_GRE ) ;
add_proto_failed :
unregister_pernet_device ( & ip6gre_net_ops ) ;
goto out ;
}
static void __exit ip6gre_fini ( void )
{
rtnl_link_unregister ( & ip6gre_tap_ops ) ;
rtnl_link_unregister ( & ip6gre_link_ops ) ;
2017-11-30 11:51:29 -08:00
rtnl_link_unregister ( & ip6erspan_tap_ops ) ;
2012-08-10 00:51:50 +00:00
inet6_del_protocol ( & ip6gre_protocol , IPPROTO_GRE ) ;
unregister_pernet_device ( & ip6gre_net_ops ) ;
}
module_init ( ip6gre_init ) ;
module_exit ( ip6gre_fini ) ;
MODULE_LICENSE ( " GPL " ) ;
2024-02-13 15:54:04 +01:00
MODULE_AUTHOR ( " D. Kozlov <xeb@mail.ru> " ) ;
2012-08-10 00:51:50 +00:00
MODULE_DESCRIPTION ( " GRE over IPv6 tunneling device " ) ;
MODULE_ALIAS_RTNL_LINK ( " ip6gre " ) ;
2014-09-24 11:03:00 +02:00
MODULE_ALIAS_RTNL_LINK ( " ip6gretap " ) ;
2017-12-13 16:38:57 -08:00
MODULE_ALIAS_RTNL_LINK ( " ip6erspan " ) ;
2012-08-10 00:51:50 +00:00
MODULE_ALIAS_NETDEV ( " ip6gre0 " ) ;