2019-05-29 07:12:43 -07:00
// SPDX-License-Identifier: GPL-2.0-only
2011-10-25 19:26:31 -07:00
/*
2014-05-06 16:48:38 -07:00
* Copyright ( c ) 2007 - 2014 Nicira , Inc .
2011-10-25 19:26:31 -07:00
*/
# define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
# include <linux/init.h>
# include <linux/module.h>
# include <linux/if_arp.h>
# include <linux/if_vlan.h>
# include <linux/in.h>
# include <linux/ip.h>
# include <linux/jhash.h>
# include <linux/delay.h>
# include <linux/time.h>
# include <linux/etherdevice.h>
# include <linux/genetlink.h>
# include <linux/kernel.h>
# include <linux/kthread.h>
# include <linux/mutex.h>
# include <linux/percpu.h>
# include <linux/rcupdate.h>
# include <linux/tcp.h>
# include <linux/udp.h>
# include <linux/ethtool.h>
# include <linux/wait.h>
# include <asm/div64.h>
# include <linux/highmem.h>
# include <linux/netfilter_bridge.h>
# include <linux/netfilter_ipv4.h>
# include <linux/inetdevice.h>
# include <linux/list.h>
# include <linux/openvswitch.h>
# include <linux/rculist.h>
# include <linux/dmi.h>
# include <net/genetlink.h>
2012-02-22 19:58:59 -08:00
# include <net/net_namespace.h>
# include <net/netns/generic.h>
2022-02-03 10:44:30 +02:00
# include <net/pkt_cls.h>
2011-10-25 19:26:31 -07:00
# include "datapath.h"
# include "flow.h"
2014-01-21 09:31:04 -08:00
# include "flow_table.h"
2013-10-03 18:16:47 -07:00
# include "flow_netlink.h"
2017-11-10 12:09:42 -08:00
# include "meter.h"
2021-06-22 10:02:33 -04:00
# include "openvswitch_trace.h"
2011-10-25 19:26:31 -07:00
# include "vport-internal_dev.h"
2013-04-29 13:06:41 +00:00
# include "vport-netdev.h"
2011-10-25 19:26:31 -07:00
netns: make struct pernet_operations::id unsigned int
Make struct pernet_operations::id unsigned.
There are 2 reasons to do so:
1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.
2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.
"int" being used as an array index needs to be sign-extended
to 64-bit before being used.
void f(long *p, int i)
{
g(p[i]);
}
roughly translates to
movsx rsi, esi
mov rdi, [rsi+...]
call g
MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.
Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:
static inline void *net_generic(const struct net *net, int id)
{
...
ptr = ng->ptr[id - 1];
...
}
And this function is used a lot, so those sign extensions add up.
Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]
However, overall balance is in negative direction:
add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
function old new delta
nfsd4_lock 3886 3959 +73
tipc_link_build_proto_msg 1096 1140 +44
mac80211_hwsim_new_radio 2776 2808 +32
tipc_mon_rcv 1032 1058 +26
svcauth_gss_legacy_init 1413 1429 +16
tipc_bcbase_select_primary 379 392 +13
nfsd4_exchange_id 1247 1260 +13
nfsd4_setclientid_confirm 782 793 +11
...
put_client_renew_locked 494 480 -14
ip_set_sockfn_get 730 716 -14
geneve_sock_add 829 813 -16
nfsd4_sequence_done 721 703 -18
nlmclnt_lookup_host 708 686 -22
nfsd4_lockt 1085 1063 -22
nfs_get_client 1077 1050 -27
tcf_bpf_init 1106 1076 -30
nfsd4_encode_fattr 5997 5930 -67
Total: Before=154856051, After=154854321, chg -0.00%
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-17 04:58:21 +03:00
unsigned int ovs_net_id __read_mostly ;
2013-04-15 13:23:03 -07:00
2014-05-06 16:44:50 -07:00
static struct genl_family dp_packet_genl_family ;
static struct genl_family dp_flow_genl_family ;
static struct genl_family dp_datapath_genl_family ;
2015-01-21 16:42:52 -08:00
static const struct nla_policy flow_policy [ ] ;
2014-07-16 11:25:52 -07:00
static const struct genl_multicast_group ovs_dp_flow_multicast_group = {
. name = OVS_FLOW_MCGROUP ,
2014-05-06 16:44:50 -07:00
} ;
2014-07-16 11:25:52 -07:00
static const struct genl_multicast_group ovs_dp_datapath_multicast_group = {
. name = OVS_DATAPATH_MCGROUP ,
2014-05-06 16:44:50 -07:00
} ;
2014-07-16 11:25:52 -07:00
static const struct genl_multicast_group ovs_dp_vport_multicast_group = {
. name = OVS_VPORT_MCGROUP ,
2014-05-06 16:44:50 -07:00
} ;
2014-05-05 13:13:14 -07:00
/* Check if need to build a reply message.
* OVS userspace sets the NLM_F_ECHO flag if it needs the reply . */
2014-09-18 10:31:04 +02:00
static bool ovs_must_notify ( struct genl_family * family , struct genl_info * info ,
unsigned int group )
2014-05-05 13:13:14 -07:00
{
return info - > nlhdr - > nlmsg_flags & NLM_F_ECHO | |
2014-12-22 18:56:36 +01:00
genl_has_listeners ( family , genl_info_net ( info ) , group ) ;
2014-05-05 13:13:14 -07:00
}
2013-11-19 15:19:38 +01:00
static void ovs_notify ( struct genl_family * family ,
2013-11-19 15:19:39 +01:00
struct sk_buff * skb , struct genl_info * info )
2013-03-29 14:46:50 +01:00
{
2015-09-22 18:56:43 +02:00
genl_notify ( family , skb , info , 0 , GFP_KERNEL ) ;
2013-03-29 14:46:50 +01:00
}
2011-10-25 19:26:31 -07:00
/**
* DOC : Locking :
*
2013-04-15 13:23:03 -07:00
* All writes e . g . Writes to device state ( add / remove datapath , port , set
* operations on vports , etc . ) , Writes to other state ( flow table
* modifications , set miscellaneous datapath parameters , etc . ) are protected
* by ovs_lock .
2011-10-25 19:26:31 -07:00
*
* Reads are protected by RCU .
*
* There are a few special cases ( mostly stats ) that have their own
* synchronization but they nest under all of above and don ' t interact with
* each other .
2013-04-15 13:23:03 -07:00
*
* The RTNL lock nests inside ovs_mutex .
2011-10-25 19:26:31 -07:00
*/
2013-04-15 13:23:03 -07:00
static DEFINE_MUTEX ( ovs_mutex ) ;
void ovs_lock ( void )
{
mutex_lock ( & ovs_mutex ) ;
}
void ovs_unlock ( void )
{
mutex_unlock ( & ovs_mutex ) ;
}
# ifdef CONFIG_LOCKDEP
int lockdep_ovsl_is_held ( void )
{
if ( debug_locks )
return lockdep_is_held ( & ovs_mutex ) ;
else
return 1 ;
}
# endif
2011-10-25 19:26:31 -07:00
static struct vport * new_vport ( const struct vport_parms * ) ;
2013-12-13 15:22:20 +01:00
static int queue_gso_packets ( struct datapath * dp , struct sk_buff * ,
2014-11-06 06:57:27 -08:00
const struct sw_flow_key * ,
2016-06-10 11:49:33 -07:00
const struct dp_upcall_info * ,
uint32_t cutlen ) ;
2013-12-13 15:22:20 +01:00
static int queue_userspace_packet ( struct datapath * dp , struct sk_buff * ,
2014-11-06 06:57:27 -08:00
const struct sw_flow_key * ,
2016-06-10 11:49:33 -07:00
const struct dp_upcall_info * ,
uint32_t cutlen ) ;
2011-10-25 19:26:31 -07:00
2020-07-15 14:09:28 +02:00
static void ovs_dp_masks_rebalance ( struct work_struct * work ) ;
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
static int ovs_dp_set_upcall_portids ( struct datapath * , const struct nlattr * ) ;
2013-04-15 13:23:03 -07:00
/* Must be called with rcu_read_lock or ovs_mutex. */
2014-09-15 19:37:25 -07:00
const char * ovs_dp_name ( const struct datapath * dp )
2011-10-25 19:26:31 -07:00
{
2013-04-15 13:23:03 -07:00
struct vport * vport = ovs_vport_ovsl_rcu ( dp , OVSP_LOCAL ) ;
2015-07-21 10:44:05 +02:00
return ovs_vport_name ( vport ) ;
2011-10-25 19:26:31 -07:00
}
2014-11-06 06:58:52 -08:00
static int get_dpifindex ( const struct datapath * dp )
2011-10-25 19:26:31 -07:00
{
struct vport * local ;
int ifindex ;
rcu_read_lock ( ) ;
2012-08-23 12:40:54 -07:00
local = ovs_vport_rcu ( dp , OVSP_LOCAL ) ;
2011-10-25 19:26:31 -07:00
if ( local )
2015-07-21 10:44:04 +02:00
ifindex = local - > dev - > ifindex ;
2011-10-25 19:26:31 -07:00
else
ifindex = 0 ;
rcu_read_unlock ( ) ;
return ifindex ;
}
static void destroy_dp_rcu ( struct rcu_head * rcu )
{
struct datapath * dp = container_of ( rcu , struct datapath , rcu ) ;
2014-05-06 18:41:20 -07:00
ovs_flow_tbl_destroy ( & dp - > table ) ;
2011-10-25 19:26:31 -07:00
free_percpu ( dp - > stats_percpu ) ;
2012-08-23 12:40:54 -07:00
kfree ( dp - > ports ) ;
2017-11-10 12:09:42 -08:00
ovs_meters_exit ( dp ) ;
2021-07-23 10:24:14 -04:00
kfree ( rcu_dereference_raw ( dp - > upcall_portids ) ) ;
2011-10-25 19:26:31 -07:00
kfree ( dp ) ;
}
2012-08-23 12:40:54 -07:00
static struct hlist_head * vport_hash_bucket ( const struct datapath * dp ,
u16 port_no )
{
return & dp - > ports [ port_no & ( DP_VPORT_HASH_BUCKETS - 1 ) ] ;
}
2014-05-05 11:32:17 -07:00
/* Called with ovs_mutex or RCU read lock. */
2012-08-23 12:40:54 -07:00
struct vport * ovs_lookup_vport ( const struct datapath * dp , u16 port_no )
{
struct vport * vport ;
struct hlist_head * head ;
head = vport_hash_bucket ( dp , port_no ) ;
2020-02-19 01:28:02 +05:30
hlist_for_each_entry_rcu ( vport , head , dp_hash_node ,
2020-09-01 20:26:12 +08:00
lockdep_ovsl_is_held ( ) ) {
2012-08-23 12:40:54 -07:00
if ( vport - > port_no = = port_no )
return vport ;
}
return NULL ;
}
2013-04-15 13:23:03 -07:00
/* Called with ovs_mutex. */
2011-10-25 19:26:31 -07:00
static struct vport * new_vport ( const struct vport_parms * parms )
{
struct vport * vport ;
vport = ovs_vport_add ( parms ) ;
if ( ! IS_ERR ( vport ) ) {
struct datapath * dp = parms - > dp ;
2012-08-23 12:40:54 -07:00
struct hlist_head * head = vport_hash_bucket ( dp , vport - > port_no ) ;
2011-10-25 19:26:31 -07:00
2012-08-23 12:40:54 -07:00
hlist_add_head_rcu ( & vport - > dp_hash_node , head ) ;
2011-10-25 19:26:31 -07:00
}
return vport ;
}
void ovs_dp_detach_port ( struct vport * p )
{
2013-04-15 13:23:03 -07:00
ASSERT_OVSL ( ) ;
2011-10-25 19:26:31 -07:00
/* First drop references to device. */
2012-08-23 12:40:54 -07:00
hlist_del_rcu ( & p - > dp_hash_node ) ;
2011-10-25 19:26:31 -07:00
/* Then destroy it. */
ovs_vport_del ( p ) ;
}
/* Must be called with rcu_read_lock. */
2014-09-15 19:28:44 -07:00
void ovs_dp_process_packet ( struct sk_buff * skb , struct sw_flow_key * key )
2011-10-25 19:26:31 -07:00
{
2014-09-15 19:20:31 -07:00
const struct vport * p = OVS_CB ( skb ) - > input_vport ;
2011-10-25 19:26:31 -07:00
struct datapath * dp = p - > dp ;
struct sw_flow * flow ;
2014-10-06 05:45:32 -07:00
struct sw_flow_actions * sf_acts ;
2011-10-25 19:26:31 -07:00
struct dp_stats_percpu * stats ;
u64 * stats_counter ;
2013-10-22 10:42:46 -07:00
u32 n_mask_hit ;
2020-07-31 14:20:56 +02:00
u32 n_cache_hit ;
2019-08-04 19:56:11 -07:00
int error ;
2011-10-25 19:26:31 -07:00
2012-11-13 09:52:25 +08:00
stats = this_cpu_ptr ( dp - > stats_percpu ) ;
2011-10-25 19:26:31 -07:00
/* Look up flow. */
2019-11-01 22:23:45 +08:00
flow = ovs_flow_tbl_lookup_stats ( & dp - > table , key , skb_get_hash ( skb ) ,
2020-07-31 14:20:56 +02:00
& n_mask_hit , & n_cache_hit ) ;
2011-10-25 19:26:31 -07:00
if ( unlikely ( ! flow ) ) {
struct dp_upcall_info upcall ;
2015-05-26 20:59:43 -07:00
memset ( & upcall , 0 , sizeof ( upcall ) ) ;
2011-10-25 19:26:31 -07:00
upcall . cmd = OVS_PACKET_CMD_MISS ;
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
if ( dp - > user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU )
2021-07-23 10:24:13 -04:00
upcall . portid =
ovs_dp_get_upcall_portid ( dp , smp_processor_id ( ) ) ;
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
else
upcall . portid = ovs_vport_find_upcall_portid ( p , skb ) ;
2015-08-26 11:31:48 -07:00
upcall . mru = OVS_CB ( skb ) - > mru ;
2016-06-10 11:49:33 -07:00
error = ovs_dp_upcall ( dp , skb , key , & upcall , 0 ) ;
2014-09-03 17:43:45 +08:00
if ( unlikely ( error ) )
kfree_skb ( skb ) ;
else
consume_skb ( skb ) ;
2011-10-25 19:26:31 -07:00
stats_counter = & stats - > n_missed ;
goto out ;
}
2014-10-06 05:45:32 -07:00
ovs_flow_stats_update ( flow , key - > tp . flags , skb ) ;
sf_acts = rcu_dereference ( flow - > sf_acts ) ;
2019-08-04 19:56:11 -07:00
error = ovs_execute_actions ( dp , skb , sf_acts , key ) ;
if ( unlikely ( error ) )
net_dbg_ratelimited ( " ovs: action execution error on datapath %s: %d \n " ,
2020-09-01 20:26:12 +08:00
ovs_dp_name ( dp ) , error ) ;
2011-10-25 19:26:31 -07:00
2013-10-29 17:22:21 -07:00
stats_counter = & stats - > n_hit ;
2011-10-25 19:26:31 -07:00
out :
/* Update datapath statistics. */
2014-02-14 15:10:46 -08:00
u64_stats_update_begin ( & stats - > syncp ) ;
2011-10-25 19:26:31 -07:00
( * stats_counter ) + + ;
2013-10-22 10:42:46 -07:00
stats - > n_mask_hit + = n_mask_hit ;
2020-07-31 14:20:56 +02:00
stats - > n_cache_hit + = n_cache_hit ;
2014-02-14 15:10:46 -08:00
u64_stats_update_end ( & stats - > syncp ) ;
2011-10-25 19:26:31 -07:00
}
int ovs_dp_upcall ( struct datapath * dp , struct sk_buff * skb ,
2014-11-06 06:57:27 -08:00
const struct sw_flow_key * key ,
2016-06-10 11:49:33 -07:00
const struct dp_upcall_info * upcall_info ,
uint32_t cutlen )
2011-10-25 19:26:31 -07:00
{
struct dp_stats_percpu * stats ;
int err ;
2021-06-22 10:02:33 -04:00
if ( trace_ovs_dp_upcall_enabled ( ) )
trace_ovs_dp_upcall ( dp , skb , key , upcall_info ) ;
2012-09-07 20:12:54 +00:00
if ( upcall_info - > portid = = 0 ) {
2011-10-25 19:26:31 -07:00
err = - ENOTCONN ;
goto err ;
}
if ( ! skb_is_gso ( skb ) )
2016-06-10 11:49:33 -07:00
err = queue_userspace_packet ( dp , skb , key , upcall_info , cutlen ) ;
2011-10-25 19:26:31 -07:00
else
2016-06-10 11:49:33 -07:00
err = queue_gso_packets ( dp , skb , key , upcall_info , cutlen ) ;
2011-10-25 19:26:31 -07:00
if ( err )
goto err ;
return 0 ;
err :
2012-11-13 09:52:25 +08:00
stats = this_cpu_ptr ( dp - > stats_percpu ) ;
2011-10-25 19:26:31 -07:00
2014-02-14 15:10:46 -08:00
u64_stats_update_begin ( & stats - > syncp ) ;
2011-10-25 19:26:31 -07:00
stats - > n_lost + + ;
2014-02-14 15:10:46 -08:00
u64_stats_update_end ( & stats - > syncp ) ;
2011-10-25 19:26:31 -07:00
return err ;
}
2013-12-13 15:22:20 +01:00
static int queue_gso_packets ( struct datapath * dp , struct sk_buff * skb ,
2014-11-06 06:57:27 -08:00
const struct sw_flow_key * key ,
2016-06-10 11:49:33 -07:00
const struct dp_upcall_info * upcall_info ,
2020-09-01 20:26:12 +08:00
uint32_t cutlen )
2011-10-25 19:26:31 -07:00
{
2017-11-25 13:14:40 -06:00
unsigned int gso_type = skb_shinfo ( skb ) - > gso_type ;
net: accept UFO datagrams from tuntap and packet
Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.
Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.
Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.
It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.
To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").
(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.
Tested
Booted a v4.13 guest kernel with QEMU. On a host kernel before this
patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
enabled, same as on a v4.13 host kernel.
A UFO packet sent from the guest appears on the tap device:
host:
nc -l -p -u 8000 &
tcpdump -n -i tap0
guest:
dd if=/dev/zero of=payload.txt bs=1 count=2000
nc -u 192.16.1.1 8000 < payload.txt
Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
packets arriving fragmented:
./with_tap_pair.sh ./tap_send_ufo tap0 tap1
(from https://github.com/wdebruij/kerneltools/tree/master/tests)
Changes
v1 -> v2
- simplified set_offload change (review comment)
- documented test procedure
Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-21 10:22:25 -05:00
struct sw_flow_key later_key ;
2011-10-25 19:26:31 -07:00
struct sk_buff * segs , * nskb ;
int err ;
2020-03-26 15:33:14 +08:00
BUILD_BUG_ON ( sizeof ( * OVS_CB ( skb ) ) > SKB_GSO_CB_OFFSET ) ;
2013-12-13 15:22:22 +01:00
segs = __skb_gso_segment ( skb , NETIF_F_SG , false ) ;
2012-07-20 14:46:29 -07:00
if ( IS_ERR ( segs ) )
return PTR_ERR ( segs ) ;
2014-10-20 13:49:17 +02:00
if ( segs = = NULL )
return - EINVAL ;
2011-10-25 19:26:31 -07:00
net: accept UFO datagrams from tuntap and packet
Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.
Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.
Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.
It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.
To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").
(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.
Tested
Booted a v4.13 guest kernel with QEMU. On a host kernel before this
patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
enabled, same as on a v4.13 host kernel.
A UFO packet sent from the guest appears on the tap device:
host:
nc -l -p -u 8000 &
tcpdump -n -i tap0
guest:
dd if=/dev/zero of=payload.txt bs=1 count=2000
nc -u 192.16.1.1 8000 < payload.txt
Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
packets arriving fragmented:
./with_tap_pair.sh ./tap_send_ufo tap0 tap1
(from https://github.com/wdebruij/kerneltools/tree/master/tests)
Changes
v1 -> v2
- simplified set_offload change (review comment)
- documented test procedure
Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-21 10:22:25 -05:00
if ( gso_type & SKB_GSO_UDP ) {
/* The initial flow key extracted by ovs_flow_key_extract()
* in this case is for a first fragment , so we need to
* properly mark later fragments .
*/
later_key = * key ;
later_key . ip . frag = OVS_FRAG_TYPE_LATER ;
}
2011-10-25 19:26:31 -07:00
/* Queue all of the segments. */
2020-01-13 18:42:29 -05:00
skb_list_walk_safe ( segs , skb , nskb ) {
net: accept UFO datagrams from tuntap and packet
Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.
Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.
Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.
It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.
To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 939912216fa8 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1
("net: avoid skb_warn_bad_offload false positives on UFO").
(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.
Tested
Booted a v4.13 guest kernel with QEMU. On a host kernel before this
patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
enabled, same as on a v4.13 host kernel.
A UFO packet sent from the guest appears on the tap device:
host:
nc -l -p -u 8000 &
tcpdump -n -i tap0
guest:
dd if=/dev/zero of=payload.txt bs=1 count=2000
nc -u 192.16.1.1 8000 < payload.txt
Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
packets arriving fragmented:
./with_tap_pair.sh ./tap_send_ufo tap0 tap1
(from https://github.com/wdebruij/kerneltools/tree/master/tests)
Changes
v1 -> v2
- simplified set_offload change (review comment)
- documented test procedure
Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-21 10:22:25 -05:00
if ( gso_type & SKB_GSO_UDP & & skb ! = segs )
key = & later_key ;
2016-06-10 11:49:33 -07:00
err = queue_userspace_packet ( dp , skb , key , upcall_info , cutlen ) ;
2011-10-25 19:26:31 -07:00
if ( err )
break ;
2020-01-13 18:42:29 -05:00
}
2011-10-25 19:26:31 -07:00
/* Free all of the segments. */
2020-01-13 18:42:29 -05:00
skb_list_walk_safe ( segs , skb , nskb ) {
2011-10-25 19:26:31 -07:00
if ( err )
kfree_skb ( skb ) ;
else
consume_skb ( skb ) ;
2020-01-13 18:42:29 -05:00
}
2011-10-25 19:26:31 -07:00
return err ;
}
2014-11-06 06:51:24 -08:00
static size_t upcall_msg_size ( const struct dp_upcall_info * upcall_info ,
openvswitch: fix skb_panic due to the incorrect actions attrlen
For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.
But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:129!
[...]
Call Trace:
<IRQ>
[<ffffffff8148be82>] skb_put+0x43/0x44
[<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
[<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
[<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
[<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
[<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
[<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
[<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
[<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
[<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
[<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
[...]
Also we can find that the actions_len is much little than the orig_len:
crash> struct sw_flow_actions 0xffff8812f539d000
struct sw_flow_actions {
rcu = {
next = 0xffff8812f5398800,
func = 0xffffe3b00035db32
},
orig_len = 1384,
actions_len = 592,
actions = 0xffff8812f539d01c
}
So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.
Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.
Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace")
Cc: Neil McKee <neil.mckee@inmon.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-16 13:30:07 +08:00
unsigned int hdrlen , int actions_attrlen )
2013-03-29 14:46:49 +01:00
{
size_t size = NLMSG_ALIGN ( sizeof ( struct ovs_header ) )
2013-12-13 15:22:21 +01:00
+ nla_total_size ( hdrlen ) /* OVS_PACKET_ATTR_PACKET */
2016-06-20 07:26:17 -07:00
+ nla_total_size ( ovs_key_attr_size ( ) ) /* OVS_PACKET_ATTR_KEY */
net: openvswitch: add hash info to upcall
When using the kernel datapath, the upcall don't
include skb hash info relatived. That will introduce
some problem, because the hash of skb is important
in kernel stack. For example, VXLAN module uses
it to select UDP src port. The tx queue selection
may also use the hash in stack.
Hash is computed in different ways. Hash is random
for a TCP socket, and hash may be computed in hardware,
or software stack. Recalculation hash is not easy.
Hash of TCP socket is computed:
tcp_v4_connect
-> sk_set_txhash (is random)
__tcp_transmit_skb
-> skb_set_hash_from_sk
There will be one upcall, without information of skb
hash, to ovs-vswitchd, for the first packet of a TCP
session. The rest packets will be processed in Open vSwitch
modules, hash kept. If this tcp session is forward to
VXLAN module, then the UDP src port of first tcp packet
is different from rest packets.
TCP packets may come from the host or dockers, to Open vSwitch.
To fix it, we store the hash info to upcall, and restore hash
when packets sent back.
+---------------+ +-------------------------+
| Docker/VMs | | ovs-vswitchd |
+----+----------+ +-+--------------------+--+
| ^ |
| | |
| | upcall v restore packet hash (not recalculate)
| +-+--------------------+--+
| tap netdev | | vxlan module
+---------------> +--> Open vSwitch ko +-->
or internal type | |
+-------------------------+
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-13 23:04:49 +08:00
+ nla_total_size ( sizeof ( unsigned int ) ) /* OVS_PACKET_ATTR_LEN */
+ nla_total_size ( sizeof ( u64 ) ) ; /* OVS_PACKET_ATTR_HASH */
2013-03-29 14:46:49 +01:00
/* OVS_PACKET_ATTR_USERDATA */
2014-11-06 06:51:24 -08:00
if ( upcall_info - > userdata )
size + = NLA_ALIGN ( upcall_info - > userdata - > nla_len ) ;
/* OVS_PACKET_ATTR_EGRESS_TUN_KEY */
if ( upcall_info - > egress_tun_info )
size + = nla_total_size ( ovs_tun_key_attr_size ( ) ) ;
2013-03-29 14:46:49 +01:00
2015-05-26 20:59:43 -07:00
/* OVS_PACKET_ATTR_ACTIONS */
if ( upcall_info - > actions_len )
openvswitch: fix skb_panic due to the incorrect actions attrlen
For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.
But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:129!
[...]
Call Trace:
<IRQ>
[<ffffffff8148be82>] skb_put+0x43/0x44
[<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
[<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
[<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
[<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
[<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
[<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
[<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
[<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
[<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
[<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
[...]
Also we can find that the actions_len is much little than the orig_len:
crash> struct sw_flow_actions 0xffff8812f539d000
struct sw_flow_actions {
rcu = {
next = 0xffff8812f5398800,
func = 0xffffe3b00035db32
},
orig_len = 1384,
actions_len = 592,
actions = 0xffff8812f539d01c
}
So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.
Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.
Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace")
Cc: Neil McKee <neil.mckee@inmon.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-16 13:30:07 +08:00
size + = nla_total_size ( actions_attrlen ) ;
2015-05-26 20:59:43 -07:00
2015-08-26 11:31:48 -07:00
/* OVS_PACKET_ATTR_MRU */
if ( upcall_info - > mru )
size + = nla_total_size ( sizeof ( upcall_info - > mru ) ) ;
2013-03-29 14:46:49 +01:00
return size ;
}
2015-08-26 11:31:48 -07:00
static void pad_packet ( struct datapath * dp , struct sk_buff * skb )
{
if ( ! ( dp - > user_features & OVS_DP_F_UNALIGNED ) ) {
size_t plen = NLA_ALIGN ( skb - > len ) - skb - > len ;
if ( plen > 0 )
networking: convert many more places to skb_put_zero()
There were many places that my previous spatch didn't find,
as pointed out by yuan linyu in various patches.
The following spatch found many more and also removes the
now unnecessary casts:
@@
identifier p, p2;
expression len;
expression skb;
type t, t2;
@@
(
-p = skb_put(skb, len);
+p = skb_put_zero(skb, len);
|
-p = (t)skb_put(skb, len);
+p = skb_put_zero(skb, len);
)
... when != p
(
p2 = (t2)p;
-memset(p2, 0, len);
|
-memset(p, 0, len);
)
@@
type t, t2;
identifier p, p2;
expression skb;
@@
t *p;
...
(
-p = skb_put(skb, sizeof(t));
+p = skb_put_zero(skb, sizeof(t));
|
-p = (t *)skb_put(skb, sizeof(t));
+p = skb_put_zero(skb, sizeof(t));
)
... when != p
(
p2 = (t2)p;
-memset(p2, 0, sizeof(*p));
|
-memset(p, 0, sizeof(*p));
)
@@
expression skb, len;
@@
-memset(skb_put(skb, len), 0, len);
+skb_put_zero(skb, len);
Apply it to the tree (with one manual fixup to keep the
comment in vxlan.c, which spatch removed.)
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-06-16 14:29:19 +02:00
skb_put_zero ( skb , plen ) ;
2015-08-26 11:31:48 -07:00
}
}
2013-12-13 15:22:20 +01:00
static int queue_userspace_packet ( struct datapath * dp , struct sk_buff * skb ,
2014-11-06 06:57:27 -08:00
const struct sw_flow_key * key ,
2016-06-10 11:49:33 -07:00
const struct dp_upcall_info * upcall_info ,
uint32_t cutlen )
2011-10-25 19:26:31 -07:00
{
struct ovs_header * upcall ;
struct sk_buff * nskb = NULL ;
2014-09-02 20:52:28 +08:00
struct sk_buff * user_skb = NULL ; /* to be queued to userspace */
2011-10-25 19:26:31 -07:00
struct nlattr * nla ;
2013-11-30 13:21:32 +01:00
size_t len ;
2013-12-13 15:22:21 +01:00
unsigned int hlen ;
2013-12-13 15:22:20 +01:00
int err , dp_ifindex ;
net: openvswitch: add hash info to upcall
When using the kernel datapath, the upcall don't
include skb hash info relatived. That will introduce
some problem, because the hash of skb is important
in kernel stack. For example, VXLAN module uses
it to select UDP src port. The tx queue selection
may also use the hash in stack.
Hash is computed in different ways. Hash is random
for a TCP socket, and hash may be computed in hardware,
or software stack. Recalculation hash is not easy.
Hash of TCP socket is computed:
tcp_v4_connect
-> sk_set_txhash (is random)
__tcp_transmit_skb
-> skb_set_hash_from_sk
There will be one upcall, without information of skb
hash, to ovs-vswitchd, for the first packet of a TCP
session. The rest packets will be processed in Open vSwitch
modules, hash kept. If this tcp session is forward to
VXLAN module, then the UDP src port of first tcp packet
is different from rest packets.
TCP packets may come from the host or dockers, to Open vSwitch.
To fix it, we store the hash info to upcall, and restore hash
when packets sent back.
+---------------+ +-------------------------+
| Docker/VMs | | ovs-vswitchd |
+----+----------+ +-+--------------------+--+
| ^ |
| | |
| | upcall v restore packet hash (not recalculate)
| +-+--------------------+--+
| tap netdev | | vxlan module
+---------------> +--> Open vSwitch ko +-->
or internal type | |
+-------------------------+
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-13 23:04:49 +08:00
u64 hash ;
2013-12-13 15:22:20 +01:00
dp_ifindex = get_dpifindex ( dp ) ;
if ( ! dp_ifindex )
return - ENODEV ;
2011-10-25 19:26:31 -07:00
2015-01-13 17:13:44 +01:00
if ( skb_vlan_tag_present ( skb ) ) {
2011-10-25 19:26:31 -07:00
nskb = skb_clone ( skb , GFP_ATOMIC ) ;
if ( ! nskb )
return - ENOMEM ;
2014-11-19 14:04:59 +01:00
nskb = __vlan_hwaccel_push_inside ( nskb ) ;
2012-05-13 08:44:18 +00:00
if ( ! nskb )
2011-10-25 19:26:31 -07:00
return - ENOMEM ;
skb = nskb ;
}
if ( nla_attr_size ( skb - > len ) > USHRT_MAX ) {
err = - EFBIG ;
goto out ;
}
2013-12-13 15:22:21 +01:00
/* Complete checksum if needed */
if ( skb - > ip_summed = = CHECKSUM_PARTIAL & &
2017-05-18 15:44:42 +02:00
( err = skb_csum_hwoffload_help ( skb , 0 ) ) )
2013-12-13 15:22:21 +01:00
goto out ;
/* Older versions of OVS user space enforce alignment of the last
* Netlink attribute to NLA_ALIGNTO which would require extensive
* padding logic . Only perform zerocopy if padding is not required .
*/
if ( dp - > user_features & OVS_DP_F_UNALIGNED )
hlen = skb_zerocopy_headlen ( skb ) ;
else
hlen = skb - > len ;
openvswitch: fix skb_panic due to the incorrect actions attrlen
For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.
But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:129!
[...]
Call Trace:
<IRQ>
[<ffffffff8148be82>] skb_put+0x43/0x44
[<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
[<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
[<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
[<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
[<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
[<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
[<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
[<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
[<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
[<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
[...]
Also we can find that the actions_len is much little than the orig_len:
crash> struct sw_flow_actions 0xffff8812f539d000
struct sw_flow_actions {
rcu = {
next = 0xffff8812f5398800,
func = 0xffffe3b00035db32
},
orig_len = 1384,
actions_len = 592,
actions = 0xffff8812f539d01c
}
So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.
Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.
Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace")
Cc: Neil McKee <neil.mckee@inmon.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-16 13:30:07 +08:00
len = upcall_msg_size ( upcall_info , hlen - cutlen ,
OVS_CB ( skb ) - > acts_origlen ) ;
2016-02-18 15:03:25 +01:00
user_skb = genlmsg_new ( len , GFP_ATOMIC ) ;
2011-10-25 19:26:31 -07:00
if ( ! user_skb ) {
err = - ENOMEM ;
goto out ;
}
upcall = genlmsg_put ( user_skb , 0 , 0 , & dp_packet_genl_family ,
0 , upcall_info - > cmd ) ;
2019-03-14 23:20:16 -05:00
if ( ! upcall ) {
err = - EINVAL ;
goto out ;
}
2011-10-25 19:26:31 -07:00
upcall - > dp_ifindex = dp_ifindex ;
2015-01-21 16:42:48 -08:00
err = ovs_nla_put_key ( key , key , OVS_PACKET_ATTR_KEY , false , user_skb ) ;
2019-05-02 16:12:38 -04:00
if ( err )
goto out ;
2011-10-25 19:26:31 -07:00
if ( upcall_info - > userdata )
2013-02-15 17:29:22 -08:00
__nla_put ( user_skb , OVS_PACKET_ATTR_USERDATA ,
nla_len ( upcall_info - > userdata ) ,
nla_data ( upcall_info - > userdata ) ) ;
2011-10-25 19:26:31 -07:00
2014-11-06 06:51:24 -08:00
if ( upcall_info - > egress_tun_info ) {
2019-04-26 11:13:06 +02:00
nla = nla_nest_start_noflag ( user_skb ,
OVS_PACKET_ATTR_EGRESS_TUN_KEY ) ;
2019-03-15 01:11:22 -05:00
if ( ! nla ) {
err = - EMSGSIZE ;
goto out ;
}
2015-10-22 18:17:16 -07:00
err = ovs_nla_put_tunnel_info ( user_skb ,
upcall_info - > egress_tun_info ) ;
2019-05-02 16:12:38 -04:00
if ( err )
goto out ;
2014-11-06 06:51:24 -08:00
nla_nest_end ( user_skb , nla ) ;
}
2015-05-26 20:59:43 -07:00
if ( upcall_info - > actions_len ) {
2019-04-26 11:13:06 +02:00
nla = nla_nest_start_noflag ( user_skb , OVS_PACKET_ATTR_ACTIONS ) ;
2019-03-15 01:11:22 -05:00
if ( ! nla ) {
err = - EMSGSIZE ;
goto out ;
}
2015-05-26 20:59:43 -07:00
err = ovs_nla_put_actions ( upcall_info - > actions ,
upcall_info - > actions_len ,
user_skb ) ;
if ( ! err )
nla_nest_end ( user_skb , nla ) ;
else
nla_nest_cancel ( user_skb , nla ) ;
}
2015-08-26 11:31:48 -07:00
/* Add OVS_PACKET_ATTR_MRU */
2019-11-14 23:51:08 +08:00
if ( upcall_info - > mru & &
nla_put_u16 ( user_skb , OVS_PACKET_ATTR_MRU , upcall_info - > mru ) ) {
err = - ENOBUFS ;
goto out ;
2015-08-26 11:31:48 -07:00
}
2016-06-20 07:26:17 -07:00
/* Add OVS_PACKET_ATTR_LEN when packet is truncated */
2019-11-14 23:51:08 +08:00
if ( cutlen > 0 & &
nla_put_u32 ( user_skb , OVS_PACKET_ATTR_LEN , skb - > len ) ) {
err = - ENOBUFS ;
goto out ;
2016-06-20 07:26:17 -07:00
}
net: openvswitch: add hash info to upcall
When using the kernel datapath, the upcall don't
include skb hash info relatived. That will introduce
some problem, because the hash of skb is important
in kernel stack. For example, VXLAN module uses
it to select UDP src port. The tx queue selection
may also use the hash in stack.
Hash is computed in different ways. Hash is random
for a TCP socket, and hash may be computed in hardware,
or software stack. Recalculation hash is not easy.
Hash of TCP socket is computed:
tcp_v4_connect
-> sk_set_txhash (is random)
__tcp_transmit_skb
-> skb_set_hash_from_sk
There will be one upcall, without information of skb
hash, to ovs-vswitchd, for the first packet of a TCP
session. The rest packets will be processed in Open vSwitch
modules, hash kept. If this tcp session is forward to
VXLAN module, then the UDP src port of first tcp packet
is different from rest packets.
TCP packets may come from the host or dockers, to Open vSwitch.
To fix it, we store the hash info to upcall, and restore hash
when packets sent back.
+---------------+ +-------------------------+
| Docker/VMs | | ovs-vswitchd |
+----+----------+ +-+--------------------+--+
| ^ |
| | |
| | upcall v restore packet hash (not recalculate)
| +-+--------------------+--+
| tap netdev | | vxlan module
+---------------> +--> Open vSwitch ko +-->
or internal type | |
+-------------------------+
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-13 23:04:49 +08:00
/* Add OVS_PACKET_ATTR_HASH */
hash = skb_get_hash_raw ( skb ) ;
if ( skb - > sw_hash )
hash | = OVS_PACKET_HASH_SW_BIT ;
if ( skb - > l4_hash )
hash | = OVS_PACKET_HASH_L4_BIT ;
if ( nla_put ( user_skb , OVS_PACKET_ATTR_HASH , sizeof ( u64 ) , & hash ) ) {
err = - ENOBUFS ;
goto out ;
}
2013-12-13 15:22:21 +01:00
/* Only reserve room for attribute header, packet data is added
* in skb_zerocopy ( ) */
if ( ! ( nla = nla_reserve ( user_skb , OVS_PACKET_ATTR_PACKET , 0 ) ) ) {
err = - ENOBUFS ;
goto out ;
}
2016-06-10 11:49:33 -07:00
nla - > nla_len = nla_attr_size ( skb - > len - cutlen ) ;
2011-10-25 19:26:31 -07:00
2016-06-10 11:49:33 -07:00
err = skb_zerocopy ( user_skb , skb , skb - > len - cutlen , hlen ) ;
2014-03-26 22:37:45 +00:00
if ( err )
goto out ;
2011-10-25 19:26:31 -07:00
2014-01-14 16:27:49 +00:00
/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
2015-08-26 11:31:48 -07:00
pad_packet ( dp , user_skb ) ;
2014-01-14 16:27:49 +00:00
2013-12-13 15:22:21 +01:00
( ( struct nlmsghdr * ) user_skb - > data ) - > nlmsg_len = user_skb - > len ;
2011-10-25 19:26:31 -07:00
2013-12-13 15:22:21 +01:00
err = genlmsg_unicast ( ovs_dp_get_net ( dp ) , user_skb , upcall_info - > portid ) ;
2014-09-02 20:52:28 +08:00
user_skb = NULL ;
2011-10-25 19:26:31 -07:00
out :
2014-03-26 22:37:45 +00:00
if ( err )
skb_tx_error ( skb ) ;
2014-09-02 20:52:28 +08:00
kfree_skb ( user_skb ) ;
2011-10-25 19:26:31 -07:00
kfree_skb ( nskb ) ;
return err ;
}
static int ovs_packet_cmd_execute ( struct sk_buff * skb , struct genl_info * info )
{
struct ovs_header * ovs_header = info - > userhdr ;
2015-08-26 11:31:48 -07:00
struct net * net = sock_net ( skb - > sk ) ;
2011-10-25 19:26:31 -07:00
struct nlattr * * a = info - > attrs ;
struct sw_flow_actions * acts ;
struct sk_buff * packet ;
struct sw_flow * flow ;
2014-10-06 05:45:32 -07:00
struct sw_flow_actions * sf_acts ;
2011-10-25 19:26:31 -07:00
struct datapath * dp ;
2014-09-15 19:20:31 -07:00
struct vport * input_vport ;
2015-08-26 11:31:48 -07:00
u16 mru = 0 ;
net: openvswitch: add hash info to upcall
When using the kernel datapath, the upcall don't
include skb hash info relatived. That will introduce
some problem, because the hash of skb is important
in kernel stack. For example, VXLAN module uses
it to select UDP src port. The tx queue selection
may also use the hash in stack.
Hash is computed in different ways. Hash is random
for a TCP socket, and hash may be computed in hardware,
or software stack. Recalculation hash is not easy.
Hash of TCP socket is computed:
tcp_v4_connect
-> sk_set_txhash (is random)
__tcp_transmit_skb
-> skb_set_hash_from_sk
There will be one upcall, without information of skb
hash, to ovs-vswitchd, for the first packet of a TCP
session. The rest packets will be processed in Open vSwitch
modules, hash kept. If this tcp session is forward to
VXLAN module, then the UDP src port of first tcp packet
is different from rest packets.
TCP packets may come from the host or dockers, to Open vSwitch.
To fix it, we store the hash info to upcall, and restore hash
when packets sent back.
+---------------+ +-------------------------+
| Docker/VMs | | ovs-vswitchd |
+----+----------+ +-+--------------------+--+
| ^ |
| | |
| | upcall v restore packet hash (not recalculate)
| +-+--------------------+--+
| tap netdev | | vxlan module
+---------------> +--> Open vSwitch ko +-->
or internal type | |
+-------------------------+
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-13 23:04:49 +08:00
u64 hash ;
2011-10-25 19:26:31 -07:00
int len ;
int err ;
2015-01-14 13:56:19 +00:00
bool log = ! a [ OVS_PACKET_ATTR_PROBE ] ;
2011-10-25 19:26:31 -07:00
err = - EINVAL ;
if ( ! a [ OVS_PACKET_ATTR_PACKET ] | | ! a [ OVS_PACKET_ATTR_KEY ] | |
2013-03-29 14:46:47 +01:00
! a [ OVS_PACKET_ATTR_ACTIONS ] )
2011-10-25 19:26:31 -07:00
goto err ;
len = nla_len ( a [ OVS_PACKET_ATTR_PACKET ] ) ;
packet = __dev_alloc_skb ( NET_IP_ALIGN + len , GFP_KERNEL ) ;
err = - ENOMEM ;
if ( ! packet )
goto err ;
skb_reserve ( packet , NET_IP_ALIGN ) ;
2013-03-29 14:46:48 +01:00
nla_memcpy ( __skb_put ( packet , len ) , a [ OVS_PACKET_ATTR_PACKET ] , len ) ;
2011-10-25 19:26:31 -07:00
2015-08-26 11:31:48 -07:00
/* Set packet's mru */
if ( a [ OVS_PACKET_ATTR_MRU ] ) {
mru = nla_get_u16 ( a [ OVS_PACKET_ATTR_MRU ] ) ;
packet - > ignore_df = 1 ;
}
OVS_CB ( packet ) - > mru = mru ;
net: openvswitch: add hash info to upcall
When using the kernel datapath, the upcall don't
include skb hash info relatived. That will introduce
some problem, because the hash of skb is important
in kernel stack. For example, VXLAN module uses
it to select UDP src port. The tx queue selection
may also use the hash in stack.
Hash is computed in different ways. Hash is random
for a TCP socket, and hash may be computed in hardware,
or software stack. Recalculation hash is not easy.
Hash of TCP socket is computed:
tcp_v4_connect
-> sk_set_txhash (is random)
__tcp_transmit_skb
-> skb_set_hash_from_sk
There will be one upcall, without information of skb
hash, to ovs-vswitchd, for the first packet of a TCP
session. The rest packets will be processed in Open vSwitch
modules, hash kept. If this tcp session is forward to
VXLAN module, then the UDP src port of first tcp packet
is different from rest packets.
TCP packets may come from the host or dockers, to Open vSwitch.
To fix it, we store the hash info to upcall, and restore hash
when packets sent back.
+---------------+ +-------------------------+
| Docker/VMs | | ovs-vswitchd |
+----+----------+ +-+--------------------+--+
| ^ |
| | |
| | upcall v restore packet hash (not recalculate)
| +-+--------------------+--+
| tap netdev | | vxlan module
+---------------> +--> Open vSwitch ko +-->
or internal type | |
+-------------------------+
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-13 23:04:49 +08:00
if ( a [ OVS_PACKET_ATTR_HASH ] ) {
hash = nla_get_u64 ( a [ OVS_PACKET_ATTR_HASH ] ) ;
__skb_set_hash ( packet , hash & 0xFFFFFFFFULL ,
! ! ( hash & OVS_PACKET_HASH_SW_BIT ) ,
! ! ( hash & OVS_PACKET_HASH_L4_BIT ) ) ;
}
2011-10-25 19:26:31 -07:00
/* Build an sw_flow for sending this packet. */
2014-03-27 12:35:23 -07:00
flow = ovs_flow_alloc ( ) ;
2011-10-25 19:26:31 -07:00
err = PTR_ERR ( flow ) ;
if ( IS_ERR ( flow ) )
goto err_kfree_skb ;
2015-08-26 11:31:52 -07:00
err = ovs_flow_key_extract_userspace ( net , a [ OVS_PACKET_ATTR_KEY ] ,
packet , & flow - > key , log ) ;
2011-10-25 19:26:31 -07:00
if ( err )
goto err_flow_free ;
2015-08-26 11:31:48 -07:00
err = ovs_nla_copy_actions ( net , a [ OVS_PACKET_ATTR_ACTIONS ] ,
2014-11-06 07:03:05 -08:00
& flow - > key , & acts , log ) ;
2013-06-17 17:50:12 -07:00
if ( err )
goto err_flow_free ;
2011-10-25 19:26:31 -07:00
2014-10-03 15:35:33 -07:00
rcu_assign_pointer ( flow - > sf_acts , acts ) ;
2011-10-25 19:26:31 -07:00
packet - > priority = flow - > key . phy . priority ;
2012-11-26 11:24:11 -08:00
packet - > mark = flow - > key . phy . skb_mark ;
2011-10-25 19:26:31 -07:00
rcu_read_lock ( ) ;
2015-08-26 11:31:48 -07:00
dp = get_dp_rcu ( net , ovs_header - > dp_ifindex ) ;
2011-10-25 19:26:31 -07:00
err = - ENODEV ;
if ( ! dp )
goto err_unlock ;
2014-09-15 19:20:31 -07:00
input_vport = ovs_vport_rcu ( dp , flow - > key . phy . in_port ) ;
if ( ! input_vport )
input_vport = ovs_vport_rcu ( dp , OVSP_LOCAL ) ;
if ( ! input_vport )
goto err_unlock ;
2015-08-26 11:31:48 -07:00
packet - > dev = input_vport - > dev ;
2014-09-15 19:20:31 -07:00
OVS_CB ( packet ) - > input_vport = input_vport ;
2014-10-06 05:45:32 -07:00
sf_acts = rcu_dereference ( flow - > sf_acts ) ;
2014-09-15 19:20:31 -07:00
2011-10-25 19:26:31 -07:00
local_bh_disable ( ) ;
2014-10-06 05:45:32 -07:00
err = ovs_execute_actions ( dp , packet , sf_acts , & flow - > key ) ;
2011-10-25 19:26:31 -07:00
local_bh_enable ( ) ;
rcu_read_unlock ( ) ;
2013-08-07 20:01:00 -07:00
ovs_flow_free ( flow , false ) ;
2011-10-25 19:26:31 -07:00
return err ;
err_unlock :
rcu_read_unlock ( ) ;
err_flow_free :
2013-08-07 20:01:00 -07:00
ovs_flow_free ( flow , false ) ;
2011-10-25 19:26:31 -07:00
err_kfree_skb :
kfree_skb ( packet ) ;
err :
return err ;
}
static const struct nla_policy packet_policy [ OVS_PACKET_ATTR_MAX + 1 ] = {
2013-03-29 14:46:47 +01:00
[ OVS_PACKET_ATTR_PACKET ] = { . len = ETH_HLEN } ,
2011-10-25 19:26:31 -07:00
[ OVS_PACKET_ATTR_KEY ] = { . type = NLA_NESTED } ,
[ OVS_PACKET_ATTR_ACTIONS ] = { . type = NLA_NESTED } ,
2015-01-14 13:56:19 +00:00
[ OVS_PACKET_ATTR_PROBE ] = { . type = NLA_FLAG } ,
2015-08-26 11:31:48 -07:00
[ OVS_PACKET_ATTR_MRU ] = { . type = NLA_U16 } ,
2020-03-02 21:05:18 -08:00
[ OVS_PACKET_ATTR_HASH ] = { . type = NLA_U64 } ,
2011-10-25 19:26:31 -07:00
} ;
2020-10-02 14:49:54 -07:00
static const struct genl_small_ops dp_packet_genl_ops [ ] = {
2011-10-25 19:26:31 -07:00
{ . cmd = OVS_PACKET_CMD_EXECUTE ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2011-10-25 19:26:31 -07:00
. doit = ovs_packet_cmd_execute
}
} ;
2016-10-24 14:40:05 +02:00
static struct genl_family dp_packet_genl_family __ro_after_init = {
2014-05-06 16:44:50 -07:00
. hdrsize = sizeof ( struct ovs_header ) ,
. name = OVS_PACKET_FAMILY ,
. version = OVS_PACKET_VERSION ,
. maxattr = OVS_PACKET_ATTR_MAX ,
genetlink: make policy common to family
Since maxattr is common, the policy can't really differ sanely,
so make it common as well.
The only user that did in fact manage to make a non-common policy
is taskstats, which has to be really careful about it (since it's
still using a common maxattr!). This is no longer supported, but
we can fake it using pre_doit.
This reduces the size of e.g. nl80211.o (which has lots of commands):
text data bss dec hex filename
398745 14323 2240 415308 6564c net/wireless/nl80211.o (before)
397913 14331 2240 414484 65314 net/wireless/nl80211.o (after)
--------------------------------
-832 +8 0 -824
Which is obviously just 8 bytes for each command, and an added 8
bytes for the new policy pointer. I'm not sure why the ops list is
counted as .text though.
Most of the code transformations were done using the following spatch:
@ops@
identifier OPS;
expression POLICY;
@@
struct genl_ops OPS[] = {
...,
{
- .policy = POLICY,
},
...
};
@@
identifier ops.OPS;
expression ops.POLICY;
identifier fam;
expression M;
@@
struct genl_family fam = {
.ops = OPS,
.maxattr = M,
+ .policy = POLICY,
...
};
This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing
the cb->data as ops, which we want to change in a later genl patch.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-21 22:51:02 +01:00
. policy = packet_policy ,
2014-05-06 16:44:50 -07:00
. netnsok = true ,
. parallel_ops = true ,
2020-10-02 14:49:54 -07:00
. small_ops = dp_packet_genl_ops ,
. n_small_ops = ARRAY_SIZE ( dp_packet_genl_ops ) ,
2016-10-24 14:40:03 +02:00
. module = THIS_MODULE ,
2014-05-06 16:44:50 -07:00
} ;
2014-11-06 06:58:52 -08:00
static void get_dp_stats ( const struct datapath * dp , struct ovs_dp_stats * stats ,
2013-10-22 10:42:46 -07:00
struct ovs_dp_megaflow_stats * mega_stats )
2011-10-25 19:26:31 -07:00
{
int i ;
2013-10-22 10:42:46 -07:00
memset ( mega_stats , 0 , sizeof ( * mega_stats ) ) ;
2013-10-04 00:14:23 -07:00
stats - > n_flows = ovs_flow_tbl_count ( & dp - > table ) ;
2013-10-22 10:42:46 -07:00
mega_stats - > n_masks = ovs_flow_tbl_num_masks ( & dp - > table ) ;
2011-10-25 19:26:31 -07:00
stats - > n_hit = stats - > n_missed = stats - > n_lost = 0 ;
2013-10-22 10:42:46 -07:00
2011-10-25 19:26:31 -07:00
for_each_possible_cpu ( i ) {
const struct dp_stats_percpu * percpu_stats ;
struct dp_stats_percpu local_stats ;
unsigned int start ;
percpu_stats = per_cpu_ptr ( dp - > stats_percpu , i ) ;
do {
2014-03-13 21:26:42 -07:00
start = u64_stats_fetch_begin_irq ( & percpu_stats - > syncp ) ;
2011-10-25 19:26:31 -07:00
local_stats = * percpu_stats ;
2014-03-13 21:26:42 -07:00
} while ( u64_stats_fetch_retry_irq ( & percpu_stats - > syncp , start ) ) ;
2011-10-25 19:26:31 -07:00
stats - > n_hit + = local_stats . n_hit ;
stats - > n_missed + = local_stats . n_missed ;
stats - > n_lost + = local_stats . n_lost ;
2013-10-22 10:42:46 -07:00
mega_stats - > n_mask_hit + = local_stats . n_mask_hit ;
2020-07-31 14:20:56 +02:00
mega_stats - > n_cache_hit + = local_stats . n_cache_hit ;
2011-10-25 19:26:31 -07:00
}
}
2015-01-21 16:42:52 -08:00
static bool should_fill_key ( const struct sw_flow_id * sfid , uint32_t ufid_flags )
{
return ovs_identifier_is_ufid ( sfid ) & &
! ( ufid_flags & OVS_UFID_F_OMIT_KEY ) ;
}
static bool should_fill_mask ( uint32_t ufid_flags )
{
return ! ( ufid_flags & OVS_UFID_F_OMIT_MASK ) ;
}
static bool should_fill_actions ( uint32_t ufid_flags )
2013-03-29 14:46:49 +01:00
{
2015-01-21 16:42:52 -08:00
return ! ( ufid_flags & OVS_UFID_F_OMIT_ACTIONS ) ;
}
static size_t ovs_flow_cmd_msg_size ( const struct sw_flow_actions * acts ,
const struct sw_flow_id * sfid ,
uint32_t ufid_flags )
{
size_t len = NLMSG_ALIGN ( sizeof ( struct ovs_header ) ) ;
2019-11-26 12:55:50 +01:00
/* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback
* see ovs_nla_put_identifier ( )
*/
2015-01-21 16:42:52 -08:00
if ( sfid & & ovs_identifier_is_ufid ( sfid ) )
len + = nla_total_size ( sfid - > ufid_len ) ;
2019-11-26 12:55:50 +01:00
else
len + = nla_total_size ( ovs_key_attr_size ( ) ) ;
2015-01-21 16:42:52 -08:00
/* OVS_FLOW_ATTR_KEY */
if ( ! sfid | | should_fill_key ( sfid , ufid_flags ) )
len + = nla_total_size ( ovs_key_attr_size ( ) ) ;
/* OVS_FLOW_ATTR_MASK */
if ( should_fill_mask ( ufid_flags ) )
len + = nla_total_size ( ovs_key_attr_size ( ) ) ;
/* OVS_FLOW_ATTR_ACTIONS */
if ( should_fill_actions ( ufid_flags ) )
2015-08-26 11:31:44 -07:00
len + = nla_total_size ( acts - > orig_len ) ;
2015-01-21 16:42:52 -08:00
return len
2016-04-26 10:06:15 +02:00
+ nla_total_size_64bit ( sizeof ( struct ovs_flow_stats ) ) /* OVS_FLOW_ATTR_STATS */
2013-03-29 14:46:49 +01:00
+ nla_total_size ( 1 ) /* OVS_FLOW_ATTR_TCP_FLAGS */
2016-04-26 10:06:15 +02:00
+ nla_total_size_64bit ( 8 ) ; /* OVS_FLOW_ATTR_USED */
2013-03-29 14:46:49 +01:00
}
2014-09-08 13:09:37 -07:00
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_stats ( const struct sw_flow * flow ,
struct sk_buff * skb )
{
struct ovs_flow_stats stats ;
__be16 tcp_flags ;
unsigned long used ;
2011-10-25 19:26:31 -07:00
2013-10-29 17:22:21 -07:00
ovs_flow_stats_get ( flow , & stats , & used , & tcp_flags ) ;
2014-05-05 14:28:07 -07:00
2012-03-29 23:20:48 -04:00
if ( used & &
2016-04-25 10:25:17 +02:00
nla_put_u64_64bit ( skb , OVS_FLOW_ATTR_USED , ovs_flow_used_time ( used ) ,
OVS_FLOW_ATTR_PAD ) )
2014-09-08 13:09:37 -07:00
return - EMSGSIZE ;
2011-10-25 19:26:31 -07:00
2012-03-29 23:20:48 -04:00
if ( stats . n_packets & &
2016-04-26 10:06:15 +02:00
nla_put_64bit ( skb , OVS_FLOW_ATTR_STATS ,
sizeof ( struct ovs_flow_stats ) , & stats ,
OVS_FLOW_ATTR_PAD ) )
2014-09-08 13:09:37 -07:00
return - EMSGSIZE ;
2011-10-25 19:26:31 -07:00
2013-10-29 17:22:21 -07:00
if ( ( u8 ) ntohs ( tcp_flags ) & &
nla_put_u8 ( skb , OVS_FLOW_ATTR_TCP_FLAGS , ( u8 ) ntohs ( tcp_flags ) ) )
2014-09-08 13:09:37 -07:00
return - EMSGSIZE ;
return 0 ;
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_actions ( const struct sw_flow * flow ,
struct sk_buff * skb , int skb_orig_len )
{
struct nlattr * start ;
int err ;
2011-10-25 19:26:31 -07:00
/* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
* this is the first flow to be dumped into ' skb ' . This is unusual for
* Netlink but individual action lists can be longer than
* NLMSG_GOODSIZE and thus entirely undumpable if we didn ' t do this .
* The userspace caller can always fetch the actions separately if it
* really wants them . ( Most userspace callers in fact don ' t care . )
*
* This can only fail for dump operations because the skb is always
* properly sized for single flows .
*/
2019-04-26 11:13:06 +02:00
start = nla_nest_start_noflag ( skb , OVS_FLOW_ATTR_ACTIONS ) ;
2013-06-17 17:50:12 -07:00
if ( start ) {
2013-07-30 15:39:39 -07:00
const struct sw_flow_actions * sf_acts ;
2013-12-03 10:58:53 -08:00
sf_acts = rcu_dereference_ovsl ( flow - > sf_acts ) ;
2013-10-03 18:16:47 -07:00
err = ovs_nla_put_actions ( sf_acts - > actions ,
sf_acts - > actions_len , skb ) ;
2014-05-05 14:28:07 -07:00
2013-06-17 17:50:12 -07:00
if ( ! err )
nla_nest_end ( skb , start ) ;
else {
if ( skb_orig_len )
2014-09-08 13:09:37 -07:00
return err ;
2013-06-17 17:50:12 -07:00
nla_nest_cancel ( skb , start ) ;
}
2014-09-08 13:09:37 -07:00
} else if ( skb_orig_len ) {
return - EMSGSIZE ;
}
return 0 ;
}
/* Called with ovs_mutex or RCU read lock. */
static int ovs_flow_cmd_fill_info ( const struct sw_flow * flow , int dp_ifindex ,
struct sk_buff * skb , u32 portid ,
2015-01-21 16:42:52 -08:00
u32 seq , u32 flags , u8 cmd , u32 ufid_flags )
2014-09-08 13:09:37 -07:00
{
const int skb_orig_len = skb - > len ;
struct ovs_header * ovs_header ;
int err ;
ovs_header = genlmsg_put ( skb , portid , seq , & dp_flow_genl_family ,
flags , cmd ) ;
if ( ! ovs_header )
return - EMSGSIZE ;
ovs_header - > dp_ifindex = dp_ifindex ;
2015-01-21 16:42:52 -08:00
err = ovs_nla_put_identifier ( flow , skb ) ;
2015-01-21 16:42:48 -08:00
if ( err )
goto error ;
2015-01-21 16:42:52 -08:00
if ( should_fill_key ( & flow - > id , ufid_flags ) ) {
err = ovs_nla_put_masked_key ( flow , skb ) ;
if ( err )
goto error ;
}
if ( should_fill_mask ( ufid_flags ) ) {
err = ovs_nla_put_mask ( flow , skb ) ;
if ( err )
goto error ;
}
2014-09-08 13:09:37 -07:00
err = ovs_flow_cmd_fill_stats ( flow , skb ) ;
if ( err )
goto error ;
2015-01-21 16:42:52 -08:00
if ( should_fill_actions ( ufid_flags ) ) {
err = ovs_flow_cmd_fill_actions ( flow , skb , skb_orig_len ) ;
if ( err )
goto error ;
}
2011-10-25 19:26:31 -07:00
2015-01-16 22:09:00 +01:00
genlmsg_end ( skb , ovs_header ) ;
return 0 ;
2011-10-25 19:26:31 -07:00
error :
genlmsg_cancel ( skb , ovs_header ) ;
return err ;
}
2014-05-05 14:28:07 -07:00
/* May not be called with RCU read lock. */
static struct sk_buff * ovs_flow_cmd_alloc_info ( const struct sw_flow_actions * acts ,
2015-01-21 16:42:52 -08:00
const struct sw_flow_id * sfid ,
2014-05-05 13:13:14 -07:00
struct genl_info * info ,
2015-01-21 16:42:52 -08:00
bool always ,
uint32_t ufid_flags )
2011-10-25 19:26:31 -07:00
{
2014-05-05 13:13:14 -07:00
struct sk_buff * skb ;
2015-01-21 16:42:52 -08:00
size_t len ;
2011-10-25 19:26:31 -07:00
2014-09-18 10:31:04 +02:00
if ( ! always & & ! ovs_must_notify ( & dp_flow_genl_family , info , 0 ) )
2014-05-05 13:13:14 -07:00
return NULL ;
2015-01-21 16:42:52 -08:00
len = ovs_flow_cmd_msg_size ( acts , sfid , ufid_flags ) ;
2016-02-18 15:03:25 +01:00
skb = genlmsg_new ( len , GFP_KERNEL ) ;
2014-05-05 13:13:14 -07:00
if ( ! skb )
return ERR_PTR ( - ENOMEM ) ;
return skb ;
2011-10-25 19:26:31 -07:00
}
2014-05-05 14:28:07 -07:00
/* Called with ovs_mutex. */
static struct sk_buff * ovs_flow_cmd_build_info ( const struct sw_flow * flow ,
int dp_ifindex ,
struct genl_info * info , u8 cmd ,
2015-01-21 16:42:52 -08:00
bool always , u32 ufid_flags )
2011-10-25 19:26:31 -07:00
{
struct sk_buff * skb ;
int retval ;
2015-01-21 16:42:52 -08:00
skb = ovs_flow_cmd_alloc_info ( ovsl_dereference ( flow - > sf_acts ) ,
& flow - > id , info , always , ufid_flags ) ;
2014-07-27 12:37:46 +05:30
if ( IS_ERR_OR_NULL ( skb ) )
2014-05-05 13:13:14 -07:00
return skb ;
2011-10-25 19:26:31 -07:00
2014-05-05 14:28:07 -07:00
retval = ovs_flow_cmd_fill_info ( flow , dp_ifindex , skb ,
info - > snd_portid , info - > snd_seq , 0 ,
2015-01-21 16:42:52 -08:00
cmd , ufid_flags ) ;
2019-12-01 18:41:24 +01:00
if ( WARN_ON_ONCE ( retval < 0 ) ) {
kfree_skb ( skb ) ;
skb = ERR_PTR ( retval ) ;
}
2011-10-25 19:26:31 -07:00
return skb ;
}
2014-05-05 14:53:51 -07:00
static int ovs_flow_cmd_new ( struct sk_buff * skb , struct genl_info * info )
2011-10-25 19:26:31 -07:00
{
2015-08-26 11:31:48 -07:00
struct net * net = sock_net ( skb - > sk ) ;
2011-10-25 19:26:31 -07:00
struct nlattr * * a = info - > attrs ;
struct ovs_header * ovs_header = info - > userhdr ;
2015-01-21 16:42:52 -08:00
struct sw_flow * flow = NULL , * new_flow ;
2013-08-07 20:01:00 -07:00
struct sw_flow_mask mask ;
2011-10-25 19:26:31 -07:00
struct sk_buff * reply ;
struct datapath * dp ;
2014-05-05 14:53:51 -07:00
struct sw_flow_actions * acts ;
2013-08-07 20:01:00 -07:00
struct sw_flow_match match ;
2015-01-21 16:42:52 -08:00
u32 ufid_flags = ovs_nla_get_ufid_flags ( a [ OVS_FLOW_ATTR_UFID_FLAGS ] ) ;
2011-10-25 19:26:31 -07:00
int error ;
2014-11-06 07:03:05 -08:00
bool log = ! a [ OVS_FLOW_ATTR_PROBE ] ;
2011-10-25 19:26:31 -07:00
2014-05-05 15:22:25 -07:00
/* Must have key and actions. */
2011-10-25 19:26:31 -07:00
error = - EINVAL ;
2014-10-06 05:08:38 -07:00
if ( ! a [ OVS_FLOW_ATTR_KEY ] ) {
2014-11-06 07:03:05 -08:00
OVS_NLERR ( log , " Flow key attr not present in new flow. " ) ;
2011-10-25 19:26:31 -07:00
goto error ;
2014-10-06 05:08:38 -07:00
}
if ( ! a [ OVS_FLOW_ATTR_ACTIONS ] ) {
2014-11-06 07:03:05 -08:00
OVS_NLERR ( log , " Flow actions attr not present in new flow. " ) ;
2014-05-05 15:22:25 -07:00
goto error ;
2014-10-06 05:08:38 -07:00
}
2013-08-07 20:01:00 -07:00
2014-05-05 15:22:25 -07:00
/* Most of the time we need to allocate a new flow, do it before
* locking .
*/
new_flow = ovs_flow_alloc ( ) ;
if ( IS_ERR ( new_flow ) ) {
error = PTR_ERR ( new_flow ) ;
goto error ;
}
/* Extract key. */
2016-09-19 13:51:00 -07:00
ovs_match_init ( & match , & new_flow - > key , false , & mask ) ;
2015-08-26 11:31:52 -07:00
error = ovs_nla_get_match ( net , & match , a [ OVS_FLOW_ATTR_KEY ] ,
2014-11-06 07:03:05 -08:00
a [ OVS_FLOW_ATTR_MASK ] , log ) ;
2011-10-25 19:26:31 -07:00
if ( error )
2014-05-05 15:22:25 -07:00
goto err_kfree_flow ;
2011-10-25 19:26:31 -07:00
2015-01-21 16:42:52 -08:00
/* Extract flow identifier. */
error = ovs_nla_get_identifier ( & new_flow - > id , a [ OVS_FLOW_ATTR_UFID ] ,
2016-09-19 13:50:59 -07:00
& new_flow - > key , log ) ;
2015-01-21 16:42:52 -08:00
if ( error )
goto err_kfree_flow ;
2013-06-17 17:50:12 -07:00
2016-09-19 13:50:59 -07:00
/* unmasked key is needed to match when ufid is not used. */
if ( ovs_identifier_is_key ( & new_flow - > id ) )
match . key = new_flow - > id . unmasked_key ;
ovs_flow_mask_key ( & new_flow - > key , & new_flow - > key , true , & mask ) ;
2014-05-05 15:22:25 -07:00
/* Validate actions. */
2015-08-26 11:31:48 -07:00
error = ovs_nla_copy_actions ( net , a [ OVS_FLOW_ATTR_ACTIONS ] ,
& new_flow - > key , & acts , log ) ;
2014-05-05 14:53:51 -07:00
if ( error ) {
2014-11-06 07:03:05 -08:00
OVS_NLERR ( log , " Flow actions may not be safe on all matching packets. " ) ;
2014-10-19 11:19:51 -07:00
goto err_kfree_flow ;
2014-05-05 15:22:25 -07:00
}
2015-01-21 16:42:52 -08:00
reply = ovs_flow_cmd_alloc_info ( acts , & new_flow - > id , info , false ,
ufid_flags ) ;
2014-05-05 15:22:25 -07:00
if ( IS_ERR ( reply ) ) {
error = PTR_ERR ( reply ) ;
goto err_kfree_acts ;
2011-10-25 19:26:31 -07:00
}
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2015-08-26 11:31:48 -07:00
dp = get_dp ( net , ovs_header - > dp_ifindex ) ;
2014-05-05 15:22:25 -07:00
if ( unlikely ( ! dp ) ) {
error = - ENODEV ;
2013-04-15 13:23:03 -07:00
goto err_unlock_ovs ;
2014-05-05 15:22:25 -07:00
}
2015-01-21 16:42:52 -08:00
2013-08-07 20:01:00 -07:00
/* Check if this is a duplicate flow */
2015-01-21 16:42:52 -08:00
if ( ovs_identifier_is_ufid ( & new_flow - > id ) )
flow = ovs_flow_tbl_lookup_ufid ( & dp - > table , & new_flow - > id ) ;
if ( ! flow )
2016-09-19 13:50:59 -07:00
flow = ovs_flow_tbl_lookup ( & dp - > table , & new_flow - > key ) ;
2014-05-05 15:22:25 -07:00
if ( likely ( ! flow ) ) {
rcu_assign_pointer ( new_flow - > sf_acts , acts ) ;
2011-10-25 19:26:31 -07:00
/* Put flow in bucket. */
2014-05-05 15:22:25 -07:00
error = ovs_flow_tbl_insert ( & dp - > table , new_flow , & mask ) ;
if ( unlikely ( error ) ) {
2013-10-04 00:17:42 -07:00
acts = NULL ;
2014-05-05 15:22:25 -07:00
goto err_unlock_ovs ;
}
if ( unlikely ( reply ) ) {
error = ovs_flow_cmd_fill_info ( new_flow ,
ovs_header - > dp_ifindex ,
reply , info - > snd_portid ,
info - > snd_seq , 0 ,
2015-01-21 16:42:52 -08:00
OVS_FLOW_CMD_NEW ,
ufid_flags ) ;
2014-05-05 15:22:25 -07:00
BUG_ON ( error < 0 ) ;
2013-10-04 00:17:42 -07:00
}
2014-05-05 15:22:25 -07:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
} else {
2014-05-05 14:53:51 -07:00
struct sw_flow_actions * old_acts ;
2011-10-25 19:26:31 -07:00
/* Bail out if we're not allowed to modify an existing flow.
* We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
* because Generic Netlink treats the latter as a dump
* request . We also accept NLM_F_EXCL in case that bug ever
* gets fixed .
*/
2014-05-05 15:22:25 -07:00
if ( unlikely ( info - > nlhdr - > nlmsg_flags & ( NLM_F_CREATE
| NLM_F_EXCL ) ) ) {
error = - EEXIST ;
2013-04-15 13:23:03 -07:00
goto err_unlock_ovs ;
2014-05-05 15:22:25 -07:00
}
2015-01-21 16:42:52 -08:00
/* The flow identifier has to be the same for flow updates.
* Look for any overlapping flow .
*/
if ( unlikely ( ! ovs_flow_cmp ( flow , & match ) ) ) {
if ( ovs_identifier_is_key ( & flow - > id ) )
flow = ovs_flow_tbl_lookup_exact ( & dp - > table ,
& match ) ;
else /* UFID matches but key is different */
flow = NULL ;
2014-06-30 20:30:29 -07:00
if ( ! flow ) {
error = - ENOENT ;
goto err_unlock_ovs ;
}
2014-05-05 15:22:25 -07:00
}
2014-05-05 14:53:51 -07:00
/* Update actions. */
old_acts = ovsl_dereference ( flow - > sf_acts ) ;
rcu_assign_pointer ( flow - > sf_acts , acts ) ;
2014-05-05 15:22:25 -07:00
if ( unlikely ( reply ) ) {
error = ovs_flow_cmd_fill_info ( flow ,
ovs_header - > dp_ifindex ,
reply , info - > snd_portid ,
info - > snd_seq , 0 ,
2015-01-21 16:42:52 -08:00
OVS_FLOW_CMD_NEW ,
ufid_flags ) ;
2014-05-05 15:22:25 -07:00
BUG_ON ( error < 0 ) ;
}
ovs_unlock ( ) ;
2014-05-05 14:53:51 -07:00
2015-07-21 10:44:03 +02:00
ovs_nla_free_flow_actions_rcu ( old_acts ) ;
2014-05-05 15:22:25 -07:00
ovs_flow_free ( new_flow , false ) ;
2014-05-05 14:53:51 -07:00
}
2014-05-05 15:22:25 -07:00
if ( reply )
ovs_notify ( & dp_flow_genl_family , reply , info ) ;
2014-05-05 14:53:51 -07:00
return 0 ;
err_unlock_ovs :
ovs_unlock ( ) ;
2014-05-05 15:22:25 -07:00
kfree_skb ( reply ) ;
err_kfree_acts :
2015-07-21 10:44:03 +02:00
ovs_nla_free_flow_actions ( acts ) ;
2014-05-05 15:22:25 -07:00
err_kfree_flow :
ovs_flow_free ( new_flow , false ) ;
2014-05-05 14:53:51 -07:00
error :
return error ;
}
2011-10-25 19:26:31 -07:00
2014-10-19 11:19:51 -07:00
/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */
2020-09-01 20:26:12 +08:00
static noinline_for_stack
struct sw_flow_actions * get_flow_actions ( struct net * net ,
const struct nlattr * a ,
const struct sw_flow_key * key ,
const struct sw_flow_mask * mask ,
bool log )
2014-10-03 15:35:32 -07:00
{
struct sw_flow_actions * acts ;
struct sw_flow_key masked_key ;
int error ;
2015-09-21 20:21:20 -07:00
ovs_flow_mask_key ( & masked_key , key , true , mask ) ;
2015-08-26 11:31:48 -07:00
error = ovs_nla_copy_actions ( net , a , & masked_key , & acts , log ) ;
2014-10-03 15:35:32 -07:00
if ( error ) {
2014-11-06 07:03:05 -08:00
OVS_NLERR ( log ,
" Actions may not be safe on all matching packets " ) ;
2014-10-03 15:35:32 -07:00
return ERR_PTR ( error ) ;
}
return acts ;
}
2017-06-29 17:27:44 -07:00
/* Factor out match-init and action-copy to avoid
* " Wframe-larger-than=1024 " warning . Because mask is only
* used to get actions , we new a function to save some
* stack space .
*
* If there are not key and action attrs , we return 0
* directly . In the case , the caller will also not use the
* match as before . If there is action attr , we try to get
* actions and save them to * acts . Before returning from
* the function , we reset the match - > mask pointer . Because
* we should not to return match object with dangling reference
* to mask .
* */
2019-07-22 17:00:01 +02:00
static noinline_for_stack int
ovs_nla_init_match_and_action ( struct net * net ,
struct sw_flow_match * match ,
struct sw_flow_key * key ,
struct nlattr * * a ,
struct sw_flow_actions * * acts ,
bool log )
2017-06-29 17:27:44 -07:00
{
struct sw_flow_mask mask ;
int error = 0 ;
if ( a [ OVS_FLOW_ATTR_KEY ] ) {
ovs_match_init ( match , key , true , & mask ) ;
error = ovs_nla_get_match ( net , match , a [ OVS_FLOW_ATTR_KEY ] ,
a [ OVS_FLOW_ATTR_MASK ] , log ) ;
if ( error )
goto error ;
}
if ( a [ OVS_FLOW_ATTR_ACTIONS ] ) {
if ( ! a [ OVS_FLOW_ATTR_KEY ] ) {
OVS_NLERR ( log ,
" Flow key attribute not present in set flow. " ) ;
2017-09-11 21:56:20 +02:00
error = - EINVAL ;
goto error ;
2017-06-29 17:27:44 -07:00
}
* acts = get_flow_actions ( net , a [ OVS_FLOW_ATTR_ACTIONS ] , key ,
& mask , log ) ;
if ( IS_ERR ( * acts ) ) {
error = PTR_ERR ( * acts ) ;
goto error ;
}
}
/* On success, error is 0. */
error :
match - > mask = NULL ;
return error ;
}
2014-05-05 14:53:51 -07:00
static int ovs_flow_cmd_set ( struct sk_buff * skb , struct genl_info * info )
{
2015-08-26 11:31:48 -07:00
struct net * net = sock_net ( skb - > sk ) ;
2014-05-05 14:53:51 -07:00
struct nlattr * * a = info - > attrs ;
struct ovs_header * ovs_header = info - > userhdr ;
2014-10-03 15:35:32 -07:00
struct sw_flow_key key ;
2014-05-05 14:53:51 -07:00
struct sw_flow * flow ;
struct sk_buff * reply = NULL ;
struct datapath * dp ;
2014-05-05 15:22:25 -07:00
struct sw_flow_actions * old_acts = NULL , * acts = NULL ;
2014-05-05 14:53:51 -07:00
struct sw_flow_match match ;
2015-01-21 16:42:52 -08:00
struct sw_flow_id sfid ;
u32 ufid_flags = ovs_nla_get_ufid_flags ( a [ OVS_FLOW_ATTR_UFID_FLAGS ] ) ;
2016-03-10 17:14:59 +01:00
int error = 0 ;
2014-11-06 07:03:05 -08:00
bool log = ! a [ OVS_FLOW_ATTR_PROBE ] ;
2015-01-21 16:42:52 -08:00
bool ufid_present ;
2014-05-05 14:53:51 -07:00
2015-01-21 16:42:52 -08:00
ufid_present = ovs_nla_get_ufid ( & sfid , a [ OVS_FLOW_ATTR_UFID ] , log ) ;
2017-06-29 17:27:44 -07:00
if ( ! a [ OVS_FLOW_ATTR_KEY ] & & ! ufid_present ) {
2016-03-10 17:14:59 +01:00
OVS_NLERR ( log ,
" Flow set message rejected, Key attribute missing. " ) ;
2017-06-29 17:27:44 -07:00
return - EINVAL ;
2016-03-10 17:14:59 +01:00
}
2017-06-29 17:27:44 -07:00
error = ovs_nla_init_match_and_action ( net , & match , & key , a ,
& acts , log ) ;
2014-05-05 14:53:51 -07:00
if ( error )
goto error ;
2017-06-29 17:27:44 -07:00
if ( acts ) {
2014-10-19 11:19:51 -07:00
/* Can allocate before locking if have acts. */
2015-01-21 16:42:52 -08:00
reply = ovs_flow_cmd_alloc_info ( acts , & sfid , info , false ,
ufid_flags ) ;
2014-05-05 15:22:25 -07:00
if ( IS_ERR ( reply ) ) {
error = PTR_ERR ( reply ) ;
goto err_kfree_acts ;
2014-05-05 09:59:40 -07:00
}
2014-05-05 14:53:51 -07:00
}
2014-05-05 14:28:07 -07:00
2014-05-05 14:53:51 -07:00
ovs_lock ( ) ;
2015-08-26 11:31:48 -07:00
dp = get_dp ( net , ovs_header - > dp_ifindex ) ;
2014-05-05 15:22:25 -07:00
if ( unlikely ( ! dp ) ) {
error = - ENODEV ;
2014-05-05 14:53:51 -07:00
goto err_unlock_ovs ;
2014-05-05 15:22:25 -07:00
}
2014-05-05 14:53:51 -07:00
/* Check that the flow exists. */
2015-01-21 16:42:52 -08:00
if ( ufid_present )
flow = ovs_flow_tbl_lookup_ufid ( & dp - > table , & sfid ) ;
else
flow = ovs_flow_tbl_lookup_exact ( & dp - > table , & match ) ;
2014-05-05 15:22:25 -07:00
if ( unlikely ( ! flow ) ) {
error = - ENOENT ;
2014-05-05 14:53:51 -07:00
goto err_unlock_ovs ;
2014-05-05 15:22:25 -07:00
}
2014-06-30 20:30:29 -07:00
2014-05-05 14:53:51 -07:00
/* Update actions, if present. */
2014-05-05 15:22:25 -07:00
if ( likely ( acts ) ) {
2014-05-05 14:53:51 -07:00
old_acts = ovsl_dereference ( flow - > sf_acts ) ;
rcu_assign_pointer ( flow - > sf_acts , acts ) ;
2014-05-05 15:22:25 -07:00
if ( unlikely ( reply ) ) {
error = ovs_flow_cmd_fill_info ( flow ,
ovs_header - > dp_ifindex ,
reply , info - > snd_portid ,
info - > snd_seq , 0 ,
2018-09-26 11:40:14 -07:00
OVS_FLOW_CMD_SET ,
2015-01-21 16:42:52 -08:00
ufid_flags ) ;
2014-05-05 15:22:25 -07:00
BUG_ON ( error < 0 ) ;
}
} else {
/* Could not alloc without acts before locking. */
reply = ovs_flow_cmd_build_info ( flow , ovs_header - > dp_ifindex ,
2018-09-26 11:40:14 -07:00
info , OVS_FLOW_CMD_SET , false ,
2015-01-21 16:42:52 -08:00
ufid_flags ) ;
2015-08-12 15:59:47 +05:30
if ( IS_ERR ( reply ) ) {
2014-05-05 15:22:25 -07:00
error = PTR_ERR ( reply ) ;
goto err_unlock_ovs ;
}
2011-10-25 19:26:31 -07:00
}
2014-05-05 14:53:51 -07:00
/* Clear stats. */
if ( a [ OVS_FLOW_ATTR_CLEAR ] )
ovs_flow_stats_clear ( flow ) ;
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
2014-05-05 15:22:25 -07:00
if ( reply )
ovs_notify ( & dp_flow_genl_family , reply , info ) ;
if ( old_acts )
2015-07-21 10:44:03 +02:00
ovs_nla_free_flow_actions_rcu ( old_acts ) ;
2014-05-05 13:13:14 -07:00
2011-10-25 19:26:31 -07:00
return 0 ;
2013-04-15 13:23:03 -07:00
err_unlock_ovs :
ovs_unlock ( ) ;
2014-05-05 15:22:25 -07:00
kfree_skb ( reply ) ;
err_kfree_acts :
2015-07-21 10:44:03 +02:00
ovs_nla_free_flow_actions ( acts ) ;
2011-10-25 19:26:31 -07:00
error :
return error ;
}
static int ovs_flow_cmd_get ( struct sk_buff * skb , struct genl_info * info )
{
struct nlattr * * a = info - > attrs ;
struct ovs_header * ovs_header = info - > userhdr ;
2015-08-26 11:31:52 -07:00
struct net * net = sock_net ( skb - > sk ) ;
2011-10-25 19:26:31 -07:00
struct sw_flow_key key ;
struct sk_buff * reply ;
struct sw_flow * flow ;
struct datapath * dp ;
2013-08-07 20:01:00 -07:00
struct sw_flow_match match ;
2015-01-21 16:42:52 -08:00
struct sw_flow_id ufid ;
u32 ufid_flags = ovs_nla_get_ufid_flags ( a [ OVS_FLOW_ATTR_UFID_FLAGS ] ) ;
int err = 0 ;
2014-11-06 07:03:05 -08:00
bool log = ! a [ OVS_FLOW_ATTR_PROBE ] ;
2015-01-21 16:42:52 -08:00
bool ufid_present ;
2011-10-25 19:26:31 -07:00
2015-01-21 16:42:52 -08:00
ufid_present = ovs_nla_get_ufid ( & ufid , a [ OVS_FLOW_ATTR_UFID ] , log ) ;
if ( a [ OVS_FLOW_ATTR_KEY ] ) {
2016-09-19 13:51:00 -07:00
ovs_match_init ( & match , & key , true , NULL ) ;
2015-08-26 11:31:52 -07:00
err = ovs_nla_get_match ( net , & match , a [ OVS_FLOW_ATTR_KEY ] , NULL ,
2015-01-21 16:42:52 -08:00
log ) ;
} else if ( ! ufid_present ) {
2014-11-06 07:03:05 -08:00
OVS_NLERR ( log ,
" Flow get message rejected, Key attribute missing. " ) ;
2015-01-21 16:42:52 -08:00
err = - EINVAL ;
2013-08-07 20:01:00 -07:00
}
2011-10-25 19:26:31 -07:00
if ( err )
return err ;
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2012-02-22 19:58:59 -08:00
dp = get_dp ( sock_net ( skb - > sk ) , ovs_header - > dp_ifindex ) ;
2013-04-15 13:23:03 -07:00
if ( ! dp ) {
err = - ENODEV ;
goto unlock ;
}
2011-10-25 19:26:31 -07:00
2015-01-21 16:42:52 -08:00
if ( ufid_present )
flow = ovs_flow_tbl_lookup_ufid ( & dp - > table , & ufid ) ;
else
flow = ovs_flow_tbl_lookup_exact ( & dp - > table , & match ) ;
2014-06-30 20:30:29 -07:00
if ( ! flow ) {
2013-04-15 13:23:03 -07:00
err = - ENOENT ;
goto unlock ;
}
2011-10-25 19:26:31 -07:00
2014-05-05 14:28:07 -07:00
reply = ovs_flow_cmd_build_info ( flow , ovs_header - > dp_ifindex , info ,
2018-09-26 11:40:14 -07:00
OVS_FLOW_CMD_GET , true , ufid_flags ) ;
2013-04-15 13:23:03 -07:00
if ( IS_ERR ( reply ) ) {
err = PTR_ERR ( reply ) ;
goto unlock ;
}
2011-10-25 19:26:31 -07:00
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
return genlmsg_reply ( reply , info ) ;
2013-04-15 13:23:03 -07:00
unlock :
ovs_unlock ( ) ;
return err ;
2011-10-25 19:26:31 -07:00
}
static int ovs_flow_cmd_del ( struct sk_buff * skb , struct genl_info * info )
{
struct nlattr * * a = info - > attrs ;
struct ovs_header * ovs_header = info - > userhdr ;
2015-08-26 11:31:52 -07:00
struct net * net = sock_net ( skb - > sk ) ;
2011-10-25 19:26:31 -07:00
struct sw_flow_key key ;
struct sk_buff * reply ;
2015-01-21 16:42:52 -08:00
struct sw_flow * flow = NULL ;
2011-10-25 19:26:31 -07:00
struct datapath * dp ;
2013-08-07 20:01:00 -07:00
struct sw_flow_match match ;
2015-01-21 16:42:52 -08:00
struct sw_flow_id ufid ;
u32 ufid_flags = ovs_nla_get_ufid_flags ( a [ OVS_FLOW_ATTR_UFID_FLAGS ] ) ;
2011-10-25 19:26:31 -07:00
int err ;
2014-11-06 07:03:05 -08:00
bool log = ! a [ OVS_FLOW_ATTR_PROBE ] ;
2015-01-21 16:42:52 -08:00
bool ufid_present ;
2011-10-25 19:26:31 -07:00
2015-01-21 16:42:52 -08:00
ufid_present = ovs_nla_get_ufid ( & ufid , a [ OVS_FLOW_ATTR_UFID ] , log ) ;
if ( a [ OVS_FLOW_ATTR_KEY ] ) {
2016-09-19 13:51:00 -07:00
ovs_match_init ( & match , & key , true , NULL ) ;
2015-08-26 11:31:52 -07:00
err = ovs_nla_get_match ( net , & match , a [ OVS_FLOW_ATTR_KEY ] ,
NULL , log ) ;
2014-05-05 14:40:13 -07:00
if ( unlikely ( err ) )
return err ;
}
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2012-02-22 19:58:59 -08:00
dp = get_dp ( sock_net ( skb - > sk ) , ovs_header - > dp_ifindex ) ;
2014-05-05 14:40:13 -07:00
if ( unlikely ( ! dp ) ) {
2013-04-15 13:23:03 -07:00
err = - ENODEV ;
goto unlock ;
}
2012-02-22 19:58:59 -08:00
2015-01-21 16:42:52 -08:00
if ( unlikely ( ! a [ OVS_FLOW_ATTR_KEY ] & & ! ufid_present ) ) {
2013-10-04 00:14:23 -07:00
err = ovs_flow_tbl_flush ( & dp - > table ) ;
2013-04-15 13:23:03 -07:00
goto unlock ;
}
2013-08-07 20:01:00 -07:00
2015-01-21 16:42:52 -08:00
if ( ufid_present )
flow = ovs_flow_tbl_lookup_ufid ( & dp - > table , & ufid ) ;
else
flow = ovs_flow_tbl_lookup_exact ( & dp - > table , & match ) ;
2014-06-30 20:30:29 -07:00
if ( unlikely ( ! flow ) ) {
2013-04-15 13:23:03 -07:00
err = - ENOENT ;
goto unlock ;
}
2011-10-25 19:26:31 -07:00
2013-10-04 00:14:23 -07:00
ovs_flow_tbl_remove ( & dp - > table , flow ) ;
2014-05-05 14:40:13 -07:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
2014-05-05 14:40:13 -07:00
reply = ovs_flow_cmd_alloc_info ( ( const struct sw_flow_actions __force * ) flow - > sf_acts ,
2015-01-21 16:42:52 -08:00
& flow - > id , info , false , ufid_flags ) ;
2014-05-05 14:40:13 -07:00
if ( likely ( reply ) ) {
2019-06-05 23:06:40 +02:00
if ( ! IS_ERR ( reply ) ) {
2014-05-05 14:40:13 -07:00
rcu_read_lock ( ) ; /*To keep RCU checker happy. */
err = ovs_flow_cmd_fill_info ( flow , ovs_header - > dp_ifindex ,
reply , info - > snd_portid ,
info - > snd_seq , 0 ,
2015-01-21 16:42:52 -08:00
OVS_FLOW_CMD_DEL ,
ufid_flags ) ;
2014-05-05 14:40:13 -07:00
rcu_read_unlock ( ) ;
2019-12-01 18:41:25 +01:00
if ( WARN_ON_ONCE ( err < 0 ) ) {
kfree_skb ( reply ) ;
goto out_free ;
}
2014-05-05 14:40:13 -07:00
ovs_notify ( & dp_flow_genl_family , reply , info ) ;
} else {
2020-09-01 20:26:12 +08:00
netlink_set_err ( sock_net ( skb - > sk ) - > genl_sock , 0 , 0 ,
PTR_ERR ( reply ) ) ;
2014-05-05 14:40:13 -07:00
}
2014-05-05 13:13:14 -07:00
}
2011-10-25 19:26:31 -07:00
2019-12-01 18:41:25 +01:00
out_free :
2014-05-05 14:40:13 -07:00
ovs_flow_free ( flow , true ) ;
2011-10-25 19:26:31 -07:00
return 0 ;
2013-04-15 13:23:03 -07:00
unlock :
ovs_unlock ( ) ;
return err ;
2011-10-25 19:26:31 -07:00
}
static int ovs_flow_cmd_dump ( struct sk_buff * skb , struct netlink_callback * cb )
{
2015-01-21 16:42:52 -08:00
struct nlattr * a [ __OVS_FLOW_ATTR_MAX ] ;
2011-10-25 19:26:31 -07:00
struct ovs_header * ovs_header = genlmsg_data ( nlmsg_data ( cb - > nlh ) ) ;
2013-10-04 00:14:23 -07:00
struct table_instance * ti ;
2011-10-25 19:26:31 -07:00
struct datapath * dp ;
2015-01-21 16:42:52 -08:00
u32 ufid_flags ;
int err ;
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 14:07:28 +02:00
err = genlmsg_parse_deprecated ( cb - > nlh , & dp_flow_genl_family , a ,
OVS_FLOW_ATTR_MAX , flow_policy , NULL ) ;
2015-01-21 16:42:52 -08:00
if ( err )
return err ;
ufid_flags = ovs_nla_get_ufid_flags ( a [ OVS_FLOW_ATTR_UFID_FLAGS ] ) ;
2011-10-25 19:26:31 -07:00
2013-07-30 15:39:39 -07:00
rcu_read_lock ( ) ;
2014-09-08 13:14:22 -07:00
dp = get_dp_rcu ( sock_net ( skb - > sk ) , ovs_header - > dp_ifindex ) ;
2013-04-15 13:23:03 -07:00
if ( ! dp ) {
2013-07-30 15:39:39 -07:00
rcu_read_unlock ( ) ;
2011-10-25 19:26:31 -07:00
return - ENODEV ;
2013-04-15 13:23:03 -07:00
}
2011-10-25 19:26:31 -07:00
2013-10-04 00:14:23 -07:00
ti = rcu_dereference ( dp - > table . ti ) ;
2011-10-25 19:26:31 -07:00
for ( ; ; ) {
struct sw_flow * flow ;
u32 bucket , obj ;
bucket = cb - > args [ 0 ] ;
obj = cb - > args [ 1 ] ;
2013-10-04 00:14:23 -07:00
flow = ovs_flow_tbl_dump_next ( ti , & bucket , & obj ) ;
2011-10-25 19:26:31 -07:00
if ( ! flow )
break ;
2014-05-05 14:28:07 -07:00
if ( ovs_flow_cmd_fill_info ( flow , ovs_header - > dp_ifindex , skb ,
2012-09-07 20:12:54 +00:00
NETLINK_CB ( cb - > skb ) . portid ,
2011-10-25 19:26:31 -07:00
cb - > nlh - > nlmsg_seq , NLM_F_MULTI ,
2018-09-26 11:40:14 -07:00
OVS_FLOW_CMD_GET , ufid_flags ) < 0 )
2011-10-25 19:26:31 -07:00
break ;
cb - > args [ 0 ] = bucket ;
cb - > args [ 1 ] = obj ;
}
2013-07-30 15:39:39 -07:00
rcu_read_unlock ( ) ;
2011-10-25 19:26:31 -07:00
return skb - > len ;
}
2014-05-06 16:44:50 -07:00
static const struct nla_policy flow_policy [ OVS_FLOW_ATTR_MAX + 1 ] = {
[ OVS_FLOW_ATTR_KEY ] = { . type = NLA_NESTED } ,
2014-11-06 07:03:05 -08:00
[ OVS_FLOW_ATTR_MASK ] = { . type = NLA_NESTED } ,
2014-05-06 16:44:50 -07:00
[ OVS_FLOW_ATTR_ACTIONS ] = { . type = NLA_NESTED } ,
[ OVS_FLOW_ATTR_CLEAR ] = { . type = NLA_FLAG } ,
2014-11-06 07:03:05 -08:00
[ OVS_FLOW_ATTR_PROBE ] = { . type = NLA_FLAG } ,
2015-01-21 16:42:52 -08:00
[ OVS_FLOW_ATTR_UFID ] = { . type = NLA_UNSPEC , . len = 1 } ,
[ OVS_FLOW_ATTR_UFID_FLAGS ] = { . type = NLA_U32 } ,
2014-05-06 16:44:50 -07:00
} ;
2020-10-02 14:49:54 -07:00
static const struct genl_small_ops dp_flow_genl_ops [ ] = {
2011-10-25 19:26:31 -07:00
{ . cmd = OVS_FLOW_CMD_NEW ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2014-05-05 14:53:51 -07:00
. doit = ovs_flow_cmd_new
2011-10-25 19:26:31 -07:00
} ,
{ . cmd = OVS_FLOW_CMD_DEL ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2011-10-25 19:26:31 -07:00
. doit = ovs_flow_cmd_del
} ,
{ . cmd = OVS_FLOW_CMD_GET ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2011-10-25 19:26:31 -07:00
. flags = 0 , /* OK for unprivileged users. */
. doit = ovs_flow_cmd_get ,
. dumpit = ovs_flow_cmd_dump
} ,
{ . cmd = OVS_FLOW_CMD_SET ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2014-05-05 14:53:51 -07:00
. doit = ovs_flow_cmd_set ,
2011-10-25 19:26:31 -07:00
} ,
} ;
2016-10-24 14:40:05 +02:00
static struct genl_family dp_flow_genl_family __ro_after_init = {
2011-10-25 19:26:31 -07:00
. hdrsize = sizeof ( struct ovs_header ) ,
2014-05-06 16:44:50 -07:00
. name = OVS_FLOW_FAMILY ,
. version = OVS_FLOW_VERSION ,
. maxattr = OVS_FLOW_ATTR_MAX ,
genetlink: make policy common to family
Since maxattr is common, the policy can't really differ sanely,
so make it common as well.
The only user that did in fact manage to make a non-common policy
is taskstats, which has to be really careful about it (since it's
still using a common maxattr!). This is no longer supported, but
we can fake it using pre_doit.
This reduces the size of e.g. nl80211.o (which has lots of commands):
text data bss dec hex filename
398745 14323 2240 415308 6564c net/wireless/nl80211.o (before)
397913 14331 2240 414484 65314 net/wireless/nl80211.o (after)
--------------------------------
-832 +8 0 -824
Which is obviously just 8 bytes for each command, and an added 8
bytes for the new policy pointer. I'm not sure why the ops list is
counted as .text though.
Most of the code transformations were done using the following spatch:
@ops@
identifier OPS;
expression POLICY;
@@
struct genl_ops OPS[] = {
...,
{
- .policy = POLICY,
},
...
};
@@
identifier ops.OPS;
expression ops.POLICY;
identifier fam;
expression M;
@@
struct genl_family fam = {
.ops = OPS,
.maxattr = M,
+ .policy = POLICY,
...
};
This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing
the cb->data as ops, which we want to change in a later genl patch.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-21 22:51:02 +01:00
. policy = flow_policy ,
2013-04-23 07:48:48 +00:00
. netnsok = true ,
. parallel_ops = true ,
2020-10-02 14:49:54 -07:00
. small_ops = dp_flow_genl_ops ,
. n_small_ops = ARRAY_SIZE ( dp_flow_genl_ops ) ,
2014-05-06 16:44:50 -07:00
. mcgrps = & ovs_dp_flow_multicast_group ,
. n_mcgrps = 1 ,
2016-10-24 14:40:03 +02:00
. module = THIS_MODULE ,
2011-10-25 19:26:31 -07:00
} ;
2013-03-29 14:46:49 +01:00
static size_t ovs_dp_cmd_msg_size ( void )
{
size_t msgsize = NLMSG_ALIGN ( sizeof ( struct ovs_header ) ) ;
msgsize + = nla_total_size ( IFNAMSIZ ) ;
2016-04-26 10:06:15 +02:00
msgsize + = nla_total_size_64bit ( sizeof ( struct ovs_dp_stats ) ) ;
msgsize + = nla_total_size_64bit ( sizeof ( struct ovs_dp_megaflow_stats ) ) ;
2014-01-23 10:47:35 -08:00
msgsize + = nla_total_size ( sizeof ( u32 ) ) ; /* OVS_DP_ATTR_USER_FEATURES */
2020-07-31 14:21:34 +02:00
msgsize + = nla_total_size ( sizeof ( u32 ) ) ; /* OVS_DP_ATTR_MASKS_CACHE_SIZE */
2013-03-29 14:46:49 +01:00
return msgsize ;
}
2014-11-11 15:55:16 -08:00
/* Called with ovs_mutex. */
2011-10-25 19:26:31 -07:00
static int ovs_dp_cmd_fill_info ( struct datapath * dp , struct sk_buff * skb ,
2012-09-07 20:12:54 +00:00
u32 portid , u32 seq , u32 flags , u8 cmd )
2011-10-25 19:26:31 -07:00
{
struct ovs_header * ovs_header ;
struct ovs_dp_stats dp_stats ;
2013-10-22 10:42:46 -07:00
struct ovs_dp_megaflow_stats dp_megaflow_stats ;
2011-10-25 19:26:31 -07:00
int err ;
2012-09-07 20:12:54 +00:00
ovs_header = genlmsg_put ( skb , portid , seq , & dp_datapath_genl_family ,
2020-09-01 20:26:12 +08:00
flags , cmd ) ;
2011-10-25 19:26:31 -07:00
if ( ! ovs_header )
goto error ;
ovs_header - > dp_ifindex = get_dpifindex ( dp ) ;
err = nla_put_string ( skb , OVS_DP_ATTR_NAME , ovs_dp_name ( dp ) ) ;
if ( err )
goto nla_put_failure ;
2013-10-22 10:42:46 -07:00
get_dp_stats ( dp , & dp_stats , & dp_megaflow_stats ) ;
2016-04-26 10:06:15 +02:00
if ( nla_put_64bit ( skb , OVS_DP_ATTR_STATS , sizeof ( struct ovs_dp_stats ) ,
& dp_stats , OVS_DP_ATTR_PAD ) )
2013-10-22 10:42:46 -07:00
goto nla_put_failure ;
2016-04-26 10:06:15 +02:00
if ( nla_put_64bit ( skb , OVS_DP_ATTR_MEGAFLOW_STATS ,
sizeof ( struct ovs_dp_megaflow_stats ) ,
& dp_megaflow_stats , OVS_DP_ATTR_PAD ) )
2012-03-29 23:20:48 -04:00
goto nla_put_failure ;
2011-10-25 19:26:31 -07:00
2013-12-13 15:22:18 +01:00
if ( nla_put_u32 ( skb , OVS_DP_ATTR_USER_FEATURES , dp - > user_features ) )
goto nla_put_failure ;
2020-07-31 14:21:34 +02:00
if ( nla_put_u32 ( skb , OVS_DP_ATTR_MASKS_CACHE_SIZE ,
ovs_flow_tbl_masks_cache_size ( & dp - > table ) ) )
goto nla_put_failure ;
2015-01-16 22:09:00 +01:00
genlmsg_end ( skb , ovs_header ) ;
return 0 ;
2011-10-25 19:26:31 -07:00
nla_put_failure :
genlmsg_cancel ( skb , ovs_header ) ;
error :
return - EMSGSIZE ;
}
2016-02-18 15:03:26 +01:00
static struct sk_buff * ovs_dp_cmd_alloc_info ( void )
2011-10-25 19:26:31 -07:00
{
2016-02-18 15:03:25 +01:00
return genlmsg_new ( ovs_dp_cmd_msg_size ( ) , GFP_KERNEL ) ;
2011-10-25 19:26:31 -07:00
}
2014-05-05 11:32:17 -07:00
/* Called with rcu_read_lock or ovs_mutex. */
2012-02-22 19:58:59 -08:00
static struct datapath * lookup_datapath ( struct net * net ,
2014-11-06 06:58:52 -08:00
const struct ovs_header * ovs_header ,
2011-10-25 19:26:31 -07:00
struct nlattr * a [ OVS_DP_ATTR_MAX + 1 ] )
{
struct datapath * dp ;
if ( ! a [ OVS_DP_ATTR_NAME ] )
2012-02-22 19:58:59 -08:00
dp = get_dp ( net , ovs_header - > dp_ifindex ) ;
2011-10-25 19:26:31 -07:00
else {
struct vport * vport ;
2012-02-22 19:58:59 -08:00
vport = ovs_vport_locate ( net , nla_data ( a [ OVS_DP_ATTR_NAME ] ) ) ;
2011-10-25 19:26:31 -07:00
dp = vport & & vport - > port_no = = OVSP_LOCAL ? vport - > dp : NULL ;
}
return dp ? dp : ERR_PTR ( - ENODEV ) ;
}
2020-09-01 20:26:12 +08:00
static void ovs_dp_reset_user_features ( struct sk_buff * skb ,
struct genl_info * info )
2013-12-13 15:22:19 +01:00
{
struct datapath * dp ;
2020-09-01 20:26:12 +08:00
dp = lookup_datapath ( sock_net ( skb - > sk ) , info - > userhdr ,
info - > attrs ) ;
2014-02-14 11:42:36 +01:00
if ( IS_ERR ( dp ) )
2013-12-13 15:22:19 +01:00
return ;
WARN ( dp - > user_features , " Dropping previously announced user features \n " ) ;
dp - > user_features = 0 ;
}
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
static int ovs_dp_set_upcall_portids ( struct datapath * dp ,
const struct nlattr * ids )
{
struct dp_nlsk_pids * old , * dp_nlsk_pids ;
if ( ! nla_len ( ids ) | | nla_len ( ids ) % sizeof ( u32 ) )
return - EINVAL ;
old = ovsl_dereference ( dp - > upcall_portids ) ;
dp_nlsk_pids = kmalloc ( sizeof ( * dp_nlsk_pids ) + nla_len ( ids ) ,
GFP_KERNEL ) ;
if ( ! dp_nlsk_pids )
return - ENOMEM ;
dp_nlsk_pids - > n_pids = nla_len ( ids ) / sizeof ( u32 ) ;
nla_memcpy ( dp_nlsk_pids - > pids , ids , nla_len ( ids ) ) ;
rcu_assign_pointer ( dp - > upcall_portids , dp_nlsk_pids ) ;
kfree_rcu ( old , rcu ) ;
return 0 ;
}
u32 ovs_dp_get_upcall_portid ( const struct datapath * dp , uint32_t cpu_id )
{
struct dp_nlsk_pids * dp_nlsk_pids ;
dp_nlsk_pids = rcu_dereference ( dp - > upcall_portids ) ;
if ( dp_nlsk_pids ) {
if ( cpu_id < dp_nlsk_pids - > n_pids ) {
return dp_nlsk_pids - > pids [ cpu_id ] ;
2021-07-23 10:24:13 -04:00
} else if ( dp_nlsk_pids - > n_pids > 0 & &
cpu_id > = dp_nlsk_pids - > n_pids ) {
/* If the number of netlink PIDs is mismatched with
* the number of CPUs as seen by the kernel , log this
* and send the upcall to an arbitrary socket ( 0 ) in
* order to not drop packets
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
*/
pr_info_ratelimited ( " cpu_id mismatch with handler threads " ) ;
2021-07-23 10:24:13 -04:00
return dp_nlsk_pids - > pids [ cpu_id %
dp_nlsk_pids - > n_pids ] ;
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
} else {
return 0 ;
}
} else {
return 0 ;
}
}
net: openvswitch: Set OvS recirc_id from tc chain index
Offloaded OvS datapath rules are translated one to one to tc rules,
for example the following simplified OvS rule:
recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
Will be translated to the following tc rule:
$ tc filter add dev dev1 ingress \
prio 1 chain 0 proto ip \
flower tcp ct_state -trk \
action ct pipe \
action goto chain 2
Received packets will first travel though tc, and if they aren't stolen
by it, like in the above rule, they will continue to OvS datapath.
Since we already did some actions (action ct in this case) which might
modify the packets, and updated action stats, we would like to continue
the proccessing with the correct recirc_id in OvS (here recirc_id(2))
where we left off.
To support this, introduce a new skb extension for tc, which
will be used for translating tc chain to ovs recirc_id to
handle these miss cases. Last tc chain index will be set
by tc goto chain action and read by OvS datapath.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-09-04 16:56:37 +03:00
static int ovs_dp_change ( struct datapath * dp , struct nlattr * a [ ] )
2013-12-13 15:22:18 +01:00
{
2022-02-03 10:44:30 +02:00
u32 user_features = 0 , old_features = dp - > user_features ;
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
int err ;
net: openvswitch: Set OvS recirc_id from tc chain index
Offloaded OvS datapath rules are translated one to one to tc rules,
for example the following simplified OvS rule:
recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
Will be translated to the following tc rule:
$ tc filter add dev dev1 ingress \
prio 1 chain 0 proto ip \
flower tcp ct_state -trk \
action ct pipe \
action goto chain 2
Received packets will first travel though tc, and if they aren't stolen
by it, like in the above rule, they will continue to OvS datapath.
Since we already did some actions (action ct in this case) which might
modify the packets, and updated action stats, we would like to continue
the proccessing with the correct recirc_id in OvS (here recirc_id(2))
where we left off.
To support this, introduce a new skb extension for tc, which
will be used for translating tc chain to ovs recirc_id to
handle these miss cases. Last tc chain index will be set
by tc goto chain action and read by OvS datapath.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-09-04 16:56:37 +03:00
if ( a [ OVS_DP_ATTR_USER_FEATURES ] ) {
user_features = nla_get_u32 ( a [ OVS_DP_ATTR_USER_FEATURES ] ) ;
if ( user_features & ~ ( OVS_DP_F_VPORT_PIDS |
OVS_DP_F_UNALIGNED |
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
OVS_DP_F_TC_RECIRC_SHARING |
OVS_DP_F_DISPATCH_UPCALL_PER_CPU ) )
net: openvswitch: Set OvS recirc_id from tc chain index
Offloaded OvS datapath rules are translated one to one to tc rules,
for example the following simplified OvS rule:
recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
Will be translated to the following tc rule:
$ tc filter add dev dev1 ingress \
prio 1 chain 0 proto ip \
flower tcp ct_state -trk \
action ct pipe \
action goto chain 2
Received packets will first travel though tc, and if they aren't stolen
by it, like in the above rule, they will continue to OvS datapath.
Since we already did some actions (action ct in this case) which might
modify the packets, and updated action stats, we would like to continue
the proccessing with the correct recirc_id in OvS (here recirc_id(2))
where we left off.
To support this, introduce a new skb extension for tc, which
will be used for translating tc chain to ovs recirc_id to
handle these miss cases. Last tc chain index will be set
by tc goto chain action and read by OvS datapath.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-09-04 16:56:37 +03:00
return - EOPNOTSUPP ;
# if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
if ( user_features & OVS_DP_F_TC_RECIRC_SHARING )
return - EOPNOTSUPP ;
# endif
}
2020-07-31 14:21:34 +02:00
if ( a [ OVS_DP_ATTR_MASKS_CACHE_SIZE ] ) {
int err ;
u32 cache_size ;
cache_size = nla_get_u32 ( a [ OVS_DP_ATTR_MASKS_CACHE_SIZE ] ) ;
err = ovs_flow_tbl_masks_cache_resize ( & dp - > table , cache_size ) ;
if ( err )
return err ;
}
net: openvswitch: Set OvS recirc_id from tc chain index
Offloaded OvS datapath rules are translated one to one to tc rules,
for example the following simplified OvS rule:
recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
Will be translated to the following tc rule:
$ tc filter add dev dev1 ingress \
prio 1 chain 0 proto ip \
flower tcp ct_state -trk \
action ct pipe \
action goto chain 2
Received packets will first travel though tc, and if they aren't stolen
by it, like in the above rule, they will continue to OvS datapath.
Since we already did some actions (action ct in this case) which might
modify the packets, and updated action stats, we would like to continue
the proccessing with the correct recirc_id in OvS (here recirc_id(2))
where we left off.
To support this, introduce a new skb extension for tc, which
will be used for translating tc chain to ovs recirc_id to
handle these miss cases. Last tc chain index will be set
by tc goto chain action and read by OvS datapath.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-09-04 16:56:37 +03:00
dp - > user_features = user_features ;
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.
This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:
* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.
In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:
a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.
The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html
Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.gray@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-15 08:27:54 -04:00
if ( dp - > user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU & &
a [ OVS_DP_ATTR_PER_CPU_PIDS ] ) {
/* Upcall Netlink Port IDs have been updated */
err = ovs_dp_set_upcall_portids ( dp ,
a [ OVS_DP_ATTR_PER_CPU_PIDS ] ) ;
if ( err )
return err ;
}
2022-02-03 10:44:30 +02:00
if ( ( dp - > user_features & OVS_DP_F_TC_RECIRC_SHARING ) & &
! ( old_features & OVS_DP_F_TC_RECIRC_SHARING ) )
tc_skb_ext_tc_enable ( ) ;
else if ( ! ( dp - > user_features & OVS_DP_F_TC_RECIRC_SHARING ) & &
( old_features & OVS_DP_F_TC_RECIRC_SHARING ) )
tc_skb_ext_tc_disable ( ) ;
net: openvswitch: Set OvS recirc_id from tc chain index
Offloaded OvS datapath rules are translated one to one to tc rules,
for example the following simplified OvS rule:
recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
Will be translated to the following tc rule:
$ tc filter add dev dev1 ingress \
prio 1 chain 0 proto ip \
flower tcp ct_state -trk \
action ct pipe \
action goto chain 2
Received packets will first travel though tc, and if they aren't stolen
by it, like in the above rule, they will continue to OvS datapath.
Since we already did some actions (action ct in this case) which might
modify the packets, and updated action stats, we would like to continue
the proccessing with the correct recirc_id in OvS (here recirc_id(2))
where we left off.
To support this, introduce a new skb extension for tc, which
will be used for translating tc chain to ovs recirc_id to
handle these miss cases. Last tc chain index will be set
by tc goto chain action and read by OvS datapath.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-09-04 16:56:37 +03:00
return 0 ;
2013-12-13 15:22:18 +01:00
}
2019-11-01 22:23:54 +08:00
static int ovs_dp_stats_init ( struct datapath * dp )
{
dp - > stats_percpu = netdev_alloc_pcpu_stats ( struct dp_stats_percpu ) ;
if ( ! dp - > stats_percpu )
return - ENOMEM ;
return 0 ;
}
static int ovs_dp_vport_init ( struct datapath * dp )
{
int i ;
dp - > ports = kmalloc_array ( DP_VPORT_HASH_BUCKETS ,
sizeof ( struct hlist_head ) ,
GFP_KERNEL ) ;
if ( ! dp - > ports )
return - ENOMEM ;
for ( i = 0 ; i < DP_VPORT_HASH_BUCKETS ; i + + )
INIT_HLIST_HEAD ( & dp - > ports [ i ] ) ;
return 0 ;
}
2011-10-25 19:26:31 -07:00
static int ovs_dp_cmd_new ( struct sk_buff * skb , struct genl_info * info )
{
struct nlattr * * a = info - > attrs ;
struct vport_parms parms ;
struct sk_buff * reply ;
struct datapath * dp ;
struct vport * vport ;
2012-02-22 19:58:59 -08:00
struct ovs_net * ovs_net ;
2019-11-01 22:23:54 +08:00
int err ;
2011-10-25 19:26:31 -07:00
err = - EINVAL ;
if ( ! a [ OVS_DP_ATTR_NAME ] | | ! a [ OVS_DP_ATTR_UPCALL_PID ] )
goto err ;
2016-02-18 15:03:26 +01:00
reply = ovs_dp_cmd_alloc_info ( ) ;
2014-05-05 14:13:32 -07:00
if ( ! reply )
return - ENOMEM ;
2011-10-25 19:26:31 -07:00
err = - ENOMEM ;
dp = kzalloc ( sizeof ( * dp ) , GFP_KERNEL ) ;
if ( dp = = NULL )
2019-11-01 22:23:54 +08:00
goto err_destroy_reply ;
2012-02-22 19:58:59 -08:00
2015-03-11 23:04:08 -05:00
ovs_dp_set_net ( dp , sock_net ( skb - > sk ) ) ;
2011-10-25 19:26:31 -07:00
/* Allocate table. */
2013-10-04 00:14:23 -07:00
err = ovs_flow_tbl_init ( & dp - > table ) ;
if ( err )
2019-11-01 22:23:54 +08:00
goto err_destroy_dp ;
2011-10-25 19:26:31 -07:00
2019-11-01 22:23:54 +08:00
err = ovs_dp_stats_init ( dp ) ;
if ( err )
2011-10-25 19:26:31 -07:00
goto err_destroy_table ;
2019-11-01 22:23:54 +08:00
err = ovs_dp_vport_init ( dp ) ;
if ( err )
goto err_destroy_stats ;
2012-08-23 12:40:54 -07:00
2017-11-10 12:09:42 -08:00
err = ovs_meters_init ( dp ) ;
if ( err )
2019-11-01 22:23:54 +08:00
goto err_destroy_ports ;
2017-11-10 12:09:42 -08:00
2011-10-25 19:26:31 -07:00
/* Set up our datapath device. */
parms . name = nla_data ( a [ OVS_DP_ATTR_NAME ] ) ;
parms . type = OVS_VPORT_TYPE_INTERNAL ;
parms . options = NULL ;
parms . dp = dp ;
parms . port_no = OVSP_LOCAL ;
2014-07-17 15:14:13 -07:00
parms . upcall_portids = a [ OVS_DP_ATTR_UPCALL_PID ] ;
2011-10-25 19:26:31 -07:00
2014-05-05 14:13:32 -07:00
/* So far only local changes have been made, now need the lock. */
ovs_lock ( ) ;
2020-11-03 09:25:49 +01:00
err = ovs_dp_change ( dp , a ) ;
if ( err )
goto err_unlock_and_destroy_meters ;
2011-10-25 19:26:31 -07:00
vport = new_vport ( & parms ) ;
if ( IS_ERR ( vport ) ) {
err = PTR_ERR ( vport ) ;
if ( err = = - EBUSY )
err = - EEXIST ;
2013-12-13 15:22:19 +01:00
if ( err = = - EEXIST ) {
/* An outdated user space instance that does not understand
* the concept of user_features has attempted to create a new
* datapath and is likely to reuse it . Drop all user features .
*/
if ( info - > genlhdr - > version < OVS_DP_VER_FEATURES )
ovs_dp_reset_user_features ( skb , info ) ;
}
2020-11-03 09:25:49 +01:00
goto err_unlock_and_destroy_meters ;
2011-10-25 19:26:31 -07:00
}
2014-05-05 14:13:32 -07:00
err = ovs_dp_cmd_fill_info ( dp , reply , info - > snd_portid ,
info - > snd_seq , 0 , OVS_DP_CMD_NEW ) ;
BUG_ON ( err < 0 ) ;
2011-10-25 19:26:31 -07:00
2012-02-22 19:58:59 -08:00
ovs_net = net_generic ( ovs_dp_get_net ( dp ) , ovs_net_id ) ;
2013-07-30 15:42:19 -07:00
list_add_tail_rcu ( & dp - > list_node , & ovs_net - > dps ) ;
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
2013-11-19 15:19:39 +01:00
ovs_notify ( & dp_datapath_genl_family , reply , info ) ;
2011-10-25 19:26:31 -07:00
return 0 ;
2020-11-03 09:25:49 +01:00
err_unlock_and_destroy_meters :
ovs_unlock ( ) ;
2017-11-10 12:09:42 -08:00
ovs_meters_exit ( dp ) ;
2019-11-01 22:23:54 +08:00
err_destroy_ports :
2012-08-23 12:40:54 -07:00
kfree ( dp - > ports ) ;
2019-11-01 22:23:54 +08:00
err_destroy_stats :
2011-10-25 19:26:31 -07:00
free_percpu ( dp - > stats_percpu ) ;
err_destroy_table :
2014-05-06 18:41:20 -07:00
ovs_flow_tbl_destroy ( & dp - > table ) ;
2019-11-01 22:23:54 +08:00
err_destroy_dp :
2011-10-25 19:26:31 -07:00
kfree ( dp ) ;
2019-11-01 22:23:54 +08:00
err_destroy_reply :
2014-05-05 14:13:32 -07:00
kfree_skb ( reply ) ;
2011-10-25 19:26:31 -07:00
err :
return err ;
}
2013-04-15 13:23:03 -07:00
/* Called with ovs_mutex. */
2012-02-22 19:58:59 -08:00
static void __dp_destroy ( struct datapath * dp )
2011-10-25 19:26:31 -07:00
{
2020-08-12 17:56:39 +08:00
struct flow_table * table = & dp - > table ;
2012-08-23 12:40:54 -07:00
int i ;
2011-10-25 19:26:31 -07:00
2022-02-03 10:44:30 +02:00
if ( dp - > user_features & OVS_DP_F_TC_RECIRC_SHARING )
tc_skb_ext_tc_disable ( ) ;
2012-08-23 12:40:54 -07:00
for ( i = 0 ; i < DP_VPORT_HASH_BUCKETS ; i + + ) {
struct vport * vport ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
struct hlist_node * n ;
2012-08-23 12:40:54 -07:00
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
hlist_for_each_entry_safe ( vport , n , & dp - > ports [ i ] , dp_hash_node )
2012-08-23 12:40:54 -07:00
if ( vport - > port_no ! = OVSP_LOCAL )
ovs_dp_detach_port ( vport ) ;
}
2011-10-25 19:26:31 -07:00
2013-07-30 15:42:19 -07:00
list_del_rcu ( & dp - > list_node ) ;
2011-10-25 19:26:31 -07:00
2013-04-15 13:23:03 -07:00
/* OVSP_LOCAL is datapath internal port. We need to make sure that
2014-01-21 09:31:04 -08:00
* all ports in datapath are destroyed first before freeing datapath .
2011-10-25 19:26:31 -07:00
*/
2013-04-15 13:23:03 -07:00
ovs_dp_detach_port ( ovs_vport_ovsl ( dp , OVSP_LOCAL ) ) ;
2011-10-25 19:26:31 -07:00
2020-08-12 17:56:39 +08:00
/* Flush sw_flow in the tables. RCU cb only releases resource
* such as dp , ports and tables . That may avoid some issues
* such as RCU usage warning .
*/
table_instance_flow_flush ( table , ovsl_dereference ( table - > ti ) ,
ovsl_dereference ( table - > ufid_ti ) ) ;
/* RCU destroy the ports, meters and flow tables. */
2011-10-25 19:26:31 -07:00
call_rcu ( & dp - > rcu , destroy_dp_rcu ) ;
2012-02-22 19:58:59 -08:00
}
static int ovs_dp_cmd_del ( struct sk_buff * skb , struct genl_info * info )
{
struct sk_buff * reply ;
struct datapath * dp ;
int err ;
2016-02-18 15:03:26 +01:00
reply = ovs_dp_cmd_alloc_info ( ) ;
2014-05-05 14:13:32 -07:00
if ( ! reply )
return - ENOMEM ;
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2012-02-22 19:58:59 -08:00
dp = lookup_datapath ( sock_net ( skb - > sk ) , info - > userhdr , info - > attrs ) ;
err = PTR_ERR ( dp ) ;
if ( IS_ERR ( dp ) )
2014-05-05 14:13:32 -07:00
goto err_unlock_free ;
2012-02-22 19:58:59 -08:00
2014-05-05 14:13:32 -07:00
err = ovs_dp_cmd_fill_info ( dp , reply , info - > snd_portid ,
info - > snd_seq , 0 , OVS_DP_CMD_DEL ) ;
BUG_ON ( err < 0 ) ;
2012-02-22 19:58:59 -08:00
__dp_destroy ( dp ) ;
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
2013-11-19 15:19:39 +01:00
ovs_notify ( & dp_datapath_genl_family , reply , info ) ;
2011-10-25 19:26:31 -07:00
return 0 ;
2014-05-05 14:13:32 -07:00
err_unlock_free :
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2014-05-05 14:13:32 -07:00
kfree_skb ( reply ) ;
2013-04-15 13:23:03 -07:00
return err ;
2011-10-25 19:26:31 -07:00
}
static int ovs_dp_cmd_set ( struct sk_buff * skb , struct genl_info * info )
{
struct sk_buff * reply ;
struct datapath * dp ;
int err ;
2016-02-18 15:03:26 +01:00
reply = ovs_dp_cmd_alloc_info ( ) ;
2014-05-05 14:13:32 -07:00
if ( ! reply )
return - ENOMEM ;
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2012-02-22 19:58:59 -08:00
dp = lookup_datapath ( sock_net ( skb - > sk ) , info - > userhdr , info - > attrs ) ;
2013-04-15 13:23:03 -07:00
err = PTR_ERR ( dp ) ;
2011-10-25 19:26:31 -07:00
if ( IS_ERR ( dp ) )
2014-05-05 14:13:32 -07:00
goto err_unlock_free ;
2011-10-25 19:26:31 -07:00
net: openvswitch: Set OvS recirc_id from tc chain index
Offloaded OvS datapath rules are translated one to one to tc rules,
for example the following simplified OvS rule:
recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
Will be translated to the following tc rule:
$ tc filter add dev dev1 ingress \
prio 1 chain 0 proto ip \
flower tcp ct_state -trk \
action ct pipe \
action goto chain 2
Received packets will first travel though tc, and if they aren't stolen
by it, like in the above rule, they will continue to OvS datapath.
Since we already did some actions (action ct in this case) which might
modify the packets, and updated action stats, we would like to continue
the proccessing with the correct recirc_id in OvS (here recirc_id(2))
where we left off.
To support this, introduce a new skb extension for tc, which
will be used for translating tc chain to ovs recirc_id to
handle these miss cases. Last tc chain index will be set
by tc goto chain action and read by OvS datapath.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-09-04 16:56:37 +03:00
err = ovs_dp_change ( dp , info - > attrs ) ;
if ( err )
goto err_unlock_free ;
2013-12-13 15:22:18 +01:00
2014-05-05 14:13:32 -07:00
err = ovs_dp_cmd_fill_info ( dp , reply , info - > snd_portid ,
2018-09-26 11:40:14 -07:00
info - > snd_seq , 0 , OVS_DP_CMD_SET ) ;
2014-05-05 14:13:32 -07:00
BUG_ON ( err < 0 ) ;
2011-10-25 19:26:31 -07:00
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2013-11-19 15:19:39 +01:00
ovs_notify ( & dp_datapath_genl_family , reply , info ) ;
2011-10-25 19:26:31 -07:00
return 0 ;
2014-05-05 14:13:32 -07:00
err_unlock_free :
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2014-05-05 14:13:32 -07:00
kfree_skb ( reply ) ;
2013-04-15 13:23:03 -07:00
return err ;
2011-10-25 19:26:31 -07:00
}
static int ovs_dp_cmd_get ( struct sk_buff * skb , struct genl_info * info )
{
struct sk_buff * reply ;
struct datapath * dp ;
2013-04-15 13:23:03 -07:00
int err ;
2011-10-25 19:26:31 -07:00
2016-02-18 15:03:26 +01:00
reply = ovs_dp_cmd_alloc_info ( ) ;
2014-05-05 14:13:32 -07:00
if ( ! reply )
return - ENOMEM ;
2014-11-11 15:55:16 -08:00
ovs_lock ( ) ;
2012-02-22 19:58:59 -08:00
dp = lookup_datapath ( sock_net ( skb - > sk ) , info - > userhdr , info - > attrs ) ;
2013-04-15 13:23:03 -07:00
if ( IS_ERR ( dp ) ) {
err = PTR_ERR ( dp ) ;
2014-05-05 14:13:32 -07:00
goto err_unlock_free ;
2013-04-15 13:23:03 -07:00
}
2014-05-05 14:13:32 -07:00
err = ovs_dp_cmd_fill_info ( dp , reply , info - > snd_portid ,
2018-09-26 11:40:14 -07:00
info - > snd_seq , 0 , OVS_DP_CMD_GET ) ;
2014-05-05 14:13:32 -07:00
BUG_ON ( err < 0 ) ;
2014-11-11 15:55:16 -08:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
return genlmsg_reply ( reply , info ) ;
2013-04-15 13:23:03 -07:00
2014-05-05 14:13:32 -07:00
err_unlock_free :
2014-11-11 15:55:16 -08:00
ovs_unlock ( ) ;
2014-05-05 14:13:32 -07:00
kfree_skb ( reply ) ;
2013-04-15 13:23:03 -07:00
return err ;
2011-10-25 19:26:31 -07:00
}
static int ovs_dp_cmd_dump ( struct sk_buff * skb , struct netlink_callback * cb )
{
2012-02-22 19:58:59 -08:00
struct ovs_net * ovs_net = net_generic ( sock_net ( skb - > sk ) , ovs_net_id ) ;
2011-10-25 19:26:31 -07:00
struct datapath * dp ;
int skip = cb - > args [ 0 ] ;
int i = 0 ;
2014-11-11 15:55:16 -08:00
ovs_lock ( ) ;
list_for_each_entry ( dp , & ovs_net - > dps , list_node ) {
2012-01-17 13:33:39 +00:00
if ( i > = skip & &
2012-09-07 20:12:54 +00:00
ovs_dp_cmd_fill_info ( dp , skb , NETLINK_CB ( cb - > skb ) . portid ,
2011-10-25 19:26:31 -07:00
cb - > nlh - > nlmsg_seq , NLM_F_MULTI ,
2018-09-26 11:40:14 -07:00
OVS_DP_CMD_GET ) < 0 )
2011-10-25 19:26:31 -07:00
break ;
i + + ;
}
2014-11-11 15:55:16 -08:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
cb - > args [ 0 ] = i ;
return skb - > len ;
}
2014-05-06 16:44:50 -07:00
static const struct nla_policy datapath_policy [ OVS_DP_ATTR_MAX + 1 ] = {
[ OVS_DP_ATTR_NAME ] = { . type = NLA_NUL_STRING , . len = IFNAMSIZ - 1 } ,
[ OVS_DP_ATTR_UPCALL_PID ] = { . type = NLA_U32 } ,
[ OVS_DP_ATTR_USER_FEATURES ] = { . type = NLA_U32 } ,
2020-07-31 14:21:34 +02:00
[ OVS_DP_ATTR_MASKS_CACHE_SIZE ] = NLA_POLICY_RANGE ( NLA_U32 , 0 ,
PCPU_MIN_UNIT_SIZE / sizeof ( struct mask_cache_entry ) ) ,
2014-05-06 16:44:50 -07:00
} ;
2020-10-02 14:49:54 -07:00
static const struct genl_small_ops dp_datapath_genl_ops [ ] = {
2011-10-25 19:26:31 -07:00
{ . cmd = OVS_DP_CMD_NEW ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2011-10-25 19:26:31 -07:00
. doit = ovs_dp_cmd_new
} ,
{ . cmd = OVS_DP_CMD_DEL ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2011-10-25 19:26:31 -07:00
. doit = ovs_dp_cmd_del
} ,
{ . cmd = OVS_DP_CMD_GET ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2011-10-25 19:26:31 -07:00
. flags = 0 , /* OK for unprivileged users. */
. doit = ovs_dp_cmd_get ,
. dumpit = ovs_dp_cmd_dump
} ,
{ . cmd = OVS_DP_CMD_SET ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2011-10-25 19:26:31 -07:00
. doit = ovs_dp_cmd_set ,
} ,
} ;
2016-10-24 14:40:05 +02:00
static struct genl_family dp_datapath_genl_family __ro_after_init = {
2011-10-25 19:26:31 -07:00
. hdrsize = sizeof ( struct ovs_header ) ,
2014-05-06 16:44:50 -07:00
. name = OVS_DATAPATH_FAMILY ,
. version = OVS_DATAPATH_VERSION ,
. maxattr = OVS_DP_ATTR_MAX ,
genetlink: make policy common to family
Since maxattr is common, the policy can't really differ sanely,
so make it common as well.
The only user that did in fact manage to make a non-common policy
is taskstats, which has to be really careful about it (since it's
still using a common maxattr!). This is no longer supported, but
we can fake it using pre_doit.
This reduces the size of e.g. nl80211.o (which has lots of commands):
text data bss dec hex filename
398745 14323 2240 415308 6564c net/wireless/nl80211.o (before)
397913 14331 2240 414484 65314 net/wireless/nl80211.o (after)
--------------------------------
-832 +8 0 -824
Which is obviously just 8 bytes for each command, and an added 8
bytes for the new policy pointer. I'm not sure why the ops list is
counted as .text though.
Most of the code transformations were done using the following spatch:
@ops@
identifier OPS;
expression POLICY;
@@
struct genl_ops OPS[] = {
...,
{
- .policy = POLICY,
},
...
};
@@
identifier ops.OPS;
expression ops.POLICY;
identifier fam;
expression M;
@@
struct genl_family fam = {
.ops = OPS,
.maxattr = M,
+ .policy = POLICY,
...
};
This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing
the cb->data as ops, which we want to change in a later genl patch.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-21 22:51:02 +01:00
. policy = datapath_policy ,
2013-04-23 07:48:48 +00:00
. netnsok = true ,
. parallel_ops = true ,
2020-10-02 14:49:54 -07:00
. small_ops = dp_datapath_genl_ops ,
. n_small_ops = ARRAY_SIZE ( dp_datapath_genl_ops ) ,
2014-05-06 16:44:50 -07:00
. mcgrps = & ovs_dp_datapath_multicast_group ,
. n_mcgrps = 1 ,
2016-10-24 14:40:03 +02:00
. module = THIS_MODULE ,
2011-10-25 19:26:31 -07:00
} ;
2013-04-15 13:23:03 -07:00
/* Called with ovs_mutex or RCU read lock. */
2011-10-25 19:26:31 -07:00
static int ovs_vport_cmd_fill_info ( struct vport * vport , struct sk_buff * skb ,
2017-11-02 17:04:37 -02:00
struct net * net , u32 portid , u32 seq ,
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
u32 flags , u8 cmd , gfp_t gfp )
2011-10-25 19:26:31 -07:00
{
struct ovs_header * ovs_header ;
struct ovs_vport_stats vport_stats ;
int err ;
2012-09-07 20:12:54 +00:00
ovs_header = genlmsg_put ( skb , portid , seq , & dp_vport_genl_family ,
2011-10-25 19:26:31 -07:00
flags , cmd ) ;
if ( ! ovs_header )
return - EMSGSIZE ;
ovs_header - > dp_ifindex = get_dpifindex ( vport - > dp ) ;
2012-03-29 23:20:48 -04:00
if ( nla_put_u32 ( skb , OVS_VPORT_ATTR_PORT_NO , vport - > port_no ) | |
nla_put_u32 ( skb , OVS_VPORT_ATTR_TYPE , vport - > ops - > type ) | |
2014-07-17 15:14:13 -07:00
nla_put_string ( skb , OVS_VPORT_ATTR_NAME ,
2017-11-02 17:04:37 -02:00
ovs_vport_name ( vport ) ) | |
nla_put_u32 ( skb , OVS_VPORT_ATTR_IFINDEX , vport - > dev - > ifindex ) )
2012-03-29 23:20:48 -04:00
goto nla_put_failure ;
2011-10-25 19:26:31 -07:00
2017-11-02 17:04:37 -02:00
if ( ! net_eq ( net , dev_net ( vport - > dev ) ) ) {
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
int id = peernet2id_alloc ( net , dev_net ( vport - > dev ) , gfp ) ;
2017-11-02 17:04:37 -02:00
if ( nla_put_s32 ( skb , OVS_VPORT_ATTR_NETNSID , id ) )
goto nla_put_failure ;
}
2011-10-25 19:26:31 -07:00
ovs_vport_get_stats ( vport , & vport_stats ) ;
2016-04-26 10:06:15 +02:00
if ( nla_put_64bit ( skb , OVS_VPORT_ATTR_STATS ,
sizeof ( struct ovs_vport_stats ) , & vport_stats ,
OVS_VPORT_ATTR_PAD ) )
2012-03-29 23:20:48 -04:00
goto nla_put_failure ;
2011-10-25 19:26:31 -07:00
2014-07-17 15:14:13 -07:00
if ( ovs_vport_get_upcall_portids ( vport , skb ) )
goto nla_put_failure ;
2011-10-25 19:26:31 -07:00
err = ovs_vport_get_options ( vport , skb ) ;
if ( err = = - EMSGSIZE )
goto error ;
2015-01-16 22:09:00 +01:00
genlmsg_end ( skb , ovs_header ) ;
return 0 ;
2011-10-25 19:26:31 -07:00
nla_put_failure :
err = - EMSGSIZE ;
error :
genlmsg_cancel ( skb , ovs_header ) ;
return err ;
}
2014-05-05 14:13:32 -07:00
static struct sk_buff * ovs_vport_cmd_alloc_info ( void )
{
return nlmsg_new ( NLMSG_DEFAULT_SIZE , GFP_KERNEL ) ;
}
/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
2017-11-02 17:04:37 -02:00
struct sk_buff * ovs_vport_cmd_build_info ( struct vport * vport , struct net * net ,
u32 portid , u32 seq , u8 cmd )
2011-10-25 19:26:31 -07:00
{
struct sk_buff * skb ;
int retval ;
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
skb = nlmsg_new ( NLMSG_DEFAULT_SIZE , GFP_KERNEL ) ;
2011-10-25 19:26:31 -07:00
if ( ! skb )
return ERR_PTR ( - ENOMEM ) ;
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
retval = ovs_vport_cmd_fill_info ( vport , skb , net , portid , seq , 0 , cmd ,
GFP_KERNEL ) ;
2013-03-26 15:48:38 -07:00
BUG_ON ( retval < 0 ) ;
2011-10-25 19:26:31 -07:00
return skb ;
}
2013-04-15 13:23:03 -07:00
/* Called with ovs_mutex or RCU read lock. */
2012-02-22 19:58:59 -08:00
static struct vport * lookup_vport ( struct net * net ,
2014-11-06 06:58:52 -08:00
const struct ovs_header * ovs_header ,
2011-10-25 19:26:31 -07:00
struct nlattr * a [ OVS_VPORT_ATTR_MAX + 1 ] )
{
struct datapath * dp ;
struct vport * vport ;
2017-11-02 17:04:37 -02:00
if ( a [ OVS_VPORT_ATTR_IFINDEX ] )
return ERR_PTR ( - EOPNOTSUPP ) ;
2011-10-25 19:26:31 -07:00
if ( a [ OVS_VPORT_ATTR_NAME ] ) {
2012-02-22 19:58:59 -08:00
vport = ovs_vport_locate ( net , nla_data ( a [ OVS_VPORT_ATTR_NAME ] ) ) ;
2011-10-25 19:26:31 -07:00
if ( ! vport )
return ERR_PTR ( - ENODEV ) ;
2012-03-06 15:04:04 -08:00
if ( ovs_header - > dp_ifindex & &
ovs_header - > dp_ifindex ! = get_dpifindex ( vport - > dp ) )
return ERR_PTR ( - ENODEV ) ;
2011-10-25 19:26:31 -07:00
return vport ;
} else if ( a [ OVS_VPORT_ATTR_PORT_NO ] ) {
u32 port_no = nla_get_u32 ( a [ OVS_VPORT_ATTR_PORT_NO ] ) ;
if ( port_no > = DP_MAX_PORTS )
return ERR_PTR ( - EFBIG ) ;
2012-02-22 19:58:59 -08:00
dp = get_dp ( net , ovs_header - > dp_ifindex ) ;
2011-10-25 19:26:31 -07:00
if ( ! dp )
return ERR_PTR ( - ENODEV ) ;
2013-04-15 13:23:03 -07:00
vport = ovs_vport_ovsl_rcu ( dp , port_no ) ;
2011-10-25 19:26:31 -07:00
if ( ! vport )
2013-01-09 14:27:35 -08:00
return ERR_PTR ( - ENODEV ) ;
2011-10-25 19:26:31 -07:00
return vport ;
} else
return ERR_PTR ( - EINVAL ) ;
2017-11-02 17:04:37 -02:00
2011-10-25 19:26:31 -07:00
}
2019-07-06 01:08:09 +09:00
static unsigned int ovs_get_max_headroom ( struct datapath * dp )
2016-02-26 10:45:39 +01:00
{
2019-07-06 01:08:09 +09:00
unsigned int dev_headroom , max_headroom = 0 ;
2016-02-26 10:45:39 +01:00
struct net_device * dev ;
struct vport * vport ;
int i ;
for ( i = 0 ; i < DP_VPORT_HASH_BUCKETS ; i + + ) {
2020-02-19 01:28:02 +05:30
hlist_for_each_entry_rcu ( vport , & dp - > ports [ i ] , dp_hash_node ,
2020-09-01 20:26:12 +08:00
lockdep_ovsl_is_held ( ) ) {
2016-02-26 10:45:39 +01:00
dev = vport - > dev ;
dev_headroom = netdev_get_fwd_headroom ( dev ) ;
if ( dev_headroom > max_headroom )
max_headroom = dev_headroom ;
}
}
2019-07-06 01:08:09 +09:00
return max_headroom ;
}
/* Called with ovs_mutex */
static void ovs_update_headroom ( struct datapath * dp , unsigned int new_headroom )
{
struct vport * vport ;
int i ;
dp - > max_headroom = new_headroom ;
2020-09-01 20:26:12 +08:00
for ( i = 0 ; i < DP_VPORT_HASH_BUCKETS ; i + + ) {
2020-02-19 01:28:02 +05:30
hlist_for_each_entry_rcu ( vport , & dp - > ports [ i ] , dp_hash_node ,
2020-09-01 20:26:12 +08:00
lockdep_ovsl_is_held ( ) )
2019-07-06 01:08:09 +09:00
netdev_set_rx_headroom ( vport - > dev , new_headroom ) ;
2020-09-01 20:26:12 +08:00
}
2016-02-26 10:45:39 +01:00
}
2011-10-25 19:26:31 -07:00
static int ovs_vport_cmd_new ( struct sk_buff * skb , struct genl_info * info )
{
struct nlattr * * a = info - > attrs ;
struct ovs_header * ovs_header = info - > userhdr ;
struct vport_parms parms ;
struct sk_buff * reply ;
struct vport * vport ;
struct datapath * dp ;
2019-07-06 01:08:09 +09:00
unsigned int new_headroom ;
2011-10-25 19:26:31 -07:00
u32 port_no ;
int err ;
if ( ! a [ OVS_VPORT_ATTR_NAME ] | | ! a [ OVS_VPORT_ATTR_TYPE ] | |
! a [ OVS_VPORT_ATTR_UPCALL_PID ] )
2014-05-05 14:13:32 -07:00
return - EINVAL ;
2017-11-02 17:04:37 -02:00
if ( a [ OVS_VPORT_ATTR_IFINDEX ] )
return - EOPNOTSUPP ;
2014-05-05 14:13:32 -07:00
port_no = a [ OVS_VPORT_ATTR_PORT_NO ]
? nla_get_u32 ( a [ OVS_VPORT_ATTR_PORT_NO ] ) : 0 ;
if ( port_no > = DP_MAX_PORTS )
return - EFBIG ;
reply = ovs_vport_cmd_alloc_info ( ) ;
if ( ! reply )
return - ENOMEM ;
2011-10-25 19:26:31 -07:00
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2014-10-22 17:29:06 +02:00
restart :
2012-02-22 19:58:59 -08:00
dp = get_dp ( sock_net ( skb - > sk ) , ovs_header - > dp_ifindex ) ;
2011-10-25 19:26:31 -07:00
err = - ENODEV ;
if ( ! dp )
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2011-10-25 19:26:31 -07:00
2014-05-05 14:13:32 -07:00
if ( port_no ) {
2013-04-15 13:23:03 -07:00
vport = ovs_vport_ovsl ( dp , port_no ) ;
2011-10-25 19:26:31 -07:00
err = - EBUSY ;
if ( vport )
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2011-10-25 19:26:31 -07:00
} else {
for ( port_no = 1 ; ; port_no + + ) {
if ( port_no > = DP_MAX_PORTS ) {
err = - EFBIG ;
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2011-10-25 19:26:31 -07:00
}
2013-04-15 13:23:03 -07:00
vport = ovs_vport_ovsl ( dp , port_no ) ;
2011-10-25 19:26:31 -07:00
if ( ! vport )
break ;
}
}
parms . name = nla_data ( a [ OVS_VPORT_ATTR_NAME ] ) ;
parms . type = nla_get_u32 ( a [ OVS_VPORT_ATTR_TYPE ] ) ;
parms . options = a [ OVS_VPORT_ATTR_OPTIONS ] ;
parms . dp = dp ;
parms . port_no = port_no ;
2014-07-17 15:14:13 -07:00
parms . upcall_portids = a [ OVS_VPORT_ATTR_UPCALL_PID ] ;
2011-10-25 19:26:31 -07:00
vport = new_vport ( & parms ) ;
err = PTR_ERR ( vport ) ;
2014-10-22 17:29:06 +02:00
if ( IS_ERR ( vport ) ) {
if ( err = = - EAGAIN )
goto restart ;
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2014-10-22 17:29:06 +02:00
}
2011-10-25 19:26:31 -07:00
2017-11-02 17:04:37 -02:00
err = ovs_vport_cmd_fill_info ( vport , reply , genl_info_net ( info ) ,
info - > snd_portid , info - > snd_seq , 0 ,
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
OVS_VPORT_CMD_NEW , GFP_KERNEL ) ;
2016-02-26 10:45:39 +01:00
2019-07-06 01:08:09 +09:00
new_headroom = netdev_get_fwd_headroom ( vport - > dev ) ;
if ( new_headroom > dp - > max_headroom )
ovs_update_headroom ( dp , new_headroom ) ;
2016-02-26 10:45:39 +01:00
else
netdev_set_rx_headroom ( vport - > dev , dp - > max_headroom ) ;
2014-05-05 14:13:32 -07:00
BUG_ON ( err < 0 ) ;
ovs_unlock ( ) ;
2013-03-29 14:46:50 +01:00
2013-11-19 15:19:39 +01:00
ovs_notify ( & dp_vport_genl_family , reply , info ) ;
2014-05-05 14:13:32 -07:00
return 0 ;
2011-10-25 19:26:31 -07:00
2014-05-05 14:13:32 -07:00
exit_unlock_free :
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2014-05-05 14:13:32 -07:00
kfree_skb ( reply ) ;
2011-10-25 19:26:31 -07:00
return err ;
}
static int ovs_vport_cmd_set ( struct sk_buff * skb , struct genl_info * info )
{
struct nlattr * * a = info - > attrs ;
struct sk_buff * reply ;
struct vport * vport ;
int err ;
2014-05-05 14:13:32 -07:00
reply = ovs_vport_cmd_alloc_info ( ) ;
if ( ! reply )
return - ENOMEM ;
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2012-02-22 19:58:59 -08:00
vport = lookup_vport ( sock_net ( skb - > sk ) , info - > userhdr , a ) ;
2011-10-25 19:26:31 -07:00
err = PTR_ERR ( vport ) ;
if ( IS_ERR ( vport ) )
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2011-10-25 19:26:31 -07:00
if ( a [ OVS_VPORT_ATTR_TYPE ] & &
2013-05-13 08:15:26 -07:00
nla_get_u32 ( a [ OVS_VPORT_ATTR_TYPE ] ) ! = vport - > ops - > type ) {
2011-10-25 19:26:31 -07:00
err = - EINVAL ;
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2013-03-26 15:48:38 -07:00
}
2013-05-13 08:15:26 -07:00
if ( a [ OVS_VPORT_ATTR_OPTIONS ] ) {
2011-10-25 19:26:31 -07:00
err = ovs_vport_set_options ( vport , a [ OVS_VPORT_ATTR_OPTIONS ] ) ;
2013-05-13 08:15:26 -07:00
if ( err )
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2013-05-13 08:15:26 -07:00
}
2013-03-26 15:48:38 -07:00
2014-07-17 15:14:13 -07:00
if ( a [ OVS_VPORT_ATTR_UPCALL_PID ] ) {
struct nlattr * ids = a [ OVS_VPORT_ATTR_UPCALL_PID ] ;
err = ovs_vport_set_upcall_portids ( vport , ids ) ;
if ( err )
goto exit_unlock_free ;
}
2011-10-25 19:26:31 -07:00
2017-11-02 17:04:37 -02:00
err = ovs_vport_cmd_fill_info ( vport , reply , genl_info_net ( info ) ,
info - > snd_portid , info - > snd_seq , 0 ,
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
OVS_VPORT_CMD_SET , GFP_KERNEL ) ;
2013-03-26 15:48:38 -07:00
BUG_ON ( err < 0 ) ;
2011-10-25 19:26:31 -07:00
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2013-11-19 15:19:39 +01:00
ovs_notify ( & dp_vport_genl_family , reply , info ) ;
2013-04-15 13:23:03 -07:00
return 0 ;
2011-10-25 19:26:31 -07:00
2014-05-05 14:13:32 -07:00
exit_unlock_free :
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2014-05-05 14:13:32 -07:00
kfree_skb ( reply ) ;
2011-10-25 19:26:31 -07:00
return err ;
}
static int ovs_vport_cmd_del ( struct sk_buff * skb , struct genl_info * info )
{
2019-07-06 01:08:09 +09:00
bool update_headroom = false ;
2011-10-25 19:26:31 -07:00
struct nlattr * * a = info - > attrs ;
struct sk_buff * reply ;
2016-02-26 10:45:39 +01:00
struct datapath * dp ;
2011-10-25 19:26:31 -07:00
struct vport * vport ;
2019-07-06 01:08:09 +09:00
unsigned int new_headroom ;
2011-10-25 19:26:31 -07:00
int err ;
2014-05-05 14:13:32 -07:00
reply = ovs_vport_cmd_alloc_info ( ) ;
if ( ! reply )
return - ENOMEM ;
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2012-02-22 19:58:59 -08:00
vport = lookup_vport ( sock_net ( skb - > sk ) , info - > userhdr , a ) ;
2011-10-25 19:26:31 -07:00
err = PTR_ERR ( vport ) ;
if ( IS_ERR ( vport ) )
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2011-10-25 19:26:31 -07:00
if ( vport - > port_no = = OVSP_LOCAL ) {
err = - EINVAL ;
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2011-10-25 19:26:31 -07:00
}
2017-11-02 17:04:37 -02:00
err = ovs_vport_cmd_fill_info ( vport , reply , genl_info_net ( info ) ,
info - > snd_portid , info - > snd_seq , 0 ,
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
OVS_VPORT_CMD_DEL , GFP_KERNEL ) ;
2014-05-05 14:13:32 -07:00
BUG_ON ( err < 0 ) ;
2016-02-26 10:45:39 +01:00
/* the vport deletion may trigger dp headroom update */
dp = vport - > dp ;
if ( netdev_get_fwd_headroom ( vport - > dev ) = = dp - > max_headroom )
2019-07-06 01:08:09 +09:00
update_headroom = true ;
2016-02-26 10:45:39 +01:00
netdev_reset_rx_headroom ( vport - > dev ) ;
2011-10-25 19:26:31 -07:00
ovs_dp_detach_port ( vport ) ;
2016-02-26 10:45:39 +01:00
2019-07-06 01:08:09 +09:00
if ( update_headroom ) {
new_headroom = ovs_get_max_headroom ( dp ) ;
if ( new_headroom < dp - > max_headroom )
ovs_update_headroom ( dp , new_headroom ) ;
}
2014-05-05 14:13:32 -07:00
ovs_unlock ( ) ;
2011-10-25 19:26:31 -07:00
2013-11-19 15:19:39 +01:00
ovs_notify ( & dp_vport_genl_family , reply , info ) ;
2014-05-05 14:13:32 -07:00
return 0 ;
2011-10-25 19:26:31 -07:00
2014-05-05 14:13:32 -07:00
exit_unlock_free :
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2014-05-05 14:13:32 -07:00
kfree_skb ( reply ) ;
2011-10-25 19:26:31 -07:00
return err ;
}
static int ovs_vport_cmd_get ( struct sk_buff * skb , struct genl_info * info )
{
struct nlattr * * a = info - > attrs ;
struct ovs_header * ovs_header = info - > userhdr ;
struct sk_buff * reply ;
struct vport * vport ;
int err ;
2014-05-05 14:13:32 -07:00
reply = ovs_vport_cmd_alloc_info ( ) ;
if ( ! reply )
return - ENOMEM ;
2011-10-25 19:26:31 -07:00
rcu_read_lock ( ) ;
2012-02-22 19:58:59 -08:00
vport = lookup_vport ( sock_net ( skb - > sk ) , ovs_header , a ) ;
2011-10-25 19:26:31 -07:00
err = PTR_ERR ( vport ) ;
if ( IS_ERR ( vport ) )
2014-05-05 14:13:32 -07:00
goto exit_unlock_free ;
2017-11-02 17:04:37 -02:00
err = ovs_vport_cmd_fill_info ( vport , reply , genl_info_net ( info ) ,
info - > snd_portid , info - > snd_seq , 0 ,
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
OVS_VPORT_CMD_GET , GFP_ATOMIC ) ;
2014-05-05 14:13:32 -07:00
BUG_ON ( err < 0 ) ;
2011-10-25 19:26:31 -07:00
rcu_read_unlock ( ) ;
return genlmsg_reply ( reply , info ) ;
2014-05-05 14:13:32 -07:00
exit_unlock_free :
2011-10-25 19:26:31 -07:00
rcu_read_unlock ( ) ;
2014-05-05 14:13:32 -07:00
kfree_skb ( reply ) ;
2011-10-25 19:26:31 -07:00
return err ;
}
static int ovs_vport_cmd_dump ( struct sk_buff * skb , struct netlink_callback * cb )
{
struct ovs_header * ovs_header = genlmsg_data ( nlmsg_data ( cb - > nlh ) ) ;
struct datapath * dp ;
2012-08-23 12:40:54 -07:00
int bucket = cb - > args [ 0 ] , skip = cb - > args [ 1 ] ;
int i , j = 0 ;
2011-10-25 19:26:31 -07:00
2014-02-15 17:42:29 -08:00
rcu_read_lock ( ) ;
2014-09-08 13:14:22 -07:00
dp = get_dp_rcu ( sock_net ( skb - > sk ) , ovs_header - > dp_ifindex ) ;
2014-02-15 17:42:29 -08:00
if ( ! dp ) {
rcu_read_unlock ( ) ;
2011-10-25 19:26:31 -07:00
return - ENODEV ;
2014-02-15 17:42:29 -08:00
}
2012-08-23 12:40:54 -07:00
for ( i = bucket ; i < DP_VPORT_HASH_BUCKETS ; i + + ) {
2011-10-25 19:26:31 -07:00
struct vport * vport ;
2012-08-23 12:40:54 -07:00
j = 0 ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-27 17:06:00 -08:00
hlist_for_each_entry_rcu ( vport , & dp - > ports [ i ] , dp_hash_node ) {
2012-08-23 12:40:54 -07:00
if ( j > = skip & &
ovs_vport_cmd_fill_info ( vport , skb ,
2017-11-02 17:04:37 -02:00
sock_net ( skb - > sk ) ,
2012-09-07 20:12:54 +00:00
NETLINK_CB ( cb - > skb ) . portid ,
2012-08-23 12:40:54 -07:00
cb - > nlh - > nlmsg_seq ,
NLM_F_MULTI ,
netns: fix GFP flags in rtnl_net_notifyid()
In rtnl_net_notifyid(), we certainly can't pass a null GFP flag to
rtnl_notify(). A GFP_KERNEL flag would be fine in most circumstances,
but there are a few paths calling rtnl_net_notifyid() from atomic
context or from RCU critical sections. The later also precludes the use
of gfp_any() as it wouldn't detect the RCU case. Also, the nlmsg_new()
call is wrong too, as it uses GFP_KERNEL unconditionally.
Therefore, we need to pass the GFP flags as parameter and propagate it
through function calls until the proper flags can be determined.
In most cases, GFP_KERNEL is fine. The exceptions are:
* openvswitch: ovs_vport_cmd_get() and ovs_vport_cmd_dump()
indirectly call rtnl_net_notifyid() from RCU critical section,
* rtnetlink: rtmsg_ifinfo_build_skb() already receives GFP flags as
parameter.
Also, in ovs_vport_cmd_build_info(), let's change the GFP flags used
by nlmsg_new(). The function is allowed to sleep, so better make the
flags consistent with the ones used in the following
ovs_vport_cmd_fill_info() call.
Found by code inspection.
Fixes: 9a9634545c70 ("netns: notify netns id events")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-23 18:39:04 +02:00
OVS_VPORT_CMD_GET ,
GFP_ATOMIC ) < 0 )
2012-08-23 12:40:54 -07:00
goto out ;
j + + ;
}
skip = 0 ;
2011-10-25 19:26:31 -07:00
}
2012-08-23 12:40:54 -07:00
out :
2011-10-25 19:26:31 -07:00
rcu_read_unlock ( ) ;
2012-08-23 12:40:54 -07:00
cb - > args [ 0 ] = i ;
cb - > args [ 1 ] = j ;
2011-10-25 19:26:31 -07:00
2012-08-23 12:40:54 -07:00
return skb - > len ;
2011-10-25 19:26:31 -07:00
}
2020-07-15 14:09:28 +02:00
static void ovs_dp_masks_rebalance ( struct work_struct * work )
{
2020-07-24 10:20:59 +02:00
struct ovs_net * ovs_net = container_of ( work , struct ovs_net ,
masks_rebalance . work ) ;
struct datapath * dp ;
2020-07-15 14:09:28 +02:00
ovs_lock ( ) ;
2020-07-24 10:20:59 +02:00
list_for_each_entry ( dp , & ovs_net - > dps , list_node )
ovs_flow_masks_rebalance ( & dp - > table ) ;
2020-07-15 14:09:28 +02:00
ovs_unlock ( ) ;
2020-07-24 10:20:59 +02:00
schedule_delayed_work ( & ovs_net - > masks_rebalance ,
2020-07-15 14:09:28 +02:00
msecs_to_jiffies ( DP_MASKS_REBALANCE_INTERVAL ) ) ;
}
2014-05-06 16:44:50 -07:00
static const struct nla_policy vport_policy [ OVS_VPORT_ATTR_MAX + 1 ] = {
[ OVS_VPORT_ATTR_NAME ] = { . type = NLA_NUL_STRING , . len = IFNAMSIZ - 1 } ,
[ OVS_VPORT_ATTR_STATS ] = { . len = sizeof ( struct ovs_vport_stats ) } ,
[ OVS_VPORT_ATTR_PORT_NO ] = { . type = NLA_U32 } ,
[ OVS_VPORT_ATTR_TYPE ] = { . type = NLA_U32 } ,
2019-09-24 19:11:52 +08:00
[ OVS_VPORT_ATTR_UPCALL_PID ] = { . type = NLA_UNSPEC } ,
2014-05-06 16:44:50 -07:00
[ OVS_VPORT_ATTR_OPTIONS ] = { . type = NLA_NESTED } ,
2017-11-02 17:04:37 -02:00
[ OVS_VPORT_ATTR_IFINDEX ] = { . type = NLA_U32 } ,
[ OVS_VPORT_ATTR_NETNSID ] = { . type = NLA_S32 } ,
2014-05-06 16:44:50 -07:00
} ;
2020-10-02 14:49:54 -07:00
static const struct genl_small_ops dp_vport_genl_ops [ ] = {
2011-10-25 19:26:31 -07:00
{ . cmd = OVS_VPORT_CMD_NEW ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2011-10-25 19:26:31 -07:00
. doit = ovs_vport_cmd_new
} ,
{ . cmd = OVS_VPORT_CMD_DEL ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2011-10-25 19:26:31 -07:00
. doit = ovs_vport_cmd_del
} ,
{ . cmd = OVS_VPORT_CMD_GET ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2011-10-25 19:26:31 -07:00
. flags = 0 , /* OK for unprivileged users. */
. doit = ovs_vport_cmd_get ,
. dumpit = ovs_vport_cmd_dump
} ,
{ . cmd = OVS_VPORT_CMD_SET ,
2019-04-26 14:07:31 +02:00
. validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP ,
2016-02-05 09:20:52 -07:00
. flags = GENL_UNS_ADMIN_PERM , /* Requires CAP_NET_ADMIN privilege. */
2011-10-25 19:26:31 -07:00
. doit = ovs_vport_cmd_set ,
} ,
} ;
2016-10-24 14:40:05 +02:00
struct genl_family dp_vport_genl_family __ro_after_init = {
2014-05-06 16:44:50 -07:00
. hdrsize = sizeof ( struct ovs_header ) ,
. name = OVS_VPORT_FAMILY ,
. version = OVS_VPORT_VERSION ,
. maxattr = OVS_VPORT_ATTR_MAX ,
genetlink: make policy common to family
Since maxattr is common, the policy can't really differ sanely,
so make it common as well.
The only user that did in fact manage to make a non-common policy
is taskstats, which has to be really careful about it (since it's
still using a common maxattr!). This is no longer supported, but
we can fake it using pre_doit.
This reduces the size of e.g. nl80211.o (which has lots of commands):
text data bss dec hex filename
398745 14323 2240 415308 6564c net/wireless/nl80211.o (before)
397913 14331 2240 414484 65314 net/wireless/nl80211.o (after)
--------------------------------
-832 +8 0 -824
Which is obviously just 8 bytes for each command, and an added 8
bytes for the new policy pointer. I'm not sure why the ops list is
counted as .text though.
Most of the code transformations were done using the following spatch:
@ops@
identifier OPS;
expression POLICY;
@@
struct genl_ops OPS[] = {
...,
{
- .policy = POLICY,
},
...
};
@@
identifier ops.OPS;
expression ops.POLICY;
identifier fam;
expression M;
@@
struct genl_family fam = {
.ops = OPS,
.maxattr = M,
+ .policy = POLICY,
...
};
This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing
the cb->data as ops, which we want to change in a later genl patch.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-21 22:51:02 +01:00
. policy = vport_policy ,
2014-05-06 16:44:50 -07:00
. netnsok = true ,
. parallel_ops = true ,
2020-10-02 14:49:54 -07:00
. small_ops = dp_vport_genl_ops ,
. n_small_ops = ARRAY_SIZE ( dp_vport_genl_ops ) ,
2014-05-06 16:44:50 -07:00
. mcgrps = & ovs_dp_vport_multicast_group ,
. n_mcgrps = 1 ,
2016-10-24 14:40:03 +02:00
. module = THIS_MODULE ,
2011-10-25 19:26:31 -07:00
} ;
2014-05-06 16:44:50 -07:00
static struct genl_family * const dp_genl_families [ ] = {
& dp_datapath_genl_family ,
& dp_vport_genl_family ,
& dp_flow_genl_family ,
& dp_packet_genl_family ,
2017-11-10 12:09:42 -08:00
& dp_meter_genl_family ,
openvswitch: Support conntrack zone limit
Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max. In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table. Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.
To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone. For example, we can designate different zone to different VM,
and set conntrack limit to each zone. By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs. Moreover, the users can
set various conntrack limit to different zone based on their preference.
The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone. If the number of
connection is above a configured limitation, ovs will return ENOMEM to the
userspace. If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.
The following high leve APIs are provided to the userspace:
- OVS_CT_LIMIT_CMD_SET:
* set default connection limit for all zones
* set the connection limit for a particular zone
- OVS_CT_LIMIT_CMD_DEL:
* remove the connection limit for a particular zone
- OVS_CT_LIMIT_CMD_GET:
* get the default connection limit for all zones
* get the connection limit for a particular zone
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-24 17:56:43 -07:00
# if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
& dp_ct_limit_genl_family ,
# endif
2011-10-25 19:26:31 -07:00
} ;
static void dp_unregister_genl ( int n_families )
{
int i ;
for ( i = 0 ; i < n_families ; i + + )
2014-05-06 16:44:50 -07:00
genl_unregister_family ( dp_genl_families [ i ] ) ;
2011-10-25 19:26:31 -07:00
}
2016-10-24 14:40:05 +02:00
static int __init dp_register_genl ( void )
2011-10-25 19:26:31 -07:00
{
int err ;
int i ;
for ( i = 0 ; i < ARRAY_SIZE ( dp_genl_families ) ; i + + ) {
2014-05-06 16:44:50 -07:00
err = genl_register_family ( dp_genl_families [ i ] ) ;
2011-10-25 19:26:31 -07:00
if ( err )
goto error ;
}
return 0 ;
error :
2014-05-06 16:44:50 -07:00
dp_unregister_genl ( i ) ;
2011-10-25 19:26:31 -07:00
return err ;
}
2012-02-22 19:58:59 -08:00
static int __net_init ovs_init_net ( struct net * net )
{
struct ovs_net * ovs_net = net_generic ( net , ovs_net_id ) ;
2020-08-31 11:57:57 +02:00
int err ;
2012-02-22 19:58:59 -08:00
INIT_LIST_HEAD ( & ovs_net - > dps ) ;
2013-04-15 13:23:03 -07:00
INIT_WORK ( & ovs_net - > dp_notify_work , ovs_dp_notify_wq ) ;
2020-07-24 10:20:59 +02:00
INIT_DELAYED_WORK ( & ovs_net - > masks_rebalance , ovs_dp_masks_rebalance ) ;
2020-08-31 11:57:57 +02:00
err = ovs_ct_init ( net ) ;
if ( err )
return err ;
2020-07-24 10:20:59 +02:00
schedule_delayed_work ( & ovs_net - > masks_rebalance ,
msecs_to_jiffies ( DP_MASKS_REBALANCE_INTERVAL ) ) ;
2020-08-31 11:57:57 +02:00
return 0 ;
2012-02-22 19:58:59 -08:00
}
2015-02-17 11:23:10 -08:00
static void __net_exit list_vports_from_net ( struct net * net , struct net * dnet ,
struct list_head * head )
2012-02-22 19:58:59 -08:00
{
2013-04-15 13:23:03 -07:00
struct ovs_net * ovs_net = net_generic ( net , ovs_net_id ) ;
2015-02-17 11:23:10 -08:00
struct datapath * dp ;
list_for_each_entry ( dp , & ovs_net - > dps , list_node ) {
int i ;
for ( i = 0 ; i < DP_VPORT_HASH_BUCKETS ; i + + ) {
struct vport * vport ;
hlist_for_each_entry ( vport , & dp - > ports [ i ] , dp_hash_node ) {
if ( vport - > ops - > type ! = OVS_VPORT_TYPE_INTERNAL )
continue ;
2015-07-21 10:44:04 +02:00
if ( dev_net ( vport - > dev ) = = dnet )
2015-02-17 11:23:10 -08:00
list_add ( & vport - > detach_list , head ) ;
}
}
}
}
static void __net_exit ovs_exit_net ( struct net * dnet )
{
struct datapath * dp , * dp_next ;
struct ovs_net * ovs_net = net_generic ( dnet , ovs_net_id ) ;
struct vport * vport , * vport_next ;
struct net * net ;
LIST_HEAD ( head ) ;
2012-02-22 19:58:59 -08:00
2013-04-15 13:23:03 -07:00
ovs_lock ( ) ;
2020-04-17 02:57:31 +08:00
ovs_ct_exit ( dnet ) ;
2012-02-22 19:58:59 -08:00
list_for_each_entry_safe ( dp , dp_next , & ovs_net - > dps , list_node )
__dp_destroy ( dp ) ;
2015-02-17 11:23:10 -08:00
net: Introduce net_rwsem to protect net_namespace_list
rtnl_lock() is used everywhere, and contention is very high.
When someone wants to iterate over alive net namespaces,
he/she has no a possibility to do that without exclusive lock.
But the exclusive rtnl_lock() in such places is overkill,
and it just increases the contention. Yes, there is already
for_each_net_rcu() in kernel, but it requires rcu_read_lock(),
and this can't be sleepable. Also, sometimes it may be need
really prevent net_namespace_list growth, so for_each_net_rcu()
is not fit there.
This patch introduces new rw_semaphore, which will be used
instead of rtnl_mutex to protect net_namespace_list. It is
sleepable and allows not-exclusive iterations over net
namespaces list. It allows to stop using rtnl_lock()
in several places (what is made in next patches) and makes
less the time, we keep rtnl_mutex. Here we just add new lock,
while the explanation of we can remove rtnl_lock() there are
in next patches.
Fine grained locks generally are better, then one big lock,
so let's do that with net_namespace_list, while the situation
allows that.
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-29 19:20:32 +03:00
down_read ( & net_rwsem ) ;
2015-02-17 11:23:10 -08:00
for_each_net ( net )
list_vports_from_net ( net , dnet , & head ) ;
net: Introduce net_rwsem to protect net_namespace_list
rtnl_lock() is used everywhere, and contention is very high.
When someone wants to iterate over alive net namespaces,
he/she has no a possibility to do that without exclusive lock.
But the exclusive rtnl_lock() in such places is overkill,
and it just increases the contention. Yes, there is already
for_each_net_rcu() in kernel, but it requires rcu_read_lock(),
and this can't be sleepable. Also, sometimes it may be need
really prevent net_namespace_list growth, so for_each_net_rcu()
is not fit there.
This patch introduces new rw_semaphore, which will be used
instead of rtnl_mutex to protect net_namespace_list. It is
sleepable and allows not-exclusive iterations over net
namespaces list. It allows to stop using rtnl_lock()
in several places (what is made in next patches) and makes
less the time, we keep rtnl_mutex. Here we just add new lock,
while the explanation of we can remove rtnl_lock() there are
in next patches.
Fine grained locks generally are better, then one big lock,
so let's do that with net_namespace_list, while the situation
allows that.
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-29 19:20:32 +03:00
up_read ( & net_rwsem ) ;
2015-02-17 11:23:10 -08:00
/* Detach all vports from given namespace. */
list_for_each_entry_safe ( vport , vport_next , & head , detach_list ) {
list_del ( & vport - > detach_list ) ;
ovs_dp_detach_port ( vport ) ;
}
2013-04-15 13:23:03 -07:00
ovs_unlock ( ) ;
2020-07-24 10:20:59 +02:00
cancel_delayed_work_sync ( & ovs_net - > masks_rebalance ) ;
2013-04-15 13:23:03 -07:00
cancel_work_sync ( & ovs_net - > dp_notify_work ) ;
2012-02-22 19:58:59 -08:00
}
static struct pernet_operations ovs_net_ops = {
. init = ovs_init_net ,
. exit = ovs_exit_net ,
. id = & ovs_net_id ,
. size = sizeof ( struct ovs_net ) ,
} ;
2011-10-25 19:26:31 -07:00
static int __init dp_init ( void )
{
int err ;
2020-09-01 20:26:12 +08:00
BUILD_BUG_ON ( sizeof ( struct ovs_skb_cb ) >
sizeof_field ( struct sk_buff , cb ) ) ;
2011-10-25 19:26:31 -07:00
pr_info ( " Open vSwitch switching datapath \n " ) ;
2014-09-15 19:37:25 -07:00
err = action_fifos_init ( ) ;
2011-10-25 19:26:31 -07:00
if ( err )
goto error ;
2014-09-15 19:37:25 -07:00
err = ovs_internal_dev_rtnl_link_register ( ) ;
if ( err )
goto error_action_fifos_exit ;
2014-06-26 09:58:26 +02:00
err = ovs_flow_init ( ) ;
if ( err )
goto error_unreg_rtnl_link ;
2011-10-25 19:26:31 -07:00
err = ovs_vport_init ( ) ;
if ( err )
goto error_flow_exit ;
2012-02-22 19:58:59 -08:00
err = register_pernet_device ( & ovs_net_ops ) ;
2011-10-25 19:26:31 -07:00
if ( err )
goto error_vport_exit ;
2012-02-22 19:58:59 -08:00
err = register_netdevice_notifier ( & ovs_dp_device_notifier ) ;
if ( err )
goto error_netns_exit ;
2014-10-22 17:29:06 +02:00
err = ovs_netdev_init ( ) ;
if ( err )
goto error_unreg_notifier ;
2011-10-25 19:26:31 -07:00
err = dp_register_genl ( ) ;
if ( err < 0 )
2014-10-22 17:29:06 +02:00
goto error_unreg_netdev ;
2011-10-25 19:26:31 -07:00
return 0 ;
2014-10-22 17:29:06 +02:00
error_unreg_netdev :
ovs_netdev_exit ( ) ;
2011-10-25 19:26:31 -07:00
error_unreg_notifier :
unregister_netdevice_notifier ( & ovs_dp_device_notifier ) ;
2012-02-22 19:58:59 -08:00
error_netns_exit :
unregister_pernet_device ( & ovs_net_ops ) ;
2011-10-25 19:26:31 -07:00
error_vport_exit :
ovs_vport_exit ( ) ;
error_flow_exit :
ovs_flow_exit ( ) ;
2014-06-26 09:58:26 +02:00
error_unreg_rtnl_link :
ovs_internal_dev_rtnl_link_unregister ( ) ;
2014-09-15 19:37:25 -07:00
error_action_fifos_exit :
action_fifos_exit ( ) ;
2011-10-25 19:26:31 -07:00
error :
return err ;
}
static void dp_cleanup ( void )
{
dp_unregister_genl ( ARRAY_SIZE ( dp_genl_families ) ) ;
2014-10-22 17:29:06 +02:00
ovs_netdev_exit ( ) ;
2011-10-25 19:26:31 -07:00
unregister_netdevice_notifier ( & ovs_dp_device_notifier ) ;
2012-02-22 19:58:59 -08:00
unregister_pernet_device ( & ovs_net_ops ) ;
rcu_barrier ( ) ;
2011-10-25 19:26:31 -07:00
ovs_vport_exit ( ) ;
ovs_flow_exit ( ) ;
2014-06-26 09:58:26 +02:00
ovs_internal_dev_rtnl_link_unregister ( ) ;
2014-09-15 19:37:25 -07:00
action_fifos_exit ( ) ;
2011-10-25 19:26:31 -07:00
}
module_init ( dp_init ) ;
module_exit ( dp_cleanup ) ;
MODULE_DESCRIPTION ( " Open vSwitch switching datapath " ) ;
MODULE_LICENSE ( " GPL " ) ;
2016-09-09 17:42:30 -03:00
MODULE_ALIAS_GENL_FAMILY ( OVS_DATAPATH_FAMILY ) ;
MODULE_ALIAS_GENL_FAMILY ( OVS_VPORT_FAMILY ) ;
MODULE_ALIAS_GENL_FAMILY ( OVS_FLOW_FAMILY ) ;
MODULE_ALIAS_GENL_FAMILY ( OVS_PACKET_FAMILY ) ;
2017-11-10 12:09:42 -08:00
MODULE_ALIAS_GENL_FAMILY ( OVS_METER_FAMILY ) ;
openvswitch: Support conntrack zone limit
Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max. In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table. Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.
To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone. For example, we can designate different zone to different VM,
and set conntrack limit to each zone. By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs. Moreover, the users can
set various conntrack limit to different zone based on their preference.
The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone. If the number of
connection is above a configured limitation, ovs will return ENOMEM to the
userspace. If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.
The following high leve APIs are provided to the userspace:
- OVS_CT_LIMIT_CMD_SET:
* set default connection limit for all zones
* set the connection limit for a particular zone
- OVS_CT_LIMIT_CMD_DEL:
* remove the connection limit for a particular zone
- OVS_CT_LIMIT_CMD_GET:
* get the default connection limit for all zones
* get the connection limit for a particular zone
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-24 17:56:43 -07:00
MODULE_ALIAS_GENL_FAMILY ( OVS_CT_LIMIT_FAMILY ) ;