2019-05-27 09:55:01 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-17 02:20:36 +04:00
/*
* net / sched / cls_api . c Packet classifier API .
*
* Authors : Alexey Kuznetsov , < kuznet @ ms2 . inr . ac . ru >
*
* Changes :
*
* Eduardo J . Blanco < ejbs @ netlabs . com . uy > : 990222 : kmod support
*/
# include <linux/module.h>
# include <linux/types.h>
# include <linux/kernel.h>
# include <linux/string.h>
# include <linux/errno.h>
2017-02-09 16:38:57 +03:00
# include <linux/err.h>
2005-04-17 02:20:36 +04:00
# include <linux/skbuff.h>
# include <linux/init.h>
# include <linux/kmod.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
# include <linux/slab.h>
2018-01-17 13:46:46 +03:00
# include <linux/idr.h>
2019-11-02 17:17:47 +03:00
# include <linux/jhash.h>
2020-02-16 13:01:23 +03:00
# include <linux/rculist.h>
2023-02-18 01:36:14 +03:00
# include <linux/rhashtable.h>
2007-11-30 16:21:31 +03:00
# include <net/net_namespace.h>
# include <net/sock.h>
2007-03-26 10:06:12 +04:00
# include <net/netlink.h>
2005-04-17 02:20:36 +04:00
# include <net/pkt_sched.h>
# include <net/pkt_cls.h>
2019-02-02 14:50:45 +03:00
# include <net/tc_act/tc_pedit.h>
2019-02-02 14:50:46 +03:00
# include <net/tc_act/tc_mirred.h>
# include <net/tc_act/tc_vlan.h>
# include <net/tc_act/tc_tunnel_key.h>
# include <net/tc_act/tc_csum.h>
# include <net/tc_act/tc_gact.h>
2019-05-04 14:46:22 +03:00
# include <net/tc_act/tc_police.h>
2019-05-04 14:46:16 +03:00
# include <net/tc_act/tc_sample.h>
2019-02-02 14:50:46 +03:00
# include <net/tc_act/tc_skbedit.h>
2019-07-09 10:30:48 +03:00
# include <net/tc_act/tc_ct.h>
2019-07-23 17:33:59 +03:00
# include <net/tc_act/tc_mpls.h>
2020-05-01 03:53:16 +03:00
# include <net/tc_act/tc_gate.h>
2019-08-07 04:13:52 +03:00
# include <net/flow_offload.h>
2022-12-06 16:55:13 +03:00
# include <net/tc_wrapper.h>
2005-04-17 02:20:36 +04:00
/* The list of all installed classifier types */
2013-12-16 08:15:11 +04:00
static LIST_HEAD ( tcf_proto_base ) ;
2005-04-17 02:20:36 +04:00
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK ( cls_mod_lock ) ;
2023-02-18 01:36:14 +03:00
static struct xarray tcf_exts_miss_cookies_xa ;
struct tcf_exts_miss_cookie_node {
const struct tcf_chain * chain ;
const struct tcf_proto * tp ;
const struct tcf_exts * exts ;
u32 chain_index ;
u32 tp_prio ;
u32 handle ;
u32 miss_cookie_base ;
struct rcu_head rcu ;
} ;
/* Each tc action entry cookie will be comprised of 32bit miss_cookie_base +
* action index in the exts tc actions array .
*/
union tcf_exts_miss_cookie {
struct {
u32 miss_cookie_base ;
u32 act_index ;
} ;
u64 miss_cookie ;
} ;
# if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
static int
tcf_exts_miss_cookie_base_alloc ( struct tcf_exts * exts , struct tcf_proto * tp ,
u32 handle )
{
struct tcf_exts_miss_cookie_node * n ;
static u32 next ;
int err ;
if ( WARN_ON ( ! handle | | ! tp - > ops - > get_exts ) )
return - EINVAL ;
n = kzalloc ( sizeof ( * n ) , GFP_KERNEL ) ;
if ( ! n )
return - ENOMEM ;
n - > chain_index = tp - > chain - > index ;
n - > chain = tp - > chain ;
n - > tp_prio = tp - > prio ;
n - > tp = tp ;
n - > exts = exts ;
n - > handle = handle ;
err = xa_alloc_cyclic ( & tcf_exts_miss_cookies_xa , & n - > miss_cookie_base ,
n , xa_limit_32b , & next , GFP_KERNEL ) ;
if ( err )
goto err_xa_alloc ;
exts - > miss_cookie_node = n ;
return 0 ;
err_xa_alloc :
kfree ( n ) ;
return err ;
}
static void tcf_exts_miss_cookie_base_destroy ( struct tcf_exts * exts )
{
struct tcf_exts_miss_cookie_node * n ;
if ( ! exts - > miss_cookie_node )
return ;
n = exts - > miss_cookie_node ;
xa_erase ( & tcf_exts_miss_cookies_xa , n - > miss_cookie_base ) ;
kfree_rcu ( n , rcu ) ;
}
static struct tcf_exts_miss_cookie_node *
tcf_exts_miss_cookie_lookup ( u64 miss_cookie , int * act_index )
{
union tcf_exts_miss_cookie mc = { . miss_cookie = miss_cookie , } ;
* act_index = mc . act_index ;
return xa_load ( & tcf_exts_miss_cookies_xa , mc . miss_cookie_base ) ;
}
# else /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */
static int
tcf_exts_miss_cookie_base_alloc ( struct tcf_exts * exts , struct tcf_proto * tp ,
u32 handle )
{
return 0 ;
}
static void tcf_exts_miss_cookie_base_destroy ( struct tcf_exts * exts )
{
}
# endif /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */
static u64 tcf_exts_miss_cookie_get ( u32 miss_cookie_base , int act_index )
{
union tcf_exts_miss_cookie mc = { . act_index = act_index , } ;
if ( ! miss_cookie_base )
return 0 ;
mc . miss_cookie_base = miss_cookie_base ;
return mc . miss_cookie ;
}
2022-02-03 11:44:30 +03:00
# ifdef CONFIG_NET_CLS_ACT
DEFINE_STATIC_KEY_FALSE ( tc_skb_ext_tc ) ;
EXPORT_SYMBOL ( tc_skb_ext_tc ) ;
void tc_skb_ext_tc_enable ( void )
{
static_branch_inc ( & tc_skb_ext_tc ) ;
}
EXPORT_SYMBOL ( tc_skb_ext_tc_enable ) ;
void tc_skb_ext_tc_disable ( void )
{
static_branch_dec ( & tc_skb_ext_tc ) ;
}
EXPORT_SYMBOL ( tc_skb_ext_tc_disable ) ;
# endif
2019-11-02 17:17:47 +03:00
static u32 destroy_obj_hashfn ( const struct tcf_proto * tp )
{
return jhash_3words ( tp - > chain - > index , tp - > prio ,
( __force __u32 ) tp - > protocol , 0 ) ;
}
static void tcf_proto_signal_destroying ( struct tcf_chain * chain ,
struct tcf_proto * tp )
{
struct tcf_block * block = chain - > block ;
mutex_lock ( & block - > proto_destroy_lock ) ;
hash_add_rcu ( block - > proto_destroy_ht , & tp - > destroy_ht_node ,
destroy_obj_hashfn ( tp ) ) ;
mutex_unlock ( & block - > proto_destroy_lock ) ;
}
static bool tcf_proto_cmp ( const struct tcf_proto * tp1 ,
const struct tcf_proto * tp2 )
{
return tp1 - > chain - > index = = tp2 - > chain - > index & &
tp1 - > prio = = tp2 - > prio & &
tp1 - > protocol = = tp2 - > protocol ;
}
static bool tcf_proto_exists_destroying ( struct tcf_chain * chain ,
struct tcf_proto * tp )
{
u32 hash = destroy_obj_hashfn ( tp ) ;
struct tcf_proto * iter ;
bool found = false ;
rcu_read_lock ( ) ;
hash_for_each_possible_rcu ( chain - > block - > proto_destroy_ht , iter ,
destroy_ht_node , hash ) {
if ( tcf_proto_cmp ( tp , iter ) ) {
found = true ;
break ;
}
}
rcu_read_unlock ( ) ;
return found ;
}
static void
tcf_proto_signal_destroyed ( struct tcf_chain * chain , struct tcf_proto * tp )
{
struct tcf_block * block = chain - > block ;
mutex_lock ( & block - > proto_destroy_lock ) ;
if ( hash_hashed ( & tp - > destroy_ht_node ) )
hash_del_rcu ( & tp - > destroy_ht_node ) ;
mutex_unlock ( & block - > proto_destroy_lock ) ;
}
2005-04-17 02:20:36 +04:00
/* Find classifier type by string name */
2018-07-23 10:23:04 +03:00
static const struct tcf_proto_ops * __tcf_proto_lookup_ops ( const char * kind )
2005-04-17 02:20:36 +04:00
{
2013-12-20 22:04:18 +04:00
const struct tcf_proto_ops * t , * res = NULL ;
2005-04-17 02:20:36 +04:00
if ( kind ) {
read_lock ( & cls_mod_lock ) ;
2013-12-16 08:15:11 +04:00
list_for_each_entry ( t , & tcf_proto_base , head ) {
2017-02-09 16:38:57 +03:00
if ( strcmp ( kind , t - > kind ) = = 0 ) {
2013-12-20 22:04:18 +04:00
if ( try_module_get ( t - > owner ) )
res = t ;
2005-04-17 02:20:36 +04:00
break ;
}
}
read_unlock ( & cls_mod_lock ) ;
}
2013-12-20 22:04:18 +04:00
return res ;
2005-04-17 02:20:36 +04:00
}
2018-07-23 10:23:04 +03:00
static const struct tcf_proto_ops *
2019-02-11 11:55:45 +03:00
tcf_proto_lookup_ops ( const char * kind , bool rtnl_held ,
struct netlink_ext_ack * extack )
2018-07-23 10:23:04 +03:00
{
const struct tcf_proto_ops * ops ;
ops = __tcf_proto_lookup_ops ( kind ) ;
if ( ops )
return ops ;
# ifdef CONFIG_MODULES
2019-02-11 11:55:45 +03:00
if ( rtnl_held )
rtnl_unlock ( ) ;
2024-02-01 16:09:42 +03:00
request_module ( NET_CLS_ALIAS_PREFIX " %s " , kind ) ;
2019-02-11 11:55:45 +03:00
if ( rtnl_held )
rtnl_lock ( ) ;
2018-07-23 10:23:04 +03:00
ops = __tcf_proto_lookup_ops ( kind ) ;
/* We dropped the RTNL semaphore in order to perform
* the module load . So , even if we succeeded in loading
* the module we have to replay the request . We indicate
* this using - EAGAIN .
*/
if ( ops ) {
module_put ( ops - > owner ) ;
return ERR_PTR ( - EAGAIN ) ;
}
# endif
NL_SET_ERR_MSG ( extack , " TC classifier not found " ) ;
return ERR_PTR ( - ENOENT ) ;
}
2005-04-17 02:20:36 +04:00
/* Register(unregister) new classifier type */
int register_tcf_proto_ops ( struct tcf_proto_ops * ops )
{
2013-12-16 08:15:11 +04:00
struct tcf_proto_ops * t ;
2005-04-17 02:20:36 +04:00
int rc = - EEXIST ;
write_lock ( & cls_mod_lock ) ;
2013-12-16 08:15:11 +04:00
list_for_each_entry ( t , & tcf_proto_base , head )
2005-04-17 02:20:36 +04:00
if ( ! strcmp ( ops - > kind , t - > kind ) )
goto out ;
2013-12-16 08:15:11 +04:00
list_add_tail ( & ops - > head , & tcf_proto_base ) ;
2005-04-17 02:20:36 +04:00
rc = 0 ;
out :
write_unlock ( & cls_mod_lock ) ;
return rc ;
}
2008-01-21 13:26:41 +03:00
EXPORT_SYMBOL ( register_tcf_proto_ops ) ;
2005-04-17 02:20:36 +04:00
2017-10-27 04:24:28 +03:00
static struct workqueue_struct * tc_filter_wq ;
2022-07-13 04:54:38 +03:00
void unregister_tcf_proto_ops ( struct tcf_proto_ops * ops )
2005-04-17 02:20:36 +04:00
{
2013-12-16 08:15:11 +04:00
struct tcf_proto_ops * t ;
2005-04-17 02:20:36 +04:00
int rc = - ENOENT ;
net: sched: fix call_rcu() race on classifier module unloads
Vijay reported that a loop as simple as ...
while true; do
tc qdisc add dev foo root handle 1: prio
tc filter add dev foo parent 1: u32 match u32 0 0 flowid 1
tc qdisc del dev foo root
rmmod cls_u32
done
... will panic the kernel. Moreover, he bisected the change
apparently introducing it to 78fd1d0ab072 ("netlink: Re-add
locking to netlink_lookup() and seq walker").
The removal of synchronize_net() from the netlink socket
triggering the qdisc to be removed, seems to have uncovered
an RCU resp. module reference count race from the tc API.
Given that RCU conversion was done after e341694e3eb5 ("netlink:
Convert netlink_lookup() to use RCU protected hash table")
which added the synchronize_net() originally, occasion of
hitting the bug was less likely (not impossible though):
When qdiscs that i) support attaching classifiers and,
ii) have at least one of them attached, get deleted, they
invoke tcf_destroy_chain(), and thus call into ->destroy()
handler from a classifier module.
After RCU conversion, all classifier that have an internal
prio list, unlink them and initiate freeing via call_rcu()
deferral.
Meanhile, tcf_destroy() releases already reference to the
tp->ops->owner module before the queued RCU callback handler
has been invoked.
Subsequent rmmod on the classifier module is then not prevented
since all module references are already dropped.
By the time, the kernel invokes the RCU callback handler from
the module, that function address is then invalid.
One way to fix it would be to add an rcu_barrier() to
unregister_tcf_proto_ops() to wait for all pending call_rcu()s
to complete.
synchronize_rcu() is not appropriate as under heavy RCU
callback load, registered call_rcu()s could be deferred
longer than a grace period. In case we don't have any pending
call_rcu()s, the barrier is allowed to return immediately.
Since we came here via unregister_tcf_proto_ops(), there
are no users of a given classifier anymore. Further nested
call_rcu()s pointing into the module space are not being
done anywhere.
Only cls_bpf_delete_prog() may schedule a work item, to
unlock pages eventually, but that is not in the range/context
of cls_bpf anymore.
Fixes: 25d8c0d55f24 ("net: rcu-ify tcf_proto")
Fixes: 9888faefe132 ("net: sched: cls_basic use RCU")
Reported-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 18:13:33 +03:00
/* Wait for outstanding call_rcu()s, if any, from a
* tcf_proto_ops ' s destroy ( ) handler .
*/
rcu_barrier ( ) ;
2017-10-27 04:24:28 +03:00
flush_workqueue ( tc_filter_wq ) ;
net: sched: fix call_rcu() race on classifier module unloads
Vijay reported that a loop as simple as ...
while true; do
tc qdisc add dev foo root handle 1: prio
tc filter add dev foo parent 1: u32 match u32 0 0 flowid 1
tc qdisc del dev foo root
rmmod cls_u32
done
... will panic the kernel. Moreover, he bisected the change
apparently introducing it to 78fd1d0ab072 ("netlink: Re-add
locking to netlink_lookup() and seq walker").
The removal of synchronize_net() from the netlink socket
triggering the qdisc to be removed, seems to have uncovered
an RCU resp. module reference count race from the tc API.
Given that RCU conversion was done after e341694e3eb5 ("netlink:
Convert netlink_lookup() to use RCU protected hash table")
which added the synchronize_net() originally, occasion of
hitting the bug was less likely (not impossible though):
When qdiscs that i) support attaching classifiers and,
ii) have at least one of them attached, get deleted, they
invoke tcf_destroy_chain(), and thus call into ->destroy()
handler from a classifier module.
After RCU conversion, all classifier that have an internal
prio list, unlink them and initiate freeing via call_rcu()
deferral.
Meanhile, tcf_destroy() releases already reference to the
tp->ops->owner module before the queued RCU callback handler
has been invoked.
Subsequent rmmod on the classifier module is then not prevented
since all module references are already dropped.
By the time, the kernel invokes the RCU callback handler from
the module, that function address is then invalid.
One way to fix it would be to add an rcu_barrier() to
unregister_tcf_proto_ops() to wait for all pending call_rcu()s
to complete.
synchronize_rcu() is not appropriate as under heavy RCU
callback load, registered call_rcu()s could be deferred
longer than a grace period. In case we don't have any pending
call_rcu()s, the barrier is allowed to return immediately.
Since we came here via unregister_tcf_proto_ops(), there
are no users of a given classifier anymore. Further nested
call_rcu()s pointing into the module space are not being
done anywhere.
Only cls_bpf_delete_prog() may schedule a work item, to
unlock pages eventually, but that is not in the range/context
of cls_bpf anymore.
Fixes: 25d8c0d55f24 ("net: rcu-ify tcf_proto")
Fixes: 9888faefe132 ("net: sched: cls_basic use RCU")
Reported-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-05-20 18:13:33 +03:00
2005-04-17 02:20:36 +04:00
write_lock ( & cls_mod_lock ) ;
2013-12-20 22:04:18 +04:00
list_for_each_entry ( t , & tcf_proto_base , head ) {
if ( t = = ops ) {
list_del ( & t - > head ) ;
rc = 0 ;
2005-04-17 02:20:36 +04:00
break ;
2013-12-20 22:04:18 +04:00
}
}
2005-04-17 02:20:36 +04:00
write_unlock ( & cls_mod_lock ) ;
2022-07-13 04:54:38 +03:00
WARN ( rc , " unregister tc filter kind(%s) failed %d \n " , ops - > kind , rc ) ;
2005-04-17 02:20:36 +04:00
}
2008-01-21 13:26:41 +03:00
EXPORT_SYMBOL ( unregister_tcf_proto_ops ) ;
2005-04-17 02:20:36 +04:00
2018-05-24 01:26:53 +03:00
bool tcf_queue_work ( struct rcu_work * rwork , work_func_t func )
2017-10-27 04:24:28 +03:00
{
2018-05-24 01:26:53 +03:00
INIT_RCU_WORK ( rwork , func ) ;
return queue_rcu_work ( tc_filter_wq , rwork ) ;
2017-10-27 04:24:28 +03:00
}
EXPORT_SYMBOL ( tcf_queue_work ) ;
2005-04-17 02:20:36 +04:00
/* Select new prio value from the range, managed by kernel. */
2008-01-21 13:26:41 +03:00
static inline u32 tcf_auto_prio ( struct tcf_proto * tp )
2005-04-17 02:20:36 +04:00
{
2008-01-21 13:26:41 +03:00
u32 first = TC_H_MAKE ( 0xC0000000U , 0U ) ;
2005-04-17 02:20:36 +04:00
if ( tp )
2011-01-19 22:26:56 +03:00
first = tp - > prio - 1 ;
2005-04-17 02:20:36 +04:00
2017-05-17 12:07:58 +03:00
return TC_H_MAJ ( first ) ;
2005-04-17 02:20:36 +04:00
}
2019-10-07 23:26:28 +03:00
static bool tcf_proto_check_kind ( struct nlattr * kind , char * name )
{
if ( kind )
2020-11-15 20:08:06 +03:00
return nla_strscpy ( name , kind , IFNAMSIZ ) < 0 ;
2019-10-07 23:26:28 +03:00
memset ( name , 0 , IFNAMSIZ ) ;
return false ;
}
2019-02-11 11:55:48 +03:00
static bool tcf_proto_is_unlocked ( const char * kind )
{
const struct tcf_proto_ops * ops ;
bool ret ;
2019-10-07 23:26:28 +03:00
if ( strlen ( kind ) = = 0 )
return false ;
2019-02-11 11:55:48 +03:00
ops = tcf_proto_lookup_ops ( kind , false , NULL ) ;
/* On error return false to take rtnl lock. Proto lookup/create
* functions will perform lookup again and properly handle errors .
*/
if ( IS_ERR ( ops ) )
return false ;
ret = ! ! ( ops - > flags & TCF_PROTO_OPS_DOIT_UNLOCKED ) ;
module_put ( ops - > owner ) ;
return ret ;
}
2017-02-09 16:38:57 +03:00
static struct tcf_proto * tcf_proto_create ( const char * kind , u32 protocol ,
2018-01-18 19:20:50 +03:00
u32 prio , struct tcf_chain * chain ,
2019-02-11 11:55:45 +03:00
bool rtnl_held ,
2018-01-18 19:20:50 +03:00
struct netlink_ext_ack * extack )
2017-02-09 16:38:57 +03:00
{
struct tcf_proto * tp ;
int err ;
tp = kzalloc ( sizeof ( * tp ) , GFP_KERNEL ) ;
if ( ! tp )
return ERR_PTR ( - ENOBUFS ) ;
2019-02-11 11:55:45 +03:00
tp - > ops = tcf_proto_lookup_ops ( kind , rtnl_held , extack ) ;
2018-07-23 10:23:04 +03:00
if ( IS_ERR ( tp - > ops ) ) {
err = PTR_ERR ( tp - > ops ) ;
2018-05-11 18:45:32 +03:00
goto errout ;
2017-02-09 16:38:57 +03:00
}
tp - > classify = tp - > ops - > classify ;
tp - > protocol = protocol ;
tp - > prio = prio ;
2017-05-17 12:08:01 +03:00
tp - > chain = chain ;
2019-02-11 11:55:41 +03:00
spin_lock_init ( & tp - > lock ) ;
2019-02-11 11:55:39 +03:00
refcount_set ( & tp - > refcnt , 1 ) ;
2017-02-09 16:38:57 +03:00
err = tp - > ops - > init ( tp ) ;
if ( err ) {
module_put ( tp - > ops - > owner ) ;
goto errout ;
}
return tp ;
errout :
kfree ( tp ) ;
return ERR_PTR ( err ) ;
}
2019-02-11 11:55:39 +03:00
static void tcf_proto_get ( struct tcf_proto * tp )
{
refcount_inc ( & tp - > refcnt ) ;
}
net: sched: make skip_sw actually skip software
TC filters come in 3 variants:
- no flag (try to process in hardware, but fallback to software))
- skip_hw (do not process filter by hardware)
- skip_sw (do not process filter by software)
However skip_sw is implemented so that the skip_sw
flag can first be checked, after it has been matched.
IMHO it's common when using skip_sw, to use it on all rules.
So if all filters in a block is skip_sw filters, then
we can bail early, we can thus avoid having to match
the filters, just to check for the skip_sw flag.
This patch adds a bypass, for when only TC skip_sw rules
are used. The bypass is guarded by a static key, to avoid
harming other workloads.
There are 3 ways that a packet from a skip_sw ruleset, can
end up in the kernel path. Although the send packets to a
non-existent chain way is only improved a few percents, then
I believe it's worth optimizing the trap and fall-though
use-cases.
+----------------------------+--------+--------+--------+
| Test description | Pre- | Post- | Rel. |
| | kpps | kpps | chg. |
+----------------------------+--------+--------+--------+
| basic forwarding + notrack | 3589.3 | 3587.9 | 1.00x |
| switch to eswitch mode | 3081.8 | 3094.7 | 1.00x |
| add ingress qdisc | 3042.9 | 3063.6 | 1.01x |
| tc forward in hw / skip_sw |37024.7 |37028.4 | 1.00x |
| tc forward in sw / skip_hw | 3245.0 | 3245.3 | 1.00x |
+----------------------------+--------+--------+--------+
| tests with only skip_sw rules below: |
+----------------------------+--------+--------+--------+
| 1 non-matching rule | 2694.7 | 3058.7 | 1.14x |
| 1 n-m rule, match trap | 2611.2 | 3323.1 | 1.27x |
| 1 n-m rule, goto non-chain | 2886.8 | 2945.9 | 1.02x |
| 5 non-matching rules | 1958.2 | 3061.3 | 1.56x |
| 5 n-m rules, match trap | 1911.9 | 3327.0 | 1.74x |
| 5 n-m rules, goto non-chain| 2883.1 | 2947.5 | 1.02x |
| 10 non-matching rules | 1466.3 | 3062.8 | 2.09x |
| 10 n-m rules, match trap | 1444.3 | 3317.9 | 2.30x |
| 10 n-m rules,goto non-chain| 2883.1 | 2939.5 | 1.02x |
| 25 non-matching rules | 838.5 | 3058.9 | 3.65x |
| 25 n-m rules, match trap | 824.5 | 3323.0 | 4.03x |
| 25 n-m rules,goto non-chain| 2875.8 | 2944.7 | 1.02x |
| 50 non-matching rules | 488.1 | 3054.7 | 6.26x |
| 50 n-m rules, match trap | 484.9 | 3318.5 | 6.84x |
| 50 n-m rules,goto non-chain| 2884.1 | 2939.7 | 1.02x |
+----------------------------+--------+--------+--------+
perf top (25 n-m skip_sw rules - pre patch):
20.39% [kernel] [k] __skb_flow_dissect
16.43% [kernel] [k] rhashtable_jhash2
10.58% [kernel] [k] fl_classify
10.23% [kernel] [k] fl_mask_lookup
4.79% [kernel] [k] memset_orig
2.58% [kernel] [k] tcf_classify
1.47% [kernel] [k] __x86_indirect_thunk_rax
1.42% [kernel] [k] __dev_queue_xmit
1.36% [kernel] [k] nft_do_chain
1.21% [kernel] [k] __rcu_read_lock
perf top (25 n-m skip_sw rules - post patch):
5.12% [kernel] [k] __dev_queue_xmit
4.77% [kernel] [k] nft_do_chain
3.65% [kernel] [k] dev_gro_receive
3.41% [kernel] [k] check_preemption_disabled
3.14% [kernel] [k] mlx5e_skb_from_cqe_mpwrq_nonlinear
2.88% [kernel] [k] __netif_receive_skb_core.constprop.0
2.49% [kernel] [k] mlx5e_xmit
2.15% [kernel] [k] ip_forward
1.95% [kernel] [k] mlx5e_tc_restore_tunnel
1.92% [kernel] [k] vlan_gro_receive
Test setup:
DUT: Intel Xeon D-1518 (2.20GHz) w/ Nvidia/Mellanox ConnectX-6 Dx 2x100G
Data rate measured on switch (Extreme X690), and DUT connected as
a router on a stick, with pktgen and pktsink as VLANs.
Pktgen-dpdk was in range 36.6-37.7 Mpps 64B packets across all tests.
Full test data at https://files.fiberby.net/ast/2024/tc_skip_sw/v2_tests/
Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-25 23:47:36 +03:00
static void tcf_maintain_bypass ( struct tcf_block * block )
{
int filtercnt = atomic_read ( & block - > filtercnt ) ;
int skipswcnt = atomic_read ( & block - > skipswcnt ) ;
bool bypass_wanted = filtercnt > 0 & & filtercnt = = skipswcnt ;
if ( bypass_wanted ! = block - > bypass_wanted ) {
# ifdef CONFIG_NET_CLS_ACT
if ( bypass_wanted )
static_branch_inc ( & tcf_bypass_check_needed_key ) ;
else
static_branch_dec ( & tcf_bypass_check_needed_key ) ;
# endif
block - > bypass_wanted = bypass_wanted ;
}
}
2024-03-25 23:47:35 +03:00
static void tcf_block_filter_cnt_update ( struct tcf_block * block , bool * counted , bool add )
{
lockdep_assert_not_held ( & block - > cb_lock ) ;
down_write ( & block - > cb_lock ) ;
if ( * counted ! = add ) {
if ( add ) {
atomic_inc ( & block - > filtercnt ) ;
* counted = true ;
} else {
atomic_dec ( & block - > filtercnt ) ;
* counted = false ;
}
}
net: sched: make skip_sw actually skip software
TC filters come in 3 variants:
- no flag (try to process in hardware, but fallback to software))
- skip_hw (do not process filter by hardware)
- skip_sw (do not process filter by software)
However skip_sw is implemented so that the skip_sw
flag can first be checked, after it has been matched.
IMHO it's common when using skip_sw, to use it on all rules.
So if all filters in a block is skip_sw filters, then
we can bail early, we can thus avoid having to match
the filters, just to check for the skip_sw flag.
This patch adds a bypass, for when only TC skip_sw rules
are used. The bypass is guarded by a static key, to avoid
harming other workloads.
There are 3 ways that a packet from a skip_sw ruleset, can
end up in the kernel path. Although the send packets to a
non-existent chain way is only improved a few percents, then
I believe it's worth optimizing the trap and fall-though
use-cases.
+----------------------------+--------+--------+--------+
| Test description | Pre- | Post- | Rel. |
| | kpps | kpps | chg. |
+----------------------------+--------+--------+--------+
| basic forwarding + notrack | 3589.3 | 3587.9 | 1.00x |
| switch to eswitch mode | 3081.8 | 3094.7 | 1.00x |
| add ingress qdisc | 3042.9 | 3063.6 | 1.01x |
| tc forward in hw / skip_sw |37024.7 |37028.4 | 1.00x |
| tc forward in sw / skip_hw | 3245.0 | 3245.3 | 1.00x |
+----------------------------+--------+--------+--------+
| tests with only skip_sw rules below: |
+----------------------------+--------+--------+--------+
| 1 non-matching rule | 2694.7 | 3058.7 | 1.14x |
| 1 n-m rule, match trap | 2611.2 | 3323.1 | 1.27x |
| 1 n-m rule, goto non-chain | 2886.8 | 2945.9 | 1.02x |
| 5 non-matching rules | 1958.2 | 3061.3 | 1.56x |
| 5 n-m rules, match trap | 1911.9 | 3327.0 | 1.74x |
| 5 n-m rules, goto non-chain| 2883.1 | 2947.5 | 1.02x |
| 10 non-matching rules | 1466.3 | 3062.8 | 2.09x |
| 10 n-m rules, match trap | 1444.3 | 3317.9 | 2.30x |
| 10 n-m rules,goto non-chain| 2883.1 | 2939.5 | 1.02x |
| 25 non-matching rules | 838.5 | 3058.9 | 3.65x |
| 25 n-m rules, match trap | 824.5 | 3323.0 | 4.03x |
| 25 n-m rules,goto non-chain| 2875.8 | 2944.7 | 1.02x |
| 50 non-matching rules | 488.1 | 3054.7 | 6.26x |
| 50 n-m rules, match trap | 484.9 | 3318.5 | 6.84x |
| 50 n-m rules,goto non-chain| 2884.1 | 2939.7 | 1.02x |
+----------------------------+--------+--------+--------+
perf top (25 n-m skip_sw rules - pre patch):
20.39% [kernel] [k] __skb_flow_dissect
16.43% [kernel] [k] rhashtable_jhash2
10.58% [kernel] [k] fl_classify
10.23% [kernel] [k] fl_mask_lookup
4.79% [kernel] [k] memset_orig
2.58% [kernel] [k] tcf_classify
1.47% [kernel] [k] __x86_indirect_thunk_rax
1.42% [kernel] [k] __dev_queue_xmit
1.36% [kernel] [k] nft_do_chain
1.21% [kernel] [k] __rcu_read_lock
perf top (25 n-m skip_sw rules - post patch):
5.12% [kernel] [k] __dev_queue_xmit
4.77% [kernel] [k] nft_do_chain
3.65% [kernel] [k] dev_gro_receive
3.41% [kernel] [k] check_preemption_disabled
3.14% [kernel] [k] mlx5e_skb_from_cqe_mpwrq_nonlinear
2.88% [kernel] [k] __netif_receive_skb_core.constprop.0
2.49% [kernel] [k] mlx5e_xmit
2.15% [kernel] [k] ip_forward
1.95% [kernel] [k] mlx5e_tc_restore_tunnel
1.92% [kernel] [k] vlan_gro_receive
Test setup:
DUT: Intel Xeon D-1518 (2.20GHz) w/ Nvidia/Mellanox ConnectX-6 Dx 2x100G
Data rate measured on switch (Extreme X690), and DUT connected as
a router on a stick, with pktgen and pktsink as VLANs.
Pktgen-dpdk was in range 36.6-37.7 Mpps 64B packets across all tests.
Full test data at https://files.fiberby.net/ast/2024/tc_skip_sw/v2_tests/
Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-03-25 23:47:36 +03:00
tcf_maintain_bypass ( block ) ;
2024-03-25 23:47:35 +03:00
up_write ( & block - > cb_lock ) ;
}
2019-02-11 11:55:39 +03:00
static void tcf_chain_put ( struct tcf_chain * chain ) ;
2019-02-11 11:55:45 +03:00
static void tcf_proto_destroy ( struct tcf_proto * tp , bool rtnl_held ,
2019-11-02 17:17:47 +03:00
bool sig_destroy , struct netlink_ext_ack * extack )
2017-02-09 16:38:56 +03:00
{
2019-02-11 11:55:45 +03:00
tp - > ops - > destroy ( tp , rtnl_held , extack ) ;
2024-03-25 23:47:35 +03:00
tcf_block_filter_cnt_update ( tp - > chain - > block , & tp - > counted , false ) ;
2019-11-02 17:17:47 +03:00
if ( sig_destroy )
tcf_proto_signal_destroyed ( tp - > chain , tp ) ;
2019-02-11 11:55:39 +03:00
tcf_chain_put ( tp - > chain ) ;
2017-04-20 00:21:21 +03:00
module_put ( tp - > ops - > owner ) ;
kfree_rcu ( tp , rcu ) ;
2017-02-09 16:38:56 +03:00
}
2019-02-11 11:55:45 +03:00
static void tcf_proto_put ( struct tcf_proto * tp , bool rtnl_held ,
2019-02-11 11:55:39 +03:00
struct netlink_ext_ack * extack )
{
if ( refcount_dec_and_test ( & tp - > refcnt ) )
2019-11-02 17:17:47 +03:00
tcf_proto_destroy ( tp , rtnl_held , true , extack ) ;
2019-02-11 11:55:39 +03:00
}
net/sched: add delete_empty() to filters and use it in cls_flower
Revert "net/sched: cls_u32: fix refcount leak in the error path of
u32_change()", and fix the u32 refcount leak in a more generic way that
preserves the semantic of rule dumping.
On tc filters that don't support lockless insertion/removal, there is no
need to guard against concurrent insertion when a removal is in progress.
Therefore, for most of them we can avoid a full walk() when deleting, and
just decrease the refcount, like it was done on older Linux kernels.
This fixes situations where walk() was wrongly detecting a non-empty
filter, like it happened with cls_u32 in the error path of change(), thus
leading to failures in the following tdc selftests:
6aa7: (filter, u32) Add/Replace u32 with source match and invalid indev
6658: (filter, u32) Add/Replace u32 with custom hash table and invalid handle
74c2: (filter, u32) Add/Replace u32 filter with invalid hash table id
On cls_flower, and on (future) lockless filters, this check is necessary:
move all the check_empty() logic in a callback so that each filter
can have its own implementation. For cls_flower, it's sufficient to check
if no IDRs have been allocated.
This reverts commit 275c44aa194b7159d1191817b20e076f55f0e620.
Changes since v1:
- document the need for delete_empty() when TCF_PROTO_OPS_DOIT_UNLOCKED
is used, thanks to Vlad Buslov
- implement delete_empty() without doing fl_walk(), thanks to Vlad Buslov
- squash revert and new fix in a single patch, to be nice with bisect
tests that run tdc on u32 filter, thanks to Dave Miller
Fixes: 275c44aa194b ("net/sched: cls_u32: fix refcount leak in the error path of u32_change()")
Fixes: 6676d5e416ee ("net: sched: set dedicated tcf_walker flag when tp is empty")
Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Suggested-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Vlad Buslov <vladbu@mellanox.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 18:36:58 +03:00
static bool tcf_proto_check_delete ( struct tcf_proto * tp )
2019-02-11 11:55:41 +03:00
{
net/sched: add delete_empty() to filters and use it in cls_flower
Revert "net/sched: cls_u32: fix refcount leak in the error path of
u32_change()", and fix the u32 refcount leak in a more generic way that
preserves the semantic of rule dumping.
On tc filters that don't support lockless insertion/removal, there is no
need to guard against concurrent insertion when a removal is in progress.
Therefore, for most of them we can avoid a full walk() when deleting, and
just decrease the refcount, like it was done on older Linux kernels.
This fixes situations where walk() was wrongly detecting a non-empty
filter, like it happened with cls_u32 in the error path of change(), thus
leading to failures in the following tdc selftests:
6aa7: (filter, u32) Add/Replace u32 with source match and invalid indev
6658: (filter, u32) Add/Replace u32 with custom hash table and invalid handle
74c2: (filter, u32) Add/Replace u32 filter with invalid hash table id
On cls_flower, and on (future) lockless filters, this check is necessary:
move all the check_empty() logic in a callback so that each filter
can have its own implementation. For cls_flower, it's sufficient to check
if no IDRs have been allocated.
This reverts commit 275c44aa194b7159d1191817b20e076f55f0e620.
Changes since v1:
- document the need for delete_empty() when TCF_PROTO_OPS_DOIT_UNLOCKED
is used, thanks to Vlad Buslov
- implement delete_empty() without doing fl_walk(), thanks to Vlad Buslov
- squash revert and new fix in a single patch, to be nice with bisect
tests that run tdc on u32 filter, thanks to Dave Miller
Fixes: 275c44aa194b ("net/sched: cls_u32: fix refcount leak in the error path of u32_change()")
Fixes: 6676d5e416ee ("net: sched: set dedicated tcf_walker flag when tp is empty")
Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Suggested-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Vlad Buslov <vladbu@mellanox.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 18:36:58 +03:00
if ( tp - > ops - > delete_empty )
return tp - > ops - > delete_empty ( tp ) ;
2019-02-11 11:55:41 +03:00
net/sched: add delete_empty() to filters and use it in cls_flower
Revert "net/sched: cls_u32: fix refcount leak in the error path of
u32_change()", and fix the u32 refcount leak in a more generic way that
preserves the semantic of rule dumping.
On tc filters that don't support lockless insertion/removal, there is no
need to guard against concurrent insertion when a removal is in progress.
Therefore, for most of them we can avoid a full walk() when deleting, and
just decrease the refcount, like it was done on older Linux kernels.
This fixes situations where walk() was wrongly detecting a non-empty
filter, like it happened with cls_u32 in the error path of change(), thus
leading to failures in the following tdc selftests:
6aa7: (filter, u32) Add/Replace u32 with source match and invalid indev
6658: (filter, u32) Add/Replace u32 with custom hash table and invalid handle
74c2: (filter, u32) Add/Replace u32 filter with invalid hash table id
On cls_flower, and on (future) lockless filters, this check is necessary:
move all the check_empty() logic in a callback so that each filter
can have its own implementation. For cls_flower, it's sufficient to check
if no IDRs have been allocated.
This reverts commit 275c44aa194b7159d1191817b20e076f55f0e620.
Changes since v1:
- document the need for delete_empty() when TCF_PROTO_OPS_DOIT_UNLOCKED
is used, thanks to Vlad Buslov
- implement delete_empty() without doing fl_walk(), thanks to Vlad Buslov
- squash revert and new fix in a single patch, to be nice with bisect
tests that run tdc on u32 filter, thanks to Dave Miller
Fixes: 275c44aa194b ("net/sched: cls_u32: fix refcount leak in the error path of u32_change()")
Fixes: 6676d5e416ee ("net: sched: set dedicated tcf_walker flag when tp is empty")
Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Suggested-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Vlad Buslov <vladbu@mellanox.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 18:36:58 +03:00
tp - > deleting = true ;
2019-02-11 11:55:41 +03:00
return tp - > deleting ;
}
static void tcf_proto_mark_delete ( struct tcf_proto * tp )
{
spin_lock ( & tp - > lock ) ;
tp - > deleting = true ;
spin_unlock ( & tp - > lock ) ;
}
static bool tcf_proto_is_deleting ( struct tcf_proto * tp )
{
bool deleting ;
spin_lock ( & tp - > lock ) ;
deleting = tp - > deleting ;
spin_unlock ( & tp - > lock ) ;
return deleting ;
}
2019-02-11 11:55:32 +03:00
# define ASSERT_BLOCK_LOCKED(block) \
lockdep_assert_held ( & ( block ) - > lock )
2018-01-17 13:46:45 +03:00
struct tcf_filter_chain_list_item {
struct list_head list ;
tcf_chain_head_change_t * chain_head_change ;
void * chain_head_change_priv ;
} ;
2017-05-17 12:08:01 +03:00
static struct tcf_chain * tcf_chain_create ( struct tcf_block * block ,
u32 chain_index )
2017-05-17 12:07:59 +03:00
{
2017-05-17 12:08:01 +03:00
struct tcf_chain * chain ;
2019-02-11 11:55:32 +03:00
ASSERT_BLOCK_LOCKED ( block ) ;
2017-05-17 12:08:01 +03:00
chain = kzalloc ( sizeof ( * chain ) , GFP_KERNEL ) ;
if ( ! chain )
return NULL ;
2020-02-16 13:01:23 +03:00
list_add_tail_rcu ( & chain - > list , & block - > chain_list ) ;
2019-02-11 11:55:38 +03:00
mutex_init ( & chain - > filter_chain_lock ) ;
2017-05-17 12:08:01 +03:00
chain - > block = block ;
chain - > index = chain_index ;
2017-09-12 02:33:31 +03:00
chain - > refcnt = 1 ;
2018-07-23 10:23:05 +03:00
if ( ! chain - > index )
block - > chain0 . chain = chain ;
2017-05-17 12:08:01 +03:00
return chain ;
2017-05-17 12:07:59 +03:00
}
2018-01-17 13:46:45 +03:00
static void tcf_chain_head_change_item ( struct tcf_filter_chain_list_item * item ,
struct tcf_proto * tp_head )
{
if ( item - > chain_head_change )
item - > chain_head_change ( tp_head , item - > chain_head_change_priv ) ;
}
2018-07-23 10:23:05 +03:00
static void tcf_chain0_head_change ( struct tcf_chain * chain ,
struct tcf_proto * tp_head )
2017-11-03 13:46:24 +03:00
{
2018-01-17 13:46:45 +03:00
struct tcf_filter_chain_list_item * item ;
2018-07-23 10:23:05 +03:00
struct tcf_block * block = chain - > block ;
2018-01-17 13:46:45 +03:00
2018-07-23 10:23:05 +03:00
if ( chain - > index )
return ;
2019-02-11 11:55:35 +03:00
mutex_lock ( & block - > lock ) ;
2018-07-23 10:23:05 +03:00
list_for_each_entry ( item , & block - > chain0 . filter_chain_list , list )
2018-01-17 13:46:45 +03:00
tcf_chain_head_change_item ( item , tp_head ) ;
2019-02-11 11:55:35 +03:00
mutex_unlock ( & block - > lock ) ;
2017-11-03 13:46:24 +03:00
}
2019-02-11 11:55:32 +03:00
/* Returns true if block can be safely freed. */
static bool tcf_chain_detach ( struct tcf_chain * chain )
2017-05-20 16:01:32 +03:00
{
2017-12-04 21:48:18 +03:00
struct tcf_block * block = chain - > block ;
2019-02-11 11:55:32 +03:00
ASSERT_BLOCK_LOCKED ( block ) ;
2020-02-16 13:01:23 +03:00
list_del_rcu ( & chain - > list ) ;
2018-07-23 10:23:05 +03:00
if ( ! chain - > index )
block - > chain0 . chain = NULL ;
2019-02-11 11:55:32 +03:00
if ( list_empty ( & block - > chain_list ) & &
refcount_read ( & block - > refcnt ) = = 0 )
return true ;
return false ;
}
static void tcf_block_destroy ( struct tcf_block * block )
{
mutex_destroy ( & block - > lock ) ;
2019-11-02 17:17:47 +03:00
mutex_destroy ( & block - > proto_destroy_lock ) ;
2023-12-19 21:16:19 +03:00
xa_destroy ( & block - > ports ) ;
2019-02-11 11:55:32 +03:00
kfree_rcu ( block , rcu ) ;
}
static void tcf_chain_destroy ( struct tcf_chain * chain , bool free_block )
{
struct tcf_block * block = chain - > block ;
2019-02-11 11:55:38 +03:00
mutex_destroy ( & chain - > filter_chain_lock ) ;
2019-03-20 17:00:16 +03:00
kfree_rcu ( chain , rcu ) ;
2019-02-11 11:55:32 +03:00
if ( free_block )
tcf_block_destroy ( block ) ;
2017-09-12 02:33:31 +03:00
}
2017-08-22 23:46:49 +03:00
2017-09-12 02:33:31 +03:00
static void tcf_chain_hold ( struct tcf_chain * chain )
{
2019-02-11 11:55:32 +03:00
ASSERT_BLOCK_LOCKED ( chain - > block ) ;
2017-09-12 02:33:31 +03:00
+ + chain - > refcnt ;
2017-05-17 12:07:59 +03:00
}
2018-08-01 13:36:55 +03:00
static bool tcf_chain_held_by_acts_only ( struct tcf_chain * chain )
2018-07-27 10:45:05 +03:00
{
2019-02-11 11:55:32 +03:00
ASSERT_BLOCK_LOCKED ( chain - > block ) ;
2018-07-27 10:45:05 +03:00
/* In case all the references are action references, this
2018-08-01 13:36:55 +03:00
* chain should not be shown to the user .
2018-07-27 10:45:05 +03:00
*/
return chain - > refcnt = = chain - > action_refcnt ;
}
2018-07-23 10:23:06 +03:00
static struct tcf_chain * tcf_chain_lookup ( struct tcf_block * block ,
u32 chain_index )
2017-05-17 12:08:01 +03:00
{
struct tcf_chain * chain ;
2019-02-11 11:55:32 +03:00
ASSERT_BLOCK_LOCKED ( block ) ;
2017-05-17 12:08:01 +03:00
list_for_each_entry ( chain , & block - > chain_list , list ) {
2018-07-23 10:23:06 +03:00
if ( chain - > index = = chain_index )
2017-09-12 02:33:31 +03:00
return chain ;
2018-07-23 10:23:06 +03:00
}
return NULL ;
}
2020-02-16 13:01:24 +03:00
# if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
static struct tcf_chain * tcf_chain_lookup_rcu ( const struct tcf_block * block ,
u32 chain_index )
{
struct tcf_chain * chain ;
list_for_each_entry_rcu ( chain , & block - > chain_list , list ) {
if ( chain - > index = = chain_index )
return chain ;
}
return NULL ;
}
# endif
2018-07-23 10:23:06 +03:00
static int tc_chain_notify ( struct tcf_chain * chain , struct sk_buff * oskb ,
2023-01-13 06:43:53 +03:00
u32 seq , u16 flags , int event , bool unicast ,
struct netlink_ext_ack * extack ) ;
2018-07-23 10:23:06 +03:00
2018-08-01 13:36:56 +03:00
static struct tcf_chain * __tcf_chain_get ( struct tcf_block * block ,
u32 chain_index , bool create ,
bool by_act )
2018-07-23 10:23:06 +03:00
{
2019-02-11 11:55:32 +03:00
struct tcf_chain * chain = NULL ;
bool is_first_reference ;
2018-07-23 10:23:06 +03:00
2019-02-11 11:55:32 +03:00
mutex_lock ( & block - > lock ) ;
chain = tcf_chain_lookup ( block , chain_index ) ;
2018-07-23 10:23:06 +03:00
if ( chain ) {
tcf_chain_hold ( chain ) ;
2018-08-01 13:36:56 +03:00
} else {
if ( ! create )
2019-02-11 11:55:32 +03:00
goto errout ;
2018-08-01 13:36:56 +03:00
chain = tcf_chain_create ( block , chain_index ) ;
if ( ! chain )
2019-02-11 11:55:32 +03:00
goto errout ;
2017-05-17 12:08:01 +03:00
}
2017-09-06 14:14:19 +03:00
2018-08-01 13:36:56 +03:00
if ( by_act )
+ + chain - > action_refcnt ;
2019-02-11 11:55:32 +03:00
is_first_reference = chain - > refcnt - chain - > action_refcnt = = 1 ;
mutex_unlock ( & block - > lock ) ;
2018-08-01 13:36:56 +03:00
/* Send notification only in case we got the first
* non - action reference . Until then , the chain acts only as
* a placeholder for actions pointing to it and user ought
* not know about them .
*/
2019-02-11 11:55:32 +03:00
if ( is_first_reference & & ! by_act )
2018-08-01 13:36:56 +03:00
tc_chain_notify ( chain , NULL , 0 , NLM_F_CREATE | NLM_F_EXCL ,
2023-01-13 06:43:53 +03:00
RTM_NEWCHAIN , false , NULL ) ;
2018-08-01 13:36:56 +03:00
2018-07-23 10:23:06 +03:00
return chain ;
2019-02-11 11:55:32 +03:00
errout :
mutex_unlock ( & block - > lock ) ;
return chain ;
2017-05-17 12:08:01 +03:00
}
2018-08-01 13:36:56 +03:00
2018-08-01 13:36:57 +03:00
static struct tcf_chain * tcf_chain_get ( struct tcf_block * block , u32 chain_index ,
bool create )
2018-08-01 13:36:56 +03:00
{
return __tcf_chain_get ( block , chain_index , create , false ) ;
}
2017-05-17 12:08:01 +03:00
2018-07-27 10:45:05 +03:00
struct tcf_chain * tcf_chain_get_by_act ( struct tcf_block * block , u32 chain_index )
{
2018-08-01 13:36:56 +03:00
return __tcf_chain_get ( block , chain_index , true , true ) ;
2018-07-27 10:45:05 +03:00
}
EXPORT_SYMBOL ( tcf_chain_get_by_act ) ;
2019-02-11 11:55:37 +03:00
static void tc_chain_tmplt_del ( const struct tcf_proto_ops * tmplt_ops ,
void * tmplt_priv ) ;
static int tc_chain_notify_delete ( const struct tcf_proto_ops * tmplt_ops ,
void * tmplt_priv , u32 chain_index ,
struct tcf_block * block , struct sk_buff * oskb ,
2023-12-08 22:28:46 +03:00
u32 seq , u16 flags ) ;
2018-07-23 10:23:07 +03:00
2019-02-11 11:55:33 +03:00
static void __tcf_chain_put ( struct tcf_chain * chain , bool by_act ,
bool explicitly_created )
2017-05-17 12:08:01 +03:00
{
2019-02-11 11:55:32 +03:00
struct tcf_block * block = chain - > block ;
2019-02-11 11:55:37 +03:00
const struct tcf_proto_ops * tmplt_ops ;
2023-06-12 12:34:26 +03:00
unsigned int refcnt , non_act_refcnt ;
2019-03-06 18:50:43 +03:00
bool free_block = false ;
2019-02-11 11:55:37 +03:00
void * tmplt_priv ;
2019-02-11 11:55:32 +03:00
mutex_lock ( & block - > lock ) ;
2019-02-11 11:55:33 +03:00
if ( explicitly_created ) {
if ( ! chain - > explicitly_created ) {
mutex_unlock ( & block - > lock ) ;
return ;
}
chain - > explicitly_created = false ;
}
2018-08-01 13:36:56 +03:00
if ( by_act )
chain - > action_refcnt - - ;
2019-02-11 11:55:32 +03:00
/* tc_chain_notify_delete can't be called while holding block lock.
* However , when block is unlocked chain can be changed concurrently , so
* save these to temporary variables .
*/
refcnt = - - chain - > refcnt ;
2023-06-12 12:34:26 +03:00
non_act_refcnt = refcnt - chain - > action_refcnt ;
2019-02-11 11:55:37 +03:00
tmplt_ops = chain - > tmplt_ops ;
tmplt_priv = chain - > tmplt_priv ;
2018-08-01 13:36:56 +03:00
2023-06-12 12:34:26 +03:00
if ( non_act_refcnt = = chain - > explicitly_created & & ! by_act ) {
if ( non_act_refcnt = = 0 )
tc_chain_notify_delete ( tmplt_ops , tmplt_priv ,
2023-12-08 22:28:46 +03:00
chain - > index , block , NULL , 0 , 0 ) ;
2019-02-11 11:55:42 +03:00
/* Last reference to chain, no need to lock. */
chain - > flushing = false ;
}
2018-08-01 13:36:56 +03:00
2019-03-06 18:50:43 +03:00
if ( refcnt = = 0 )
free_block = tcf_chain_detach ( chain ) ;
mutex_unlock ( & block - > lock ) ;
2019-02-11 11:55:32 +03:00
if ( refcnt = = 0 ) {
2019-02-11 11:55:37 +03:00
tc_chain_tmplt_del ( tmplt_ops , tmplt_priv ) ;
2019-02-11 11:55:32 +03:00
tcf_chain_destroy ( chain , free_block ) ;
2018-07-23 10:23:06 +03:00
}
2017-05-17 12:08:01 +03:00
}
2018-08-01 13:36:56 +03:00
2018-08-01 13:36:57 +03:00
static void tcf_chain_put ( struct tcf_chain * chain )
2018-08-01 13:36:56 +03:00
{
2019-02-11 11:55:33 +03:00
__tcf_chain_put ( chain , false , false ) ;
2018-08-01 13:36:56 +03:00
}
2017-05-17 12:08:01 +03:00
2018-07-27 10:45:05 +03:00
void tcf_chain_put_by_act ( struct tcf_chain * chain )
{
2019-02-11 11:55:33 +03:00
__tcf_chain_put ( chain , true , false ) ;
2018-07-27 10:45:05 +03:00
}
EXPORT_SYMBOL ( tcf_chain_put_by_act ) ;
2018-07-23 10:23:06 +03:00
static void tcf_chain_put_explicitly_created ( struct tcf_chain * chain )
{
2019-02-11 11:55:33 +03:00
__tcf_chain_put ( chain , false , true ) ;
2018-07-23 10:23:06 +03:00
}
2019-02-11 11:55:45 +03:00
static void tcf_chain_flush ( struct tcf_chain * chain , bool rtnl_held )
2018-08-01 13:36:57 +03:00
{
2019-02-11 11:55:39 +03:00
struct tcf_proto * tp , * tp_next ;
2018-08-01 13:36:57 +03:00
2019-02-11 11:55:38 +03:00
mutex_lock ( & chain - > filter_chain_lock ) ;
tp = tcf_chain_dereference ( chain - > filter_chain , chain ) ;
2019-11-02 17:17:47 +03:00
while ( tp ) {
tp_next = rcu_dereference_protected ( tp - > next , 1 ) ;
tcf_proto_signal_destroying ( chain , tp ) ;
tp = tp_next ;
}
tp = tcf_chain_dereference ( chain - > filter_chain , chain ) ;
2019-02-11 11:55:39 +03:00
RCU_INIT_POINTER ( chain - > filter_chain , NULL ) ;
2018-08-01 13:36:57 +03:00
tcf_chain0_head_change ( chain , NULL ) ;
2019-02-11 11:55:42 +03:00
chain - > flushing = true ;
2019-02-11 11:55:38 +03:00
mutex_unlock ( & chain - > filter_chain_lock ) ;
2018-08-01 13:36:57 +03:00
while ( tp ) {
2019-02-11 11:55:39 +03:00
tp_next = rcu_dereference_protected ( tp - > next , 1 ) ;
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp , rtnl_held , NULL ) ;
2019-02-11 11:55:39 +03:00
tp = tp_next ;
2018-08-01 13:36:57 +03:00
}
}
2019-07-09 23:55:45 +03:00
static int tcf_block_setup ( struct tcf_block * block ,
struct flow_block_offload * bo ) ;
2020-05-29 03:25:36 +03:00
static void tcf_block_offload_init ( struct flow_block_offload * bo ,
2020-07-11 00:55:03 +03:00
struct net_device * dev , struct Qdisc * sch ,
2020-05-29 03:25:36 +03:00
enum flow_block_command command ,
enum flow_block_binder_type binder_type ,
struct flow_block * flow_block ,
bool shared , struct netlink_ext_ack * extack )
{
bo - > net = dev_net ( dev ) ;
bo - > command = command ;
bo - > binder_type = binder_type ;
bo - > block = flow_block ;
bo - > block_shared = shared ;
bo - > extack = extack ;
2020-07-11 00:55:03 +03:00
bo - > sch = sch ;
2021-08-17 20:05:18 +03:00
bo - > cb_list_head = & flow_block - > cb_list ;
2020-05-29 03:25:36 +03:00
INIT_LIST_HEAD ( & bo - > cb_list ) ;
}
2020-05-29 03:25:37 +03:00
static void tcf_block_unbind ( struct tcf_block * block ,
struct flow_block_offload * bo ) ;
static void tc_block_indr_cleanup ( struct flow_block_cb * block_cb )
2018-11-10 08:21:26 +03:00
{
2020-05-29 03:25:37 +03:00
struct tcf_block * block = block_cb - > indr . data ;
struct net_device * dev = block_cb - > indr . dev ;
2020-07-11 00:55:03 +03:00
struct Qdisc * sch = block_cb - > indr . sch ;
2020-05-29 03:25:37 +03:00
struct netlink_ext_ack extack = { } ;
2021-04-01 07:52:48 +03:00
struct flow_block_offload bo = { } ;
2018-11-10 08:21:26 +03:00
2020-07-11 00:55:03 +03:00
tcf_block_offload_init ( & bo , dev , sch , FLOW_BLOCK_UNBIND ,
2020-05-29 03:25:37 +03:00
block_cb - > indr . binder_type ,
& block - > flow_block , tcf_block_shared ( block ) ,
& extack ) ;
2020-10-26 15:33:27 +03:00
rtnl_lock ( ) ;
2020-05-29 03:25:37 +03:00
down_write ( & block - > cb_lock ) ;
2020-06-18 15:49:10 +03:00
list_del ( & block_cb - > driver_list ) ;
2020-05-29 03:25:37 +03:00
list_move ( & block_cb - > list , & bo . cb_list ) ;
tcf_block_unbind ( block , & bo ) ;
2020-10-26 15:33:27 +03:00
up_write ( & block - > cb_lock ) ;
2020-05-29 03:25:37 +03:00
rtnl_unlock ( ) ;
2018-11-10 08:21:26 +03:00
}
2018-01-17 13:46:50 +03:00
static bool tcf_block_offload_in_use ( struct tcf_block * block )
{
2019-08-26 16:44:58 +03:00
return atomic_read ( & block - > offloadcnt ) ;
2018-01-17 13:46:50 +03:00
}
static int tcf_block_offload_cmd ( struct tcf_block * block ,
2020-07-11 00:55:03 +03:00
struct net_device * dev , struct Qdisc * sch ,
2018-01-17 13:46:50 +03:00
struct tcf_block_ext_info * ei ,
2019-07-09 23:55:40 +03:00
enum flow_block_command command ,
2018-06-26 00:30:04 +03:00
struct netlink_ext_ack * extack )
2017-10-19 16:50:29 +03:00
{
2019-07-09 23:55:46 +03:00
struct flow_block_offload bo = { } ;
2017-10-19 16:50:29 +03:00
2020-07-11 00:55:03 +03:00
tcf_block_offload_init ( & bo , dev , sch , command , ei - > binder_type ,
2020-05-29 03:25:36 +03:00
& block - > flow_block , tcf_block_shared ( block ) ,
extack ) ;
2019-07-09 23:55:45 +03:00
net/sched: cls_api: fix nooffloaddevcnt warning dmesg log
The block->nooffloaddevcnt should always count for indr block.
even the indr block offload successful. The representor maybe
gone away and the ingress qdisc can work in software mode.
block->nooffloaddevcnt warning with following dmesg log:
[ 760.667058] #####################################################
[ 760.668186] ## TEST test-ecmp-add-vxlan-encap-disable-sriov.sh ##
[ 760.669179] #####################################################
[ 761.780655] :test: Fedora 30 (Thirty)
[ 761.783794] :test: Linux reg-r-vrt-018-180 5.7.0+
[ 761.822890] :test: NIC ens1f0 FW 16.26.6000 PCI 0000:81:00.0 DEVICE 0x1019 ConnectX-5 Ex
[ 761.860244] mlx5_core 0000:81:00.0 ens1f0: Link up
[ 761.880693] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
[ 762.059732] mlx5_core 0000:81:00.1 ens1f1: Link up
[ 762.234341] :test: unbind vfs of ens1f0
[ 762.257825] :test: Change ens1f0 eswitch (0000:81:00.0) mode to switchdev
[ 762.291363] :test: unbind vfs of ens1f1
[ 762.306914] :test: Change ens1f1 eswitch (0000:81:00.1) mode to switchdev
[ 762.309237] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3)
[ 763.282598] mlx5_core 0000:81:00.1: E-Switch: Supported tc offload range - chains: 4294967294, prios: 4294967295
[ 763.362825] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.444465] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
[ 763.460088] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.502586] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.552429] ens1f1_0: renamed from eth0
[ 763.569569] mlx5_core 0000:81:00.1: E-Switch: Enable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 763.629694] ens1f1_1: renamed from eth1
[ 764.631552] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_0: link becomes ready
[ 764.670841] :test: unbind vfs of ens1f0
[ 764.681966] :test: unbind vfs of ens1f1
[ 764.726762] mlx5_core 0000:81:00.0 ens1f0: Link up
[ 764.766511] mlx5_core 0000:81:00.1 ens1f1: Link up
[ 764.797325] :test: Add multipath vxlan encap rule and disable sriov
[ 764.798544] :test: config multipath route
[ 764.812732] mlx5_core 0000:81:00.0: lag map port 1:2 port 2:2
[ 764.874556] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:2
[ 765.603681] :test: OK
[ 765.659048] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_1: link becomes ready
[ 765.675085] :test: verify rule in hw
[ 765.694237] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
[ 765.711892] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1: link becomes ready
[ 766.979230] :test: OK
[ 768.125419] :test: OK
[ 768.127519] :test: - disable sriov ens1f1
[ 768.131160] pci 0000:81:02.2: Removing from iommu group 75
[ 768.132646] pci 0000:81:02.3: Removing from iommu group 76
[ 769.179749] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 769.455627] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:1
[ 769.703990] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 769.988637] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
[ 769.990022] :test: - disable sriov ens1f0
[ 769.994922] pci 0000:81:00.2: Removing from iommu group 73
[ 769.997048] pci 0000:81:00.3: Removing from iommu group 74
[ 771.035813] mlx5_core 0000:81:00.0: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 771.339091] ------------[ cut here ]------------
[ 771.340812] WARNING: CPU: 6 PID: 3448 at net/sched/cls_api.c:749 tcf_block_offload_unbind.isra.0+0x5c/0x60
[ 771.341728] Modules linked in: act_mirred act_tunnel_key cls_flower dummy vxlan ip6_udp_tunnel udp_tunnel sch_ingress nfsv3 nfs_acl nfs lockd grace fscache tun bridge stp llc sunrpc rdma_ucm rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mlxfw act_ct nf_flow_table kvm_intel nf_nat kvm nf_conntrack irqbypass crct10dif_pclmul igb crc32_pclmul nf_defrag_ipv6 libcrc32c nf_defrag_ipv4 crc32c_intel ghash_clmulni_intel ptp ipmi_ssif intel_cstate pps_c
ore ses intel_uncore mei_me iTCO_wdt joydev ipmi_si iTCO_vendor_support i2c_i801 enclosure mei ioatdma dca lpc_ich wmi ipmi_devintf pcspkr acpi_power_meter ipmi_msghandler acpi_pad ast i2c_algo_bit drm_vram_helper drm_kms_helper drm_ttm_helper ttm drm mpt3sas raid_class scsi_transport_sas
[ 771.347818] CPU: 6 PID: 3448 Comm: test-ecmp-add-v Not tainted 5.7.0+ #1146
[ 771.348727] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
[ 771.349646] RIP: 0010:tcf_block_offload_unbind.isra.0+0x5c/0x60
[ 771.350553] Code: 4a fd ff ff 83 f8 a1 74 0e 5b 4c 89 e7 5d 41 5c 41 5d e9 07 93 89 ff 8b 83 a0 00 00 00 8d 50 ff 89 93 a0 00 00 00 85 c0 75 df <0f> 0b eb db 0f 1f 44 00 00 41 57 41 56 41 55 41 89 cd 41 54 49 89
[ 771.352420] RSP: 0018:ffffb33144cd3b00 EFLAGS: 00010246
[ 771.353353] RAX: 0000000000000000 RBX: ffff8b37cf4b2800 RCX: 0000000000000000
[ 771.354294] RDX: 00000000ffffffff RSI: ffff8b3b9aad0000 RDI: ffffffff8d5c6e20
[ 771.355245] RBP: ffff8b37eb546948 R08: ffffffffc0b7a348 R09: ffff8b3b9aad0000
[ 771.356189] R10: 0000000000000001 R11: ffff8b3ba7a0a1c0 R12: ffff8b37cf4b2850
[ 771.357123] R13: ffff8b3b9aad0000 R14: ffff8b37cf4b2820 R15: ffff8b37cf4b2820
[ 771.358039] FS: 00007f8a19b6e740(0000) GS:ffff8b3befa00000(0000) knlGS:0000000000000000
[ 771.358965] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 771.359885] CR2: 00007f3afb91c1a0 CR3: 000000045133c004 CR4: 00000000001606e0
[ 771.360825] Call Trace:
[ 771.361764] __tcf_block_put+0x84/0x150
[ 771.362712] ingress_destroy+0x1b/0x20 [sch_ingress]
[ 771.363658] qdisc_destroy+0x3e/0xc0
[ 771.364594] dev_shutdown+0x7a/0xa5
[ 771.365522] rollback_registered_many+0x20d/0x530
[ 771.366458] ? netdev_upper_dev_unlink+0x15d/0x1c0
[ 771.367387] unregister_netdevice_many.part.0+0xf/0x70
[ 771.368310] vxlan_netdevice_event+0xa4/0x110 [vxlan]
[ 771.369454] notifier_call_chain+0x4c/0x70
[ 771.370579] rollback_registered_many+0x2f5/0x530
[ 771.371719] rollback_registered+0x56/0x90
[ 771.372843] unregister_netdevice_queue+0x73/0xb0
[ 771.373982] unregister_netdev+0x18/0x20
[ 771.375168] mlx5e_vport_rep_unload+0x56/0xc0 [mlx5_core]
[ 771.376327] esw_offloads_disable+0x81/0x90 [mlx5_core]
[ 771.377512] mlx5_eswitch_disable_locked.cold+0xcb/0x1af [mlx5_core]
[ 771.378679] mlx5_eswitch_disable+0x44/0x60 [mlx5_core]
[ 771.379822] mlx5_device_disable_sriov+0xad/0xb0 [mlx5_core]
[ 771.380968] mlx5_core_sriov_configure+0xc1/0xe0 [mlx5_core]
[ 771.382087] sriov_numvfs_store+0xfc/0x130
[ 771.383195] kernfs_fop_write+0xce/0x1b0
[ 771.384302] vfs_write+0xb6/0x1a0
[ 771.385410] ksys_write+0x5f/0xe0
[ 771.386500] do_syscall_64+0x5b/0x1d0
[ 771.387569] entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()")
Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-18 15:49:11 +03:00
if ( dev - > netdev_ops - > ndo_setup_tc ) {
int err ;
2020-05-29 03:25:37 +03:00
err = dev - > netdev_ops - > ndo_setup_tc ( dev , TC_SETUP_BLOCK , & bo ) ;
net/sched: cls_api: fix nooffloaddevcnt warning dmesg log
The block->nooffloaddevcnt should always count for indr block.
even the indr block offload successful. The representor maybe
gone away and the ingress qdisc can work in software mode.
block->nooffloaddevcnt warning with following dmesg log:
[ 760.667058] #####################################################
[ 760.668186] ## TEST test-ecmp-add-vxlan-encap-disable-sriov.sh ##
[ 760.669179] #####################################################
[ 761.780655] :test: Fedora 30 (Thirty)
[ 761.783794] :test: Linux reg-r-vrt-018-180 5.7.0+
[ 761.822890] :test: NIC ens1f0 FW 16.26.6000 PCI 0000:81:00.0 DEVICE 0x1019 ConnectX-5 Ex
[ 761.860244] mlx5_core 0000:81:00.0 ens1f0: Link up
[ 761.880693] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
[ 762.059732] mlx5_core 0000:81:00.1 ens1f1: Link up
[ 762.234341] :test: unbind vfs of ens1f0
[ 762.257825] :test: Change ens1f0 eswitch (0000:81:00.0) mode to switchdev
[ 762.291363] :test: unbind vfs of ens1f1
[ 762.306914] :test: Change ens1f1 eswitch (0000:81:00.1) mode to switchdev
[ 762.309237] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3)
[ 763.282598] mlx5_core 0000:81:00.1: E-Switch: Supported tc offload range - chains: 4294967294, prios: 4294967295
[ 763.362825] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.444465] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
[ 763.460088] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.502586] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.552429] ens1f1_0: renamed from eth0
[ 763.569569] mlx5_core 0000:81:00.1: E-Switch: Enable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 763.629694] ens1f1_1: renamed from eth1
[ 764.631552] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_0: link becomes ready
[ 764.670841] :test: unbind vfs of ens1f0
[ 764.681966] :test: unbind vfs of ens1f1
[ 764.726762] mlx5_core 0000:81:00.0 ens1f0: Link up
[ 764.766511] mlx5_core 0000:81:00.1 ens1f1: Link up
[ 764.797325] :test: Add multipath vxlan encap rule and disable sriov
[ 764.798544] :test: config multipath route
[ 764.812732] mlx5_core 0000:81:00.0: lag map port 1:2 port 2:2
[ 764.874556] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:2
[ 765.603681] :test: OK
[ 765.659048] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_1: link becomes ready
[ 765.675085] :test: verify rule in hw
[ 765.694237] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
[ 765.711892] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1: link becomes ready
[ 766.979230] :test: OK
[ 768.125419] :test: OK
[ 768.127519] :test: - disable sriov ens1f1
[ 768.131160] pci 0000:81:02.2: Removing from iommu group 75
[ 768.132646] pci 0000:81:02.3: Removing from iommu group 76
[ 769.179749] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 769.455627] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:1
[ 769.703990] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 769.988637] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
[ 769.990022] :test: - disable sriov ens1f0
[ 769.994922] pci 0000:81:00.2: Removing from iommu group 73
[ 769.997048] pci 0000:81:00.3: Removing from iommu group 74
[ 771.035813] mlx5_core 0000:81:00.0: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 771.339091] ------------[ cut here ]------------
[ 771.340812] WARNING: CPU: 6 PID: 3448 at net/sched/cls_api.c:749 tcf_block_offload_unbind.isra.0+0x5c/0x60
[ 771.341728] Modules linked in: act_mirred act_tunnel_key cls_flower dummy vxlan ip6_udp_tunnel udp_tunnel sch_ingress nfsv3 nfs_acl nfs lockd grace fscache tun bridge stp llc sunrpc rdma_ucm rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mlxfw act_ct nf_flow_table kvm_intel nf_nat kvm nf_conntrack irqbypass crct10dif_pclmul igb crc32_pclmul nf_defrag_ipv6 libcrc32c nf_defrag_ipv4 crc32c_intel ghash_clmulni_intel ptp ipmi_ssif intel_cstate pps_c
ore ses intel_uncore mei_me iTCO_wdt joydev ipmi_si iTCO_vendor_support i2c_i801 enclosure mei ioatdma dca lpc_ich wmi ipmi_devintf pcspkr acpi_power_meter ipmi_msghandler acpi_pad ast i2c_algo_bit drm_vram_helper drm_kms_helper drm_ttm_helper ttm drm mpt3sas raid_class scsi_transport_sas
[ 771.347818] CPU: 6 PID: 3448 Comm: test-ecmp-add-v Not tainted 5.7.0+ #1146
[ 771.348727] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
[ 771.349646] RIP: 0010:tcf_block_offload_unbind.isra.0+0x5c/0x60
[ 771.350553] Code: 4a fd ff ff 83 f8 a1 74 0e 5b 4c 89 e7 5d 41 5c 41 5d e9 07 93 89 ff 8b 83 a0 00 00 00 8d 50 ff 89 93 a0 00 00 00 85 c0 75 df <0f> 0b eb db 0f 1f 44 00 00 41 57 41 56 41 55 41 89 cd 41 54 49 89
[ 771.352420] RSP: 0018:ffffb33144cd3b00 EFLAGS: 00010246
[ 771.353353] RAX: 0000000000000000 RBX: ffff8b37cf4b2800 RCX: 0000000000000000
[ 771.354294] RDX: 00000000ffffffff RSI: ffff8b3b9aad0000 RDI: ffffffff8d5c6e20
[ 771.355245] RBP: ffff8b37eb546948 R08: ffffffffc0b7a348 R09: ffff8b3b9aad0000
[ 771.356189] R10: 0000000000000001 R11: ffff8b3ba7a0a1c0 R12: ffff8b37cf4b2850
[ 771.357123] R13: ffff8b3b9aad0000 R14: ffff8b37cf4b2820 R15: ffff8b37cf4b2820
[ 771.358039] FS: 00007f8a19b6e740(0000) GS:ffff8b3befa00000(0000) knlGS:0000000000000000
[ 771.358965] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 771.359885] CR2: 00007f3afb91c1a0 CR3: 000000045133c004 CR4: 00000000001606e0
[ 771.360825] Call Trace:
[ 771.361764] __tcf_block_put+0x84/0x150
[ 771.362712] ingress_destroy+0x1b/0x20 [sch_ingress]
[ 771.363658] qdisc_destroy+0x3e/0xc0
[ 771.364594] dev_shutdown+0x7a/0xa5
[ 771.365522] rollback_registered_many+0x20d/0x530
[ 771.366458] ? netdev_upper_dev_unlink+0x15d/0x1c0
[ 771.367387] unregister_netdevice_many.part.0+0xf/0x70
[ 771.368310] vxlan_netdevice_event+0xa4/0x110 [vxlan]
[ 771.369454] notifier_call_chain+0x4c/0x70
[ 771.370579] rollback_registered_many+0x2f5/0x530
[ 771.371719] rollback_registered+0x56/0x90
[ 771.372843] unregister_netdevice_queue+0x73/0xb0
[ 771.373982] unregister_netdev+0x18/0x20
[ 771.375168] mlx5e_vport_rep_unload+0x56/0xc0 [mlx5_core]
[ 771.376327] esw_offloads_disable+0x81/0x90 [mlx5_core]
[ 771.377512] mlx5_eswitch_disable_locked.cold+0xcb/0x1af [mlx5_core]
[ 771.378679] mlx5_eswitch_disable+0x44/0x60 [mlx5_core]
[ 771.379822] mlx5_device_disable_sriov+0xad/0xb0 [mlx5_core]
[ 771.380968] mlx5_core_sriov_configure+0xc1/0xe0 [mlx5_core]
[ 771.382087] sriov_numvfs_store+0xfc/0x130
[ 771.383195] kernfs_fop_write+0xce/0x1b0
[ 771.384302] vfs_write+0xb6/0x1a0
[ 771.385410] ksys_write+0x5f/0xe0
[ 771.386500] do_syscall_64+0x5b/0x1d0
[ 771.387569] entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()")
Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-18 15:49:11 +03:00
if ( err < 0 ) {
if ( err ! = - EOPNOTSUPP )
NL_SET_ERR_MSG ( extack , " Driver ndo_setup_tc failed " ) ;
return err ;
}
return tcf_block_setup ( block , & bo ) ;
2020-04-23 17:57:45 +03:00
}
2019-07-09 23:55:45 +03:00
2020-07-11 00:55:03 +03:00
flow_indr_dev_setup_offload ( dev , sch , TC_SETUP_BLOCK , block , & bo ,
net/sched: cls_api: fix nooffloaddevcnt warning dmesg log
The block->nooffloaddevcnt should always count for indr block.
even the indr block offload successful. The representor maybe
gone away and the ingress qdisc can work in software mode.
block->nooffloaddevcnt warning with following dmesg log:
[ 760.667058] #####################################################
[ 760.668186] ## TEST test-ecmp-add-vxlan-encap-disable-sriov.sh ##
[ 760.669179] #####################################################
[ 761.780655] :test: Fedora 30 (Thirty)
[ 761.783794] :test: Linux reg-r-vrt-018-180 5.7.0+
[ 761.822890] :test: NIC ens1f0 FW 16.26.6000 PCI 0000:81:00.0 DEVICE 0x1019 ConnectX-5 Ex
[ 761.860244] mlx5_core 0000:81:00.0 ens1f0: Link up
[ 761.880693] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
[ 762.059732] mlx5_core 0000:81:00.1 ens1f1: Link up
[ 762.234341] :test: unbind vfs of ens1f0
[ 762.257825] :test: Change ens1f0 eswitch (0000:81:00.0) mode to switchdev
[ 762.291363] :test: unbind vfs of ens1f1
[ 762.306914] :test: Change ens1f1 eswitch (0000:81:00.1) mode to switchdev
[ 762.309237] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(LEGACY), nvfs(2), active vports(3)
[ 763.282598] mlx5_core 0000:81:00.1: E-Switch: Supported tc offload range - chains: 4294967294, prios: 4294967295
[ 763.362825] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.444465] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
[ 763.460088] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.502586] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 763.552429] ens1f1_0: renamed from eth0
[ 763.569569] mlx5_core 0000:81:00.1: E-Switch: Enable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 763.629694] ens1f1_1: renamed from eth1
[ 764.631552] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_0: link becomes ready
[ 764.670841] :test: unbind vfs of ens1f0
[ 764.681966] :test: unbind vfs of ens1f1
[ 764.726762] mlx5_core 0000:81:00.0 ens1f0: Link up
[ 764.766511] mlx5_core 0000:81:00.1 ens1f1: Link up
[ 764.797325] :test: Add multipath vxlan encap rule and disable sriov
[ 764.798544] :test: config multipath route
[ 764.812732] mlx5_core 0000:81:00.0: lag map port 1:2 port 2:2
[ 764.874556] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:2
[ 765.603681] :test: OK
[ 765.659048] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1_1: link becomes ready
[ 765.675085] :test: verify rule in hw
[ 765.694237] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f0: link becomes ready
[ 765.711892] IPv6: ADDRCONF(NETDEV_CHANGE): ens1f1: link becomes ready
[ 766.979230] :test: OK
[ 768.125419] :test: OK
[ 768.127519] :test: - disable sriov ens1f1
[ 768.131160] pci 0000:81:02.2: Removing from iommu group 75
[ 768.132646] pci 0000:81:02.3: Removing from iommu group 76
[ 769.179749] mlx5_core 0000:81:00.1: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 769.455627] mlx5_core 0000:81:00.0: modify lag map port 1:1 port 2:1
[ 769.703990] mlx5_core 0000:81:00.1: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0)
[ 769.988637] mlx5_core 0000:81:00.1 ens1f1: renamed from eth0
[ 769.990022] :test: - disable sriov ens1f0
[ 769.994922] pci 0000:81:00.2: Removing from iommu group 73
[ 769.997048] pci 0000:81:00.3: Removing from iommu group 74
[ 771.035813] mlx5_core 0000:81:00.0: E-Switch: Disable: mode(OFFLOADS), nvfs(2), active vports(3)
[ 771.339091] ------------[ cut here ]------------
[ 771.340812] WARNING: CPU: 6 PID: 3448 at net/sched/cls_api.c:749 tcf_block_offload_unbind.isra.0+0x5c/0x60
[ 771.341728] Modules linked in: act_mirred act_tunnel_key cls_flower dummy vxlan ip6_udp_tunnel udp_tunnel sch_ingress nfsv3 nfs_acl nfs lockd grace fscache tun bridge stp llc sunrpc rdma_ucm rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mlxfw act_ct nf_flow_table kvm_intel nf_nat kvm nf_conntrack irqbypass crct10dif_pclmul igb crc32_pclmul nf_defrag_ipv6 libcrc32c nf_defrag_ipv4 crc32c_intel ghash_clmulni_intel ptp ipmi_ssif intel_cstate pps_c
ore ses intel_uncore mei_me iTCO_wdt joydev ipmi_si iTCO_vendor_support i2c_i801 enclosure mei ioatdma dca lpc_ich wmi ipmi_devintf pcspkr acpi_power_meter ipmi_msghandler acpi_pad ast i2c_algo_bit drm_vram_helper drm_kms_helper drm_ttm_helper ttm drm mpt3sas raid_class scsi_transport_sas
[ 771.347818] CPU: 6 PID: 3448 Comm: test-ecmp-add-v Not tainted 5.7.0+ #1146
[ 771.348727] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017
[ 771.349646] RIP: 0010:tcf_block_offload_unbind.isra.0+0x5c/0x60
[ 771.350553] Code: 4a fd ff ff 83 f8 a1 74 0e 5b 4c 89 e7 5d 41 5c 41 5d e9 07 93 89 ff 8b 83 a0 00 00 00 8d 50 ff 89 93 a0 00 00 00 85 c0 75 df <0f> 0b eb db 0f 1f 44 00 00 41 57 41 56 41 55 41 89 cd 41 54 49 89
[ 771.352420] RSP: 0018:ffffb33144cd3b00 EFLAGS: 00010246
[ 771.353353] RAX: 0000000000000000 RBX: ffff8b37cf4b2800 RCX: 0000000000000000
[ 771.354294] RDX: 00000000ffffffff RSI: ffff8b3b9aad0000 RDI: ffffffff8d5c6e20
[ 771.355245] RBP: ffff8b37eb546948 R08: ffffffffc0b7a348 R09: ffff8b3b9aad0000
[ 771.356189] R10: 0000000000000001 R11: ffff8b3ba7a0a1c0 R12: ffff8b37cf4b2850
[ 771.357123] R13: ffff8b3b9aad0000 R14: ffff8b37cf4b2820 R15: ffff8b37cf4b2820
[ 771.358039] FS: 00007f8a19b6e740(0000) GS:ffff8b3befa00000(0000) knlGS:0000000000000000
[ 771.358965] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 771.359885] CR2: 00007f3afb91c1a0 CR3: 000000045133c004 CR4: 00000000001606e0
[ 771.360825] Call Trace:
[ 771.361764] __tcf_block_put+0x84/0x150
[ 771.362712] ingress_destroy+0x1b/0x20 [sch_ingress]
[ 771.363658] qdisc_destroy+0x3e/0xc0
[ 771.364594] dev_shutdown+0x7a/0xa5
[ 771.365522] rollback_registered_many+0x20d/0x530
[ 771.366458] ? netdev_upper_dev_unlink+0x15d/0x1c0
[ 771.367387] unregister_netdevice_many.part.0+0xf/0x70
[ 771.368310] vxlan_netdevice_event+0xa4/0x110 [vxlan]
[ 771.369454] notifier_call_chain+0x4c/0x70
[ 771.370579] rollback_registered_many+0x2f5/0x530
[ 771.371719] rollback_registered+0x56/0x90
[ 771.372843] unregister_netdevice_queue+0x73/0xb0
[ 771.373982] unregister_netdev+0x18/0x20
[ 771.375168] mlx5e_vport_rep_unload+0x56/0xc0 [mlx5_core]
[ 771.376327] esw_offloads_disable+0x81/0x90 [mlx5_core]
[ 771.377512] mlx5_eswitch_disable_locked.cold+0xcb/0x1af [mlx5_core]
[ 771.378679] mlx5_eswitch_disable+0x44/0x60 [mlx5_core]
[ 771.379822] mlx5_device_disable_sriov+0xad/0xb0 [mlx5_core]
[ 771.380968] mlx5_core_sriov_configure+0xc1/0xe0 [mlx5_core]
[ 771.382087] sriov_numvfs_store+0xfc/0x130
[ 771.383195] kernfs_fop_write+0xce/0x1b0
[ 771.384302] vfs_write+0xb6/0x1a0
[ 771.385410] ksys_write+0x5f/0xe0
[ 771.386500] do_syscall_64+0x5b/0x1d0
[ 771.387569] entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: 0fdcf78d5973 ("net: use flow_indr_dev_setup_offload()")
Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-18 15:49:11 +03:00
tc_block_indr_cleanup ) ;
tcf_block_setup ( block , & bo ) ;
return - EOPNOTSUPP ;
2017-10-19 16:50:29 +03:00
}
2018-01-17 13:46:50 +03:00
static int tcf_block_offload_bind ( struct tcf_block * block , struct Qdisc * q ,
2018-06-26 00:30:04 +03:00
struct tcf_block_ext_info * ei ,
struct netlink_ext_ack * extack )
2017-10-19 16:50:29 +03:00
{
2018-01-17 13:46:50 +03:00
struct net_device * dev = q - > dev_queue - > dev ;
int err ;
2019-08-26 16:44:57 +03:00
down_write ( & block - > cb_lock ) ;
2018-01-17 13:46:50 +03:00
/* If tc offload feature is disabled and the block we try to bind
* to already has some offloaded filters , forbid to bind .
*/
2020-05-29 03:25:37 +03:00
if ( dev - > netdev_ops - > ndo_setup_tc & &
! tc_can_offload ( dev ) & &
tcf_block_offload_in_use ( block ) ) {
2018-06-26 00:30:04 +03:00
NL_SET_ERR_MSG ( extack , " Bind to offloaded block failed as dev has offload disabled " ) ;
2019-08-26 16:44:57 +03:00
err = - EOPNOTSUPP ;
goto err_unlock ;
2018-06-26 00:30:04 +03:00
}
2018-01-17 13:46:50 +03:00
2020-07-11 00:55:03 +03:00
err = tcf_block_offload_cmd ( block , dev , q , ei , FLOW_BLOCK_BIND , extack ) ;
2018-01-17 13:46:50 +03:00
if ( err = = - EOPNOTSUPP )
goto no_offload_dev_inc ;
2018-11-10 08:21:26 +03:00
if ( err )
2019-08-26 16:44:57 +03:00
goto err_unlock ;
2018-11-10 08:21:26 +03:00
2019-08-26 16:44:57 +03:00
up_write ( & block - > cb_lock ) ;
2018-11-10 08:21:26 +03:00
return 0 ;
2018-01-17 13:46:50 +03:00
no_offload_dev_inc :
2020-05-29 03:25:37 +03:00
if ( tcf_block_offload_in_use ( block ) )
2019-08-26 16:44:57 +03:00
goto err_unlock ;
2020-05-29 03:25:37 +03:00
2019-08-26 16:44:57 +03:00
err = 0 ;
2018-01-17 13:46:50 +03:00
block - > nooffloaddevcnt + + ;
2019-08-26 16:44:57 +03:00
err_unlock :
up_write ( & block - > cb_lock ) ;
return err ;
2017-10-19 16:50:29 +03:00
}
static void tcf_block_offload_unbind ( struct tcf_block * block , struct Qdisc * q ,
struct tcf_block_ext_info * ei )
{
2018-01-17 13:46:50 +03:00
struct net_device * dev = q - > dev_queue - > dev ;
int err ;
2019-08-26 16:44:57 +03:00
down_write ( & block - > cb_lock ) ;
2020-07-11 00:55:03 +03:00
err = tcf_block_offload_cmd ( block , dev , q , ei , FLOW_BLOCK_UNBIND , NULL ) ;
2018-01-17 13:46:50 +03:00
if ( err = = - EOPNOTSUPP )
goto no_offload_dev_dec ;
2019-08-26 16:44:57 +03:00
up_write ( & block - > cb_lock ) ;
2018-01-17 13:46:50 +03:00
return ;
no_offload_dev_dec :
WARN_ON ( block - > nooffloaddevcnt - - = = 0 ) ;
2019-08-26 16:44:57 +03:00
up_write ( & block - > cb_lock ) ;
2017-10-19 16:50:29 +03:00
}
2018-01-17 13:46:45 +03:00
static int
2018-07-23 10:23:05 +03:00
tcf_chain0_head_change_cb_add ( struct tcf_block * block ,
struct tcf_block_ext_info * ei ,
struct netlink_ext_ack * extack )
2018-01-17 13:46:45 +03:00
{
struct tcf_filter_chain_list_item * item ;
2019-02-11 11:55:35 +03:00
struct tcf_chain * chain0 ;
2018-01-17 13:46:45 +03:00
item = kmalloc ( sizeof ( * item ) , GFP_KERNEL ) ;
if ( ! item ) {
NL_SET_ERR_MSG ( extack , " Memory allocation for head change callback item failed " ) ;
return - ENOMEM ;
}
item - > chain_head_change = ei - > chain_head_change ;
item - > chain_head_change_priv = ei - > chain_head_change_priv ;
2019-02-11 11:55:35 +03:00
mutex_lock ( & block - > lock ) ;
chain0 = block - > chain0 . chain ;
2019-02-11 11:55:38 +03:00
if ( chain0 )
tcf_chain_hold ( chain0 ) ;
else
list_add ( & item - > list , & block - > chain0 . filter_chain_list ) ;
2019-02-11 11:55:35 +03:00
mutex_unlock ( & block - > lock ) ;
2019-02-11 11:55:38 +03:00
if ( chain0 ) {
struct tcf_proto * tp_head ;
mutex_lock ( & chain0 - > filter_chain_lock ) ;
tp_head = tcf_chain_dereference ( chain0 - > filter_chain , chain0 ) ;
if ( tp_head )
tcf_chain_head_change_item ( item , tp_head ) ;
mutex_lock ( & block - > lock ) ;
list_add ( & item - > list , & block - > chain0 . filter_chain_list ) ;
mutex_unlock ( & block - > lock ) ;
mutex_unlock ( & chain0 - > filter_chain_lock ) ;
tcf_chain_put ( chain0 ) ;
}
2018-01-17 13:46:45 +03:00
return 0 ;
}
static void
2018-07-23 10:23:05 +03:00
tcf_chain0_head_change_cb_del ( struct tcf_block * block ,
struct tcf_block_ext_info * ei )
2018-01-17 13:46:45 +03:00
{
struct tcf_filter_chain_list_item * item ;
2019-02-11 11:55:35 +03:00
mutex_lock ( & block - > lock ) ;
2018-07-23 10:23:05 +03:00
list_for_each_entry ( item , & block - > chain0 . filter_chain_list , list ) {
2018-01-17 13:46:45 +03:00
if ( ( ! ei - > chain_head_change & & ! ei - > chain_head_change_priv ) | |
( item - > chain_head_change = = ei - > chain_head_change & &
item - > chain_head_change_priv = = ei - > chain_head_change_priv ) ) {
2019-02-11 11:55:35 +03:00
if ( block - > chain0 . chain )
2018-07-23 10:23:05 +03:00
tcf_chain_head_change_item ( item , NULL ) ;
2018-01-17 13:46:45 +03:00
list_del ( & item - > list ) ;
2019-02-11 11:55:35 +03:00
mutex_unlock ( & block - > lock ) ;
2018-01-17 13:46:45 +03:00
kfree ( item ) ;
return ;
}
}
2019-02-11 11:55:35 +03:00
mutex_unlock ( & block - > lock ) ;
2018-01-17 13:46:45 +03:00
WARN_ON ( 1 ) ;
}
2018-01-17 13:46:46 +03:00
struct tcf_net {
2018-09-24 19:22:56 +03:00
spinlock_t idr_lock ; /* Protects idr */
2018-01-17 13:46:46 +03:00
struct idr idr ;
} ;
static unsigned int tcf_net_id ;
static int tcf_block_insert ( struct tcf_block * block , struct net * net ,
2018-02-13 14:00:16 +03:00
struct netlink_ext_ack * extack )
2018-01-17 13:46:45 +03:00
{
2018-01-17 13:46:46 +03:00
struct tcf_net * tn = net_generic ( net , tcf_net_id ) ;
2018-09-24 19:22:56 +03:00
int err ;
idr_preload ( GFP_KERNEL ) ;
spin_lock ( & tn - > idr_lock ) ;
err = idr_alloc_u32 ( & tn - > idr , block , & block - > index , block - > index ,
GFP_NOWAIT ) ;
spin_unlock ( & tn - > idr_lock ) ;
idr_preload_end ( ) ;
2018-01-17 13:46:46 +03:00
2018-09-24 19:22:56 +03:00
return err ;
2018-01-17 13:46:45 +03:00
}
2018-01-17 13:46:46 +03:00
static void tcf_block_remove ( struct tcf_block * block , struct net * net )
{
struct tcf_net * tn = net_generic ( net , tcf_net_id ) ;
2018-09-24 19:22:56 +03:00
spin_lock ( & tn - > idr_lock ) ;
2017-11-28 17:48:43 +03:00
idr_remove ( & tn - > idr , block - > index ) ;
2018-09-24 19:22:56 +03:00
spin_unlock ( & tn - > idr_lock ) ;
2018-01-17 13:46:46 +03:00
}
static struct tcf_block * tcf_block_create ( struct net * net , struct Qdisc * q ,
2018-02-13 14:00:16 +03:00
u32 block_index ,
2018-01-17 13:46:46 +03:00
struct netlink_ext_ack * extack )
2017-05-17 12:07:55 +03:00
{
2018-01-17 13:46:46 +03:00
struct tcf_block * block ;
2017-05-17 12:07:55 +03:00
2018-01-17 13:46:46 +03:00
block = kzalloc ( sizeof ( * block ) , GFP_KERNEL ) ;
2017-12-20 20:35:19 +03:00
if ( ! block ) {
NL_SET_ERR_MSG ( extack , " Memory allocation for block failed " ) ;
2018-01-17 13:46:46 +03:00
return ERR_PTR ( - ENOMEM ) ;
2017-12-20 20:35:19 +03:00
}
2019-02-11 11:55:32 +03:00
mutex_init ( & block - > lock ) ;
2019-11-02 17:17:47 +03:00
mutex_init ( & block - > proto_destroy_lock ) ;
2019-08-26 16:44:57 +03:00
init_rwsem ( & block - > cb_lock ) ;
2019-07-19 19:20:16 +03:00
flow_block_init ( & block - > flow_block ) ;
2017-05-17 12:08:01 +03:00
INIT_LIST_HEAD ( & block - > chain_list ) ;
2018-01-17 13:46:48 +03:00
INIT_LIST_HEAD ( & block - > owner_list ) ;
2018-07-23 10:23:05 +03:00
INIT_LIST_HEAD ( & block - > chain0 . filter_chain_list ) ;
2017-10-19 16:50:31 +03:00
2018-09-24 19:22:54 +03:00
refcount_set ( & block - > refcnt , 1 ) ;
2018-01-17 13:46:46 +03:00
block - > net = net ;
2018-02-13 14:00:16 +03:00
block - > index = block_index ;
2023-12-19 21:16:19 +03:00
xa_init ( & block - > ports ) ;
2018-02-13 14:00:16 +03:00
/* Don't store q pointer for blocks which are shared */
if ( ! tcf_block_shared ( block ) )
block - > q = q ;
2018-01-17 13:46:46 +03:00
return block ;
}
2023-12-19 21:16:20 +03:00
struct tcf_block * tcf_block_lookup ( struct net * net , u32 block_index )
2018-01-17 13:46:46 +03:00
{
struct tcf_net * tn = net_generic ( net , tcf_net_id ) ;
2017-11-28 18:01:24 +03:00
return idr_find ( & tn - > idr , block_index ) ;
2018-01-17 13:46:46 +03:00
}
2023-12-19 21:16:20 +03:00
EXPORT_SYMBOL ( tcf_block_lookup ) ;
2018-01-17 13:46:46 +03:00
2018-09-24 19:22:57 +03:00
static struct tcf_block * tcf_block_refcnt_get ( struct net * net , u32 block_index )
{
struct tcf_block * block ;
rcu_read_lock ( ) ;
block = tcf_block_lookup ( net , block_index ) ;
if ( block & & ! refcount_inc_not_zero ( & block - > refcnt ) )
block = NULL ;
rcu_read_unlock ( ) ;
return block ;
}
2019-02-11 11:55:36 +03:00
static struct tcf_chain *
__tcf_get_next_chain ( struct tcf_block * block , struct tcf_chain * chain )
2018-09-24 19:22:55 +03:00
{
2019-02-11 11:55:36 +03:00
mutex_lock ( & block - > lock ) ;
if ( chain )
chain = list_is_last ( & chain - > list , & block - > chain_list ) ?
NULL : list_next_entry ( chain , list ) ;
else
chain = list_first_entry_or_null ( & block - > chain_list ,
struct tcf_chain , list ) ;
2018-09-24 19:22:55 +03:00
2019-02-11 11:55:36 +03:00
/* skip all action-only chains */
while ( chain & & tcf_chain_held_by_acts_only ( chain ) )
chain = list_is_last ( & chain - > list , & block - > chain_list ) ?
NULL : list_next_entry ( chain , list ) ;
if ( chain )
2018-09-24 19:22:55 +03:00
tcf_chain_hold ( chain ) ;
2019-02-11 11:55:36 +03:00
mutex_unlock ( & block - > lock ) ;
2018-09-24 19:22:55 +03:00
2019-02-11 11:55:36 +03:00
return chain ;
2018-09-24 19:22:55 +03:00
}
2019-02-11 11:55:36 +03:00
/* Function to be used by all clients that want to iterate over all chains on
* block . It properly obtains block - > lock and takes reference to chain before
* returning it . Users of this function must be tolerant to concurrent chain
* insertion / deletion or ensure that no concurrent chain modification is
* possible . Note that all netlink dump callbacks cannot guarantee to provide
* consistent dump because rtnl lock is released each time skb is filled with
* data and sent to user - space .
*/
struct tcf_chain *
tcf_get_next_chain ( struct tcf_block * block , struct tcf_chain * chain )
2018-09-24 19:22:55 +03:00
{
2019-02-11 11:55:36 +03:00
struct tcf_chain * chain_next = __tcf_get_next_chain ( block , chain ) ;
2018-09-24 19:22:55 +03:00
2019-02-11 11:55:36 +03:00
if ( chain )
2018-09-24 19:22:55 +03:00
tcf_chain_put ( chain ) ;
2019-02-11 11:55:36 +03:00
return chain_next ;
}
EXPORT_SYMBOL ( tcf_get_next_chain ) ;
2019-02-11 11:55:40 +03:00
static struct tcf_proto *
__tcf_get_next_proto ( struct tcf_chain * chain , struct tcf_proto * tp )
{
2019-02-11 11:55:41 +03:00
u32 prio = 0 ;
2019-02-11 11:55:40 +03:00
ASSERT_RTNL ( ) ;
mutex_lock ( & chain - > filter_chain_lock ) ;
2019-02-11 11:55:41 +03:00
if ( ! tp ) {
2019-02-11 11:55:40 +03:00
tp = tcf_chain_dereference ( chain - > filter_chain , chain ) ;
2019-02-11 11:55:41 +03:00
} else if ( tcf_proto_is_deleting ( tp ) ) {
/* 'deleting' flag is set and chain->filter_chain_lock was
* unlocked , which means next pointer could be invalid . Restart
* search .
*/
prio = tp - > prio + 1 ;
tp = tcf_chain_dereference ( chain - > filter_chain , chain ) ;
for ( ; tp ; tp = tcf_chain_dereference ( tp - > next , chain ) )
if ( ! tp - > deleting & & tp - > prio > = prio )
break ;
} else {
2019-02-11 11:55:40 +03:00
tp = tcf_chain_dereference ( tp - > next , chain ) ;
2019-02-11 11:55:41 +03:00
}
2019-02-11 11:55:40 +03:00
if ( tp )
tcf_proto_get ( tp ) ;
mutex_unlock ( & chain - > filter_chain_lock ) ;
return tp ;
}
/* Function to be used by all clients that want to iterate over all tp's on
* chain . Users of this function must be tolerant to concurrent tp
* insertion / deletion or ensure that no concurrent chain modification is
* possible . Note that all netlink dump callbacks cannot guarantee to provide
* consistent dump because rtnl lock is released each time skb is filled with
* data and sent to user - space .
*/
struct tcf_proto *
2020-11-27 18:12:05 +03:00
tcf_get_next_proto ( struct tcf_chain * chain , struct tcf_proto * tp )
2019-02-11 11:55:40 +03:00
{
struct tcf_proto * tp_next = __tcf_get_next_proto ( chain , tp ) ;
if ( tp )
2020-11-27 18:12:05 +03:00
tcf_proto_put ( tp , true , NULL ) ;
2019-02-11 11:55:40 +03:00
return tp_next ;
}
EXPORT_SYMBOL ( tcf_get_next_proto ) ;
2019-02-11 11:55:45 +03:00
static void tcf_block_flush_all_chains ( struct tcf_block * block , bool rtnl_held )
2019-02-11 11:55:36 +03:00
{
struct tcf_chain * chain ;
/* Last reference to block. At this point chains cannot be added or
* removed concurrently .
*/
for ( chain = tcf_get_next_chain ( block , NULL ) ;
chain ;
chain = tcf_get_next_chain ( block , chain ) ) {
tcf_chain_put_explicitly_created ( chain ) ;
2019-02-11 11:55:45 +03:00
tcf_chain_flush ( chain , rtnl_held ) ;
2018-09-24 19:22:55 +03:00
}
}
2019-02-11 11:55:47 +03:00
/* Lookup Qdisc and increments its reference counter.
* Set parent , if necessary .
*/
static int __tcf_qdisc_find ( struct net * net , struct Qdisc * * q ,
u32 * parent , int ifindex , bool rtnl_held ,
struct netlink_ext_ack * extack )
{
const struct Qdisc_class_ops * cops ;
struct net_device * dev ;
int err = 0 ;
if ( ifindex = = TCM_IFINDEX_MAGIC_BLOCK )
return 0 ;
rcu_read_lock ( ) ;
/* Find link */
dev = dev_get_by_index_rcu ( net , ifindex ) ;
if ( ! dev ) {
rcu_read_unlock ( ) ;
return - ENODEV ;
}
/* Find qdisc */
if ( ! * parent ) {
2022-02-11 23:06:23 +03:00
* q = rcu_dereference ( dev - > qdisc ) ;
2019-02-11 11:55:47 +03:00
* parent = ( * q ) - > handle ;
} else {
* q = qdisc_lookup_rcu ( dev , TC_H_MAJ ( * parent ) ) ;
if ( ! * q ) {
NL_SET_ERR_MSG ( extack , " Parent Qdisc doesn't exists " ) ;
err = - EINVAL ;
goto errout_rcu ;
}
}
* q = qdisc_refcount_inc_nz ( * q ) ;
if ( ! * q ) {
NL_SET_ERR_MSG ( extack , " Parent Qdisc doesn't exists " ) ;
err = - EINVAL ;
goto errout_rcu ;
}
/* Is it classful? */
cops = ( * q ) - > ops - > cl_ops ;
if ( ! cops ) {
NL_SET_ERR_MSG ( extack , " Qdisc not classful " ) ;
err = - EINVAL ;
goto errout_qdisc ;
}
if ( ! cops - > tcf_block ) {
NL_SET_ERR_MSG ( extack , " Class doesn't support blocks " ) ;
err = - EOPNOTSUPP ;
goto errout_qdisc ;
}
errout_rcu :
/* At this point we know that qdisc is not noop_qdisc,
* which means that qdisc holds a reference to net_device
* and we hold a reference to qdisc , so it is safe to release
* rcu read lock .
*/
rcu_read_unlock ( ) ;
return err ;
errout_qdisc :
rcu_read_unlock ( ) ;
if ( rtnl_held )
qdisc_put ( * q ) ;
else
qdisc_put_unlocked ( * q ) ;
* q = NULL ;
return err ;
}
static int __tcf_qdisc_cl_find ( struct Qdisc * q , u32 parent , unsigned long * cl ,
int ifindex , struct netlink_ext_ack * extack )
{
if ( ifindex = = TCM_IFINDEX_MAGIC_BLOCK )
return 0 ;
/* Do we search for filter, attached to class? */
if ( TC_H_MIN ( parent ) ) {
const struct Qdisc_class_ops * cops = q - > ops - > cl_ops ;
* cl = cops - > find ( q , parent ) ;
if ( * cl = = 0 ) {
NL_SET_ERR_MSG ( extack , " Specified class doesn't exist " ) ;
return - ENOENT ;
}
}
return 0 ;
}
static struct tcf_block * __tcf_block_find ( struct net * net , struct Qdisc * q ,
unsigned long cl , int ifindex ,
u32 block_index ,
struct netlink_ext_ack * extack )
{
struct tcf_block * block ;
if ( ifindex = = TCM_IFINDEX_MAGIC_BLOCK ) {
block = tcf_block_refcnt_get ( net , block_index ) ;
if ( ! block ) {
NL_SET_ERR_MSG ( extack , " Block of given index was not found " ) ;
return ERR_PTR ( - EINVAL ) ;
}
} else {
const struct Qdisc_class_ops * cops = q - > ops - > cl_ops ;
block = cops - > tcf_block ( q , cl , extack ) ;
if ( ! block )
return ERR_PTR ( - EINVAL ) ;
if ( tcf_block_shared ( block ) ) {
NL_SET_ERR_MSG ( extack , " This filter block is shared. Please use the block index to manipulate the filters " ) ;
return ERR_PTR ( - EOPNOTSUPP ) ;
}
/* Always take reference to block in order to support execution
* of rules update path of cls API without rtnl lock . Caller
* must release block when it is finished using it . ' if ' block
* of this conditional obtain reference to block by calling
* tcf_block_refcnt_get ( ) .
*/
refcount_inc ( & block - > refcnt ) ;
}
return block ;
}
2018-09-24 19:22:57 +03:00
static void __tcf_block_put ( struct tcf_block * block , struct Qdisc * q ,
2019-02-11 11:55:45 +03:00
struct tcf_block_ext_info * ei , bool rtnl_held )
2018-09-24 19:22:57 +03:00
{
2019-02-11 11:55:32 +03:00
if ( refcount_dec_and_mutex_lock ( & block - > refcnt , & block - > lock ) ) {
2018-09-24 19:22:57 +03:00
/* Flushing/putting all chains will cause the block to be
* deallocated when last chain is freed . However , if chain_list
* is empty , block has to be manually deallocated . After block
* reference counter reached 0 , it is no longer possible to
* increment it or add new chains to block .
*/
bool free_block = list_empty ( & block - > chain_list ) ;
2019-02-11 11:55:32 +03:00
mutex_unlock ( & block - > lock ) ;
2018-09-24 19:22:57 +03:00
if ( tcf_block_shared ( block ) )
tcf_block_remove ( block , block - > net ) ;
if ( q )
tcf_block_offload_unbind ( block , q , ei ) ;
if ( free_block )
2019-02-11 11:55:32 +03:00
tcf_block_destroy ( block ) ;
2018-09-24 19:22:57 +03:00
else
2019-02-11 11:55:45 +03:00
tcf_block_flush_all_chains ( block , rtnl_held ) ;
2018-09-24 19:22:57 +03:00
} else if ( q ) {
tcf_block_offload_unbind ( block , q , ei ) ;
}
}
2019-02-11 11:55:45 +03:00
static void tcf_block_refcnt_put ( struct tcf_block * block , bool rtnl_held )
2018-09-24 19:22:57 +03:00
{
2019-02-11 11:55:45 +03:00
__tcf_block_put ( block , NULL , NULL , rtnl_held ) ;
2018-09-24 19:22:57 +03:00
}
2018-05-31 09:52:53 +03:00
/* Find tcf block.
* Set q , parent , cl when appropriate .
*/
static struct tcf_block * tcf_block_find ( struct net * net , struct Qdisc * * q ,
u32 * parent , unsigned long * cl ,
int ifindex , u32 block_index ,
struct netlink_ext_ack * extack )
{
struct tcf_block * block ;
2018-09-24 19:22:53 +03:00
int err = 0 ;
2018-05-31 09:52:53 +03:00
2019-02-11 11:55:47 +03:00
ASSERT_RTNL ( ) ;
2018-09-24 19:22:53 +03:00
2019-02-11 11:55:47 +03:00
err = __tcf_qdisc_find ( net , q , parent , ifindex , true , extack ) ;
if ( err )
goto errout ;
2018-05-31 09:52:53 +03:00
2019-02-11 11:55:47 +03:00
err = __tcf_qdisc_cl_find ( * q , * parent , cl , ifindex , extack ) ;
if ( err )
goto errout_qdisc ;
2018-09-24 19:22:58 +03:00
2019-02-11 11:55:47 +03:00
block = __tcf_block_find ( net , * q , * cl , ifindex , block_index , extack ) ;
2019-02-18 12:26:32 +03:00
if ( IS_ERR ( block ) ) {
err = PTR_ERR ( block ) ;
2019-02-11 11:55:47 +03:00
goto errout_qdisc ;
2019-02-18 12:26:32 +03:00
}
2018-05-31 09:52:53 +03:00
return block ;
2018-09-24 19:22:53 +03:00
errout_qdisc :
2019-02-11 11:55:47 +03:00
if ( * q )
2018-09-24 19:22:53 +03:00
qdisc_put ( * q ) ;
2019-02-11 11:55:47 +03:00
errout :
* q = NULL ;
2018-09-24 19:22:53 +03:00
return ERR_PTR ( err ) ;
}
2019-02-11 11:55:45 +03:00
static void tcf_block_release ( struct Qdisc * q , struct tcf_block * block ,
bool rtnl_held )
2018-09-24 19:22:53 +03:00
{
2018-09-24 19:22:58 +03:00
if ( ! IS_ERR_OR_NULL ( block ) )
2019-02-11 11:55:45 +03:00
tcf_block_refcnt_put ( block , rtnl_held ) ;
2018-09-24 19:22:58 +03:00
2019-02-11 11:55:48 +03:00
if ( q ) {
if ( rtnl_held )
qdisc_put ( q ) ;
else
qdisc_put_unlocked ( q ) ;
}
2018-05-31 09:52:53 +03:00
}
2018-01-17 13:46:48 +03:00
struct tcf_block_owner_item {
struct list_head list ;
struct Qdisc * q ;
2019-07-09 23:55:41 +03:00
enum flow_block_binder_type binder_type ;
2018-01-17 13:46:48 +03:00
} ;
static void
tcf_block_owner_netif_keep_dst ( struct tcf_block * block ,
struct Qdisc * q ,
2019-07-09 23:55:41 +03:00
enum flow_block_binder_type binder_type )
2018-01-17 13:46:48 +03:00
{
if ( block - > keep_dst & &
2019-07-09 23:55:41 +03:00
binder_type ! = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS & &
binder_type ! = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS )
2018-01-17 13:46:48 +03:00
netif_keep_dst ( qdisc_dev ( q ) ) ;
}
void tcf_block_netif_keep_dst ( struct tcf_block * block )
{
struct tcf_block_owner_item * item ;
block - > keep_dst = true ;
list_for_each_entry ( item , & block - > owner_list , list )
tcf_block_owner_netif_keep_dst ( block , item - > q ,
item - > binder_type ) ;
}
EXPORT_SYMBOL ( tcf_block_netif_keep_dst ) ;
static int tcf_block_owner_add ( struct tcf_block * block ,
struct Qdisc * q ,
2019-07-09 23:55:41 +03:00
enum flow_block_binder_type binder_type )
2018-01-17 13:46:48 +03:00
{
struct tcf_block_owner_item * item ;
item = kmalloc ( sizeof ( * item ) , GFP_KERNEL ) ;
if ( ! item )
return - ENOMEM ;
item - > q = q ;
item - > binder_type = binder_type ;
list_add ( & item - > list , & block - > owner_list ) ;
return 0 ;
}
static void tcf_block_owner_del ( struct tcf_block * block ,
struct Qdisc * q ,
2019-07-09 23:55:41 +03:00
enum flow_block_binder_type binder_type )
2018-01-17 13:46:48 +03:00
{
struct tcf_block_owner_item * item ;
list_for_each_entry ( item , & block - > owner_list , list ) {
if ( item - > q = = q & & item - > binder_type = = binder_type ) {
list_del ( & item - > list ) ;
kfree ( item ) ;
return ;
}
}
WARN_ON ( 1 ) ;
}
2024-01-12 14:39:30 +03:00
static bool tcf_block_tracks_dev ( struct tcf_block * block ,
struct tcf_block_ext_info * ei )
{
return tcf_block_shared ( block ) & &
( ei - > binder_type = = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS | |
ei - > binder_type = = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS ) ;
}
2018-01-17 13:46:46 +03:00
int tcf_block_get_ext ( struct tcf_block * * p_block , struct Qdisc * q ,
struct tcf_block_ext_info * ei ,
struct netlink_ext_ack * extack )
{
2024-01-04 15:58:44 +03:00
struct net_device * dev = qdisc_dev ( q ) ;
2018-01-17 13:46:46 +03:00
struct net * net = qdisc_net ( q ) ;
struct tcf_block * block = NULL ;
int err ;
2018-09-24 19:22:58 +03:00
if ( ei - > block_index )
2018-01-17 13:46:46 +03:00
/* block_index not 0 means the shared block is requested */
2018-09-24 19:22:58 +03:00
block = tcf_block_refcnt_get ( net , ei - > block_index ) ;
2018-01-17 13:46:46 +03:00
if ( ! block ) {
2018-02-13 14:00:16 +03:00
block = tcf_block_create ( net , q , ei - > block_index , extack ) ;
2018-01-17 13:46:46 +03:00
if ( IS_ERR ( block ) )
return PTR_ERR ( block ) ;
2018-02-13 14:00:16 +03:00
if ( tcf_block_shared ( block ) ) {
err = tcf_block_insert ( block , net , extack ) ;
2018-01-17 13:46:46 +03:00
if ( err )
goto err_block_insert ;
}
}
2018-01-17 13:46:48 +03:00
err = tcf_block_owner_add ( block , q , ei - > binder_type ) ;
if ( err )
goto err_block_owner_add ;
tcf_block_owner_netif_keep_dst ( block , q , ei - > binder_type ) ;
2018-07-23 10:23:05 +03:00
err = tcf_chain0_head_change_cb_add ( block , ei , extack ) ;
2018-01-17 13:46:45 +03:00
if ( err )
2018-07-23 10:23:05 +03:00
goto err_chain0_head_change_cb_add ;
2018-01-17 13:46:50 +03:00
2018-06-26 00:30:04 +03:00
err = tcf_block_offload_bind ( block , q , ei , extack ) ;
2018-01-17 13:46:50 +03:00
if ( err )
goto err_block_offload_bind ;
2024-01-12 14:39:30 +03:00
if ( tcf_block_tracks_dev ( block , ei ) ) {
2024-01-04 15:58:44 +03:00
err = xa_insert ( & block - > ports , dev - > ifindex , dev , GFP_KERNEL ) ;
if ( err ) {
NL_SET_ERR_MSG ( extack , " block dev insert failed " ) ;
goto err_dev_insert ;
}
}
2017-05-17 12:07:55 +03:00
* p_block = block ;
return 0 ;
2017-05-17 12:07:59 +03:00
2024-01-04 15:58:44 +03:00
err_dev_insert :
2018-01-17 13:46:50 +03:00
err_block_offload_bind :
2018-07-23 10:23:05 +03:00
tcf_chain0_head_change_cb_del ( block , ei ) ;
err_chain0_head_change_cb_add :
2018-01-17 13:46:48 +03:00
tcf_block_owner_del ( block , q , ei - > binder_type ) ;
err_block_owner_add :
2018-01-17 13:46:46 +03:00
err_block_insert :
2019-02-11 11:55:45 +03:00
tcf_block_refcnt_put ( block , true ) ;
2017-05-17 12:07:59 +03:00
return err ;
2017-05-17 12:07:55 +03:00
}
2017-10-19 16:50:29 +03:00
EXPORT_SYMBOL ( tcf_block_get_ext ) ;
2017-11-03 13:46:24 +03:00
static void tcf_chain_head_change_dflt ( struct tcf_proto * tp_head , void * priv )
{
struct tcf_proto __rcu * * p_filter_chain = priv ;
rcu_assign_pointer ( * p_filter_chain , tp_head ) ;
}
2017-10-19 16:50:29 +03:00
int tcf_block_get ( struct tcf_block * * p_block ,
2017-12-20 20:35:19 +03:00
struct tcf_proto __rcu * * p_filter_chain , struct Qdisc * q ,
struct netlink_ext_ack * extack )
2017-10-19 16:50:29 +03:00
{
2017-11-03 13:46:24 +03:00
struct tcf_block_ext_info ei = {
. chain_head_change = tcf_chain_head_change_dflt ,
. chain_head_change_priv = p_filter_chain ,
} ;
2017-10-19 16:50:29 +03:00
2017-11-03 13:46:24 +03:00
WARN_ON ( ! p_filter_chain ) ;
2017-12-20 20:35:19 +03:00
return tcf_block_get_ext ( p_block , q , & ei , extack ) ;
2017-10-19 16:50:29 +03:00
}
2017-05-17 12:07:55 +03:00
EXPORT_SYMBOL ( tcf_block_get ) ;
2017-10-27 04:24:28 +03:00
/* XXX: Standalone actions are not allowed to jump to any chain, and bound
2017-11-24 14:27:58 +03:00
* actions should be all removed after flushing .
2017-10-27 04:24:28 +03:00
*/
2017-11-03 13:46:24 +03:00
void tcf_block_put_ext ( struct tcf_block * block , struct Qdisc * q ,
2017-10-30 08:10:01 +03:00
struct tcf_block_ext_info * ei )
2017-10-27 04:24:28 +03:00
{
2024-01-04 15:58:44 +03:00
struct net_device * dev = qdisc_dev ( q ) ;
2017-12-17 06:11:55 +03:00
if ( ! block )
return ;
2024-01-12 14:39:30 +03:00
if ( tcf_block_tracks_dev ( block , ei ) )
2024-01-04 15:58:44 +03:00
xa_erase ( & block - > ports , dev - > ifindex ) ;
2018-07-23 10:23:05 +03:00
tcf_chain0_head_change_cb_del ( block , ei ) ;
2018-01-17 13:46:48 +03:00
tcf_block_owner_del ( block , q , ei - > binder_type ) ;
2017-11-24 14:27:58 +03:00
2019-02-11 11:55:45 +03:00
__tcf_block_put ( block , q , ei , true ) ;
2017-05-17 12:07:55 +03:00
}
2017-10-19 16:50:29 +03:00
EXPORT_SYMBOL ( tcf_block_put_ext ) ;
void tcf_block_put ( struct tcf_block * block )
{
struct tcf_block_ext_info ei = { 0 , } ;
2017-12-21 15:13:59 +03:00
if ( ! block )
return ;
2017-11-03 13:46:24 +03:00
tcf_block_put_ext ( block , block - > q , & ei ) ;
2017-10-19 16:50:29 +03:00
}
2017-10-30 08:10:01 +03:00
2017-05-17 12:07:55 +03:00
EXPORT_SYMBOL ( tcf_block_put ) ;
2017-02-09 16:38:56 +03:00
2018-06-26 00:30:10 +03:00
static int
2019-07-19 19:20:15 +03:00
tcf_block_playback_offloads ( struct tcf_block * block , flow_setup_cb_t * cb ,
2018-06-26 00:30:10 +03:00
void * cb_priv , bool add , bool offload_in_use ,
struct netlink_ext_ack * extack )
{
2019-02-11 11:55:36 +03:00
struct tcf_chain * chain , * chain_prev ;
2019-02-11 11:55:40 +03:00
struct tcf_proto * tp , * tp_prev ;
2018-06-26 00:30:10 +03:00
int err ;
2019-08-26 16:44:57 +03:00
lockdep_assert_held ( & block - > cb_lock ) ;
2019-02-11 11:55:36 +03:00
for ( chain = __tcf_get_next_chain ( block , NULL ) ;
chain ;
chain_prev = chain ,
chain = __tcf_get_next_chain ( block , chain ) ,
tcf_chain_put ( chain_prev ) ) {
net/sched: flower: Fix chain template offload
When a qdisc is deleted from a net device the stack instructs the
underlying driver to remove its flow offload callback from the
associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack
then continues to replay the removal of the filters in the block for
this driver by iterating over the chains in the block and invoking the
'reoffload' operation of the classifier being used. In turn, the
classifier in its 'reoffload' operation prepares and emits a
'FLOW_CLS_DESTROY' command for each filter.
However, the stack does not do the same for chain templates and the
underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when
a qdisc is deleted. This results in a memory leak [1] which can be
reproduced using [2].
Fix by introducing a 'tmplt_reoffload' operation and have the stack
invoke it with the appropriate arguments as part of the replay.
Implement the operation in the sole classifier that supports chain
templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}'
command based on whether a flow offload callback is being bound to a
filter block or being unbound from one.
As far as I can tell, the issue happens since cited commit which
reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains()
in __tcf_block_put(). The order cannot be reversed as the filter block
is expected to be freed after flushing all the chains.
[1]
unreferenced object 0xffff888107e28800 (size 2048):
comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
hex dump (first 32 bytes):
b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff ..|......[......
01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff ................
backtrace:
[<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff81ab374e>] __kmalloc+0x4e/0x90
[<ffffffff832aec6d>] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0
[<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
[<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
[<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
[<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
[<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
[<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
[<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
[<ffffffff83ac6270>] netlink_unicast+0x540/0x820
[<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
[<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
[<ffffffff8379d29a>] ___sys_sendmsg+0x13a/0x1e0
[<ffffffff8379d50c>] __sys_sendmsg+0x11c/0x1f0
[<ffffffff843b9ce0>] do_syscall_64+0x40/0xe0
unreferenced object 0xffff88816d2c0400 (size 1024):
comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
hex dump (first 32 bytes):
40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00 @.......W.8.....
10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff ..,m......,m....
backtrace:
[<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff81ab36c1>] __kmalloc_node+0x51/0x90
[<ffffffff81a8ed96>] kvmalloc_node+0xa6/0x1f0
[<ffffffff82827d03>] bucket_table_alloc.isra.0+0x83/0x460
[<ffffffff82828d2b>] rhashtable_init+0x43b/0x7c0
[<ffffffff832aed48>] mlxsw_sp_acl_ruleset_get+0x428/0x7a0
[<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
[<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
[<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
[<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
[<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
[<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
[<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
[<ffffffff83ac6270>] netlink_unicast+0x540/0x820
[<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
[<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
[2]
# tc qdisc add dev swp1 clsact
# tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32
# tc qdisc del dev swp1 clsact
# devlink dev reload pci/0000:06:00.0
Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-01-22 16:28:43 +03:00
if ( chain - > tmplt_ops & & add )
chain - > tmplt_ops - > tmplt_reoffload ( chain , true , cb ,
cb_priv ) ;
2019-02-11 11:55:40 +03:00
for ( tp = __tcf_get_next_proto ( chain , NULL ) ; tp ;
tp_prev = tp ,
tp = __tcf_get_next_proto ( chain , tp ) ,
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp_prev , true , NULL ) ) {
2018-06-26 00:30:10 +03:00
if ( tp - > ops - > reoffload ) {
err = tp - > ops - > reoffload ( tp , add , cb , cb_priv ,
extack ) ;
if ( err & & add )
goto err_playback_remove ;
} else if ( add & & offload_in_use ) {
err = - EOPNOTSUPP ;
NL_SET_ERR_MSG ( extack , " Filter HW offload failed - classifier without re-offloading support " ) ;
goto err_playback_remove ;
}
}
net/sched: flower: Fix chain template offload
When a qdisc is deleted from a net device the stack instructs the
underlying driver to remove its flow offload callback from the
associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack
then continues to replay the removal of the filters in the block for
this driver by iterating over the chains in the block and invoking the
'reoffload' operation of the classifier being used. In turn, the
classifier in its 'reoffload' operation prepares and emits a
'FLOW_CLS_DESTROY' command for each filter.
However, the stack does not do the same for chain templates and the
underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when
a qdisc is deleted. This results in a memory leak [1] which can be
reproduced using [2].
Fix by introducing a 'tmplt_reoffload' operation and have the stack
invoke it with the appropriate arguments as part of the replay.
Implement the operation in the sole classifier that supports chain
templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}'
command based on whether a flow offload callback is being bound to a
filter block or being unbound from one.
As far as I can tell, the issue happens since cited commit which
reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains()
in __tcf_block_put(). The order cannot be reversed as the filter block
is expected to be freed after flushing all the chains.
[1]
unreferenced object 0xffff888107e28800 (size 2048):
comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
hex dump (first 32 bytes):
b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff ..|......[......
01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff ................
backtrace:
[<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff81ab374e>] __kmalloc+0x4e/0x90
[<ffffffff832aec6d>] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0
[<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
[<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
[<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
[<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
[<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
[<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
[<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
[<ffffffff83ac6270>] netlink_unicast+0x540/0x820
[<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
[<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
[<ffffffff8379d29a>] ___sys_sendmsg+0x13a/0x1e0
[<ffffffff8379d50c>] __sys_sendmsg+0x11c/0x1f0
[<ffffffff843b9ce0>] do_syscall_64+0x40/0xe0
unreferenced object 0xffff88816d2c0400 (size 1024):
comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
hex dump (first 32 bytes):
40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00 @.......W.8.....
10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff ..,m......,m....
backtrace:
[<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff81ab36c1>] __kmalloc_node+0x51/0x90
[<ffffffff81a8ed96>] kvmalloc_node+0xa6/0x1f0
[<ffffffff82827d03>] bucket_table_alloc.isra.0+0x83/0x460
[<ffffffff82828d2b>] rhashtable_init+0x43b/0x7c0
[<ffffffff832aed48>] mlxsw_sp_acl_ruleset_get+0x428/0x7a0
[<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
[<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
[<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
[<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
[<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
[<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
[<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
[<ffffffff83ac6270>] netlink_unicast+0x540/0x820
[<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
[<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
[2]
# tc qdisc add dev swp1 clsact
# tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32
# tc qdisc del dev swp1 clsact
# devlink dev reload pci/0000:06:00.0
Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-01-22 16:28:43 +03:00
if ( chain - > tmplt_ops & & ! add )
chain - > tmplt_ops - > tmplt_reoffload ( chain , false , cb ,
cb_priv ) ;
2018-06-26 00:30:10 +03:00
}
return 0 ;
err_playback_remove :
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp , true , NULL ) ;
2019-02-11 11:55:36 +03:00
tcf_chain_put ( chain ) ;
2018-06-26 00:30:10 +03:00
tcf_block_playback_offloads ( block , cb , cb_priv , false , offload_in_use ,
extack ) ;
return err ;
}
2019-07-09 23:55:45 +03:00
static int tcf_block_bind ( struct tcf_block * block ,
struct flow_block_offload * bo )
{
struct flow_block_cb * block_cb , * next ;
int err , i = 0 ;
2019-08-26 16:44:57 +03:00
lockdep_assert_held ( & block - > cb_lock ) ;
2019-07-09 23:55:45 +03:00
list_for_each_entry ( block_cb , & bo - > cb_list , list ) {
err = tcf_block_playback_offloads ( block , block_cb - > cb ,
block_cb - > cb_priv , true ,
tcf_block_offload_in_use ( block ) ,
bo - > extack ) ;
if ( err )
goto err_unroll ;
2019-08-26 16:45:01 +03:00
if ( ! bo - > unlocked_driver_cb )
block - > lockeddevcnt + + ;
2019-07-09 23:55:45 +03:00
i + + ;
}
2019-07-19 19:20:16 +03:00
list_splice ( & bo - > cb_list , & block - > flow_block . cb_list ) ;
2019-07-09 23:55:45 +03:00
return 0 ;
err_unroll :
list_for_each_entry_safe ( block_cb , next , & bo - > cb_list , list ) {
2023-04-26 15:31:11 +03:00
list_del ( & block_cb - > driver_list ) ;
2019-07-09 23:55:45 +03:00
if ( i - - > 0 ) {
list_del ( & block_cb - > list ) ;
tcf_block_playback_offloads ( block , block_cb - > cb ,
block_cb - > cb_priv , false ,
tcf_block_offload_in_use ( block ) ,
NULL ) ;
2019-08-26 16:45:01 +03:00
if ( ! bo - > unlocked_driver_cb )
block - > lockeddevcnt - - ;
2019-07-09 23:55:45 +03:00
}
flow_block_cb_free ( block_cb ) ;
}
return err ;
}
static void tcf_block_unbind ( struct tcf_block * block ,
struct flow_block_offload * bo )
{
struct flow_block_cb * block_cb , * next ;
2019-08-26 16:44:57 +03:00
lockdep_assert_held ( & block - > cb_lock ) ;
2019-07-09 23:55:45 +03:00
list_for_each_entry_safe ( block_cb , next , & bo - > cb_list , list ) {
tcf_block_playback_offloads ( block , block_cb - > cb ,
block_cb - > cb_priv , false ,
tcf_block_offload_in_use ( block ) ,
NULL ) ;
list_del ( & block_cb - > list ) ;
flow_block_cb_free ( block_cb ) ;
2019-08-26 16:45:01 +03:00
if ( ! bo - > unlocked_driver_cb )
block - > lockeddevcnt - - ;
2019-07-09 23:55:45 +03:00
}
}
static int tcf_block_setup ( struct tcf_block * block ,
struct flow_block_offload * bo )
{
int err ;
switch ( bo - > command ) {
case FLOW_BLOCK_BIND :
err = tcf_block_bind ( block , bo ) ;
break ;
case FLOW_BLOCK_UNBIND :
err = 0 ;
tcf_block_unbind ( block , bo ) ;
break ;
default :
WARN_ON_ONCE ( 1 ) ;
err = - EOPNOTSUPP ;
}
return err ;
}
2017-05-17 12:07:54 +03:00
/* Main classifier routine: scans classifier chain attached
* to this qdisc , ( optionally ) tests for protocol and asks
* specific classifiers .
*/
2020-02-16 13:01:21 +03:00
static inline int __tcf_classify ( struct sk_buff * skb ,
const struct tcf_proto * tp ,
2020-02-16 13:01:24 +03:00
const struct tcf_proto * orig_tp ,
2020-02-16 13:01:21 +03:00
struct tcf_result * res ,
bool compat_mode ,
2023-02-18 01:36:14 +03:00
struct tcf_exts_miss_cookie_node * n ,
int act_index ,
2020-02-16 13:01:21 +03:00
u32 * last_executed_chain )
2017-05-17 12:07:54 +03:00
{
# ifdef CONFIG_NET_CLS_ACT
2021-05-19 16:17:21 +03:00
const int max_reclassify_loop = 16 ;
2017-05-23 10:11:59 +03:00
const struct tcf_proto * first_tp ;
2017-05-17 12:07:54 +03:00
int limit = 0 ;
reclassify :
# endif
for ( ; tp ; tp = rcu_dereference_bh ( tp - > next ) ) {
sched: consistently handle layer3 header accesses in the presence of VLANs
There are a couple of places in net/sched/ that check skb->protocol and act
on the value there. However, in the presence of VLAN tags, the value stored
in skb->protocol can be inconsistent based on whether VLAN acceleration is
enabled. The commit quoted in the Fixes tag below fixed the users of
skb->protocol to use a helper that will always see the VLAN ethertype.
However, most of the callers don't actually handle the VLAN ethertype, but
expect to find the IP header type in the protocol field. This means that
things like changing the ECN field, or parsing diffserv values, stops
working if there's a VLAN tag, or if there are multiple nested VLAN
tags (QinQ).
To fix this, change the helper to take an argument that indicates whether
the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we
make sure to skip all of them, so behaviour is consistent even in QinQ
mode.
To make the helper usable from the ECN code, move it to if_vlan.h instead
of pkt_sched.h.
v3:
- Remove empty lines
- Move vlan variable definitions inside loop in skb_protocol()
- Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and
bpf_skb_ecn_set_ce()
v2:
- Use eth_type_vlan() helper in skb_protocol()
- Also fix code that reads skb->protocol directly
- Change a couple of 'if/else if' statements to switch constructs to avoid
calling the helper twice
Reported-by: Ilya Ponetayev <i.ponetaev@ndmsystems.com>
Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-07-03 23:26:43 +03:00
__be16 protocol = skb_protocol ( skb , false ) ;
2023-02-18 01:36:14 +03:00
int err = 0 ;
2017-05-17 12:07:54 +03:00
2023-02-18 01:36:14 +03:00
if ( n ) {
struct tcf_exts * exts ;
if ( n - > tp_prio ! = tp - > prio )
continue ;
/* We re-lookup the tp and chain based on index instead
* of having hard refs and locks to them , so do a sanity
* check if any of tp , chain , exts was replaced by the
* time we got here with a cookie from hardware .
*/
if ( unlikely ( n - > tp ! = tp | | n - > tp - > chain ! = n - > chain | |
2023-10-09 12:26:55 +03:00
! tp - > ops - > get_exts ) ) {
2023-12-16 23:44:36 +03:00
tcf_set_drop_reason ( skb ,
SKB_DROP_REASON_TC_COOKIE_ERROR ) ;
2023-02-18 01:36:14 +03:00
return TC_ACT_SHOT ;
2023-10-09 12:26:55 +03:00
}
2023-02-18 01:36:14 +03:00
exts = tp - > ops - > get_exts ( tp , n - > handle ) ;
2023-10-09 12:26:55 +03:00
if ( unlikely ( ! exts | | n - > exts ! = exts ) ) {
2023-12-16 23:44:36 +03:00
tcf_set_drop_reason ( skb ,
SKB_DROP_REASON_TC_COOKIE_ERROR ) ;
2023-02-18 01:36:14 +03:00
return TC_ACT_SHOT ;
2023-10-09 12:26:55 +03:00
}
2017-05-17 12:07:54 +03:00
2023-02-18 01:36:14 +03:00
n = NULL ;
err = tcf_exts_exec_ex ( skb , exts , act_index , res ) ;
} else {
if ( tp - > protocol ! = protocol & &
tp - > protocol ! = htons ( ETH_P_ALL ) )
continue ;
err = tc_classify ( skb , tp , res ) ;
}
2017-05-17 12:07:54 +03:00
# ifdef CONFIG_NET_CLS_ACT
2017-05-17 12:08:03 +03:00
if ( unlikely ( err = = TC_ACT_RECLASSIFY & & ! compat_mode ) ) {
2017-05-23 10:11:59 +03:00
first_tp = orig_tp ;
2020-02-16 13:01:21 +03:00
* last_executed_chain = first_tp - > chain - > index ;
2017-05-17 12:07:54 +03:00
goto reset ;
2017-05-17 12:08:03 +03:00
} else if ( unlikely ( TC_ACT_EXT_CMP ( err , TC_ACT_GOTO_CHAIN ) ) ) {
2017-05-23 10:11:59 +03:00
first_tp = res - > goto_tp ;
2020-02-16 13:01:21 +03:00
* last_executed_chain = err & TC_ACT_EXT_VAL_MASK ;
2017-05-17 12:08:03 +03:00
goto reset ;
}
2017-05-17 12:07:54 +03:00
# endif
2023-12-16 23:44:34 +03:00
if ( err > = 0 )
2017-05-17 12:07:54 +03:00
return err ;
}
2023-10-09 12:26:55 +03:00
if ( unlikely ( n ) ) {
2023-12-16 23:44:36 +03:00
tcf_set_drop_reason ( skb ,
SKB_DROP_REASON_TC_COOKIE_ERROR ) ;
2023-02-18 01:36:14 +03:00
return TC_ACT_SHOT ;
2023-10-09 12:26:55 +03:00
}
2023-02-18 01:36:14 +03:00
2017-05-17 12:07:54 +03:00
return TC_ACT_UNSPEC ; /* signal: continue lookup */
# ifdef CONFIG_NET_CLS_ACT
reset :
if ( unlikely ( limit + + > = max_reclassify_loop ) ) {
2018-01-17 13:46:47 +03:00
net_notice_ratelimited ( " %u: reclassify loop, rule prio %u, protocol %02x \n " ,
tp - > chain - > block - > index ,
tp - > prio & 0xffff ,
2017-05-17 12:07:54 +03:00
ntohs ( tp - > protocol ) ) ;
2023-12-16 23:44:36 +03:00
tcf_set_drop_reason ( skb ,
SKB_DROP_REASON_TC_RECLASSIFY_LOOP ) ;
2017-05-17 12:07:54 +03:00
return TC_ACT_SHOT ;
}
2017-05-23 10:11:59 +03:00
tp = first_tp ;
2017-05-17 12:07:54 +03:00
goto reclassify ;
# endif
}
2020-02-16 13:01:21 +03:00
2021-07-28 21:08:00 +03:00
int tcf_classify ( struct sk_buff * skb ,
const struct tcf_block * block ,
const struct tcf_proto * tp ,
2020-02-16 13:01:21 +03:00
struct tcf_result * res , bool compat_mode )
{
# if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
u32 last_executed_chain = 0 ;
2023-02-18 01:36:14 +03:00
return __tcf_classify ( skb , tp , tp , res , compat_mode , NULL , 0 ,
2020-02-16 13:01:21 +03:00
& last_executed_chain ) ;
# else
u32 last_executed_chain = tp ? tp - > chain - > index : 0 ;
2023-02-18 01:36:14 +03:00
struct tcf_exts_miss_cookie_node * n = NULL ;
2020-02-16 13:01:24 +03:00
const struct tcf_proto * orig_tp = tp ;
2020-02-16 13:01:21 +03:00
struct tc_skb_ext * ext ;
2023-02-18 01:36:14 +03:00
int act_index = 0 ;
2020-02-16 13:01:21 +03:00
int ret ;
2021-07-28 21:08:00 +03:00
if ( block ) {
ext = skb_ext_find ( skb , TC_SKB_EXT ) ;
2020-02-16 13:01:24 +03:00
2023-02-18 01:36:14 +03:00
if ( ext & & ( ext - > chain | | ext - > act_miss ) ) {
2021-07-28 21:08:00 +03:00
struct tcf_chain * fchain ;
2023-02-18 01:36:14 +03:00
u32 chain ;
if ( ext - > act_miss ) {
n = tcf_exts_miss_cookie_lookup ( ext - > act_miss_cookie ,
& act_index ) ;
2023-10-09 12:26:55 +03:00
if ( ! n ) {
2023-12-16 23:44:36 +03:00
tcf_set_drop_reason ( skb ,
SKB_DROP_REASON_TC_COOKIE_ERROR ) ;
2023-02-18 01:36:14 +03:00
return TC_ACT_SHOT ;
2023-10-09 12:26:55 +03:00
}
2020-02-16 13:01:24 +03:00
2023-02-18 01:36:14 +03:00
chain = n - > chain_index ;
} else {
chain = ext - > chain ;
}
fchain = tcf_chain_lookup_rcu ( block , chain ) ;
2023-10-09 12:26:55 +03:00
if ( ! fchain ) {
2023-12-16 23:44:36 +03:00
tcf_set_drop_reason ( skb ,
SKB_DROP_REASON_TC_CHAIN_NOTFOUND ) ;
2021-07-28 21:08:00 +03:00
return TC_ACT_SHOT ;
2023-10-09 12:26:55 +03:00
}
2020-02-16 13:01:24 +03:00
2021-07-28 21:08:00 +03:00
/* Consume, so cloned/redirect skbs won't inherit ext */
skb_ext_del ( skb , TC_SKB_EXT ) ;
2020-02-16 13:01:24 +03:00
2021-07-28 21:08:00 +03:00
tp = rcu_dereference_bh ( fchain - > filter_chain ) ;
last_executed_chain = fchain - > index ;
}
2020-02-16 13:01:24 +03:00
}
2023-02-18 01:36:14 +03:00
ret = __tcf_classify ( skb , tp , orig_tp , res , compat_mode , n , act_index ,
2020-02-16 13:01:24 +03:00
& last_executed_chain ) ;
2020-02-16 13:01:21 +03:00
2022-02-03 11:44:30 +03:00
if ( tc_skb_ext_tc_enabled ( ) ) {
/* If we missed on some chain */
if ( ret = = TC_ACT_UNSPEC & & last_executed_chain ) {
struct tc_skb_cb * cb = tc_skb_cb ( skb ) ;
ext = tc_skb_ext_alloc ( skb ) ;
2023-10-09 12:26:55 +03:00
if ( WARN_ON_ONCE ( ! ext ) ) {
2023-12-16 23:44:36 +03:00
tcf_set_drop_reason ( skb , SKB_DROP_REASON_NOMEM ) ;
2022-02-03 11:44:30 +03:00
return TC_ACT_SHOT ;
2023-10-09 12:26:55 +03:00
}
2022-02-03 11:44:30 +03:00
ext - > chain = last_executed_chain ;
ext - > mru = cb - > mru ;
ext - > post_ct = cb - > post_ct ;
ext - > post_ct_snat = cb - > post_ct_snat ;
ext - > post_ct_dnat = cb - > post_ct_dnat ;
ext - > zone = cb - > zone ;
}
2020-02-16 13:01:21 +03:00
}
return ret ;
# endif
}
2021-07-28 21:08:00 +03:00
EXPORT_SYMBOL ( tcf_classify ) ;
2020-02-16 13:01:21 +03:00
2017-05-17 12:07:59 +03:00
struct tcf_chain_info {
struct tcf_proto __rcu * * pprev ;
struct tcf_proto __rcu * next ;
} ;
2019-02-11 11:55:38 +03:00
static struct tcf_proto * tcf_chain_tp_prev ( struct tcf_chain * chain ,
struct tcf_chain_info * chain_info )
2017-05-17 12:07:59 +03:00
{
2019-02-11 11:55:38 +03:00
return tcf_chain_dereference ( * chain_info - > pprev , chain ) ;
2017-05-17 12:07:59 +03:00
}
2019-02-11 11:55:42 +03:00
static int tcf_chain_tp_insert ( struct tcf_chain * chain ,
struct tcf_chain_info * chain_info ,
struct tcf_proto * tp )
2017-05-17 12:07:59 +03:00
{
2019-02-11 11:55:42 +03:00
if ( chain - > flushing )
return - EAGAIN ;
net/sched: fix initialization order when updating chain 0 head
Currently, when inserting a new filter that needs to sit at the head
of chain 0, it will first update the heads pointer on all devices using
the (shared) block, and only then complete the initialization of the new
element so that it has a "next" element.
This can lead to a situation that the chain 0 head is propagated to
another CPU before the "next" initialization is done. When this race
condition is triggered, packets being matched on that CPU will simply
miss all other filters, and will flow through the stack as if there were
no other filters installed. If the system is using OVS + TC, such
packets will get handled by vswitchd via upcall, which results in much
higher latency and reordering. For other applications it may result in
packet drops.
This is reproducible with a tc only setup, but it varies from system to
system. It could be reproduced with a shared block amongst 10 veth
tunnels, and an ingress filter mirroring packets to another veth.
That's because using the last added veth tunnel to the shared block to
do the actual traffic, it makes the race window bigger and easier to
trigger.
The fix is rather simple, to just initialize the next pointer of the new
filter instance (tp) before propagating the head change.
The fixes tag is pointing to the original code though this issue should
only be observed when using it unlocked.
Fixes: 2190d1d0944f ("net: sched: introduce helpers to work with filter chains")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Davide Caratti <dcaratti@redhat.com>
Link: https://lore.kernel.org/r/b97d5f4eaffeeb9d058155bcab63347527261abf.1649341369.git.marcelo.leitner@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-04-07 17:29:23 +03:00
RCU_INIT_POINTER ( tp - > next , tcf_chain_tp_prev ( chain , chain_info ) ) ;
2017-11-03 13:46:24 +03:00
if ( * chain_info - > pprev = = chain - > filter_chain )
2018-07-23 10:23:05 +03:00
tcf_chain0_head_change ( chain , tp ) ;
2019-02-11 11:55:39 +03:00
tcf_proto_get ( tp ) ;
2017-05-17 12:07:59 +03:00
rcu_assign_pointer ( * chain_info - > pprev , tp ) ;
2019-02-11 11:55:42 +03:00
return 0 ;
2017-05-17 12:07:59 +03:00
}
static void tcf_chain_tp_remove ( struct tcf_chain * chain ,
struct tcf_chain_info * chain_info ,
struct tcf_proto * tp )
{
2019-02-11 11:55:38 +03:00
struct tcf_proto * next = tcf_chain_dereference ( chain_info - > next , chain ) ;
2017-05-17 12:07:59 +03:00
2019-02-11 11:55:41 +03:00
tcf_proto_mark_delete ( tp ) ;
2017-11-03 13:46:24 +03:00
if ( tp = = chain - > filter_chain )
2018-07-23 10:23:05 +03:00
tcf_chain0_head_change ( chain , next ) ;
2017-05-17 12:07:59 +03:00
RCU_INIT_POINTER ( * chain_info - > pprev , next ) ;
}
2019-02-11 11:55:41 +03:00
static struct tcf_proto * tcf_chain_tp_find ( struct tcf_chain * chain ,
struct tcf_chain_info * chain_info ,
u32 protocol , u32 prio ,
bool prio_allocate ) ;
/* Try to insert new proto.
* If proto with specified priority already exists , free new proto
* and return existing one .
*/
static struct tcf_proto * tcf_chain_tp_insert_unique ( struct tcf_chain * chain ,
struct tcf_proto * tp_new ,
2019-02-11 11:55:45 +03:00
u32 protocol , u32 prio ,
bool rtnl_held )
2019-02-11 11:55:41 +03:00
{
struct tcf_chain_info chain_info ;
struct tcf_proto * tp ;
2019-02-11 11:55:42 +03:00
int err = 0 ;
2019-02-11 11:55:41 +03:00
mutex_lock ( & chain - > filter_chain_lock ) ;
2019-11-02 17:17:47 +03:00
if ( tcf_proto_exists_destroying ( chain , tp_new ) ) {
mutex_unlock ( & chain - > filter_chain_lock ) ;
tcf_proto_destroy ( tp_new , rtnl_held , false , NULL ) ;
return ERR_PTR ( - EAGAIN ) ;
}
2019-02-11 11:55:41 +03:00
tp = tcf_chain_tp_find ( chain , & chain_info ,
protocol , prio , false ) ;
if ( ! tp )
2019-02-11 11:55:42 +03:00
err = tcf_chain_tp_insert ( chain , & chain_info , tp_new ) ;
2019-02-11 11:55:41 +03:00
mutex_unlock ( & chain - > filter_chain_lock ) ;
if ( tp ) {
2019-11-02 17:17:47 +03:00
tcf_proto_destroy ( tp_new , rtnl_held , false , NULL ) ;
2019-02-11 11:55:41 +03:00
tp_new = tp ;
2019-02-11 11:55:42 +03:00
} else if ( err ) {
2019-11-02 17:17:47 +03:00
tcf_proto_destroy ( tp_new , rtnl_held , false , NULL ) ;
2019-02-11 11:55:42 +03:00
tp_new = ERR_PTR ( err ) ;
2019-02-11 11:55:41 +03:00
}
return tp_new ;
}
static void tcf_chain_tp_delete_empty ( struct tcf_chain * chain ,
2019-02-11 11:55:45 +03:00
struct tcf_proto * tp , bool rtnl_held ,
2019-02-11 11:55:41 +03:00
struct netlink_ext_ack * extack )
{
struct tcf_chain_info chain_info ;
struct tcf_proto * tp_iter ;
struct tcf_proto * * pprev ;
struct tcf_proto * next ;
mutex_lock ( & chain - > filter_chain_lock ) ;
/* Atomically find and remove tp from chain. */
for ( pprev = & chain - > filter_chain ;
( tp_iter = tcf_chain_dereference ( * pprev , chain ) ) ;
pprev = & tp_iter - > next ) {
if ( tp_iter = = tp ) {
chain_info . pprev = pprev ;
chain_info . next = tp_iter - > next ;
WARN_ON ( tp_iter - > deleting ) ;
break ;
}
}
/* Verify that tp still exists and no new filters were inserted
* concurrently .
* Mark tp for deletion if it is empty .
*/
net/sched: add delete_empty() to filters and use it in cls_flower
Revert "net/sched: cls_u32: fix refcount leak in the error path of
u32_change()", and fix the u32 refcount leak in a more generic way that
preserves the semantic of rule dumping.
On tc filters that don't support lockless insertion/removal, there is no
need to guard against concurrent insertion when a removal is in progress.
Therefore, for most of them we can avoid a full walk() when deleting, and
just decrease the refcount, like it was done on older Linux kernels.
This fixes situations where walk() was wrongly detecting a non-empty
filter, like it happened with cls_u32 in the error path of change(), thus
leading to failures in the following tdc selftests:
6aa7: (filter, u32) Add/Replace u32 with source match and invalid indev
6658: (filter, u32) Add/Replace u32 with custom hash table and invalid handle
74c2: (filter, u32) Add/Replace u32 filter with invalid hash table id
On cls_flower, and on (future) lockless filters, this check is necessary:
move all the check_empty() logic in a callback so that each filter
can have its own implementation. For cls_flower, it's sufficient to check
if no IDRs have been allocated.
This reverts commit 275c44aa194b7159d1191817b20e076f55f0e620.
Changes since v1:
- document the need for delete_empty() when TCF_PROTO_OPS_DOIT_UNLOCKED
is used, thanks to Vlad Buslov
- implement delete_empty() without doing fl_walk(), thanks to Vlad Buslov
- squash revert and new fix in a single patch, to be nice with bisect
tests that run tdc on u32 filter, thanks to Dave Miller
Fixes: 275c44aa194b ("net/sched: cls_u32: fix refcount leak in the error path of u32_change()")
Fixes: 6676d5e416ee ("net: sched: set dedicated tcf_walker flag when tp is empty")
Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Suggested-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Vlad Buslov <vladbu@mellanox.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 18:36:58 +03:00
if ( ! tp_iter | | ! tcf_proto_check_delete ( tp ) ) {
2019-02-11 11:55:41 +03:00
mutex_unlock ( & chain - > filter_chain_lock ) ;
return ;
}
2019-11-02 17:17:47 +03:00
tcf_proto_signal_destroying ( chain , tp ) ;
2019-02-11 11:55:41 +03:00
next = tcf_chain_dereference ( chain_info . next , chain ) ;
if ( tp = = chain - > filter_chain )
tcf_chain0_head_change ( chain , next ) ;
RCU_INIT_POINTER ( * chain_info . pprev , next ) ;
mutex_unlock ( & chain - > filter_chain_lock ) ;
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp , rtnl_held , extack ) ;
2019-02-11 11:55:41 +03:00
}
2017-05-17 12:07:59 +03:00
static struct tcf_proto * tcf_chain_tp_find ( struct tcf_chain * chain ,
struct tcf_chain_info * chain_info ,
u32 protocol , u32 prio ,
bool prio_allocate )
{
struct tcf_proto * * pprev ;
struct tcf_proto * tp ;
/* Check the chain for existence of proto-tcf with this priority */
for ( pprev = & chain - > filter_chain ;
2019-02-11 11:55:38 +03:00
( tp = tcf_chain_dereference ( * pprev , chain ) ) ;
pprev = & tp - > next ) {
2017-05-17 12:07:59 +03:00
if ( tp - > prio > = prio ) {
if ( tp - > prio = = prio ) {
if ( prio_allocate | |
( tp - > protocol ! = protocol & & protocol ) )
return ERR_PTR ( - EINVAL ) ;
} else {
tp = NULL ;
}
break ;
}
}
chain_info - > pprev = pprev ;
2019-02-11 11:55:39 +03:00
if ( tp ) {
chain_info - > next = tp - > next ;
tcf_proto_get ( tp ) ;
} else {
chain_info - > next = NULL ;
}
2017-05-17 12:07:59 +03:00
return tp ;
}
2017-08-08 01:26:50 +03:00
static int tcf_fill_node ( struct net * net , struct sk_buff * skb ,
2018-01-17 13:46:51 +03:00
struct tcf_proto * tp , struct tcf_block * block ,
struct Qdisc * q , u32 parent , void * fh ,
2019-02-11 11:55:45 +03:00
u32 portid , u32 seq , u16 flags , int event ,
2023-01-13 06:43:53 +03:00
bool terse_dump , bool rtnl_held ,
struct netlink_ext_ack * extack )
2017-08-08 01:26:50 +03:00
{
struct tcmsg * tcm ;
struct nlmsghdr * nlh ;
unsigned char * b = skb_tail_pointer ( skb ) ;
nlh = nlmsg_put ( skb , portid , seq , event , sizeof ( * tcm ) , flags ) ;
if ( ! nlh )
goto out_nlmsg_trim ;
tcm = nlmsg_data ( nlh ) ;
tcm - > tcm_family = AF_UNSPEC ;
tcm - > tcm__pad1 = 0 ;
tcm - > tcm__pad2 = 0 ;
2018-01-17 13:46:51 +03:00
if ( q ) {
tcm - > tcm_ifindex = qdisc_dev ( q ) - > ifindex ;
tcm - > tcm_parent = parent ;
} else {
tcm - > tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK ;
tcm - > tcm_block_index = block - > index ;
}
2017-08-08 01:26:50 +03:00
tcm - > tcm_info = TC_H_MAKE ( tp - > prio , tp - > protocol ) ;
if ( nla_put_string ( skb , TCA_KIND , tp - > ops - > kind ) )
goto nla_put_failure ;
if ( nla_put_u32 ( skb , TCA_CHAIN , tp - > chain - > index ) )
goto nla_put_failure ;
if ( ! fh ) {
tcm - > tcm_handle = 0 ;
2020-05-15 14:40:11 +03:00
} else if ( terse_dump ) {
if ( tp - > ops - > terse_dump ) {
if ( tp - > ops - > terse_dump ( net , tp , fh , skb , tcm ,
rtnl_held ) < 0 )
goto nla_put_failure ;
} else {
goto cls_op_not_supp ;
}
2017-08-08 01:26:50 +03:00
} else {
2019-02-11 11:55:45 +03:00
if ( tp - > ops - > dump & &
tp - > ops - > dump ( net , tp , fh , skb , tcm , rtnl_held ) < 0 )
2017-08-08 01:26:50 +03:00
goto nla_put_failure ;
}
2023-01-13 06:43:53 +03:00
if ( extack & & extack - > _msg & &
nla_put_string ( skb , TCA_EXT_WARN_MSG , extack - > _msg ) )
goto nla_put_failure ;
2017-08-08 01:26:50 +03:00
nlh - > nlmsg_len = skb_tail_pointer ( skb ) - b ;
2023-01-13 06:43:53 +03:00
2017-08-08 01:26:50 +03:00
return skb - > len ;
out_nlmsg_trim :
nla_put_failure :
2020-05-15 14:40:11 +03:00
cls_op_not_supp :
2017-08-08 01:26:50 +03:00
nlmsg_trim ( skb , b ) ;
return - 1 ;
}
static int tfilter_notify ( struct net * net , struct sk_buff * oskb ,
struct nlmsghdr * n , struct tcf_proto * tp ,
2018-01-17 13:46:51 +03:00
struct tcf_block * block , struct Qdisc * q ,
2019-02-11 11:55:45 +03:00
u32 parent , void * fh , int event , bool unicast ,
2023-01-13 06:43:53 +03:00
bool rtnl_held , struct netlink_ext_ack * extack )
2017-08-08 01:26:50 +03:00
{
struct sk_buff * skb ;
u32 portid = oskb ? NETLINK_CB ( oskb ) . portid : 0 ;
2019-03-11 13:15:54 +03:00
int err = 0 ;
2017-08-08 01:26:50 +03:00
2023-12-08 22:28:47 +03:00
if ( ! unicast & & ! rtnl_notify_needed ( net , n - > nlmsg_flags , RTNLGRP_TC ) )
return 0 ;
2017-08-08 01:26:50 +03:00
skb = alloc_skb ( NLMSG_GOODSIZE , GFP_KERNEL ) ;
if ( ! skb )
return - ENOBUFS ;
2018-01-17 13:46:51 +03:00
if ( tcf_fill_node ( net , skb , tp , block , q , parent , fh , portid ,
2019-02-11 11:55:45 +03:00
n - > nlmsg_seq , n - > nlmsg_flags , event ,
2023-01-13 06:43:53 +03:00
false , rtnl_held , extack ) < = 0 ) {
2017-08-08 01:26:50 +03:00
kfree_skb ( skb ) ;
return - EINVAL ;
}
if ( unicast )
2021-07-15 15:24:24 +03:00
err = rtnl_unicast ( skb , net , portid ) ;
2019-03-11 13:15:54 +03:00
else
err = rtnetlink_send ( skb , net , portid , RTNLGRP_TC ,
n - > nlmsg_flags & NLM_F_ECHO ) ;
return err ;
2017-08-08 01:26:50 +03:00
}
static int tfilter_del_notify ( struct net * net , struct sk_buff * oskb ,
struct nlmsghdr * n , struct tcf_proto * tp ,
2018-01-17 13:46:51 +03:00
struct tcf_block * block , struct Qdisc * q ,
2023-12-08 22:28:46 +03:00
u32 parent , void * fh , bool * last , bool rtnl_held ,
struct netlink_ext_ack * extack )
2017-08-08 01:26:50 +03:00
{
struct sk_buff * skb ;
u32 portid = oskb ? NETLINK_CB ( oskb ) . portid : 0 ;
int err ;
2023-12-08 22:28:47 +03:00
if ( ! rtnl_notify_needed ( net , n - > nlmsg_flags , RTNLGRP_TC ) )
return tp - > ops - > delete ( tp , fh , last , rtnl_held , extack ) ;
2017-08-08 01:26:50 +03:00
skb = alloc_skb ( NLMSG_GOODSIZE , GFP_KERNEL ) ;
if ( ! skb )
return - ENOBUFS ;
2018-01-17 13:46:51 +03:00
if ( tcf_fill_node ( net , skb , tp , block , q , parent , fh , portid ,
2019-02-11 11:55:45 +03:00
n - > nlmsg_seq , n - > nlmsg_flags , RTM_DELTFILTER ,
2023-01-13 06:43:53 +03:00
false , rtnl_held , extack ) < = 0 ) {
2018-01-18 19:20:50 +03:00
NL_SET_ERR_MSG ( extack , " Failed to build del event notification " ) ;
2017-08-08 01:26:50 +03:00
kfree_skb ( skb ) ;
return - EINVAL ;
}
2019-02-11 11:55:45 +03:00
err = tp - > ops - > delete ( tp , fh , last , rtnl_held , extack ) ;
2017-08-08 01:26:50 +03:00
if ( err ) {
kfree_skb ( skb ) ;
return err ;
}
2023-12-08 22:28:46 +03:00
err = rtnetlink_send ( skb , net , portid , RTNLGRP_TC ,
n - > nlmsg_flags & NLM_F_ECHO ) ;
2018-01-18 19:20:50 +03:00
if ( err < 0 )
NL_SET_ERR_MSG ( extack , " Failed to send filter delete notification " ) ;
2019-03-11 13:15:54 +03:00
2018-01-18 19:20:50 +03:00
return err ;
2017-08-08 01:26:50 +03:00
}
static void tfilter_notify_chain ( struct net * net , struct sk_buff * oskb ,
2018-01-17 13:46:51 +03:00
struct tcf_block * block , struct Qdisc * q ,
u32 parent , struct nlmsghdr * n ,
2023-01-13 06:43:53 +03:00
struct tcf_chain * chain , int event ,
struct netlink_ext_ack * extack )
2017-08-08 01:26:50 +03:00
{
struct tcf_proto * tp ;
2020-11-27 18:12:05 +03:00
for ( tp = tcf_get_next_proto ( chain , NULL ) ;
tp ; tp = tcf_get_next_proto ( chain , tp ) )
2023-01-13 06:43:53 +03:00
tfilter_notify ( net , oskb , n , tp , block , q , parent , NULL ,
event , false , true , extack ) ;
2017-08-08 01:26:50 +03:00
}
2019-02-11 11:55:44 +03:00
static void tfilter_put ( struct tcf_proto * tp , void * fh )
{
if ( tp - > ops - > put & & fh )
tp - > ops - > put ( tp , fh ) ;
}
2022-10-21 10:58:39 +03:00
static bool is_qdisc_ingress ( __u32 classid )
{
return ( TC_H_MIN ( classid ) = = TC_H_MIN ( TC_H_MIN_INGRESS ) ) ;
}
2018-05-31 09:52:53 +03:00
static int tc_new_tfilter ( struct sk_buff * skb , struct nlmsghdr * n ,
2017-04-16 19:48:24 +03:00
struct netlink_ext_ack * extack )
2005-04-17 02:20:36 +04:00
{
2008-03-25 20:26:21 +03:00
struct net * net = sock_net ( skb - > sk ) ;
2008-01-23 09:11:33 +03:00
struct nlattr * tca [ TCA_MAX + 1 ] ;
2019-10-07 23:26:28 +03:00
char name [ IFNAMSIZ ] ;
2005-04-17 02:20:36 +04:00
struct tcmsg * t ;
u32 protocol ;
u32 prio ;
2017-05-17 12:07:57 +03:00
bool prio_allocate ;
2005-04-17 02:20:36 +04:00
u32 parent ;
2017-05-17 12:08:01 +03:00
u32 chain_index ;
net: sched: fix use-after-free in tc_new_tfilter()
Whenever tc_new_tfilter() jumps back to replay: label,
we need to make sure @q and @chain local variables are cleared again,
or risk use-after-free as in [1]
For consistency, apply the same fix in tc_ctl_chain()
BUG: KASAN: use-after-free in mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
Write of size 8 at addr ffff8880985c4b08 by task syz-executor.4/1945
CPU: 0 PID: 1945 Comm: syz-executor.4 Not tainted 5.17.0-rc1-syzkaller-00495-gff58831fa02d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
tcf_chain_head_change_item net/sched/cls_api.c:372 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:386
tcf_chain_tp_insert net/sched/cls_api.c:1657 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1707 [inline]
tc_new_tfilter+0x1e67/0x2350 net/sched/cls_api.c:2086
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f2647172059
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f2645aa5168 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00007f2647285100 RCX: 00007f2647172059
RDX: 040000000000009f RSI: 00000000200002c0 RDI: 0000000000000006
RBP: 00007f26471cc08d R08: 0000000000000000 R09: 0000000000000000
R10: 9e00000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fffb3f7f02f R14: 00007f2645aa5300 R15: 0000000000022000
</TASK>
Allocated by task 1944:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:45 [inline]
set_alloc_info mm/kasan/common.c:436 [inline]
____kasan_kmalloc mm/kasan/common.c:515 [inline]
____kasan_kmalloc mm/kasan/common.c:474 [inline]
__kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:524
kmalloc_node include/linux/slab.h:604 [inline]
kzalloc_node include/linux/slab.h:726 [inline]
qdisc_alloc+0xac/0xa10 net/sched/sch_generic.c:941
qdisc_create.constprop.0+0xce/0x10f0 net/sched/sch_api.c:1211
tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660
rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5592
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed by task 3609:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track+0x21/0x30 mm/kasan/common.c:45
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free+0x130/0x160 mm/kasan/common.c:328
kasan_slab_free include/linux/kasan.h:236 [inline]
slab_free_hook mm/slub.c:1728 [inline]
slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1754
slab_free mm/slub.c:3509 [inline]
kfree+0xcb/0x280 mm/slub.c:4562
rcu_do_batch kernel/rcu/tree.c:2527 [inline]
rcu_core+0x7b8/0x1540 kernel/rcu/tree.c:2778
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Last potentially related work creation:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
__kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348
__call_rcu kernel/rcu/tree.c:3026 [inline]
call_rcu+0xb1/0x740 kernel/rcu/tree.c:3106
qdisc_put_unlocked+0x6f/0x90 net/sched/sch_generic.c:1109
tcf_block_release+0x86/0x90 net/sched/cls_api.c:1238
tc_new_tfilter+0xc0d/0x2350 net/sched/cls_api.c:2148
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff8880985c4800
which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 776 bytes inside of
1024-byte region [ffff8880985c4800, ffff8880985c4c00)
The buggy address belongs to the page:
page:ffffea0002617000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x985c0
head:ffffea0002617000 order:3 compound_mapcount:0 compound_pincount:0
flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888010c41dc0
raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 3, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 1941, ts 1038999441284, free_ts 1033444432829
prep_new_page mm/page_alloc.c:2434 [inline]
get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165
__alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389
alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271
alloc_slab_page mm/slub.c:1799 [inline]
allocate_slab mm/slub.c:1944 [inline]
new_slab+0x28a/0x3b0 mm/slub.c:2004
___slab_alloc+0x87c/0xe90 mm/slub.c:3018
__slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105
slab_alloc_node mm/slub.c:3196 [inline]
slab_alloc mm/slub.c:3238 [inline]
__kmalloc+0x2fb/0x340 mm/slub.c:4420
kmalloc include/linux/slab.h:586 [inline]
kzalloc include/linux/slab.h:715 [inline]
__register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1335
neigh_sysctl_register+0x2c8/0x5e0 net/core/neighbour.c:3787
devinet_sysctl_register+0xb1/0x230 net/ipv4/devinet.c:2618
inetdev_init+0x286/0x580 net/ipv4/devinet.c:278
inetdev_event+0xa8a/0x15d0 net/ipv4/devinet.c:1532
notifier_call_chain+0xb5/0x200 kernel/notifier.c:84
call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1919
call_netdevice_notifiers_extack net/core/dev.c:1931 [inline]
call_netdevice_notifiers net/core/dev.c:1945 [inline]
register_netdevice+0x1073/0x1500 net/core/dev.c:9698
veth_newlink+0x59c/0xa90 drivers/net/veth.c:1722
page last free stack trace:
reset_page_owner include/linux/page_owner.h:24 [inline]
free_pages_prepare mm/page_alloc.c:1352 [inline]
free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404
free_unref_page_prepare mm/page_alloc.c:3325 [inline]
free_unref_page+0x19/0x690 mm/page_alloc.c:3404
release_pages+0x748/0x1220 mm/swap.c:956
tlb_batch_pages_flush mm/mmu_gather.c:50 [inline]
tlb_flush_mmu_free mm/mmu_gather.c:243 [inline]
tlb_flush_mmu+0xe9/0x6b0 mm/mmu_gather.c:250
zap_pte_range mm/memory.c:1441 [inline]
zap_pmd_range mm/memory.c:1490 [inline]
zap_pud_range mm/memory.c:1519 [inline]
zap_p4d_range mm/memory.c:1540 [inline]
unmap_page_range+0x1d1d/0x2a30 mm/memory.c:1561
unmap_single_vma+0x198/0x310 mm/memory.c:1606
unmap_vmas+0x16b/0x2f0 mm/memory.c:1638
exit_mmap+0x201/0x670 mm/mmap.c:3178
__mmput+0x122/0x4b0 kernel/fork.c:1114
mmput+0x56/0x60 kernel/fork.c:1135
exit_mm kernel/exit.c:507 [inline]
do_exit+0xa3c/0x2a30 kernel/exit.c:793
do_group_exit+0xd2/0x2f0 kernel/exit.c:935
__do_sys_exit_group kernel/exit.c:946 [inline]
__se_sys_exit_group kernel/exit.c:944 [inline]
__x64_sys_exit_group+0x3a/0x50 kernel/exit.c:944
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Memory state around the buggy address:
ffff8880985c4a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8880985c4b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff8880985c4b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
Fixes: 470502de5bdb ("net: sched: unlock rules update API")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20220131172018.3704490-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-31 20:20:18 +03:00
struct Qdisc * q ;
2017-05-17 12:07:59 +03:00
struct tcf_chain_info chain_info ;
net: sched: fix use-after-free in tc_new_tfilter()
Whenever tc_new_tfilter() jumps back to replay: label,
we need to make sure @q and @chain local variables are cleared again,
or risk use-after-free as in [1]
For consistency, apply the same fix in tc_ctl_chain()
BUG: KASAN: use-after-free in mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
Write of size 8 at addr ffff8880985c4b08 by task syz-executor.4/1945
CPU: 0 PID: 1945 Comm: syz-executor.4 Not tainted 5.17.0-rc1-syzkaller-00495-gff58831fa02d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
tcf_chain_head_change_item net/sched/cls_api.c:372 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:386
tcf_chain_tp_insert net/sched/cls_api.c:1657 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1707 [inline]
tc_new_tfilter+0x1e67/0x2350 net/sched/cls_api.c:2086
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f2647172059
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f2645aa5168 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00007f2647285100 RCX: 00007f2647172059
RDX: 040000000000009f RSI: 00000000200002c0 RDI: 0000000000000006
RBP: 00007f26471cc08d R08: 0000000000000000 R09: 0000000000000000
R10: 9e00000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fffb3f7f02f R14: 00007f2645aa5300 R15: 0000000000022000
</TASK>
Allocated by task 1944:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:45 [inline]
set_alloc_info mm/kasan/common.c:436 [inline]
____kasan_kmalloc mm/kasan/common.c:515 [inline]
____kasan_kmalloc mm/kasan/common.c:474 [inline]
__kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:524
kmalloc_node include/linux/slab.h:604 [inline]
kzalloc_node include/linux/slab.h:726 [inline]
qdisc_alloc+0xac/0xa10 net/sched/sch_generic.c:941
qdisc_create.constprop.0+0xce/0x10f0 net/sched/sch_api.c:1211
tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660
rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5592
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed by task 3609:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track+0x21/0x30 mm/kasan/common.c:45
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free+0x130/0x160 mm/kasan/common.c:328
kasan_slab_free include/linux/kasan.h:236 [inline]
slab_free_hook mm/slub.c:1728 [inline]
slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1754
slab_free mm/slub.c:3509 [inline]
kfree+0xcb/0x280 mm/slub.c:4562
rcu_do_batch kernel/rcu/tree.c:2527 [inline]
rcu_core+0x7b8/0x1540 kernel/rcu/tree.c:2778
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Last potentially related work creation:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
__kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348
__call_rcu kernel/rcu/tree.c:3026 [inline]
call_rcu+0xb1/0x740 kernel/rcu/tree.c:3106
qdisc_put_unlocked+0x6f/0x90 net/sched/sch_generic.c:1109
tcf_block_release+0x86/0x90 net/sched/cls_api.c:1238
tc_new_tfilter+0xc0d/0x2350 net/sched/cls_api.c:2148
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff8880985c4800
which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 776 bytes inside of
1024-byte region [ffff8880985c4800, ffff8880985c4c00)
The buggy address belongs to the page:
page:ffffea0002617000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x985c0
head:ffffea0002617000 order:3 compound_mapcount:0 compound_pincount:0
flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888010c41dc0
raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 3, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 1941, ts 1038999441284, free_ts 1033444432829
prep_new_page mm/page_alloc.c:2434 [inline]
get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165
__alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389
alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271
alloc_slab_page mm/slub.c:1799 [inline]
allocate_slab mm/slub.c:1944 [inline]
new_slab+0x28a/0x3b0 mm/slub.c:2004
___slab_alloc+0x87c/0xe90 mm/slub.c:3018
__slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105
slab_alloc_node mm/slub.c:3196 [inline]
slab_alloc mm/slub.c:3238 [inline]
__kmalloc+0x2fb/0x340 mm/slub.c:4420
kmalloc include/linux/slab.h:586 [inline]
kzalloc include/linux/slab.h:715 [inline]
__register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1335
neigh_sysctl_register+0x2c8/0x5e0 net/core/neighbour.c:3787
devinet_sysctl_register+0xb1/0x230 net/ipv4/devinet.c:2618
inetdev_init+0x286/0x580 net/ipv4/devinet.c:278
inetdev_event+0xa8a/0x15d0 net/ipv4/devinet.c:1532
notifier_call_chain+0xb5/0x200 kernel/notifier.c:84
call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1919
call_netdevice_notifiers_extack net/core/dev.c:1931 [inline]
call_netdevice_notifiers net/core/dev.c:1945 [inline]
register_netdevice+0x1073/0x1500 net/core/dev.c:9698
veth_newlink+0x59c/0xa90 drivers/net/veth.c:1722
page last free stack trace:
reset_page_owner include/linux/page_owner.h:24 [inline]
free_pages_prepare mm/page_alloc.c:1352 [inline]
free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404
free_unref_page_prepare mm/page_alloc.c:3325 [inline]
free_unref_page+0x19/0x690 mm/page_alloc.c:3404
release_pages+0x748/0x1220 mm/swap.c:956
tlb_batch_pages_flush mm/mmu_gather.c:50 [inline]
tlb_flush_mmu_free mm/mmu_gather.c:243 [inline]
tlb_flush_mmu+0xe9/0x6b0 mm/mmu_gather.c:250
zap_pte_range mm/memory.c:1441 [inline]
zap_pmd_range mm/memory.c:1490 [inline]
zap_pud_range mm/memory.c:1519 [inline]
zap_p4d_range mm/memory.c:1540 [inline]
unmap_page_range+0x1d1d/0x2a30 mm/memory.c:1561
unmap_single_vma+0x198/0x310 mm/memory.c:1606
unmap_vmas+0x16b/0x2f0 mm/memory.c:1638
exit_mmap+0x201/0x670 mm/mmap.c:3178
__mmput+0x122/0x4b0 kernel/fork.c:1114
mmput+0x56/0x60 kernel/fork.c:1135
exit_mm kernel/exit.c:507 [inline]
do_exit+0xa3c/0x2a30 kernel/exit.c:793
do_group_exit+0xd2/0x2f0 kernel/exit.c:935
__do_sys_exit_group kernel/exit.c:946 [inline]
__se_sys_exit_group kernel/exit.c:944 [inline]
__x64_sys_exit_group+0x3a/0x50 kernel/exit.c:944
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Memory state around the buggy address:
ffff8880985c4a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8880985c4b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff8880985c4b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
Fixes: 470502de5bdb ("net: sched: unlock rules update API")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20220131172018.3704490-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-31 20:20:18 +03:00
struct tcf_chain * chain ;
2017-05-17 12:07:55 +03:00
struct tcf_block * block ;
2005-04-17 02:20:36 +04:00
struct tcf_proto * tp ;
unsigned long cl ;
2017-08-05 07:31:43 +03:00
void * fh ;
2005-04-17 02:20:36 +04:00
int err ;
net, sched: fix soft lockup in tc_classify
Shahar reported a soft lockup in tc_classify(), where we run into an
endless loop when walking the classifier chain due to tp->next == tp
which is a state we should never run into. The issue only seems to
trigger under load in the tc control path.
What happens is that in tc_ctl_tfilter(), thread A allocates a new
tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
with it. In that classifier callback we had to unlock/lock the rtnl
mutex and returned with -EAGAIN. One reason why we need to drop there
is, for example, that we need to request an action module to be loaded.
This happens via tcf_exts_validate() -> tcf_action_init/_1() meaning
after we loaded and found the requested action, we need to redo the
whole request so we don't race against others. While we had to unlock
rtnl in that time, thread B's request was processed next on that CPU.
Thread B added a new tp instance successfully to the classifier chain.
When thread A returned grabbing the rtnl mutex again, propagating -EAGAIN
and destroying its tp instance which never got linked, we goto replay
and redo A's request.
This time when walking the classifier chain in tc_ctl_tfilter() for
checking for existing tp instances we had a priority match and found
the tp instance that was created and linked by thread B. Now calling
again into tp->ops->change() with that tp was successful and returned
without error.
tp_created was never cleared in the second round, thus kernel thinks
that we need to link it into the classifier chain (once again). tp and
*back point to the same object due to the match we had earlier on. Thus
for thread B's already public tp, we reset tp->next to tp itself and
link it into the chain, which eventually causes the mentioned endless
loop in tc_classify() once a packet hits the data path.
Fix is to clear tp_created at the beginning of each request, also when
we replay it. On the paths that can cause -EAGAIN we already destroy
the original tp instance we had and on replay we really need to start
from scratch. It seems that this issue was first introduced in commit
12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining
and avoid kernel panic when we use cls_cgroup").
Fixes: 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup")
Reported-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Tested-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-12-21 20:04:11 +03:00
int tp_created ;
2019-02-11 11:55:48 +03:00
bool rtnl_held = false ;
2021-08-10 06:43:05 +03:00
u32 flags ;
2005-04-17 02:20:36 +04:00
replay :
net, sched: fix soft lockup in tc_classify
Shahar reported a soft lockup in tc_classify(), where we run into an
endless loop when walking the classifier chain due to tp->next == tp
which is a state we should never run into. The issue only seems to
trigger under load in the tc control path.
What happens is that in tc_ctl_tfilter(), thread A allocates a new
tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
with it. In that classifier callback we had to unlock/lock the rtnl
mutex and returned with -EAGAIN. One reason why we need to drop there
is, for example, that we need to request an action module to be loaded.
This happens via tcf_exts_validate() -> tcf_action_init/_1() meaning
after we loaded and found the requested action, we need to redo the
whole request so we don't race against others. While we had to unlock
rtnl in that time, thread B's request was processed next on that CPU.
Thread B added a new tp instance successfully to the classifier chain.
When thread A returned grabbing the rtnl mutex again, propagating -EAGAIN
and destroying its tp instance which never got linked, we goto replay
and redo A's request.
This time when walking the classifier chain in tc_ctl_tfilter() for
checking for existing tp instances we had a priority match and found
the tp instance that was created and linked by thread B. Now calling
again into tp->ops->change() with that tp was successful and returned
without error.
tp_created was never cleared in the second round, thus kernel thinks
that we need to link it into the classifier chain (once again). tp and
*back point to the same object due to the match we had earlier on. Thus
for thread B's already public tp, we reset tp->next to tp itself and
link it into the chain, which eventually causes the mentioned endless
loop in tc_classify() once a packet hits the data path.
Fix is to clear tp_created at the beginning of each request, also when
we replay it. On the paths that can cause -EAGAIN we already destroy
the original tp instance we had and on replay we really need to start
from scratch. It seems that this issue was first introduced in commit
12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining
and avoid kernel panic when we use cls_cgroup").
Fixes: 12186be7d2e1 ("net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid kernel panic when we use cls_cgroup")
Reported-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Tested-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-12-21 20:04:11 +03:00
tp_created = 0 ;
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 15:07:28 +03:00
err = nlmsg_parse_deprecated ( n , sizeof ( * t ) , tca , TCA_MAX ,
rtm_tca_policy , extack ) ;
2013-03-25 21:36:33 +04:00
if ( err < 0 )
return err ;
2012-06-27 08:48:50 +04:00
t = nlmsg_data ( n ) ;
2005-04-17 02:20:36 +04:00
protocol = TC_H_MIN ( t - > tcm_info ) ;
prio = TC_H_MAJ ( t - > tcm_info ) ;
2017-05-17 12:07:57 +03:00
prio_allocate = false ;
2005-04-17 02:20:36 +04:00
parent = t - > tcm_parent ;
2019-02-11 11:55:39 +03:00
tp = NULL ;
2005-04-17 02:20:36 +04:00
cl = 0 ;
2019-02-11 11:55:48 +03:00
block = NULL ;
net: sched: fix use-after-free in tc_new_tfilter()
Whenever tc_new_tfilter() jumps back to replay: label,
we need to make sure @q and @chain local variables are cleared again,
or risk use-after-free as in [1]
For consistency, apply the same fix in tc_ctl_chain()
BUG: KASAN: use-after-free in mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
Write of size 8 at addr ffff8880985c4b08 by task syz-executor.4/1945
CPU: 0 PID: 1945 Comm: syz-executor.4 Not tainted 5.17.0-rc1-syzkaller-00495-gff58831fa02d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
tcf_chain_head_change_item net/sched/cls_api.c:372 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:386
tcf_chain_tp_insert net/sched/cls_api.c:1657 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1707 [inline]
tc_new_tfilter+0x1e67/0x2350 net/sched/cls_api.c:2086
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f2647172059
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f2645aa5168 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00007f2647285100 RCX: 00007f2647172059
RDX: 040000000000009f RSI: 00000000200002c0 RDI: 0000000000000006
RBP: 00007f26471cc08d R08: 0000000000000000 R09: 0000000000000000
R10: 9e00000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fffb3f7f02f R14: 00007f2645aa5300 R15: 0000000000022000
</TASK>
Allocated by task 1944:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:45 [inline]
set_alloc_info mm/kasan/common.c:436 [inline]
____kasan_kmalloc mm/kasan/common.c:515 [inline]
____kasan_kmalloc mm/kasan/common.c:474 [inline]
__kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:524
kmalloc_node include/linux/slab.h:604 [inline]
kzalloc_node include/linux/slab.h:726 [inline]
qdisc_alloc+0xac/0xa10 net/sched/sch_generic.c:941
qdisc_create.constprop.0+0xce/0x10f0 net/sched/sch_api.c:1211
tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660
rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5592
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed by task 3609:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track+0x21/0x30 mm/kasan/common.c:45
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free+0x130/0x160 mm/kasan/common.c:328
kasan_slab_free include/linux/kasan.h:236 [inline]
slab_free_hook mm/slub.c:1728 [inline]
slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1754
slab_free mm/slub.c:3509 [inline]
kfree+0xcb/0x280 mm/slub.c:4562
rcu_do_batch kernel/rcu/tree.c:2527 [inline]
rcu_core+0x7b8/0x1540 kernel/rcu/tree.c:2778
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Last potentially related work creation:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
__kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348
__call_rcu kernel/rcu/tree.c:3026 [inline]
call_rcu+0xb1/0x740 kernel/rcu/tree.c:3106
qdisc_put_unlocked+0x6f/0x90 net/sched/sch_generic.c:1109
tcf_block_release+0x86/0x90 net/sched/cls_api.c:1238
tc_new_tfilter+0xc0d/0x2350 net/sched/cls_api.c:2148
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff8880985c4800
which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 776 bytes inside of
1024-byte region [ffff8880985c4800, ffff8880985c4c00)
The buggy address belongs to the page:
page:ffffea0002617000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x985c0
head:ffffea0002617000 order:3 compound_mapcount:0 compound_pincount:0
flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888010c41dc0
raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 3, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 1941, ts 1038999441284, free_ts 1033444432829
prep_new_page mm/page_alloc.c:2434 [inline]
get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165
__alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389
alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271
alloc_slab_page mm/slub.c:1799 [inline]
allocate_slab mm/slub.c:1944 [inline]
new_slab+0x28a/0x3b0 mm/slub.c:2004
___slab_alloc+0x87c/0xe90 mm/slub.c:3018
__slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105
slab_alloc_node mm/slub.c:3196 [inline]
slab_alloc mm/slub.c:3238 [inline]
__kmalloc+0x2fb/0x340 mm/slub.c:4420
kmalloc include/linux/slab.h:586 [inline]
kzalloc include/linux/slab.h:715 [inline]
__register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1335
neigh_sysctl_register+0x2c8/0x5e0 net/core/neighbour.c:3787
devinet_sysctl_register+0xb1/0x230 net/ipv4/devinet.c:2618
inetdev_init+0x286/0x580 net/ipv4/devinet.c:278
inetdev_event+0xa8a/0x15d0 net/ipv4/devinet.c:1532
notifier_call_chain+0xb5/0x200 kernel/notifier.c:84
call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1919
call_netdevice_notifiers_extack net/core/dev.c:1931 [inline]
call_netdevice_notifiers net/core/dev.c:1945 [inline]
register_netdevice+0x1073/0x1500 net/core/dev.c:9698
veth_newlink+0x59c/0xa90 drivers/net/veth.c:1722
page last free stack trace:
reset_page_owner include/linux/page_owner.h:24 [inline]
free_pages_prepare mm/page_alloc.c:1352 [inline]
free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404
free_unref_page_prepare mm/page_alloc.c:3325 [inline]
free_unref_page+0x19/0x690 mm/page_alloc.c:3404
release_pages+0x748/0x1220 mm/swap.c:956
tlb_batch_pages_flush mm/mmu_gather.c:50 [inline]
tlb_flush_mmu_free mm/mmu_gather.c:243 [inline]
tlb_flush_mmu+0xe9/0x6b0 mm/mmu_gather.c:250
zap_pte_range mm/memory.c:1441 [inline]
zap_pmd_range mm/memory.c:1490 [inline]
zap_pud_range mm/memory.c:1519 [inline]
zap_p4d_range mm/memory.c:1540 [inline]
unmap_page_range+0x1d1d/0x2a30 mm/memory.c:1561
unmap_single_vma+0x198/0x310 mm/memory.c:1606
unmap_vmas+0x16b/0x2f0 mm/memory.c:1638
exit_mmap+0x201/0x670 mm/mmap.c:3178
__mmput+0x122/0x4b0 kernel/fork.c:1114
mmput+0x56/0x60 kernel/fork.c:1135
exit_mm kernel/exit.c:507 [inline]
do_exit+0xa3c/0x2a30 kernel/exit.c:793
do_group_exit+0xd2/0x2f0 kernel/exit.c:935
__do_sys_exit_group kernel/exit.c:946 [inline]
__se_sys_exit_group kernel/exit.c:944 [inline]
__x64_sys_exit_group+0x3a/0x50 kernel/exit.c:944
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Memory state around the buggy address:
ffff8880985c4a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8880985c4b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff8880985c4b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
Fixes: 470502de5bdb ("net: sched: unlock rules update API")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20220131172018.3704490-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-31 20:20:18 +03:00
q = NULL ;
chain = NULL ;
2021-08-10 06:43:05 +03:00
flags = 0 ;
2005-04-17 02:20:36 +04:00
if ( prio = = 0 ) {
2018-05-31 09:52:53 +03:00
/* If no priority is provided by the user,
* we allocate one .
*/
if ( n - > nlmsg_flags & NLM_F_CREATE ) {
prio = TC_H_MAKE ( 0x80000000U , 0U ) ;
prio_allocate = true ;
} else {
2018-01-18 19:20:50 +03:00
NL_SET_ERR_MSG ( extack , " Invalid filter command with priority of zero " ) ;
2005-04-17 02:20:36 +04:00
return - ENOENT ;
net, cls: allow for deleting all filters for given parent
Add a possibility where the user can just specify the parent and
all filters under that parent are then being purged. Currently,
for example for scripting, one needs to specify pref/prio to have
a well-defined number for 'tc filter del' command for addressing
the previously created instance or additionally filter handle in
case of priorities being the same. Improve usage by allowing the
option for tc to specify the parent and removing the whole chain
for that given parent.
Example usage after patch, no tc changes required:
# tc qdisc replace dev foo clsact
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter show dev foo egress
filter protocol all pref 49151 bpf
filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action
# tc filter del dev foo egress
# tc filter show dev foo egress
#
Previously, RTM_DELTFILTER requests with invalid prio of 0 were
rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE
flag were allowed where the kernel would auto-generate a pref/prio.
We can piggyback on that and use prio of 0 as a wildcard for
requests of RTM_DELTFILTER.
For notifying tc netlink monitoring users (e.g. libnl uses this
for caching), there are two options, that is, sending individual
tfilter_notify() notifications for each tcf_proto, or sending a
single one indicating wildcard removal. I tried both and there
are pros and cons for each, eventually I decided for sending
individual tfilter_notify(), so that user space can support this
seamlessly and there won't be a mess of changing each and every
application to make sure expectations from the kernel won't break
when they don't understand single notification. Since linear chains
don't really scale, I expect only a handful of classifiers to be
attached at max for a given parent anyway.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 00:10:22 +03:00
}
2005-04-17 02:20:36 +04:00
}
/* Find head of filter chain. */
2019-02-11 11:55:48 +03:00
err = __tcf_qdisc_find ( net , & q , & parent , t - > tcm_ifindex , false , extack ) ;
if ( err )
return err ;
2019-10-07 23:26:28 +03:00
if ( tcf_proto_check_kind ( tca [ TCA_KIND ] , name ) ) {
NL_SET_ERR_MSG ( extack , " Specified TC filter name too long " ) ;
err = - EINVAL ;
goto errout ;
}
2019-02-11 11:55:48 +03:00
/* Take rtnl mutex if rtnl_held was set to true on previous iteration,
* block is shared ( no qdisc found ) , qdisc is not unlocked , classifier
* type is not specified , classifier is not unlocked .
*/
if ( rtnl_held | |
( q & & ! ( q - > ops - > cl_ops - > flags & QDISC_CLASS_OPS_DOIT_UNLOCKED ) ) | |
2019-10-07 23:26:28 +03:00
! tcf_proto_is_unlocked ( name ) ) {
2019-02-11 11:55:48 +03:00
rtnl_held = true ;
rtnl_lock ( ) ;
}
err = __tcf_qdisc_cl_find ( q , parent , & cl , t - > tcm_ifindex , extack ) ;
if ( err )
goto errout ;
block = __tcf_block_find ( net , q , cl , t - > tcm_ifindex , t - > tcm_block_index ,
extack ) ;
2018-05-31 09:52:53 +03:00
if ( IS_ERR ( block ) ) {
err = PTR_ERR ( block ) ;
goto errout ;
2017-02-09 16:38:58 +03:00
}
net_sched: fix tcm_parent in tc filter dump
When we tell kernel to dump filters from root (ffff:ffff),
those filters on ingress (ffff:0000) are matched, but their
true parents must be dumped as they are. However, kernel
dumps just whatever we tell it, that is either ffff:ffff
or ffff:0000:
$ nl-cls-list --dev=dummy0 --parent=root
cls basic dev dummy0 id none parent root prio 49152 protocol ip match-all
cls basic dev dummy0 id :1 parent root prio 49152 protocol ip match-all
$ nl-cls-list --dev=dummy0 --parent=ffff:
cls basic dev dummy0 id none parent ffff: prio 49152 protocol ip match-all
cls basic dev dummy0 id :1 parent ffff: prio 49152 protocol ip match-all
This is confusing and misleading, more importantly this is
a regression since 4.15, so the old behavior must be restored.
And, when tc filters are installed on a tc class, the parent
should be the classid, rather than the qdisc handle. Commit
edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto")
removed the classid we save for filters, we can just restore
this classid in tcf_block.
Steps to reproduce this:
ip li set dev dummy0 up
tc qd add dev dummy0 ingress
tc filter add dev dummy0 parent ffff: protocol arp basic action pass
tc filter show dev dummy0 root
Before this patch:
filter protocol arp pref 49152 basic
filter protocol arp pref 49152 basic handle 0x1
action order 1: gact action pass
random type none pass val 0
index 1 ref 1 bind 1
After this patch:
filter parent ffff: protocol arp pref 49152 basic
filter parent ffff: protocol arp pref 49152 basic handle 0x1
action order 1: gact action pass
random type none pass val 0
index 1 ref 1 bind 1
Fixes: a10fa20101ae ("net: sched: propagate q and parent from caller down to tcf_fill_node")
Fixes: edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto")
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-05-01 06:53:49 +03:00
block - > classid = parent ;
2017-05-17 12:08:01 +03:00
chain_index = tca [ TCA_CHAIN ] ? nla_get_u32 ( tca [ TCA_CHAIN ] ) : 0 ;
if ( chain_index > TC_ACT_EXT_VAL_MASK ) {
2018-01-18 19:20:50 +03:00
NL_SET_ERR_MSG ( extack , " Specified chain index exceeds upper limit " ) ;
2017-05-17 12:08:01 +03:00
err = - EINVAL ;
goto errout ;
}
2018-05-31 09:52:53 +03:00
chain = tcf_chain_get ( block , chain_index , true ) ;
2017-05-17 12:08:01 +03:00
if ( ! chain ) {
2018-08-27 21:58:43 +03:00
NL_SET_ERR_MSG ( extack , " Cannot create specified filter chain " ) ;
2018-05-31 09:52:53 +03:00
err = - ENOMEM ;
net, cls: allow for deleting all filters for given parent
Add a possibility where the user can just specify the parent and
all filters under that parent are then being purged. Currently,
for example for scripting, one needs to specify pref/prio to have
a well-defined number for 'tc filter del' command for addressing
the previously created instance or additionally filter handle in
case of priorities being the same. Improve usage by allowing the
option for tc to specify the parent and removing the whole chain
for that given parent.
Example usage after patch, no tc changes required:
# tc qdisc replace dev foo clsact
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter add dev foo egress bpf da obj ./bpf.o
# tc filter show dev foo egress
filter protocol all pref 49151 bpf
filter protocol all pref 49151 bpf handle 0x1 bpf.o:[classifier] direct-action
filter protocol all pref 49152 bpf
filter protocol all pref 49152 bpf handle 0x1 bpf.o:[classifier] direct-action
# tc filter del dev foo egress
# tc filter show dev foo egress
#
Previously, RTM_DELTFILTER requests with invalid prio of 0 were
rejected, so only netlink requests with RTM_NEWTFILTER and NLM_F_CREATE
flag were allowed where the kernel would auto-generate a pref/prio.
We can piggyback on that and use prio of 0 as a wildcard for
requests of RTM_DELTFILTER.
For notifying tc netlink monitoring users (e.g. libnl uses this
for caching), there are two options, that is, sending individual
tfilter_notify() notifications for each tcf_proto, or sending a
single one indicating wildcard removal. I tried both and there
are pros and cons for each, eventually I decided for sending
individual tfilter_notify(), so that user space can support this
seamlessly and there won't be a mess of changing each and every
application to make sure expectations from the kernel won't break
when they don't understand single notification. Since linear chains
don't really scale, I expect only a handful of classifiers to be
attached at max for a given parent anyway.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-11 00:10:22 +03:00
goto errout ;
}
2005-04-17 02:20:36 +04:00
2019-02-11 11:55:38 +03:00
mutex_lock ( & chain - > filter_chain_lock ) ;
2017-05-17 12:07:59 +03:00
tp = tcf_chain_tp_find ( chain , & chain_info , protocol ,
prio , prio_allocate ) ;
if ( IS_ERR ( tp ) ) {
2018-01-18 19:20:50 +03:00
NL_SET_ERR_MSG ( extack , " Filter with specified priority/protocol not found " ) ;
2017-05-17 12:07:59 +03:00
err = PTR_ERR ( tp ) ;
2019-02-11 11:55:38 +03:00
goto errout_locked ;
2005-04-17 02:20:36 +04:00
}
if ( tp = = NULL ) {
2019-02-11 11:55:41 +03:00
struct tcf_proto * tp_new = NULL ;
2019-02-11 11:55:42 +03:00
if ( chain - > flushing ) {
err = - EAGAIN ;
goto errout_locked ;
}
2005-04-17 02:20:36 +04:00
/* Proto-tcf does not exist, create new one */
2017-02-09 16:38:58 +03:00
if ( tca [ TCA_KIND ] = = NULL | | ! protocol ) {
2018-01-18 19:20:50 +03:00
NL_SET_ERR_MSG ( extack , " Filter kind and protocol must be specified " ) ;
2017-02-09 16:38:58 +03:00
err = - EINVAL ;
2019-02-11 11:55:38 +03:00
goto errout_locked ;
2017-02-09 16:38:58 +03:00
}
2005-04-17 02:20:36 +04:00
2018-05-31 09:52:53 +03:00
if ( ! ( n - > nlmsg_flags & NLM_F_CREATE ) ) {
2018-01-18 19:20:50 +03:00
NL_SET_ERR_MSG ( extack , " Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter " ) ;
2017-02-09 16:38:58 +03:00
err = - ENOENT ;
2019-02-11 11:55:38 +03:00
goto errout_locked ;
2017-02-09 16:38:58 +03:00
}
2005-04-17 02:20:36 +04:00
2017-05-17 12:07:57 +03:00
if ( prio_allocate )
2019-02-11 11:55:38 +03:00
prio = tcf_auto_prio ( tcf_chain_tp_prev ( chain ,
& chain_info ) ) ;
2005-04-17 02:20:36 +04:00
2019-02-11 11:55:38 +03:00
mutex_unlock ( & chain - > filter_chain_lock ) ;
2020-01-21 22:02:20 +03:00
tp_new = tcf_proto_create ( name , protocol , prio , chain ,
rtnl_held , extack ) ;
2019-02-11 11:55:41 +03:00
if ( IS_ERR ( tp_new ) ) {
err = PTR_ERR ( tp_new ) ;
2019-02-11 11:55:42 +03:00
goto errout_tp ;
2005-04-17 02:20:36 +04:00
}
2019-02-11 11:55:38 +03:00
2009-06-02 13:17:34 +04:00
tp_created = 1 ;
2019-02-11 11:55:45 +03:00
tp = tcf_chain_tp_insert_unique ( chain , tp_new , protocol , prio ,
rtnl_held ) ;
2019-02-11 11:55:42 +03:00
if ( IS_ERR ( tp ) ) {
err = PTR_ERR ( tp ) ;
goto errout_tp ;
}
2019-02-11 11:55:38 +03:00
} else {
mutex_unlock ( & chain - > filter_chain_lock ) ;
2017-02-09 16:38:58 +03:00
}
2005-04-17 02:20:36 +04:00
2019-02-11 11:55:41 +03:00
if ( tca [ TCA_KIND ] & & nla_strcmp ( tca [ TCA_KIND ] , tp - > ops - > kind ) ) {
NL_SET_ERR_MSG ( extack , " Specified filter kind does not match existing one " ) ;
err = - EINVAL ;
goto errout ;
}
2005-04-17 02:20:36 +04:00
fh = tp - > ops - > get ( tp , t - > tcm_handle ) ;
2017-08-05 07:31:43 +03:00
if ( ! fh ) {
2018-05-31 09:52:53 +03:00
if ( ! ( n - > nlmsg_flags & NLM_F_CREATE ) ) {
2018-01-18 19:20:50 +03:00
NL_SET_ERR_MSG ( extack , " Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter " ) ;
2017-02-09 16:38:58 +03:00
err = - ENOENT ;
2005-04-17 02:20:36 +04:00
goto errout ;
2017-02-09 16:38:58 +03:00
}
2018-05-31 09:52:53 +03:00
} else if ( n - > nlmsg_flags & NLM_F_EXCL ) {
2019-02-11 11:55:44 +03:00
tfilter_put ( tp , fh ) ;
2018-05-31 09:52:53 +03:00
NL_SET_ERR_MSG ( extack , " Filter already exists " ) ;
err = - EEXIST ;
goto errout ;
2005-04-17 02:20:36 +04:00
}
2018-07-23 10:23:07 +03:00
if ( chain - > tmplt_ops & & chain - > tmplt_ops ! = tp - > ops ) {
2022-09-21 12:27:34 +03:00
tfilter_put ( tp , fh ) ;
2018-07-23 10:23:07 +03:00
NL_SET_ERR_MSG ( extack , " Chain template is set to a different filter kind " ) ;
err = - EINVAL ;
goto errout ;
}
2021-07-30 02:12:14 +03:00
if ( ! ( n - > nlmsg_flags & NLM_F_CREATE ) )
flags | = TCA_ACT_FLAGS_REPLACE ;
if ( ! rtnl_held )
flags | = TCA_ACT_FLAGS_NO_RTNL ;
2022-10-21 10:58:39 +03:00
if ( is_qdisc_ingress ( parent ) )
flags | = TCA_ACT_FLAGS_AT_INGRESS ;
2014-04-26 00:54:06 +04:00
err = tp - > ops - > change ( net , skb , tp , cl , t - > tcm_handle , tca , & fh ,
2021-07-30 02:12:14 +03:00
flags , extack ) ;
2019-02-11 11:55:44 +03:00
if ( err = = 0 ) {
2024-03-25 23:47:35 +03:00
tcf_block_filter_cnt_update ( block , & tp - > counted , true ) ;
2018-01-17 13:46:51 +03:00
tfilter_notify ( net , skb , n , tp , block , q , parent , fh ,
2023-01-13 06:43:53 +03:00
RTM_NEWTFILTER , false , rtnl_held , extack ) ;
2019-02-11 11:55:44 +03:00
tfilter_put ( tp , fh ) ;
2019-07-21 17:44:12 +03:00
/* q pointer is NULL for shared blocks */
if ( q )
q - > flags & = ~ TCQ_F_CAN_BYPASS ;
2019-02-11 11:55:44 +03:00
}
2005-04-17 02:20:36 +04:00
errout :
2019-02-11 11:55:41 +03:00
if ( err & & tp_created )
2019-02-11 11:55:45 +03:00
tcf_chain_tp_delete_empty ( chain , tp , rtnl_held , NULL ) ;
2019-02-11 11:55:42 +03:00
errout_tp :
2019-02-11 11:55:39 +03:00
if ( chain ) {
if ( tp & & ! IS_ERR ( tp ) )
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp , rtnl_held , NULL ) ;
2019-02-11 11:55:39 +03:00
if ( ! tp_created )
tcf_chain_put ( chain ) ;
}
2019-02-11 11:55:45 +03:00
tcf_block_release ( q , block , rtnl_held ) ;
2019-02-11 11:55:48 +03:00
if ( rtnl_held )
rtnl_unlock ( ) ;
if ( err = = - EAGAIN ) {
/* Take rtnl lock in case EAGAIN is caused by concurrent flush
* of target chain .
*/
rtnl_held = true ;
2005-04-17 02:20:36 +04:00
/* Replay the request. */
goto replay ;
2019-02-11 11:55:48 +03:00
}
2005-04-17 02:20:36 +04:00
return err ;
2019-02-11 11:55:38 +03:00
errout_locked :
mutex_unlock ( & chain - > filter_chain_lock ) ;
goto errout ;
2005-04-17 02:20:36 +04:00
}
2018-05-31 09:52:53 +03:00
static int tc_del_tfilter ( struct sk_buff * skb , struct nlmsghdr * n ,
struct netlink_ext_ack * extack )
{
struct net * net = sock_net ( skb - > sk ) ;
struct nlattr * tca [ TCA_MAX + 1 ] ;
2019-10-07 23:26:28 +03:00
char name [ IFNAMSIZ ] ;
2018-05-31 09:52:53 +03:00
struct tcmsg * t ;
u32 protocol ;
u32 prio ;
u32 parent ;
u32 chain_index ;
struct Qdisc * q = NULL ;
struct tcf_chain_info chain_info ;
struct tcf_chain * chain = NULL ;
2019-02-11 11:55:48 +03:00
struct tcf_block * block = NULL ;
2018-05-31 09:52:53 +03:00
struct tcf_proto * tp = NULL ;
unsigned long cl = 0 ;
void * fh = NULL ;
int err ;
2019-02-11 11:55:48 +03:00
bool rtnl_held = false ;
2018-05-31 09:52:53 +03:00
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 15:07:28 +03:00
err = nlmsg_parse_deprecated ( n , sizeof ( * t ) , tca , TCA_MAX ,
rtm_tca_policy , extack ) ;
2018-05-31 09:52:53 +03:00
if ( err < 0 )
return err ;
t = nlmsg_data ( n ) ;
protocol = TC_H_MIN ( t - > tcm_info ) ;
prio = TC_H_MAJ ( t - > tcm_info ) ;
parent = t - > tcm_parent ;
if ( prio = = 0 & & ( protocol | | t - > tcm_handle | | tca [ TCA_KIND ] ) ) {
NL_SET_ERR_MSG ( extack , " Cannot flush filters with protocol, handle or kind set " ) ;
return - ENOENT ;
}
/* Find head of filter chain. */
2019-02-11 11:55:48 +03:00
err = __tcf_qdisc_find ( net , & q , & parent , t - > tcm_ifindex , false , extack ) ;
if ( err )
return err ;
2019-10-07 23:26:28 +03:00
if ( tcf_proto_check_kind ( tca [ TCA_KIND ] , name ) ) {
NL_SET_ERR_MSG ( extack , " Specified TC filter name too long " ) ;
err = - EINVAL ;
goto errout ;
}
2019-02-11 11:55:48 +03:00
/* Take rtnl mutex if flushing whole chain, block is shared (no qdisc
* found ) , qdisc is not unlocked , classifier type is not specified ,
* classifier is not unlocked .
*/
if ( ! prio | |
( q & & ! ( q - > ops - > cl_ops - > flags & QDISC_CLASS_OPS_DOIT_UNLOCKED ) ) | |
2019-10-07 23:26:28 +03:00
! tcf_proto_is_unlocked ( name ) ) {
2019-02-11 11:55:48 +03:00
rtnl_held = true ;
rtnl_lock ( ) ;
}
err = __tcf_qdisc_cl_find ( q , parent , & cl , t - > tcm_ifindex , extack ) ;
if ( err )
goto errout ;
block = __tcf_block_find ( net , q , cl , t - > tcm_ifindex , t - > tcm_block_index ,
extack ) ;
2018-05-31 09:52:53 +03:00
if ( IS_ERR ( block ) ) {
err = PTR_ERR ( block ) ;
goto errout ;
}
chain_index = tca [ TCA_CHAIN ] ? nla_get_u32 ( tca [ TCA_CHAIN ] ) : 0 ;
if ( chain_index > TC_ACT_EXT_VAL_MASK ) {
NL_SET_ERR_MSG ( extack , " Specified chain index exceeds upper limit " ) ;
err = - EINVAL ;
goto errout ;
}
chain = tcf_chain_get ( block , chain_index , false ) ;
if ( ! chain ) {
2018-08-03 12:08:47 +03:00
/* User requested flush on non-existent chain. Nothing to do,
* so just return success .
*/
if ( prio = = 0 ) {
err = 0 ;
goto errout ;
}
2018-05-31 09:52:53 +03:00
NL_SET_ERR_MSG ( extack , " Cannot find specified filter chain " ) ;
2018-08-27 21:58:44 +03:00
err = - ENOENT ;
2018-05-31 09:52:53 +03:00
goto errout ;
}
if ( prio = = 0 ) {
tfilter_notify_chain ( net , skb , block , q , parent , n ,
2023-01-13 06:43:53 +03:00
chain , RTM_DELTFILTER , extack ) ;
2019-02-11 11:55:45 +03:00
tcf_chain_flush ( chain , rtnl_held ) ;
2018-05-31 09:52:53 +03:00
err = 0 ;
goto errout ;
}
2019-02-11 11:55:38 +03:00
mutex_lock ( & chain - > filter_chain_lock ) ;
2018-05-31 09:52:53 +03:00
tp = tcf_chain_tp_find ( chain , & chain_info , protocol ,
prio , false ) ;
if ( ! tp | | IS_ERR ( tp ) ) {
NL_SET_ERR_MSG ( extack , " Filter with specified priority/protocol not found " ) ;
2018-06-04 18:32:23 +03:00
err = tp ? PTR_ERR ( tp ) : - ENOENT ;
2019-02-11 11:55:38 +03:00
goto errout_locked ;
2018-05-31 09:52:53 +03:00
} else if ( tca [ TCA_KIND ] & & nla_strcmp ( tca [ TCA_KIND ] , tp - > ops - > kind ) ) {
NL_SET_ERR_MSG ( extack , " Specified filter kind does not match existing one " ) ;
err = - EINVAL ;
2019-02-11 11:55:38 +03:00
goto errout_locked ;
} else if ( t - > tcm_handle = = 0 ) {
2019-11-02 17:17:47 +03:00
tcf_proto_signal_destroying ( chain , tp ) ;
2019-02-11 11:55:38 +03:00
tcf_chain_tp_remove ( chain , & chain_info , tp ) ;
mutex_unlock ( & chain - > filter_chain_lock ) ;
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp , rtnl_held , NULL ) ;
2019-02-11 11:55:38 +03:00
tfilter_notify ( net , skb , n , tp , block , q , parent , fh ,
2023-01-13 06:43:53 +03:00
RTM_DELTFILTER , false , rtnl_held , extack ) ;
2019-02-11 11:55:38 +03:00
err = 0 ;
2018-05-31 09:52:53 +03:00
goto errout ;
}
2019-02-11 11:55:38 +03:00
mutex_unlock ( & chain - > filter_chain_lock ) ;
2018-05-31 09:52:53 +03:00
fh = tp - > ops - > get ( tp , t - > tcm_handle ) ;
if ( ! fh ) {
2019-02-11 11:55:38 +03:00
NL_SET_ERR_MSG ( extack , " Specified filter handle not found " ) ;
err = - ENOENT ;
2018-05-31 09:52:53 +03:00
} else {
bool last ;
2023-12-08 22:28:46 +03:00
err = tfilter_del_notify ( net , skb , n , tp , block , q , parent , fh ,
& last , rtnl_held , extack ) ;
2019-02-11 11:55:45 +03:00
2018-05-31 09:52:53 +03:00
if ( err )
goto errout ;
2019-02-11 11:55:41 +03:00
if ( last )
2019-02-11 11:55:45 +03:00
tcf_chain_tp_delete_empty ( chain , tp , rtnl_held , extack ) ;
2018-05-31 09:52:53 +03:00
}
errout :
2019-02-11 11:55:39 +03:00
if ( chain ) {
if ( tp & & ! IS_ERR ( tp ) )
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp , rtnl_held , NULL ) ;
2018-05-31 09:52:53 +03:00
tcf_chain_put ( chain ) ;
2019-02-11 11:55:39 +03:00
}
2019-02-11 11:55:45 +03:00
tcf_block_release ( q , block , rtnl_held ) ;
2019-02-11 11:55:48 +03:00
if ( rtnl_held )
rtnl_unlock ( ) ;
2018-05-31 09:52:53 +03:00
return err ;
2019-02-11 11:55:38 +03:00
errout_locked :
mutex_unlock ( & chain - > filter_chain_lock ) ;
goto errout ;
2018-05-31 09:52:53 +03:00
}
static int tc_get_tfilter ( struct sk_buff * skb , struct nlmsghdr * n ,
struct netlink_ext_ack * extack )
{
struct net * net = sock_net ( skb - > sk ) ;
struct nlattr * tca [ TCA_MAX + 1 ] ;
2019-10-07 23:26:28 +03:00
char name [ IFNAMSIZ ] ;
2018-05-31 09:52:53 +03:00
struct tcmsg * t ;
u32 protocol ;
u32 prio ;
u32 parent ;
u32 chain_index ;
struct Qdisc * q = NULL ;
struct tcf_chain_info chain_info ;
struct tcf_chain * chain = NULL ;
2019-02-11 11:55:48 +03:00
struct tcf_block * block = NULL ;
2018-05-31 09:52:53 +03:00
struct tcf_proto * tp = NULL ;
unsigned long cl = 0 ;
void * fh = NULL ;
int err ;
2019-02-11 11:55:48 +03:00
bool rtnl_held = false ;
2018-05-31 09:52:53 +03:00
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 15:07:28 +03:00
err = nlmsg_parse_deprecated ( n , sizeof ( * t ) , tca , TCA_MAX ,
rtm_tca_policy , extack ) ;
2018-05-31 09:52:53 +03:00
if ( err < 0 )
return err ;
t = nlmsg_data ( n ) ;
protocol = TC_H_MIN ( t - > tcm_info ) ;
prio = TC_H_MAJ ( t - > tcm_info ) ;
parent = t - > tcm_parent ;
if ( prio = = 0 ) {
NL_SET_ERR_MSG ( extack , " Invalid filter command with priority of zero " ) ;
return - ENOENT ;
}
/* Find head of filter chain. */
2019-02-11 11:55:48 +03:00
err = __tcf_qdisc_find ( net , & q , & parent , t - > tcm_ifindex , false , extack ) ;
if ( err )
return err ;
2019-10-07 23:26:28 +03:00
if ( tcf_proto_check_kind ( tca [ TCA_KIND ] , name ) ) {
NL_SET_ERR_MSG ( extack , " Specified TC filter name too long " ) ;
err = - EINVAL ;
goto errout ;
}
2019-02-11 11:55:48 +03:00
/* Take rtnl mutex if block is shared (no qdisc found), qdisc is not
* unlocked , classifier type is not specified , classifier is not
* unlocked .
*/
if ( ( q & & ! ( q - > ops - > cl_ops - > flags & QDISC_CLASS_OPS_DOIT_UNLOCKED ) ) | |
2019-10-07 23:26:28 +03:00
! tcf_proto_is_unlocked ( name ) ) {
2019-02-11 11:55:48 +03:00
rtnl_held = true ;
rtnl_lock ( ) ;
}
err = __tcf_qdisc_cl_find ( q , parent , & cl , t - > tcm_ifindex , extack ) ;
if ( err )
goto errout ;
block = __tcf_block_find ( net , q , cl , t - > tcm_ifindex , t - > tcm_block_index ,
extack ) ;
2018-05-31 09:52:53 +03:00
if ( IS_ERR ( block ) ) {
err = PTR_ERR ( block ) ;
goto errout ;
}
chain_index = tca [ TCA_CHAIN ] ? nla_get_u32 ( tca [ TCA_CHAIN ] ) : 0 ;
if ( chain_index > TC_ACT_EXT_VAL_MASK ) {
NL_SET_ERR_MSG ( extack , " Specified chain index exceeds upper limit " ) ;
err = - EINVAL ;
goto errout ;
}
chain = tcf_chain_get ( block , chain_index , false ) ;
if ( ! chain ) {
NL_SET_ERR_MSG ( extack , " Cannot find specified filter chain " ) ;
err = - EINVAL ;
goto errout ;
}
2019-02-11 11:55:38 +03:00
mutex_lock ( & chain - > filter_chain_lock ) ;
2018-05-31 09:52:53 +03:00
tp = tcf_chain_tp_find ( chain , & chain_info , protocol ,
prio , false ) ;
2019-02-11 11:55:38 +03:00
mutex_unlock ( & chain - > filter_chain_lock ) ;
2018-05-31 09:52:53 +03:00
if ( ! tp | | IS_ERR ( tp ) ) {
NL_SET_ERR_MSG ( extack , " Filter with specified priority/protocol not found " ) ;
2018-06-04 18:32:23 +03:00
err = tp ? PTR_ERR ( tp ) : - ENOENT ;
2018-05-31 09:52:53 +03:00
goto errout ;
} else if ( tca [ TCA_KIND ] & & nla_strcmp ( tca [ TCA_KIND ] , tp - > ops - > kind ) ) {
NL_SET_ERR_MSG ( extack , " Specified filter kind does not match existing one " ) ;
err = - EINVAL ;
goto errout ;
}
fh = tp - > ops - > get ( tp , t - > tcm_handle ) ;
if ( ! fh ) {
NL_SET_ERR_MSG ( extack , " Specified filter handle not found " ) ;
err = - ENOENT ;
} else {
err = tfilter_notify ( net , skb , n , tp , block , q , parent ,
2023-01-13 06:43:53 +03:00
fh , RTM_NEWTFILTER , true , rtnl_held , NULL ) ;
2018-05-31 09:52:53 +03:00
if ( err < 0 )
NL_SET_ERR_MSG ( extack , " Failed to send filter notify message " ) ;
}
2019-02-11 11:55:44 +03:00
tfilter_put ( tp , fh ) ;
2018-05-31 09:52:53 +03:00
errout :
2019-02-11 11:55:39 +03:00
if ( chain ) {
if ( tp & & ! IS_ERR ( tp ) )
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp , rtnl_held , NULL ) ;
2018-05-31 09:52:53 +03:00
tcf_chain_put ( chain ) ;
2019-02-11 11:55:39 +03:00
}
2019-02-11 11:55:45 +03:00
tcf_block_release ( q , block , rtnl_held ) ;
2019-02-11 11:55:48 +03:00
if ( rtnl_held )
rtnl_unlock ( ) ;
2018-05-31 09:52:53 +03:00
return err ;
}
2008-01-21 13:26:41 +03:00
struct tcf_dump_args {
2005-04-17 02:20:36 +04:00
struct tcf_walker w ;
struct sk_buff * skb ;
struct netlink_callback * cb ;
2018-01-17 13:46:51 +03:00
struct tcf_block * block ;
2017-10-13 15:01:05 +03:00
struct Qdisc * q ;
u32 parent ;
2020-05-15 14:40:11 +03:00
bool terse_dump ;
2005-04-17 02:20:36 +04:00
} ;
2017-08-05 07:31:43 +03:00
static int tcf_node_dump ( struct tcf_proto * tp , void * n , struct tcf_walker * arg )
2005-04-17 02:20:36 +04:00
{
2008-01-21 13:26:41 +03:00
struct tcf_dump_args * a = ( void * ) arg ;
2014-01-10 04:14:01 +04:00
struct net * net = sock_net ( a - > skb - > sk ) ;
2005-04-17 02:20:36 +04:00
2018-01-17 13:46:51 +03:00
return tcf_fill_node ( net , a - > skb , tp , a - > block , a - > q , a - > parent ,
2017-10-13 15:01:05 +03:00
n , NETLINK_CB ( a - > cb - > skb ) . portid ,
2016-09-18 15:45:33 +03:00
a - > cb - > nlh - > nlmsg_seq , NLM_F_MULTI ,
2023-01-13 06:43:53 +03:00
RTM_NEWTFILTER , a - > terse_dump , true , NULL ) ;
2005-04-17 02:20:36 +04:00
}
2017-10-13 15:01:05 +03:00
static bool tcf_chain_dump ( struct tcf_chain * chain , struct Qdisc * q , u32 parent ,
struct sk_buff * skb , struct netlink_callback * cb ,
2020-05-15 14:40:11 +03:00
long index_start , long * p_index , bool terse )
2017-05-17 12:08:00 +03:00
{
struct net * net = sock_net ( skb - > sk ) ;
2018-01-17 13:46:51 +03:00
struct tcf_block * block = chain - > block ;
2017-05-17 12:08:00 +03:00
struct tcmsg * tcm = nlmsg_data ( cb - > nlh ) ;
2019-02-11 11:55:40 +03:00
struct tcf_proto * tp , * tp_prev ;
2017-05-17 12:08:00 +03:00
struct tcf_dump_args arg ;
2019-02-11 11:55:40 +03:00
for ( tp = __tcf_get_next_proto ( chain , NULL ) ;
tp ;
tp_prev = tp ,
tp = __tcf_get_next_proto ( chain , tp ) ,
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp_prev , true , NULL ) ,
2019-02-11 11:55:40 +03:00
( * p_index ) + + ) {
2017-05-17 12:08:00 +03:00
if ( * p_index < index_start )
continue ;
if ( TC_H_MAJ ( tcm - > tcm_info ) & &
TC_H_MAJ ( tcm - > tcm_info ) ! = tp - > prio )
continue ;
if ( TC_H_MIN ( tcm - > tcm_info ) & &
TC_H_MIN ( tcm - > tcm_info ) ! = tp - > protocol )
continue ;
if ( * p_index > index_start )
memset ( & cb - > args [ 1 ] , 0 ,
sizeof ( cb - > args ) - sizeof ( cb - > args [ 0 ] ) ) ;
if ( cb - > args [ 1 ] = = 0 ) {
2018-07-17 15:58:14 +03:00
if ( tcf_fill_node ( net , skb , tp , block , q , parent , NULL ,
2017-05-17 12:08:00 +03:00
NETLINK_CB ( cb - > skb ) . portid ,
cb - > nlh - > nlmsg_seq , NLM_F_MULTI ,
2023-01-13 06:43:53 +03:00
RTM_NEWTFILTER , false , true , NULL ) < = 0 )
2019-02-11 11:55:40 +03:00
goto errout ;
2017-05-17 12:08:00 +03:00
cb - > args [ 1 ] = 1 ;
}
if ( ! tp - > ops - > walk )
continue ;
arg . w . fn = tcf_node_dump ;
arg . skb = skb ;
arg . cb = cb ;
2018-01-17 13:46:51 +03:00
arg . block = block ;
2017-10-13 15:01:05 +03:00
arg . q = q ;
arg . parent = parent ;
2017-05-17 12:08:00 +03:00
arg . w . stop = 0 ;
arg . w . skip = cb - > args [ 1 ] - 1 ;
arg . w . count = 0 ;
2018-07-09 13:29:11 +03:00
arg . w . cookie = cb - > args [ 2 ] ;
2020-05-15 14:40:11 +03:00
arg . terse_dump = terse ;
2019-02-11 11:55:45 +03:00
tp - > ops - > walk ( tp , & arg . w , true ) ;
2018-07-09 13:29:11 +03:00
cb - > args [ 2 ] = arg . w . cookie ;
2017-05-17 12:08:00 +03:00
cb - > args [ 1 ] = arg . w . count + 1 ;
if ( arg . w . stop )
2019-02-11 11:55:40 +03:00
goto errout ;
2017-05-17 12:08:00 +03:00
}
2017-05-17 12:08:01 +03:00
return true ;
2019-02-11 11:55:40 +03:00
errout :
2019-02-11 11:55:45 +03:00
tcf_proto_put ( tp , true , NULL ) ;
2019-02-11 11:55:40 +03:00
return false ;
2017-05-17 12:08:00 +03:00
}
2020-05-15 14:40:11 +03:00
static const struct nla_policy tcf_tfilter_dump_policy [ TCA_MAX + 1 ] = {
2023-12-28 09:43:58 +03:00
[ TCA_CHAIN ] = { . type = NLA_U32 } ,
2020-05-15 14:40:11 +03:00
[ TCA_DUMP_FLAGS ] = NLA_POLICY_BITFIELD32 ( TCA_DUMP_FLAGS_TERSE ) ,
} ;
2009-11-06 07:57:26 +03:00
/* called with RTNL */
2005-04-17 02:20:36 +04:00
static int tc_dump_tfilter ( struct sk_buff * skb , struct netlink_callback * cb )
{
2019-02-11 11:55:36 +03:00
struct tcf_chain * chain , * chain_prev ;
2008-03-25 20:26:21 +03:00
struct net * net = sock_net ( skb - > sk ) ;
2017-05-17 12:08:01 +03:00
struct nlattr * tca [ TCA_MAX + 1 ] ;
2018-01-17 13:46:51 +03:00
struct Qdisc * q = NULL ;
2017-05-17 12:07:55 +03:00
struct tcf_block * block ;
2012-06-27 08:48:50 +04:00
struct tcmsg * tcm = nlmsg_data ( cb - > nlh ) ;
2020-05-15 14:40:11 +03:00
bool terse_dump = false ;
2017-05-17 12:08:00 +03:00
long index_start ;
long index ;
2017-10-13 15:01:05 +03:00
u32 parent ;
2017-05-17 12:08:01 +03:00
int err ;
2005-04-17 02:20:36 +04:00
2013-03-27 10:47:04 +04:00
if ( nlmsg_len ( cb - > nlh ) < sizeof ( * tcm ) )
2005-04-17 02:20:36 +04:00
return skb - > len ;
2017-05-17 12:08:01 +03:00
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 15:07:28 +03:00
err = nlmsg_parse_deprecated ( cb - > nlh , sizeof ( * tcm ) , tca , TCA_MAX ,
2020-05-15 14:40:11 +03:00
tcf_tfilter_dump_policy , cb - > extack ) ;
2017-05-17 12:08:01 +03:00
if ( err )
return err ;
2020-05-15 14:40:11 +03:00
if ( tca [ TCA_DUMP_FLAGS ] ) {
struct nla_bitfield32 flags =
nla_get_bitfield32 ( tca [ TCA_DUMP_FLAGS ] ) ;
terse_dump = flags . value & TCA_DUMP_FLAGS_TERSE ;
}
2018-01-17 13:46:51 +03:00
if ( tcm - > tcm_ifindex = = TCM_IFINDEX_MAGIC_BLOCK ) {
2018-09-24 19:22:58 +03:00
block = tcf_block_refcnt_get ( net , tcm - > tcm_block_index ) ;
2018-01-17 13:46:51 +03:00
if ( ! block )
goto out ;
2018-01-18 18:14:49 +03:00
/* If we work with block index, q is NULL and parent value
* will never be used in the following code . The check
* in tcf_fill_node prevents it . However , compiler does not
* see that far , so set parent to zero to silence the warning
* about parent being uninitialized .
*/
parent = 0 ;
2017-10-13 15:01:05 +03:00
} else {
2018-01-17 13:46:51 +03:00
const struct Qdisc_class_ops * cops ;
struct net_device * dev ;
unsigned long cl = 0 ;
dev = __dev_get_by_index ( net , tcm - > tcm_ifindex ) ;
if ( ! dev )
return skb - > len ;
parent = tcm - > tcm_parent ;
net_sched: fix tcm_parent in tc filter dump
When we tell kernel to dump filters from root (ffff:ffff),
those filters on ingress (ffff:0000) are matched, but their
true parents must be dumped as they are. However, kernel
dumps just whatever we tell it, that is either ffff:ffff
or ffff:0000:
$ nl-cls-list --dev=dummy0 --parent=root
cls basic dev dummy0 id none parent root prio 49152 protocol ip match-all
cls basic dev dummy0 id :1 parent root prio 49152 protocol ip match-all
$ nl-cls-list --dev=dummy0 --parent=ffff:
cls basic dev dummy0 id none parent ffff: prio 49152 protocol ip match-all
cls basic dev dummy0 id :1 parent ffff: prio 49152 protocol ip match-all
This is confusing and misleading, more importantly this is
a regression since 4.15, so the old behavior must be restored.
And, when tc filters are installed on a tc class, the parent
should be the classid, rather than the qdisc handle. Commit
edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto")
removed the classid we save for filters, we can just restore
this classid in tcf_block.
Steps to reproduce this:
ip li set dev dummy0 up
tc qd add dev dummy0 ingress
tc filter add dev dummy0 parent ffff: protocol arp basic action pass
tc filter show dev dummy0 root
Before this patch:
filter protocol arp pref 49152 basic
filter protocol arp pref 49152 basic handle 0x1
action order 1: gact action pass
random type none pass val 0
index 1 ref 1 bind 1
After this patch:
filter parent ffff: protocol arp pref 49152 basic
filter parent ffff: protocol arp pref 49152 basic handle 0x1
action order 1: gact action pass
random type none pass val 0
index 1 ref 1 bind 1
Fixes: a10fa20101ae ("net: sched: propagate q and parent from caller down to tcf_fill_node")
Fixes: edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto")
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-05-01 06:53:49 +03:00
if ( ! parent )
2022-02-11 23:06:23 +03:00
q = rtnl_dereference ( dev - > qdisc ) ;
net_sched: fix tcm_parent in tc filter dump
When we tell kernel to dump filters from root (ffff:ffff),
those filters on ingress (ffff:0000) are matched, but their
true parents must be dumped as they are. However, kernel
dumps just whatever we tell it, that is either ffff:ffff
or ffff:0000:
$ nl-cls-list --dev=dummy0 --parent=root
cls basic dev dummy0 id none parent root prio 49152 protocol ip match-all
cls basic dev dummy0 id :1 parent root prio 49152 protocol ip match-all
$ nl-cls-list --dev=dummy0 --parent=ffff:
cls basic dev dummy0 id none parent ffff: prio 49152 protocol ip match-all
cls basic dev dummy0 id :1 parent ffff: prio 49152 protocol ip match-all
This is confusing and misleading, more importantly this is
a regression since 4.15, so the old behavior must be restored.
And, when tc filters are installed on a tc class, the parent
should be the classid, rather than the qdisc handle. Commit
edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto")
removed the classid we save for filters, we can just restore
this classid in tcf_block.
Steps to reproduce this:
ip li set dev dummy0 up
tc qd add dev dummy0 ingress
tc filter add dev dummy0 parent ffff: protocol arp basic action pass
tc filter show dev dummy0 root
Before this patch:
filter protocol arp pref 49152 basic
filter protocol arp pref 49152 basic handle 0x1
action order 1: gact action pass
random type none pass val 0
index 1 ref 1 bind 1
After this patch:
filter parent ffff: protocol arp pref 49152 basic
filter parent ffff: protocol arp pref 49152 basic handle 0x1
action order 1: gact action pass
random type none pass val 0
index 1 ref 1 bind 1
Fixes: a10fa20101ae ("net: sched: propagate q and parent from caller down to tcf_fill_node")
Fixes: edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto")
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-05-01 06:53:49 +03:00
else
2018-01-17 13:46:51 +03:00
q = qdisc_lookup ( dev , TC_H_MAJ ( tcm - > tcm_parent ) ) ;
if ( ! q )
goto out ;
cops = q - > ops - > cl_ops ;
if ( ! cops )
net_sched: remove tc class reference counting
For TC classes, their ->get() and ->put() are always paired, and the
reference counting is completely useless, because:
1) For class modification and dumping paths, we already hold RTNL lock,
so all of these ->get(),->change(),->put() are atomic.
2) For filter bindiing/unbinding, we use other reference counter than
this one, and they should have RTNL lock too.
3) For ->qlen_notify(), it is special because it is called on ->enqueue()
path, but we already hold qdisc tree lock there, and we hold this
tree lock when graft or delete the class too, so it should not be gone
or changed until we release the tree lock.
Therefore, this patch removes ->get() and ->put(), but:
1) Adds a new ->find() to find the pointer to a class by classid, no
refcnt.
2) Move the original class destroy upon the last refcnt into ->delete(),
right after releasing tree lock. This is fine because the class is
already removed from hash when holding the lock.
For those who also use ->put() as ->unbind(), just rename them to reflect
this change.
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-25 02:51:29 +03:00
goto out ;
2018-01-17 13:46:51 +03:00
if ( ! cops - > tcf_block )
goto out ;
if ( TC_H_MIN ( tcm - > tcm_parent ) ) {
cl = cops - > find ( q , tcm - > tcm_parent ) ;
if ( cl = = 0 )
goto out ;
}
block = cops - > tcf_block ( q , cl , NULL ) ;
if ( ! block )
goto out ;
net_sched: fix tcm_parent in tc filter dump
When we tell kernel to dump filters from root (ffff:ffff),
those filters on ingress (ffff:0000) are matched, but their
true parents must be dumped as they are. However, kernel
dumps just whatever we tell it, that is either ffff:ffff
or ffff:0000:
$ nl-cls-list --dev=dummy0 --parent=root
cls basic dev dummy0 id none parent root prio 49152 protocol ip match-all
cls basic dev dummy0 id :1 parent root prio 49152 protocol ip match-all
$ nl-cls-list --dev=dummy0 --parent=ffff:
cls basic dev dummy0 id none parent ffff: prio 49152 protocol ip match-all
cls basic dev dummy0 id :1 parent ffff: prio 49152 protocol ip match-all
This is confusing and misleading, more importantly this is
a regression since 4.15, so the old behavior must be restored.
And, when tc filters are installed on a tc class, the parent
should be the classid, rather than the qdisc handle. Commit
edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto")
removed the classid we save for filters, we can just restore
this classid in tcf_block.
Steps to reproduce this:
ip li set dev dummy0 up
tc qd add dev dummy0 ingress
tc filter add dev dummy0 parent ffff: protocol arp basic action pass
tc filter show dev dummy0 root
Before this patch:
filter protocol arp pref 49152 basic
filter protocol arp pref 49152 basic handle 0x1
action order 1: gact action pass
random type none pass val 0
index 1 ref 1 bind 1
After this patch:
filter parent ffff: protocol arp pref 49152 basic
filter parent ffff: protocol arp pref 49152 basic handle 0x1
action order 1: gact action pass
random type none pass val 0
index 1 ref 1 bind 1
Fixes: a10fa20101ae ("net: sched: propagate q and parent from caller down to tcf_fill_node")
Fixes: edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto")
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-05-01 06:53:49 +03:00
parent = block - > classid ;
2018-01-17 13:46:51 +03:00
if ( tcf_block_shared ( block ) )
q = NULL ;
2005-04-17 02:20:36 +04:00
}
2017-05-17 12:08:00 +03:00
index_start = cb - > args [ 0 ] ;
index = 0 ;
2017-05-17 12:08:01 +03:00
2019-02-11 11:55:36 +03:00
for ( chain = __tcf_get_next_chain ( block , NULL ) ;
chain ;
chain_prev = chain ,
chain = __tcf_get_next_chain ( block , chain ) ,
tcf_chain_put ( chain_prev ) ) {
2017-05-17 12:08:01 +03:00
if ( tca [ TCA_CHAIN ] & &
nla_get_u32 ( tca [ TCA_CHAIN ] ) ! = chain - > index )
continue ;
2017-10-13 15:01:05 +03:00
if ( ! tcf_chain_dump ( chain , q , parent , skb , cb ,
2020-05-15 14:40:11 +03:00
index_start , & index , terse_dump ) ) {
2019-02-11 11:55:36 +03:00
tcf_chain_put ( chain ) ;
2018-02-19 23:32:51 +03:00
err = - EMSGSIZE ;
2017-05-17 12:08:01 +03:00
break ;
2018-02-19 23:32:51 +03:00
}
2017-05-17 12:08:01 +03:00
}
2018-09-24 19:22:58 +03:00
if ( tcm - > tcm_ifindex = = TCM_IFINDEX_MAGIC_BLOCK )
2019-02-11 11:55:45 +03:00
tcf_block_refcnt_put ( block , true ) ;
2017-05-17 12:08:00 +03:00
cb - > args [ 0 ] = index ;
2005-04-17 02:20:36 +04:00
out :
2018-02-19 23:32:51 +03:00
/* If we did no progress, the error (EMSGSIZE) is real */
if ( skb - > len = = 0 & & err )
return err ;
2005-04-17 02:20:36 +04:00
return skb - > len ;
}
2019-02-11 11:55:37 +03:00
static int tc_chain_fill_node ( const struct tcf_proto_ops * tmplt_ops ,
void * tmplt_priv , u32 chain_index ,
struct net * net , struct sk_buff * skb ,
struct tcf_block * block ,
2023-01-13 06:43:53 +03:00
u32 portid , u32 seq , u16 flags , int event ,
struct netlink_ext_ack * extack )
2018-07-23 10:23:06 +03:00
{
unsigned char * b = skb_tail_pointer ( skb ) ;
2018-07-23 10:23:07 +03:00
const struct tcf_proto_ops * ops ;
2018-07-23 10:23:06 +03:00
struct nlmsghdr * nlh ;
struct tcmsg * tcm ;
2018-07-23 10:23:07 +03:00
void * priv ;
2019-02-11 11:55:37 +03:00
ops = tmplt_ops ;
priv = tmplt_priv ;
2018-07-23 10:23:06 +03:00
nlh = nlmsg_put ( skb , portid , seq , event , sizeof ( * tcm ) , flags ) ;
if ( ! nlh )
goto out_nlmsg_trim ;
tcm = nlmsg_data ( nlh ) ;
tcm - > tcm_family = AF_UNSPEC ;
tcm - > tcm__pad1 = 0 ;
tcm - > tcm__pad2 = 0 ;
tcm - > tcm_handle = 0 ;
if ( block - > q ) {
tcm - > tcm_ifindex = qdisc_dev ( block - > q ) - > ifindex ;
tcm - > tcm_parent = block - > q - > handle ;
} else {
tcm - > tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK ;
tcm - > tcm_block_index = block - > index ;
}
2019-02-11 11:55:37 +03:00
if ( nla_put_u32 ( skb , TCA_CHAIN , chain_index ) )
2018-07-23 10:23:06 +03:00
goto nla_put_failure ;
2018-07-23 10:23:07 +03:00
if ( ops ) {
if ( nla_put_string ( skb , TCA_KIND , ops - > kind ) )
goto nla_put_failure ;
if ( ops - > tmplt_dump ( skb , net , priv ) < 0 )
goto nla_put_failure ;
}
2023-01-13 06:43:53 +03:00
if ( extack & & extack - > _msg & &
nla_put_string ( skb , TCA_EXT_WARN_MSG , extack - > _msg ) )
goto out_nlmsg_trim ;
2018-07-23 10:23:06 +03:00
nlh - > nlmsg_len = skb_tail_pointer ( skb ) - b ;
2023-01-13 06:43:53 +03:00
2018-07-23 10:23:06 +03:00
return skb - > len ;
out_nlmsg_trim :
nla_put_failure :
nlmsg_trim ( skb , b ) ;
return - EMSGSIZE ;
}
static int tc_chain_notify ( struct tcf_chain * chain , struct sk_buff * oskb ,
2023-01-13 06:43:53 +03:00
u32 seq , u16 flags , int event , bool unicast ,
struct netlink_ext_ack * extack )
2018-07-23 10:23:06 +03:00
{
u32 portid = oskb ? NETLINK_CB ( oskb ) . portid : 0 ;
struct tcf_block * block = chain - > block ;
struct net * net = block - > net ;
struct sk_buff * skb ;
2019-03-11 13:15:54 +03:00
int err = 0 ;
2018-07-23 10:23:06 +03:00
2023-12-08 22:28:47 +03:00
if ( ! unicast & & ! rtnl_notify_needed ( net , flags , RTNLGRP_TC ) )
return 0 ;
2018-07-23 10:23:06 +03:00
skb = alloc_skb ( NLMSG_GOODSIZE , GFP_KERNEL ) ;
if ( ! skb )
return - ENOBUFS ;
2019-02-11 11:55:37 +03:00
if ( tc_chain_fill_node ( chain - > tmplt_ops , chain - > tmplt_priv ,
chain - > index , net , skb , block , portid ,
2023-01-13 06:43:53 +03:00
seq , flags , event , extack ) < = 0 ) {
2018-07-23 10:23:06 +03:00
kfree_skb ( skb ) ;
return - EINVAL ;
}
if ( unicast )
2021-07-15 15:24:24 +03:00
err = rtnl_unicast ( skb , net , portid ) ;
2019-03-11 13:15:54 +03:00
else
err = rtnetlink_send ( skb , net , portid , RTNLGRP_TC ,
flags & NLM_F_ECHO ) ;
2018-07-23 10:23:06 +03:00
2019-03-11 13:15:54 +03:00
return err ;
2018-07-23 10:23:06 +03:00
}
2019-02-11 11:55:37 +03:00
static int tc_chain_notify_delete ( const struct tcf_proto_ops * tmplt_ops ,
void * tmplt_priv , u32 chain_index ,
struct tcf_block * block , struct sk_buff * oskb ,
2023-12-08 22:28:46 +03:00
u32 seq , u16 flags )
2019-02-11 11:55:37 +03:00
{
u32 portid = oskb ? NETLINK_CB ( oskb ) . portid : 0 ;
struct net * net = block - > net ;
struct sk_buff * skb ;
2023-12-08 22:28:47 +03:00
if ( ! rtnl_notify_needed ( net , flags , RTNLGRP_TC ) )
return 0 ;
2019-02-11 11:55:37 +03:00
skb = alloc_skb ( NLMSG_GOODSIZE , GFP_KERNEL ) ;
if ( ! skb )
return - ENOBUFS ;
if ( tc_chain_fill_node ( tmplt_ops , tmplt_priv , chain_index , net , skb ,
2023-01-13 06:43:53 +03:00
block , portid , seq , flags , RTM_DELCHAIN , NULL ) < = 0 ) {
2019-02-11 11:55:37 +03:00
kfree_skb ( skb ) ;
return - EINVAL ;
}
return rtnetlink_send ( skb , net , portid , RTNLGRP_TC , flags & NLM_F_ECHO ) ;
}
2018-07-23 10:23:07 +03:00
static int tc_chain_tmplt_add ( struct tcf_chain * chain , struct net * net ,
struct nlattr * * tca ,
struct netlink_ext_ack * extack )
{
const struct tcf_proto_ops * ops ;
2019-12-07 22:34:45 +03:00
char name [ IFNAMSIZ ] ;
2018-07-23 10:23:07 +03:00
void * tmplt_priv ;
/* If kind is not set, user did not specify template. */
if ( ! tca [ TCA_KIND ] )
return 0 ;
2019-12-07 22:34:45 +03:00
if ( tcf_proto_check_kind ( tca [ TCA_KIND ] , name ) ) {
NL_SET_ERR_MSG ( extack , " Specified TC chain template name too long " ) ;
return - EINVAL ;
}
ops = tcf_proto_lookup_ops ( name , true , extack ) ;
2018-07-23 10:23:07 +03:00
if ( IS_ERR ( ops ) )
return PTR_ERR ( ops ) ;
net/sched: flower: Fix chain template offload
When a qdisc is deleted from a net device the stack instructs the
underlying driver to remove its flow offload callback from the
associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack
then continues to replay the removal of the filters in the block for
this driver by iterating over the chains in the block and invoking the
'reoffload' operation of the classifier being used. In turn, the
classifier in its 'reoffload' operation prepares and emits a
'FLOW_CLS_DESTROY' command for each filter.
However, the stack does not do the same for chain templates and the
underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when
a qdisc is deleted. This results in a memory leak [1] which can be
reproduced using [2].
Fix by introducing a 'tmplt_reoffload' operation and have the stack
invoke it with the appropriate arguments as part of the replay.
Implement the operation in the sole classifier that supports chain
templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}'
command based on whether a flow offload callback is being bound to a
filter block or being unbound from one.
As far as I can tell, the issue happens since cited commit which
reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains()
in __tcf_block_put(). The order cannot be reversed as the filter block
is expected to be freed after flushing all the chains.
[1]
unreferenced object 0xffff888107e28800 (size 2048):
comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
hex dump (first 32 bytes):
b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff ..|......[......
01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff ................
backtrace:
[<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff81ab374e>] __kmalloc+0x4e/0x90
[<ffffffff832aec6d>] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0
[<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
[<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
[<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
[<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
[<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
[<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
[<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
[<ffffffff83ac6270>] netlink_unicast+0x540/0x820
[<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
[<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
[<ffffffff8379d29a>] ___sys_sendmsg+0x13a/0x1e0
[<ffffffff8379d50c>] __sys_sendmsg+0x11c/0x1f0
[<ffffffff843b9ce0>] do_syscall_64+0x40/0xe0
unreferenced object 0xffff88816d2c0400 (size 1024):
comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
hex dump (first 32 bytes):
40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00 @.......W.8.....
10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff ..,m......,m....
backtrace:
[<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff81ab36c1>] __kmalloc_node+0x51/0x90
[<ffffffff81a8ed96>] kvmalloc_node+0xa6/0x1f0
[<ffffffff82827d03>] bucket_table_alloc.isra.0+0x83/0x460
[<ffffffff82828d2b>] rhashtable_init+0x43b/0x7c0
[<ffffffff832aed48>] mlxsw_sp_acl_ruleset_get+0x428/0x7a0
[<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
[<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
[<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
[<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
[<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
[<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
[<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
[<ffffffff83ac6270>] netlink_unicast+0x540/0x820
[<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
[<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
[2]
# tc qdisc add dev swp1 clsact
# tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32
# tc qdisc del dev swp1 clsact
# devlink dev reload pci/0000:06:00.0
Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-01-22 16:28:43 +03:00
if ( ! ops - > tmplt_create | | ! ops - > tmplt_destroy | | ! ops - > tmplt_dump | |
! ops - > tmplt_reoffload ) {
2018-07-23 10:23:07 +03:00
NL_SET_ERR_MSG ( extack , " Chain templates are not supported with specified classifier " ) ;
2023-06-07 05:23:01 +03:00
module_put ( ops - > owner ) ;
2018-07-23 10:23:07 +03:00
return - EOPNOTSUPP ;
}
tmplt_priv = ops - > tmplt_create ( net , chain , tca , extack ) ;
if ( IS_ERR ( tmplt_priv ) ) {
module_put ( ops - > owner ) ;
return PTR_ERR ( tmplt_priv ) ;
}
chain - > tmplt_ops = ops ;
chain - > tmplt_priv = tmplt_priv ;
return 0 ;
}
2019-02-11 11:55:37 +03:00
static void tc_chain_tmplt_del ( const struct tcf_proto_ops * tmplt_ops ,
void * tmplt_priv )
2018-07-23 10:23:07 +03:00
{
/* If template ops are set, no work to do for us. */
2019-02-11 11:55:37 +03:00
if ( ! tmplt_ops )
2018-07-23 10:23:07 +03:00
return ;
2019-02-11 11:55:37 +03:00
tmplt_ops - > tmplt_destroy ( tmplt_priv ) ;
module_put ( tmplt_ops - > owner ) ;
2018-07-23 10:23:07 +03:00
}
2018-07-23 10:23:06 +03:00
/* Add/delete/get a chain */
static int tc_ctl_chain ( struct sk_buff * skb , struct nlmsghdr * n ,
struct netlink_ext_ack * extack )
{
struct net * net = sock_net ( skb - > sk ) ;
struct nlattr * tca [ TCA_MAX + 1 ] ;
struct tcmsg * t ;
u32 parent ;
u32 chain_index ;
net: sched: fix use-after-free in tc_new_tfilter()
Whenever tc_new_tfilter() jumps back to replay: label,
we need to make sure @q and @chain local variables are cleared again,
or risk use-after-free as in [1]
For consistency, apply the same fix in tc_ctl_chain()
BUG: KASAN: use-after-free in mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
Write of size 8 at addr ffff8880985c4b08 by task syz-executor.4/1945
CPU: 0 PID: 1945 Comm: syz-executor.4 Not tainted 5.17.0-rc1-syzkaller-00495-gff58831fa02d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
tcf_chain_head_change_item net/sched/cls_api.c:372 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:386
tcf_chain_tp_insert net/sched/cls_api.c:1657 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1707 [inline]
tc_new_tfilter+0x1e67/0x2350 net/sched/cls_api.c:2086
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f2647172059
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f2645aa5168 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00007f2647285100 RCX: 00007f2647172059
RDX: 040000000000009f RSI: 00000000200002c0 RDI: 0000000000000006
RBP: 00007f26471cc08d R08: 0000000000000000 R09: 0000000000000000
R10: 9e00000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fffb3f7f02f R14: 00007f2645aa5300 R15: 0000000000022000
</TASK>
Allocated by task 1944:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:45 [inline]
set_alloc_info mm/kasan/common.c:436 [inline]
____kasan_kmalloc mm/kasan/common.c:515 [inline]
____kasan_kmalloc mm/kasan/common.c:474 [inline]
__kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:524
kmalloc_node include/linux/slab.h:604 [inline]
kzalloc_node include/linux/slab.h:726 [inline]
qdisc_alloc+0xac/0xa10 net/sched/sch_generic.c:941
qdisc_create.constprop.0+0xce/0x10f0 net/sched/sch_api.c:1211
tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660
rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5592
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed by task 3609:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track+0x21/0x30 mm/kasan/common.c:45
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free+0x130/0x160 mm/kasan/common.c:328
kasan_slab_free include/linux/kasan.h:236 [inline]
slab_free_hook mm/slub.c:1728 [inline]
slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1754
slab_free mm/slub.c:3509 [inline]
kfree+0xcb/0x280 mm/slub.c:4562
rcu_do_batch kernel/rcu/tree.c:2527 [inline]
rcu_core+0x7b8/0x1540 kernel/rcu/tree.c:2778
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Last potentially related work creation:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
__kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348
__call_rcu kernel/rcu/tree.c:3026 [inline]
call_rcu+0xb1/0x740 kernel/rcu/tree.c:3106
qdisc_put_unlocked+0x6f/0x90 net/sched/sch_generic.c:1109
tcf_block_release+0x86/0x90 net/sched/cls_api.c:1238
tc_new_tfilter+0xc0d/0x2350 net/sched/cls_api.c:2148
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff8880985c4800
which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 776 bytes inside of
1024-byte region [ffff8880985c4800, ffff8880985c4c00)
The buggy address belongs to the page:
page:ffffea0002617000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x985c0
head:ffffea0002617000 order:3 compound_mapcount:0 compound_pincount:0
flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888010c41dc0
raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 3, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 1941, ts 1038999441284, free_ts 1033444432829
prep_new_page mm/page_alloc.c:2434 [inline]
get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165
__alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389
alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271
alloc_slab_page mm/slub.c:1799 [inline]
allocate_slab mm/slub.c:1944 [inline]
new_slab+0x28a/0x3b0 mm/slub.c:2004
___slab_alloc+0x87c/0xe90 mm/slub.c:3018
__slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105
slab_alloc_node mm/slub.c:3196 [inline]
slab_alloc mm/slub.c:3238 [inline]
__kmalloc+0x2fb/0x340 mm/slub.c:4420
kmalloc include/linux/slab.h:586 [inline]
kzalloc include/linux/slab.h:715 [inline]
__register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1335
neigh_sysctl_register+0x2c8/0x5e0 net/core/neighbour.c:3787
devinet_sysctl_register+0xb1/0x230 net/ipv4/devinet.c:2618
inetdev_init+0x286/0x580 net/ipv4/devinet.c:278
inetdev_event+0xa8a/0x15d0 net/ipv4/devinet.c:1532
notifier_call_chain+0xb5/0x200 kernel/notifier.c:84
call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1919
call_netdevice_notifiers_extack net/core/dev.c:1931 [inline]
call_netdevice_notifiers net/core/dev.c:1945 [inline]
register_netdevice+0x1073/0x1500 net/core/dev.c:9698
veth_newlink+0x59c/0xa90 drivers/net/veth.c:1722
page last free stack trace:
reset_page_owner include/linux/page_owner.h:24 [inline]
free_pages_prepare mm/page_alloc.c:1352 [inline]
free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404
free_unref_page_prepare mm/page_alloc.c:3325 [inline]
free_unref_page+0x19/0x690 mm/page_alloc.c:3404
release_pages+0x748/0x1220 mm/swap.c:956
tlb_batch_pages_flush mm/mmu_gather.c:50 [inline]
tlb_flush_mmu_free mm/mmu_gather.c:243 [inline]
tlb_flush_mmu+0xe9/0x6b0 mm/mmu_gather.c:250
zap_pte_range mm/memory.c:1441 [inline]
zap_pmd_range mm/memory.c:1490 [inline]
zap_pud_range mm/memory.c:1519 [inline]
zap_p4d_range mm/memory.c:1540 [inline]
unmap_page_range+0x1d1d/0x2a30 mm/memory.c:1561
unmap_single_vma+0x198/0x310 mm/memory.c:1606
unmap_vmas+0x16b/0x2f0 mm/memory.c:1638
exit_mmap+0x201/0x670 mm/mmap.c:3178
__mmput+0x122/0x4b0 kernel/fork.c:1114
mmput+0x56/0x60 kernel/fork.c:1135
exit_mm kernel/exit.c:507 [inline]
do_exit+0xa3c/0x2a30 kernel/exit.c:793
do_group_exit+0xd2/0x2f0 kernel/exit.c:935
__do_sys_exit_group kernel/exit.c:946 [inline]
__se_sys_exit_group kernel/exit.c:944 [inline]
__x64_sys_exit_group+0x3a/0x50 kernel/exit.c:944
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Memory state around the buggy address:
ffff8880985c4a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8880985c4b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff8880985c4b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
Fixes: 470502de5bdb ("net: sched: unlock rules update API")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20220131172018.3704490-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-31 20:20:18 +03:00
struct Qdisc * q ;
struct tcf_chain * chain ;
2018-07-23 10:23:06 +03:00
struct tcf_block * block ;
unsigned long cl ;
int err ;
replay :
net: sched: fix use-after-free in tc_new_tfilter()
Whenever tc_new_tfilter() jumps back to replay: label,
we need to make sure @q and @chain local variables are cleared again,
or risk use-after-free as in [1]
For consistency, apply the same fix in tc_ctl_chain()
BUG: KASAN: use-after-free in mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
Write of size 8 at addr ffff8880985c4b08 by task syz-executor.4/1945
CPU: 0 PID: 1945 Comm: syz-executor.4 Not tainted 5.17.0-rc1-syzkaller-00495-gff58831fa02d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
mini_qdisc_pair_swap+0x1b9/0x1f0 net/sched/sch_generic.c:1581
tcf_chain_head_change_item net/sched/cls_api.c:372 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:386
tcf_chain_tp_insert net/sched/cls_api.c:1657 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1707 [inline]
tc_new_tfilter+0x1e67/0x2350 net/sched/cls_api.c:2086
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f2647172059
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f2645aa5168 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 00007f2647285100 RCX: 00007f2647172059
RDX: 040000000000009f RSI: 00000000200002c0 RDI: 0000000000000006
RBP: 00007f26471cc08d R08: 0000000000000000 R09: 0000000000000000
R10: 9e00000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fffb3f7f02f R14: 00007f2645aa5300 R15: 0000000000022000
</TASK>
Allocated by task 1944:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:45 [inline]
set_alloc_info mm/kasan/common.c:436 [inline]
____kasan_kmalloc mm/kasan/common.c:515 [inline]
____kasan_kmalloc mm/kasan/common.c:474 [inline]
__kasan_kmalloc+0xa9/0xd0 mm/kasan/common.c:524
kmalloc_node include/linux/slab.h:604 [inline]
kzalloc_node include/linux/slab.h:726 [inline]
qdisc_alloc+0xac/0xa10 net/sched/sch_generic.c:941
qdisc_create.constprop.0+0xce/0x10f0 net/sched/sch_api.c:1211
tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660
rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5592
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed by task 3609:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
kasan_set_track+0x21/0x30 mm/kasan/common.c:45
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free+0x130/0x160 mm/kasan/common.c:328
kasan_slab_free include/linux/kasan.h:236 [inline]
slab_free_hook mm/slub.c:1728 [inline]
slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1754
slab_free mm/slub.c:3509 [inline]
kfree+0xcb/0x280 mm/slub.c:4562
rcu_do_batch kernel/rcu/tree.c:2527 [inline]
rcu_core+0x7b8/0x1540 kernel/rcu/tree.c:2778
__do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Last potentially related work creation:
kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38
__kasan_record_aux_stack+0xbe/0xd0 mm/kasan/generic.c:348
__call_rcu kernel/rcu/tree.c:3026 [inline]
call_rcu+0xb1/0x740 kernel/rcu/tree.c:3106
qdisc_put_unlocked+0x6f/0x90 net/sched/sch_generic.c:1109
tcf_block_release+0x86/0x90 net/sched/cls_api.c:1238
tc_new_tfilter+0xc0d/0x2350 net/sched/cls_api.c:2148
rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:5583
netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
netlink_unicast+0x539/0x7e0 net/netlink/af_netlink.c:1343
netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1919
sock_sendmsg_nosec net/socket.c:705 [inline]
sock_sendmsg+0xcf/0x120 net/socket.c:725
____sys_sendmsg+0x331/0x810 net/socket.c:2413
___sys_sendmsg+0xf3/0x170 net/socket.c:2467
__sys_sendmmsg+0x195/0x470 net/socket.c:2553
__do_sys_sendmmsg net/socket.c:2582 [inline]
__se_sys_sendmmsg net/socket.c:2579 [inline]
__x64_sys_sendmmsg+0x99/0x100 net/socket.c:2579
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff8880985c4800
which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 776 bytes inside of
1024-byte region [ffff8880985c4800, ffff8880985c4c00)
The buggy address belongs to the page:
page:ffffea0002617000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x985c0
head:ffffea0002617000 order:3 compound_mapcount:0 compound_pincount:0
flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff)
raw: 00fff00000010200 0000000000000000 dead000000000122 ffff888010c41dc0
raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 3, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 1941, ts 1038999441284, free_ts 1033444432829
prep_new_page mm/page_alloc.c:2434 [inline]
get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165
__alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389
alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271
alloc_slab_page mm/slub.c:1799 [inline]
allocate_slab mm/slub.c:1944 [inline]
new_slab+0x28a/0x3b0 mm/slub.c:2004
___slab_alloc+0x87c/0xe90 mm/slub.c:3018
__slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105
slab_alloc_node mm/slub.c:3196 [inline]
slab_alloc mm/slub.c:3238 [inline]
__kmalloc+0x2fb/0x340 mm/slub.c:4420
kmalloc include/linux/slab.h:586 [inline]
kzalloc include/linux/slab.h:715 [inline]
__register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1335
neigh_sysctl_register+0x2c8/0x5e0 net/core/neighbour.c:3787
devinet_sysctl_register+0xb1/0x230 net/ipv4/devinet.c:2618
inetdev_init+0x286/0x580 net/ipv4/devinet.c:278
inetdev_event+0xa8a/0x15d0 net/ipv4/devinet.c:1532
notifier_call_chain+0xb5/0x200 kernel/notifier.c:84
call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1919
call_netdevice_notifiers_extack net/core/dev.c:1931 [inline]
call_netdevice_notifiers net/core/dev.c:1945 [inline]
register_netdevice+0x1073/0x1500 net/core/dev.c:9698
veth_newlink+0x59c/0xa90 drivers/net/veth.c:1722
page last free stack trace:
reset_page_owner include/linux/page_owner.h:24 [inline]
free_pages_prepare mm/page_alloc.c:1352 [inline]
free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404
free_unref_page_prepare mm/page_alloc.c:3325 [inline]
free_unref_page+0x19/0x690 mm/page_alloc.c:3404
release_pages+0x748/0x1220 mm/swap.c:956
tlb_batch_pages_flush mm/mmu_gather.c:50 [inline]
tlb_flush_mmu_free mm/mmu_gather.c:243 [inline]
tlb_flush_mmu+0xe9/0x6b0 mm/mmu_gather.c:250
zap_pte_range mm/memory.c:1441 [inline]
zap_pmd_range mm/memory.c:1490 [inline]
zap_pud_range mm/memory.c:1519 [inline]
zap_p4d_range mm/memory.c:1540 [inline]
unmap_page_range+0x1d1d/0x2a30 mm/memory.c:1561
unmap_single_vma+0x198/0x310 mm/memory.c:1606
unmap_vmas+0x16b/0x2f0 mm/memory.c:1638
exit_mmap+0x201/0x670 mm/mmap.c:3178
__mmput+0x122/0x4b0 kernel/fork.c:1114
mmput+0x56/0x60 kernel/fork.c:1135
exit_mm kernel/exit.c:507 [inline]
do_exit+0xa3c/0x2a30 kernel/exit.c:793
do_group_exit+0xd2/0x2f0 kernel/exit.c:935
__do_sys_exit_group kernel/exit.c:946 [inline]
__se_sys_exit_group kernel/exit.c:944 [inline]
__x64_sys_exit_group+0x3a/0x50 kernel/exit.c:944
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Memory state around the buggy address:
ffff8880985c4a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4a80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8880985c4b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff8880985c4b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880985c4c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
Fixes: 470502de5bdb ("net: sched: unlock rules update API")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Buslov <vladbu@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Link: https://lore.kernel.org/r/20220131172018.3704490-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-31 20:20:18 +03:00
q = NULL ;
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 15:07:28 +03:00
err = nlmsg_parse_deprecated ( n , sizeof ( * t ) , tca , TCA_MAX ,
rtm_tca_policy , extack ) ;
2018-07-23 10:23:06 +03:00
if ( err < 0 )
return err ;
t = nlmsg_data ( n ) ;
parent = t - > tcm_parent ;
cl = 0 ;
block = tcf_block_find ( net , & q , & parent , & cl ,
t - > tcm_ifindex , t - > tcm_block_index , extack ) ;
if ( IS_ERR ( block ) )
return PTR_ERR ( block ) ;
chain_index = tca [ TCA_CHAIN ] ? nla_get_u32 ( tca [ TCA_CHAIN ] ) : 0 ;
if ( chain_index > TC_ACT_EXT_VAL_MASK ) {
NL_SET_ERR_MSG ( extack , " Specified chain index exceeds upper limit " ) ;
2018-09-24 19:22:53 +03:00
err = - EINVAL ;
goto errout_block ;
2018-07-23 10:23:06 +03:00
}
2019-02-11 11:55:34 +03:00
mutex_lock ( & block - > lock ) ;
2018-07-23 10:23:06 +03:00
chain = tcf_chain_lookup ( block , chain_index ) ;
if ( n - > nlmsg_type = = RTM_NEWCHAIN ) {
if ( chain ) {
2018-08-01 13:36:55 +03:00
if ( tcf_chain_held_by_acts_only ( chain ) ) {
2018-07-27 10:45:05 +03:00
/* The chain exists only because there is
2018-08-01 13:36:55 +03:00
* some action referencing it .
2018-07-27 10:45:05 +03:00
*/
tcf_chain_hold ( chain ) ;
} else {
NL_SET_ERR_MSG ( extack , " Filter chain already exists " ) ;
2018-09-24 19:22:53 +03:00
err = - EEXIST ;
2019-02-11 11:55:34 +03:00
goto errout_block_locked ;
2018-07-27 10:45:05 +03:00
}
} else {
if ( ! ( n - > nlmsg_flags & NLM_F_CREATE ) ) {
NL_SET_ERR_MSG ( extack , " Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain " ) ;
2018-09-24 19:22:53 +03:00
err = - ENOENT ;
2019-02-11 11:55:34 +03:00
goto errout_block_locked ;
2018-07-27 10:45:05 +03:00
}
chain = tcf_chain_create ( block , chain_index ) ;
if ( ! chain ) {
NL_SET_ERR_MSG ( extack , " Failed to create filter chain " ) ;
2018-09-24 19:22:53 +03:00
err = - ENOMEM ;
2019-02-11 11:55:34 +03:00
goto errout_block_locked ;
2018-07-27 10:45:05 +03:00
}
2018-07-23 10:23:06 +03:00
}
} else {
2018-08-01 13:36:55 +03:00
if ( ! chain | | tcf_chain_held_by_acts_only ( chain ) ) {
2018-07-23 10:23:06 +03:00
NL_SET_ERR_MSG ( extack , " Cannot find specified filter chain " ) ;
2018-09-24 19:22:53 +03:00
err = - EINVAL ;
2019-02-11 11:55:34 +03:00
goto errout_block_locked ;
2018-07-23 10:23:06 +03:00
}
tcf_chain_hold ( chain ) ;
}
2019-02-11 11:55:34 +03:00
if ( n - > nlmsg_type = = RTM_NEWCHAIN ) {
/* Modifying chain requires holding parent block lock. In case
* the chain was successfully added , take a reference to the
* chain . This ensures that an empty chain does not disappear at
* the end of this function .
*/
tcf_chain_hold ( chain ) ;
chain - > explicitly_created = true ;
}
mutex_unlock ( & block - > lock ) ;
2018-07-23 10:23:06 +03:00
switch ( n - > nlmsg_type ) {
case RTM_NEWCHAIN :
2018-07-23 10:23:07 +03:00
err = tc_chain_tmplt_add ( chain , net , tca , extack ) ;
2019-02-11 11:55:34 +03:00
if ( err ) {
tcf_chain_put_explicitly_created ( chain ) ;
2018-07-23 10:23:07 +03:00
goto errout ;
2019-02-11 11:55:34 +03:00
}
2018-07-23 10:23:06 +03:00
tc_chain_notify ( chain , NULL , 0 , NLM_F_CREATE | NLM_F_EXCL ,
2023-01-13 06:43:53 +03:00
RTM_NEWCHAIN , false , extack ) ;
2018-07-23 10:23:06 +03:00
break ;
case RTM_DELCHAIN :
2018-09-12 00:22:23 +03:00
tfilter_notify_chain ( net , skb , block , q , parent , n ,
2023-01-13 06:43:53 +03:00
chain , RTM_DELTFILTER , extack ) ;
2018-07-23 10:23:06 +03:00
/* Flush the chain first as the user requested chain removal. */
2019-02-11 11:55:45 +03:00
tcf_chain_flush ( chain , true ) ;
2018-07-23 10:23:06 +03:00
/* In case the chain was successfully deleted, put a reference
* to the chain previously taken during addition .
*/
tcf_chain_put_explicitly_created ( chain ) ;
break ;
case RTM_GETCHAIN :
err = tc_chain_notify ( chain , skb , n - > nlmsg_seq ,
2023-01-13 06:43:53 +03:00
n - > nlmsg_flags , n - > nlmsg_type , true , extack ) ;
2018-07-23 10:23:06 +03:00
if ( err < 0 )
NL_SET_ERR_MSG ( extack , " Failed to send chain notify message " ) ;
break ;
default :
err = - EOPNOTSUPP ;
NL_SET_ERR_MSG ( extack , " Unsupported message type " ) ;
goto errout ;
}
errout :
tcf_chain_put ( chain ) ;
2018-09-24 19:22:53 +03:00
errout_block :
2019-02-11 11:55:45 +03:00
tcf_block_release ( q , block , true ) ;
2018-07-23 10:23:06 +03:00
if ( err = = - EAGAIN )
/* Replay the request. */
goto replay ;
return err ;
2019-02-11 11:55:34 +03:00
errout_block_locked :
mutex_unlock ( & block - > lock ) ;
goto errout_block ;
2018-07-23 10:23:06 +03:00
}
/* called with RTNL */
static int tc_dump_chain ( struct sk_buff * skb , struct netlink_callback * cb )
{
struct net * net = sock_net ( skb - > sk ) ;
struct nlattr * tca [ TCA_MAX + 1 ] ;
struct Qdisc * q = NULL ;
struct tcf_block * block ;
struct tcmsg * tcm = nlmsg_data ( cb - > nlh ) ;
2019-02-25 18:45:44 +03:00
struct tcf_chain * chain ;
2018-07-23 10:23:06 +03:00
long index_start ;
long index ;
int err ;
if ( nlmsg_len ( cb - > nlh ) < sizeof ( * tcm ) )
return skb - > len ;
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 15:07:28 +03:00
err = nlmsg_parse_deprecated ( cb - > nlh , sizeof ( * tcm ) , tca , TCA_MAX ,
rtm_tca_policy , cb - > extack ) ;
2018-07-23 10:23:06 +03:00
if ( err )
return err ;
if ( tcm - > tcm_ifindex = = TCM_IFINDEX_MAGIC_BLOCK ) {
2018-09-24 19:22:58 +03:00
block = tcf_block_refcnt_get ( net , tcm - > tcm_block_index ) ;
2018-07-23 10:23:06 +03:00
if ( ! block )
goto out ;
} else {
const struct Qdisc_class_ops * cops ;
struct net_device * dev ;
unsigned long cl = 0 ;
dev = __dev_get_by_index ( net , tcm - > tcm_ifindex ) ;
if ( ! dev )
return skb - > len ;
2020-10-28 14:35:33 +03:00
if ( ! tcm - > tcm_parent )
2022-02-11 23:06:23 +03:00
q = rtnl_dereference ( dev - > qdisc ) ;
2020-10-28 14:35:33 +03:00
else
2018-07-23 10:23:06 +03:00
q = qdisc_lookup ( dev , TC_H_MAJ ( tcm - > tcm_parent ) ) ;
2020-10-28 14:35:33 +03:00
2018-07-23 10:23:06 +03:00
if ( ! q )
goto out ;
cops = q - > ops - > cl_ops ;
if ( ! cops )
goto out ;
if ( ! cops - > tcf_block )
goto out ;
if ( TC_H_MIN ( tcm - > tcm_parent ) ) {
cl = cops - > find ( q , tcm - > tcm_parent ) ;
if ( cl = = 0 )
goto out ;
}
block = cops - > tcf_block ( q , cl , NULL ) ;
if ( ! block )
goto out ;
if ( tcf_block_shared ( block ) )
q = NULL ;
}
index_start = cb - > args [ 0 ] ;
index = 0 ;
2019-02-25 18:45:44 +03:00
mutex_lock ( & block - > lock ) ;
list_for_each_entry ( chain , & block - > chain_list , list ) {
2018-07-23 10:23:06 +03:00
if ( ( tca [ TCA_CHAIN ] & &
nla_get_u32 ( tca [ TCA_CHAIN ] ) ! = chain - > index ) )
continue ;
if ( index < index_start ) {
index + + ;
continue ;
}
2019-02-25 18:45:44 +03:00
if ( tcf_chain_held_by_acts_only ( chain ) )
continue ;
2019-02-11 11:55:37 +03:00
err = tc_chain_fill_node ( chain - > tmplt_ops , chain - > tmplt_priv ,
chain - > index , net , skb , block ,
2018-07-23 10:23:06 +03:00
NETLINK_CB ( cb - > skb ) . portid ,
cb - > nlh - > nlmsg_seq , NLM_F_MULTI ,
2023-01-13 06:43:53 +03:00
RTM_NEWCHAIN , NULL ) ;
2019-02-25 18:45:44 +03:00
if ( err < = 0 )
2018-07-23 10:23:06 +03:00
break ;
index + + ;
}
2019-02-25 18:45:44 +03:00
mutex_unlock ( & block - > lock ) ;
2018-07-23 10:23:06 +03:00
2018-09-24 19:22:58 +03:00
if ( tcm - > tcm_ifindex = = TCM_IFINDEX_MAGIC_BLOCK )
2019-02-11 11:55:45 +03:00
tcf_block_refcnt_put ( block , true ) ;
2018-07-23 10:23:06 +03:00
cb - > args [ 0 ] = index ;
out :
/* If we did no progress, the error (EMSGSIZE) is real */
if ( skb - > len = = 0 & & err )
return err ;
return skb - > len ;
}
2023-02-18 01:36:14 +03:00
int tcf_exts_init_ex ( struct tcf_exts * exts , struct net * net , int action ,
int police , struct tcf_proto * tp , u32 handle ,
bool use_action_miss )
{
int err = 0 ;
# ifdef CONFIG_NET_CLS_ACT
exts - > type = 0 ;
exts - > nr_actions = 0 ;
2023-04-20 21:36:33 +03:00
exts - > miss_cookie_node = NULL ;
2023-02-18 01:36:14 +03:00
/* Note: we do not own yet a reference on net.
* This reference might be taken later from tcf_exts_get_net ( ) .
*/
exts - > net = net ;
exts - > actions = kcalloc ( TCA_ACT_MAX_PRIO , sizeof ( struct tc_action * ) ,
GFP_KERNEL ) ;
if ( ! exts - > actions )
return - ENOMEM ;
# endif
exts - > action = action ;
exts - > police = police ;
if ( ! use_action_miss )
return 0 ;
err = tcf_exts_miss_cookie_base_alloc ( exts , tp , handle ) ;
if ( err )
goto err_miss_alloc ;
return 0 ;
err_miss_alloc :
tcf_exts_destroy ( exts ) ;
2023-04-15 18:33:09 +03:00
# ifdef CONFIG_NET_CLS_ACT
exts - > actions = NULL ;
# endif
2023-02-18 01:36:14 +03:00
return err ;
}
EXPORT_SYMBOL ( tcf_exts_init_ex ) ;
2014-09-25 21:26:37 +04:00
void tcf_exts_destroy ( struct tcf_exts * exts )
2005-04-17 02:20:36 +04:00
{
2023-02-18 01:36:14 +03:00
tcf_exts_miss_cookie_base_destroy ( exts ) ;
2023-02-24 21:18:49 +03:00
# ifdef CONFIG_NET_CLS_ACT
2019-09-18 22:57:04 +03:00
if ( exts - > actions ) {
tcf_action_destroy ( exts - > actions , TCA_ACT_UNBIND ) ;
kfree ( exts - > actions ) ;
}
2016-08-14 08:35:00 +03:00
exts - > nr_actions = 0 ;
2005-04-17 02:20:36 +04:00
# endif
}
2008-01-21 13:26:41 +03:00
EXPORT_SYMBOL ( tcf_exts_destroy ) ;
2005-04-17 02:20:36 +04:00
2021-12-17 21:16:28 +03:00
int tcf_exts_validate_ex ( struct net * net , struct tcf_proto * tp , struct nlattr * * tb ,
struct nlattr * rate_tlv , struct tcf_exts * exts ,
u32 flags , u32 fl_flags , struct netlink_ext_ack * extack )
2005-04-17 02:20:36 +04:00
{
# ifdef CONFIG_NET_CLS_ACT
{
2021-04-07 18:36:03 +03:00
int init_res [ TCA_ACT_MAX_PRIO ] = { } ;
2005-04-17 02:20:36 +04:00
struct tc_action * act ;
2018-03-09 00:59:17 +03:00
size_t attr_size = 0 ;
2005-04-17 02:20:36 +04:00
2013-12-16 08:15:07 +04:00
if ( exts - > police & & tb [ exts - > police ] ) {
net_sched: fix RTNL deadlock again caused by request_module()
tcf_action_init_1() loads tc action modules automatically with
request_module() after parsing the tc action names, and it drops RTNL
lock and re-holds it before and after request_module(). This causes a
lot of troubles, as discovered by syzbot, because we can be in the
middle of batch initializations when we create an array of tc actions.
One of the problem is deadlock:
CPU 0 CPU 1
rtnl_lock();
for (...) {
tcf_action_init_1();
-> rtnl_unlock();
-> request_module();
rtnl_lock();
for (...) {
tcf_action_init_1();
-> tcf_idr_check_alloc();
// Insert one action into idr,
// but it is not committed until
// tcf_idr_insert_many(), then drop
// the RTNL lock in the _next_
// iteration
-> rtnl_unlock();
-> rtnl_lock();
-> a_o->init();
-> tcf_idr_check_alloc();
// Now waiting for the same index
// to be committed
-> request_module();
-> rtnl_lock()
// Now waiting for RTNL lock
}
rtnl_unlock();
}
rtnl_unlock();
This is not easy to solve, we can move the request_module() before
this loop and pre-load all the modules we need for this netlink
message and then do the rest initializations. So the loop breaks down
to two now:
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
struct tc_action_ops *a_o;
a_o = tc_action_load_ops(name, tb[i]...);
ops[i - 1] = a_o;
}
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
act = tcf_action_init_1(ops[i - 1]...);
}
Although this looks serious, it only has been reported by syzbot, so it
seems hard to trigger this by humans. And given the size of this patch,
I'd suggest to make it to net-next and not to backport to stable.
This patch has been tested by syzbot and tested with tdc.py by me.
Fixes: 0fedc63fadf0 ("net_sched: commit action insertions together")
Reported-and-tested-by: syzbot+82752bc5331601cf4899@syzkaller.appspotmail.com
Reported-and-tested-by: syzbot+b3b63b6bff456bd95294@syzkaller.appspotmail.com
Reported-by: syzbot+ba67b12b1ca729912834@syzkaller.appspotmail.com
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/20210117005657.14810-1-xiyou.wangcong@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-17 03:56:57 +03:00
struct tc_action_ops * a_o ;
2024-01-05 03:38:10 +03:00
flags | = TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND ;
a_o = tc_action_load_ops ( tb [ exts - > police ] , flags ,
2021-07-30 02:12:14 +03:00
extack ) ;
net_sched: fix RTNL deadlock again caused by request_module()
tcf_action_init_1() loads tc action modules automatically with
request_module() after parsing the tc action names, and it drops RTNL
lock and re-holds it before and after request_module(). This causes a
lot of troubles, as discovered by syzbot, because we can be in the
middle of batch initializations when we create an array of tc actions.
One of the problem is deadlock:
CPU 0 CPU 1
rtnl_lock();
for (...) {
tcf_action_init_1();
-> rtnl_unlock();
-> request_module();
rtnl_lock();
for (...) {
tcf_action_init_1();
-> tcf_idr_check_alloc();
// Insert one action into idr,
// but it is not committed until
// tcf_idr_insert_many(), then drop
// the RTNL lock in the _next_
// iteration
-> rtnl_unlock();
-> rtnl_lock();
-> a_o->init();
-> tcf_idr_check_alloc();
// Now waiting for the same index
// to be committed
-> request_module();
-> rtnl_lock()
// Now waiting for RTNL lock
}
rtnl_unlock();
}
rtnl_unlock();
This is not easy to solve, we can move the request_module() before
this loop and pre-load all the modules we need for this netlink
message and then do the rest initializations. So the loop breaks down
to two now:
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
struct tc_action_ops *a_o;
a_o = tc_action_load_ops(name, tb[i]...);
ops[i - 1] = a_o;
}
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
act = tcf_action_init_1(ops[i - 1]...);
}
Although this looks serious, it only has been reported by syzbot, so it
seems hard to trigger this by humans. And given the size of this patch,
I'd suggest to make it to net-next and not to backport to stable.
This patch has been tested by syzbot and tested with tdc.py by me.
Fixes: 0fedc63fadf0 ("net_sched: commit action insertions together")
Reported-and-tested-by: syzbot+82752bc5331601cf4899@syzkaller.appspotmail.com
Reported-and-tested-by: syzbot+b3b63b6bff456bd95294@syzkaller.appspotmail.com
Reported-by: syzbot+ba67b12b1ca729912834@syzkaller.appspotmail.com
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Tested-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/20210117005657.14810-1-xiyou.wangcong@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-17 03:56:57 +03:00
if ( IS_ERR ( a_o ) )
return PTR_ERR ( a_o ) ;
2017-05-17 12:08:02 +03:00
act = tcf_action_init_1 ( net , tp , tb [ exts - > police ] ,
2021-07-30 02:12:14 +03:00
rate_tlv , a_o , init_res , flags ,
extack ) ;
2021-04-07 18:36:04 +03:00
module_put ( a_o - > owner ) ;
if ( IS_ERR ( act ) )
2008-01-24 07:33:13 +03:00
return PTR_ERR ( act ) ;
2005-04-17 02:20:36 +04:00
2013-12-16 08:15:05 +04:00
act - > type = exts - > type = TCA_OLD_COMPAT ;
2016-08-14 08:35:00 +03:00
exts - > actions [ 0 ] = act ;
exts - > nr_actions = 1 ;
2023-12-11 21:18:07 +03:00
tcf_idr_insert_many ( exts - > actions , init_res ) ;
2013-12-16 08:15:07 +04:00
} else if ( exts - > action & & tb [ exts - > action ] ) {
2018-07-05 17:24:33 +03:00
int err ;
2016-08-14 08:35:00 +03:00
2021-07-30 02:12:14 +03:00
flags | = TCA_ACT_FLAGS_BIND ;
2017-05-17 12:08:02 +03:00
err = tcf_action_init ( net , tp , tb [ exts - > action ] ,
2021-07-30 02:12:14 +03:00
rate_tlv , exts - > actions , init_res ,
2021-12-17 21:16:28 +03:00
& attr_size , flags , fl_flags ,
extack ) ;
2018-07-05 17:24:33 +03:00
if ( err < 0 )
2013-12-16 08:15:05 +04:00
return err ;
2018-07-05 17:24:33 +03:00
exts - > nr_actions = err ;
2005-04-17 02:20:36 +04:00
}
}
# else
2013-12-16 08:15:07 +04:00
if ( ( exts - > action & & tb [ exts - > action ] ) | |
2018-01-18 19:20:52 +03:00
( exts - > police & & tb [ exts - > police ] ) ) {
NL_SET_ERR_MSG ( extack , " Classifier actions are not supported per compile options (CONFIG_NET_CLS_ACT) " ) ;
2005-04-17 02:20:36 +04:00
return - EOPNOTSUPP ;
2018-01-18 19:20:52 +03:00
}
2005-04-17 02:20:36 +04:00
# endif
return 0 ;
}
2021-12-17 21:16:28 +03:00
EXPORT_SYMBOL ( tcf_exts_validate_ex ) ;
int tcf_exts_validate ( struct net * net , struct tcf_proto * tp , struct nlattr * * tb ,
struct nlattr * rate_tlv , struct tcf_exts * exts ,
u32 flags , struct netlink_ext_ack * extack )
{
return tcf_exts_validate_ex ( net , tp , tb , rate_tlv , exts ,
flags , 0 , extack ) ;
}
2008-01-21 13:26:41 +03:00
EXPORT_SYMBOL ( tcf_exts_validate ) ;
2005-04-17 02:20:36 +04:00
2017-08-04 15:29:15 +03:00
void tcf_exts_change ( struct tcf_exts * dst , struct tcf_exts * src )
2005-04-17 02:20:36 +04:00
{
# ifdef CONFIG_NET_CLS_ACT
2016-08-14 08:35:00 +03:00
struct tcf_exts old = * dst ;
2017-08-04 15:29:15 +03:00
* dst = * src ;
2016-08-14 08:35:00 +03:00
tcf_exts_destroy ( & old ) ;
2005-04-17 02:20:36 +04:00
# endif
}
2008-01-21 13:26:41 +03:00
EXPORT_SYMBOL ( tcf_exts_change ) ;
2005-04-17 02:20:36 +04:00
2016-08-14 08:35:00 +03:00
# ifdef CONFIG_NET_CLS_ACT
static struct tc_action * tcf_exts_first_act ( struct tcf_exts * exts )
{
if ( exts - > nr_actions = = 0 )
return NULL ;
else
return exts - > actions [ 0 ] ;
}
# endif
2013-12-16 08:15:05 +04:00
2013-12-16 08:15:07 +04:00
int tcf_exts_dump ( struct sk_buff * skb , struct tcf_exts * exts )
2005-04-17 02:20:36 +04:00
{
# ifdef CONFIG_NET_CLS_ACT
2014-07-17 01:25:30 +04:00
struct nlattr * nest ;
2017-08-04 15:29:03 +03:00
if ( exts - > action & & tcf_exts_has_actions ( exts ) ) {
2005-04-17 02:20:36 +04:00
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
* tc data even if iproute2 was newer - jhs
*/
2013-12-16 08:15:05 +04:00
if ( exts - > type ! = TCA_OLD_COMPAT ) {
2019-04-26 12:13:06 +03:00
nest = nla_nest_start_noflag ( skb , exts - > action ) ;
2008-01-24 07:34:11 +03:00
if ( nest = = NULL )
goto nla_put_failure ;
2016-08-14 08:35:00 +03:00
2020-05-15 14:40:12 +03:00
if ( tcf_action_dump ( skb , exts - > actions , 0 , 0 , false )
< 0 )
2008-01-23 09:11:33 +03:00
goto nla_put_failure ;
2008-01-24 07:34:11 +03:00
nla_nest_end ( skb , nest ) ;
2013-12-16 08:15:07 +04:00
} else if ( exts - > police ) {
2013-12-16 08:15:05 +04:00
struct tc_action * act = tcf_exts_first_act ( exts ) ;
2019-04-26 12:13:06 +03:00
nest = nla_nest_start_noflag ( skb , exts - > police ) ;
2013-12-23 17:02:12 +04:00
if ( nest = = NULL | | ! act )
2008-01-24 07:34:11 +03:00
goto nla_put_failure ;
2013-12-16 08:15:05 +04:00
if ( tcf_action_dump_old ( skb , act , 0 , 0 ) < 0 )
2008-01-23 09:11:33 +03:00
goto nla_put_failure ;
2008-01-24 07:34:11 +03:00
nla_nest_end ( skb , nest ) ;
2005-04-17 02:20:36 +04:00
}
}
return 0 ;
2014-07-17 01:25:30 +04:00
nla_put_failure :
nla_nest_cancel ( skb , nest ) ;
2005-04-17 02:20:36 +04:00
return - 1 ;
2014-07-17 01:25:30 +04:00
# else
return 0 ;
# endif
2005-04-17 02:20:36 +04:00
}
2008-01-21 13:26:41 +03:00
EXPORT_SYMBOL ( tcf_exts_dump ) ;
2005-04-17 02:20:36 +04:00
2020-05-15 14:40:12 +03:00
int tcf_exts_terse_dump ( struct sk_buff * skb , struct tcf_exts * exts )
{
# ifdef CONFIG_NET_CLS_ACT
struct nlattr * nest ;
if ( ! exts - > action | | ! tcf_exts_has_actions ( exts ) )
return 0 ;
nest = nla_nest_start_noflag ( skb , exts - > action ) ;
if ( ! nest )
goto nla_put_failure ;
if ( tcf_action_dump ( skb , exts - > actions , 0 , 0 , true ) < 0 )
goto nla_put_failure ;
nla_nest_end ( skb , nest ) ;
return 0 ;
nla_put_failure :
nla_nest_cancel ( skb , nest ) ;
return - 1 ;
# else
return 0 ;
# endif
}
EXPORT_SYMBOL ( tcf_exts_terse_dump ) ;
2008-01-21 13:26:41 +03:00
2013-12-16 08:15:07 +04:00
int tcf_exts_dump_stats ( struct sk_buff * skb , struct tcf_exts * exts )
2005-04-17 02:20:36 +04:00
{
# ifdef CONFIG_NET_CLS_ACT
2013-12-16 08:15:05 +04:00
struct tc_action * a = tcf_exts_first_act ( exts ) ;
2015-02-03 21:05:18 +03:00
if ( a ! = NULL & & tcf_action_copy_stats ( skb , a , 1 ) < 0 )
2013-12-16 08:15:05 +04:00
return - 1 ;
2005-04-17 02:20:36 +04:00
# endif
return 0 ;
}
2008-01-21 13:26:41 +03:00
EXPORT_SYMBOL ( tcf_exts_dump_stats ) ;
2005-04-17 02:20:36 +04:00
2019-08-26 16:44:59 +03:00
static void tcf_block_offload_inc ( struct tcf_block * block , u32 * flags )
{
if ( * flags & TCA_CLS_FLAGS_IN_HW )
return ;
* flags | = TCA_CLS_FLAGS_IN_HW ;
2024-03-25 23:47:34 +03:00
if ( tc_skip_sw ( * flags ) )
atomic_inc ( & block - > skipswcnt ) ;
2019-08-26 16:44:59 +03:00
atomic_inc ( & block - > offloadcnt ) ;
}
static void tcf_block_offload_dec ( struct tcf_block * block , u32 * flags )
{
if ( ! ( * flags & TCA_CLS_FLAGS_IN_HW ) )
return ;
* flags & = ~ TCA_CLS_FLAGS_IN_HW ;
2024-03-25 23:47:34 +03:00
if ( tc_skip_sw ( * flags ) )
atomic_dec ( & block - > skipswcnt ) ;
2019-08-26 16:44:59 +03:00
atomic_dec ( & block - > offloadcnt ) ;
}
static void tc_cls_offload_cnt_update ( struct tcf_block * block ,
struct tcf_proto * tp , u32 * cnt ,
u32 * flags , u32 diff , bool add )
{
lockdep_assert_held ( & block - > cb_lock ) ;
spin_lock ( & tp - > lock ) ;
if ( add ) {
if ( ! * cnt )
tcf_block_offload_inc ( block , flags ) ;
* cnt + = diff ;
} else {
* cnt - = diff ;
if ( ! * cnt )
tcf_block_offload_dec ( block , flags ) ;
}
spin_unlock ( & tp - > lock ) ;
}
static void
tc_cls_offload_cnt_reset ( struct tcf_block * block , struct tcf_proto * tp ,
u32 * cnt , u32 * flags )
{
lockdep_assert_held ( & block - > cb_lock ) ;
spin_lock ( & tp - > lock ) ;
tcf_block_offload_dec ( block , flags ) ;
* cnt = 0 ;
spin_unlock ( & tp - > lock ) ;
}
static int
__tc_setup_cb_call ( struct tcf_block * block , enum tc_setup_type type ,
void * type_data , bool err_stop )
2017-10-11 10:41:09 +03:00
{
2019-07-09 23:55:46 +03:00
struct flow_block_cb * block_cb ;
2018-12-11 22:15:46 +03:00
int ok_count = 0 ;
int err ;
2019-07-19 19:20:16 +03:00
list_for_each_entry ( block_cb , & block - > flow_block . cb_list , list ) {
2018-12-11 22:15:46 +03:00
err = block_cb - > cb ( type , type_data , block_cb - > cb_priv ) ;
if ( err ) {
2019-08-26 16:44:59 +03:00
if ( err_stop )
return err ;
2018-12-11 22:15:46 +03:00
} else {
ok_count + + ;
}
}
2019-08-26 16:44:59 +03:00
return ok_count ;
}
int tc_setup_cb_call ( struct tcf_block * block , enum tc_setup_type type ,
void * type_data , bool err_stop , bool rtnl_held )
{
2019-08-26 16:45:02 +03:00
bool take_rtnl = READ_ONCE ( block - > lockeddevcnt ) & & ! rtnl_held ;
2019-08-26 16:44:59 +03:00
int ok_count ;
2019-08-26 16:45:02 +03:00
retry :
if ( take_rtnl )
rtnl_lock ( ) ;
2019-08-26 16:44:59 +03:00
down_read ( & block - > cb_lock ) ;
2019-08-26 16:45:02 +03:00
/* Need to obtain rtnl lock if block is bound to devs that require it.
* In block bind code cb_lock is obtained while holding rtnl , so we must
* obtain the locks in same order here .
*/
if ( ! rtnl_held & & ! take_rtnl & & block - > lockeddevcnt ) {
up_read ( & block - > cb_lock ) ;
take_rtnl = true ;
goto retry ;
}
2019-08-26 16:44:59 +03:00
ok_count = __tc_setup_cb_call ( block , type , type_data , err_stop ) ;
2019-08-26 16:45:02 +03:00
2019-08-26 16:44:57 +03:00
up_read ( & block - > cb_lock ) ;
2019-08-26 16:45:02 +03:00
if ( take_rtnl )
rtnl_unlock ( ) ;
2018-12-11 22:15:46 +03:00
return ok_count ;
2017-10-11 10:41:09 +03:00
}
EXPORT_SYMBOL ( tc_setup_cb_call ) ;
2017-10-11 10:41:08 +03:00
2019-08-26 16:44:59 +03:00
/* Non-destructive filter add. If filter that wasn't already in hardware is
* successfully offloaded , increment block offloads counter . On failure ,
* previously offloaded filter is considered to be intact and offloads counter
* is not decremented .
*/
int tc_setup_cb_add ( struct tcf_block * block , struct tcf_proto * tp ,
enum tc_setup_type type , void * type_data , bool err_stop ,
u32 * flags , unsigned int * in_hw_count , bool rtnl_held )
{
2019-08-26 16:45:02 +03:00
bool take_rtnl = READ_ONCE ( block - > lockeddevcnt ) & & ! rtnl_held ;
2019-08-26 16:44:59 +03:00
int ok_count ;
2019-08-26 16:45:02 +03:00
retry :
if ( take_rtnl )
rtnl_lock ( ) ;
2019-08-26 16:44:59 +03:00
down_read ( & block - > cb_lock ) ;
2019-08-26 16:45:02 +03:00
/* Need to obtain rtnl lock if block is bound to devs that require it.
* In block bind code cb_lock is obtained while holding rtnl , so we must
* obtain the locks in same order here .
*/
if ( ! rtnl_held & & ! take_rtnl & & block - > lockeddevcnt ) {
up_read ( & block - > cb_lock ) ;
take_rtnl = true ;
goto retry ;
}
2019-08-26 16:44:59 +03:00
/* Make sure all netdevs sharing this block are offload-capable. */
if ( block - > nooffloaddevcnt & & err_stop ) {
ok_count = - EOPNOTSUPP ;
goto err_unlock ;
}
ok_count = __tc_setup_cb_call ( block , type , type_data , err_stop ) ;
2019-08-26 16:45:00 +03:00
if ( ok_count < 0 )
goto err_unlock ;
if ( tp - > ops - > hw_add )
tp - > ops - > hw_add ( tp , type_data ) ;
2019-08-26 16:44:59 +03:00
if ( ok_count > 0 )
tc_cls_offload_cnt_update ( block , tp , in_hw_count , flags ,
ok_count , true ) ;
err_unlock :
up_read ( & block - > cb_lock ) ;
2019-08-26 16:45:02 +03:00
if ( take_rtnl )
rtnl_unlock ( ) ;
2021-12-21 04:14:55 +03:00
return min ( ok_count , 0 ) ;
2019-08-26 16:44:59 +03:00
}
EXPORT_SYMBOL ( tc_setup_cb_add ) ;
/* Destructive filter replace. If filter that wasn't already in hardware is
* successfully offloaded , increment block offload counter . On failure ,
* previously offloaded filter is considered to be destroyed and offload counter
* is decremented .
*/
int tc_setup_cb_replace ( struct tcf_block * block , struct tcf_proto * tp ,
enum tc_setup_type type , void * type_data , bool err_stop ,
u32 * old_flags , unsigned int * old_in_hw_count ,
u32 * new_flags , unsigned int * new_in_hw_count ,
bool rtnl_held )
{
2019-08-26 16:45:02 +03:00
bool take_rtnl = READ_ONCE ( block - > lockeddevcnt ) & & ! rtnl_held ;
2019-08-26 16:44:59 +03:00
int ok_count ;
2019-08-26 16:45:02 +03:00
retry :
if ( take_rtnl )
rtnl_lock ( ) ;
2019-08-26 16:44:59 +03:00
down_read ( & block - > cb_lock ) ;
2019-08-26 16:45:02 +03:00
/* Need to obtain rtnl lock if block is bound to devs that require it.
* In block bind code cb_lock is obtained while holding rtnl , so we must
* obtain the locks in same order here .
*/
if ( ! rtnl_held & & ! take_rtnl & & block - > lockeddevcnt ) {
up_read ( & block - > cb_lock ) ;
take_rtnl = true ;
goto retry ;
}
2019-08-26 16:44:59 +03:00
/* Make sure all netdevs sharing this block are offload-capable. */
if ( block - > nooffloaddevcnt & & err_stop ) {
ok_count = - EOPNOTSUPP ;
goto err_unlock ;
}
tc_cls_offload_cnt_reset ( block , tp , old_in_hw_count , old_flags ) ;
2019-08-26 16:45:00 +03:00
if ( tp - > ops - > hw_del )
tp - > ops - > hw_del ( tp , type_data ) ;
2019-08-26 16:44:59 +03:00
ok_count = __tc_setup_cb_call ( block , type , type_data , err_stop ) ;
2019-08-26 16:45:00 +03:00
if ( ok_count < 0 )
goto err_unlock ;
if ( tp - > ops - > hw_add )
tp - > ops - > hw_add ( tp , type_data ) ;
2019-08-26 16:44:59 +03:00
if ( ok_count > 0 )
2019-08-26 16:45:00 +03:00
tc_cls_offload_cnt_update ( block , tp , new_in_hw_count ,
new_flags , ok_count , true ) ;
2019-08-26 16:44:59 +03:00
err_unlock :
up_read ( & block - > cb_lock ) ;
2019-08-26 16:45:02 +03:00
if ( take_rtnl )
rtnl_unlock ( ) ;
2021-12-21 04:14:55 +03:00
return min ( ok_count , 0 ) ;
2019-08-26 16:44:59 +03:00
}
EXPORT_SYMBOL ( tc_setup_cb_replace ) ;
/* Destroy filter and decrement block offload counter, if filter was previously
* offloaded .
*/
int tc_setup_cb_destroy ( struct tcf_block * block , struct tcf_proto * tp ,
enum tc_setup_type type , void * type_data , bool err_stop ,
u32 * flags , unsigned int * in_hw_count , bool rtnl_held )
{
2019-08-26 16:45:02 +03:00
bool take_rtnl = READ_ONCE ( block - > lockeddevcnt ) & & ! rtnl_held ;
2019-08-26 16:44:59 +03:00
int ok_count ;
2019-08-26 16:45:02 +03:00
retry :
if ( take_rtnl )
rtnl_lock ( ) ;
2019-08-26 16:44:59 +03:00
down_read ( & block - > cb_lock ) ;
2019-08-26 16:45:02 +03:00
/* Need to obtain rtnl lock if block is bound to devs that require it.
* In block bind code cb_lock is obtained while holding rtnl , so we must
* obtain the locks in same order here .
*/
if ( ! rtnl_held & & ! take_rtnl & & block - > lockeddevcnt ) {
up_read ( & block - > cb_lock ) ;
take_rtnl = true ;
goto retry ;
}
2019-08-26 16:44:59 +03:00
ok_count = __tc_setup_cb_call ( block , type , type_data , err_stop ) ;
tc_cls_offload_cnt_reset ( block , tp , in_hw_count , flags ) ;
2019-08-26 16:45:00 +03:00
if ( tp - > ops - > hw_del )
tp - > ops - > hw_del ( tp , type_data ) ;
2019-08-26 16:44:59 +03:00
up_read ( & block - > cb_lock ) ;
2019-08-26 16:45:02 +03:00
if ( take_rtnl )
rtnl_unlock ( ) ;
2021-12-21 04:14:55 +03:00
return min ( ok_count , 0 ) ;
2019-08-26 16:44:59 +03:00
}
EXPORT_SYMBOL ( tc_setup_cb_destroy ) ;
int tc_setup_cb_reoffload ( struct tcf_block * block , struct tcf_proto * tp ,
bool add , flow_setup_cb_t * cb ,
enum tc_setup_type type , void * type_data ,
void * cb_priv , u32 * flags , unsigned int * in_hw_count )
{
int err = cb ( type , type_data , cb_priv ) ;
if ( err ) {
if ( add & & tc_skip_sw ( * flags ) )
return err ;
} else {
tc_cls_offload_cnt_update ( block , tp , in_hw_count , flags , 1 ,
add ) ;
}
return 0 ;
}
EXPORT_SYMBOL ( tc_setup_cb_reoffload ) ;
2023-02-18 01:36:13 +03:00
static int tcf_act_get_user_cookie ( struct flow_action_entry * entry ,
const struct tc_action * act )
2020-02-25 13:45:18 +03:00
{
2023-02-18 01:36:13 +03:00
struct tc_cookie * user_cookie ;
2020-02-25 13:45:18 +03:00
int err = 0 ;
rcu_read_lock ( ) ;
2023-02-18 01:36:13 +03:00
user_cookie = rcu_dereference ( act - > user_cookie ) ;
if ( user_cookie ) {
entry - > user_cookie = flow_action_cookie_create ( user_cookie - > data ,
user_cookie - > len ,
GFP_ATOMIC ) ;
if ( ! entry - > user_cookie )
2020-02-25 13:45:18 +03:00
err = - ENOMEM ;
}
rcu_read_unlock ( ) ;
return err ;
}
2023-02-18 01:36:13 +03:00
static void tcf_act_put_user_cookie ( struct flow_action_entry * entry )
2020-02-25 13:45:18 +03:00
{
2023-02-18 01:36:13 +03:00
flow_action_cookie_destroy ( entry - > user_cookie ) ;
2020-02-25 13:45:18 +03:00
}
2021-12-17 21:16:20 +03:00
void tc_cleanup_offload_action ( struct flow_action * flow_action )
2019-08-26 16:45:04 +03:00
{
struct flow_action_entry * entry ;
int i ;
2020-02-25 13:45:18 +03:00
flow_action_for_each ( i , entry , flow_action ) {
2023-02-18 01:36:13 +03:00
tcf_act_put_user_cookie ( entry ) ;
2019-09-13 18:28:39 +03:00
if ( entry - > destructor )
entry - > destructor ( entry - > destructor_priv ) ;
2020-02-25 13:45:18 +03:00
}
2019-08-26 16:45:04 +03:00
}
2021-12-17 21:16:20 +03:00
EXPORT_SYMBOL ( tc_cleanup_offload_action ) ;
2019-08-26 16:45:04 +03:00
2021-12-17 21:16:21 +03:00
static int tc_setup_offload_act ( struct tc_action * act ,
struct flow_action_entry * entry ,
2022-04-07 10:35:22 +03:00
u32 * index_inc ,
struct netlink_ext_ack * extack )
2019-09-13 18:28:39 +03:00
{
2019-09-13 18:28:41 +03:00
# ifdef CONFIG_NET_CLS_ACT
2022-04-07 10:35:31 +03:00
if ( act - > ops - > offload_act_setup ) {
2022-04-07 10:35:22 +03:00
return act - > ops - > offload_act_setup ( act , entry , index_inc , true ,
extack ) ;
2022-04-07 10:35:31 +03:00
} else {
NL_SET_ERR_MSG ( extack , " Action does not support offload " ) ;
2021-12-17 21:16:21 +03:00
return - EOPNOTSUPP ;
2022-04-07 10:35:31 +03:00
}
2021-12-17 21:16:21 +03:00
# else
2019-09-13 18:28:39 +03:00
return 0 ;
2019-09-13 18:28:40 +03:00
# endif
}
2021-12-17 21:16:22 +03:00
int tc_setup_action ( struct flow_action * flow_action ,
2022-04-07 10:35:22 +03:00
struct tc_action * actions [ ] ,
2023-02-18 01:36:14 +03:00
u32 miss_cookie_base ,
2022-04-07 10:35:22 +03:00
struct netlink_ext_ack * extack )
2019-02-02 14:50:46 +03:00
{
2022-07-19 15:24:09 +03:00
int i , j , k , index , err = 0 ;
2020-02-17 13:12:09 +03:00
struct tc_action * act ;
2019-02-02 14:50:46 +03:00
2020-03-20 02:26:23 +03:00
BUILD_BUG_ON ( TCA_ACT_HW_STATS_ANY ! = FLOW_ACTION_HW_STATS_ANY ) ;
BUILD_BUG_ON ( TCA_ACT_HW_STATS_IMMEDIATE ! = FLOW_ACTION_HW_STATS_IMMEDIATE ) ;
BUILD_BUG_ON ( TCA_ACT_HW_STATS_DELAYED ! = FLOW_ACTION_HW_STATS_DELAYED ) ;
2020-03-07 14:40:20 +03:00
2021-12-17 21:16:22 +03:00
if ( ! actions )
2019-02-02 14:50:46 +03:00
return 0 ;
j = 0 ;
2021-12-17 21:16:22 +03:00
tcf_act_for_each_action ( i , act , actions ) {
2019-02-02 14:50:46 +03:00
struct flow_action_entry * entry ;
entry = & flow_action - > entries [ j ] ;
2020-02-17 13:12:09 +03:00
spin_lock_bh ( & act - > tcfa_lock ) ;
2023-02-18 01:36:13 +03:00
err = tcf_act_get_user_cookie ( entry , act ) ;
2020-02-25 13:45:18 +03:00
if ( err )
goto err_out_locked ;
2020-03-07 14:40:20 +03:00
2021-12-17 21:16:21 +03:00
index = 0 ;
2022-04-07 10:35:22 +03:00
err = tc_setup_offload_act ( act , entry , & index , extack ) ;
2022-07-19 15:24:09 +03:00
if ( err )
2020-02-17 13:12:09 +03:00
goto err_out_locked ;
2022-07-19 15:24:09 +03:00
for ( k = 0 ; k < index ; k + + ) {
entry [ k ] . hw_stats = tc_act_hw_stats ( act - > hw_stats ) ;
entry [ k ] . hw_index = act - > tcfa_index ;
2023-02-18 01:36:13 +03:00
entry [ k ] . cookie = ( unsigned long ) act ;
2023-02-18 01:36:14 +03:00
entry [ k ] . miss_cookie =
tcf_exts_miss_cookie_get ( miss_cookie_base , i ) ;
2022-07-19 15:24:09 +03:00
}
j + = index ;
2020-02-17 13:12:09 +03:00
spin_unlock_bh ( & act - > tcfa_lock ) ;
2019-02-02 14:50:46 +03:00
}
2019-08-26 16:45:03 +03:00
2019-02-02 14:50:46 +03:00
err_out :
2019-08-26 16:45:04 +03:00
if ( err )
2021-12-17 21:16:20 +03:00
tc_cleanup_offload_action ( flow_action ) ;
2019-08-26 16:45:04 +03:00
2019-08-26 16:45:03 +03:00
return err ;
2020-02-17 13:12:09 +03:00
err_out_locked :
spin_unlock_bh ( & act - > tcfa_lock ) ;
goto err_out ;
2019-02-02 14:50:46 +03:00
}
2021-12-17 21:16:22 +03:00
int tc_setup_offload_action ( struct flow_action * flow_action ,
2022-04-07 10:35:22 +03:00
const struct tcf_exts * exts ,
struct netlink_ext_ack * extack )
2021-12-17 21:16:22 +03:00
{
# ifdef CONFIG_NET_CLS_ACT
2023-02-18 01:36:14 +03:00
u32 miss_cookie_base ;
2021-12-17 21:16:22 +03:00
if ( ! exts )
return 0 ;
2023-02-18 01:36:14 +03:00
miss_cookie_base = exts - > miss_cookie_node ?
exts - > miss_cookie_node - > miss_cookie_base : 0 ;
return tc_setup_action ( flow_action , exts - > actions , miss_cookie_base ,
extack ) ;
2021-12-17 21:16:22 +03:00
# else
return 0 ;
# endif
}
2021-12-17 21:16:20 +03:00
EXPORT_SYMBOL ( tc_setup_offload_action ) ;
2019-02-02 14:50:46 +03:00
2019-02-02 14:50:45 +03:00
unsigned int tcf_exts_num_actions ( struct tcf_exts * exts )
{
unsigned int num_acts = 0 ;
struct tc_action * act ;
int i ;
tcf_exts_for_each_action ( i , act , exts ) {
if ( is_tcf_pedit ( act ) )
num_acts + = tcf_pedit_nkeys ( act ) ;
else
num_acts + + ;
}
return num_acts ;
}
EXPORT_SYMBOL ( tcf_exts_num_actions ) ;
2020-06-27 01:45:26 +03:00
# ifdef CONFIG_NET_CLS_ACT
static int tcf_qevent_parse_block_index ( struct nlattr * block_index_attr ,
u32 * p_block_index ,
struct netlink_ext_ack * extack )
{
* p_block_index = nla_get_u32 ( block_index_attr ) ;
if ( ! * p_block_index ) {
NL_SET_ERR_MSG ( extack , " Block number may not be zero " ) ;
return - EINVAL ;
}
return 0 ;
}
int tcf_qevent_init ( struct tcf_qevent * qe , struct Qdisc * sch ,
enum flow_block_binder_type binder_type ,
struct nlattr * block_index_attr ,
struct netlink_ext_ack * extack )
{
u32 block_index ;
int err ;
if ( ! block_index_attr )
return 0 ;
err = tcf_qevent_parse_block_index ( block_index_attr , & block_index , extack ) ;
if ( err )
return err ;
qe - > info . binder_type = binder_type ;
qe - > info . chain_head_change = tcf_chain_head_change_dflt ;
qe - > info . chain_head_change_priv = & qe - > filter_chain ;
qe - > info . block_index = block_index ;
return tcf_block_get_ext ( & qe - > block , sch , & qe - > info , extack ) ;
}
EXPORT_SYMBOL ( tcf_qevent_init ) ;
void tcf_qevent_destroy ( struct tcf_qevent * qe , struct Qdisc * sch )
{
if ( qe - > info . block_index )
tcf_block_put_ext ( qe - > block , sch , & qe - > info ) ;
}
EXPORT_SYMBOL ( tcf_qevent_destroy ) ;
int tcf_qevent_validate_change ( struct tcf_qevent * qe , struct nlattr * block_index_attr ,
struct netlink_ext_ack * extack )
{
u32 block_index ;
int err ;
if ( ! block_index_attr )
return 0 ;
err = tcf_qevent_parse_block_index ( block_index_attr , & block_index , extack ) ;
if ( err )
return err ;
/* Bounce newly-configured block or change in block. */
if ( block_index ! = qe - > info . block_index ) {
NL_SET_ERR_MSG ( extack , " Change of blocks is not supported " ) ;
return - EINVAL ;
}
return 0 ;
}
EXPORT_SYMBOL ( tcf_qevent_validate_change ) ;
struct sk_buff * tcf_qevent_handle ( struct tcf_qevent * qe , struct Qdisc * sch , struct sk_buff * skb ,
2020-07-14 20:03:07 +03:00
struct sk_buff * * to_free , int * ret )
2020-06-27 01:45:26 +03:00
{
struct tcf_result cl_res ;
struct tcf_proto * fl ;
if ( ! qe - > info . block_index )
return skb ;
fl = rcu_dereference_bh ( qe - > filter_chain ) ;
2021-07-28 21:08:00 +03:00
switch ( tcf_classify ( skb , NULL , fl , & cl_res , false ) ) {
2020-06-27 01:45:26 +03:00
case TC_ACT_SHOT :
qdisc_qstats_drop ( sch ) ;
__qdisc_drop ( skb , to_free ) ;
* ret = __NET_XMIT_BYPASS ;
return NULL ;
case TC_ACT_STOLEN :
case TC_ACT_QUEUED :
case TC_ACT_TRAP :
__qdisc_drop ( skb , to_free ) ;
* ret = __NET_XMIT_STOLEN ;
return NULL ;
case TC_ACT_REDIRECT :
skb_do_redirect ( skb ) ;
* ret = __NET_XMIT_STOLEN ;
return NULL ;
}
return skb ;
}
EXPORT_SYMBOL ( tcf_qevent_handle ) ;
int tcf_qevent_dump ( struct sk_buff * skb , int attr_name , struct tcf_qevent * qe )
{
if ( ! qe - > info . block_index )
return 0 ;
return nla_put_u32 ( skb , attr_name , qe - > info . block_index ) ;
}
EXPORT_SYMBOL ( tcf_qevent_dump ) ;
# endif
2018-01-17 13:46:46 +03:00
static __net_init int tcf_net_init ( struct net * net )
{
struct tcf_net * tn = net_generic ( net , tcf_net_id ) ;
2018-09-24 19:22:56 +03:00
spin_lock_init ( & tn - > idr_lock ) ;
2018-01-17 13:46:46 +03:00
idr_init ( & tn - > idr ) ;
return 0 ;
}
static void __net_exit tcf_net_exit ( struct net * net )
{
struct tcf_net * tn = net_generic ( net , tcf_net_id ) ;
idr_destroy ( & tn - > idr ) ;
}
static struct pernet_operations tcf_net_ops = {
. init = tcf_net_init ,
. exit = tcf_net_exit ,
. id = & tcf_net_id ,
. size = sizeof ( struct tcf_net ) ,
} ;
2005-04-17 02:20:36 +04:00
static int __init tc_filter_init ( void )
{
2018-01-17 13:46:46 +03:00
int err ;
2017-10-27 04:24:28 +03:00
tc_filter_wq = alloc_ordered_workqueue ( " tc_filter_workqueue " , 0 ) ;
if ( ! tc_filter_wq )
return - ENOMEM ;
2018-01-17 13:46:46 +03:00
err = register_pernet_subsys ( & tcf_net_ops ) ;
if ( err )
goto err_register_pernet_subsys ;
2023-02-18 01:36:14 +03:00
xa_init_flags ( & tcf_exts_miss_cookies_xa , XA_FLAGS_ALLOC1 ) ;
2019-02-11 11:55:48 +03:00
rtnl_register ( PF_UNSPEC , RTM_NEWTFILTER , tc_new_tfilter , NULL ,
RTNL_FLAG_DOIT_UNLOCKED ) ;
rtnl_register ( PF_UNSPEC , RTM_DELTFILTER , tc_del_tfilter , NULL ,
RTNL_FLAG_DOIT_UNLOCKED ) ;
2018-05-31 09:52:53 +03:00
rtnl_register ( PF_UNSPEC , RTM_GETTFILTER , tc_get_tfilter ,
2019-02-11 11:55:48 +03:00
tc_dump_tfilter , RTNL_FLAG_DOIT_UNLOCKED ) ;
2018-07-23 10:23:06 +03:00
rtnl_register ( PF_UNSPEC , RTM_NEWCHAIN , tc_ctl_chain , NULL , 0 ) ;
rtnl_register ( PF_UNSPEC , RTM_DELCHAIN , tc_ctl_chain , NULL , 0 ) ;
rtnl_register ( PF_UNSPEC , RTM_GETCHAIN , tc_ctl_chain ,
tc_dump_chain , 0 ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
2018-01-17 13:46:46 +03:00
err_register_pernet_subsys :
destroy_workqueue ( tc_filter_wq ) ;
return err ;
2005-04-17 02:20:36 +04:00
}
subsys_initcall ( tc_filter_init ) ;