2019-12-18 17:55:13 +03:00
// SPDX-License-Identifier: GPL-2.0-only
/*
* net / sched / sch_ets . c Enhanced Transmission Selection scheduler
*
* Description
* - - - - - - - - - - -
*
* The Enhanced Transmission Selection scheduler is a classful queuing
* discipline that merges functionality of PRIO and DRR qdiscs in one scheduler .
* ETS makes it easy to configure a set of strict and bandwidth - sharing bands to
* implement the transmission selection described in 802.1 Qaz .
*
* Although ETS is technically classful , it ' s not possible to add and remove
* classes at will . Instead one specifies number of classes , how many are
* PRIO - like and how many DRR - like , and quanta for the latter .
*
* Algorithm
* - - - - - - - - -
*
* The strict classes , if any , are tried for traffic first : first band 0 , if it
* has no traffic then band 1 , etc .
*
* When there is no traffic in any of the strict queues , the bandwidth - sharing
* ones are tried next . Each band is assigned a deficit counter , initialized to
* " quantum " of that band . ETS maintains a list of active bandwidth - sharing
* bands whose qdiscs are non - empty . A packet is dequeued from the band at the
* head of the list if the packet size is smaller or equal to the deficit
* counter . If the counter is too small , it is increased by " quantum " and the
* scheduler moves on to the next band in the active list .
*/
# include <linux/module.h>
# include <net/gen_stats.h>
# include <net/netlink.h>
# include <net/pkt_cls.h>
# include <net/pkt_sched.h>
# include <net/sch_generic.h>
struct ets_class {
struct list_head alist ; /* In struct ets_sched.active. */
struct Qdisc * qdisc ;
u32 quantum ;
u32 deficit ;
2021-10-16 11:49:09 +03:00
struct gnet_stats_basic_sync bstats ;
2019-12-18 17:55:13 +03:00
struct gnet_stats_queue qstats ;
} ;
struct ets_sched {
struct list_head active ;
struct tcf_proto __rcu * filter_list ;
struct tcf_block * block ;
unsigned int nbands ;
unsigned int nstrict ;
u8 prio2band [ TC_PRIO_MAX + 1 ] ;
struct ets_class classes [ TCQ_ETS_MAX_BANDS ] ;
} ;
static const struct nla_policy ets_policy [ TCA_ETS_MAX + 1 ] = {
[ TCA_ETS_NBANDS ] = { . type = NLA_U8 } ,
[ TCA_ETS_NSTRICT ] = { . type = NLA_U8 } ,
[ TCA_ETS_QUANTA ] = { . type = NLA_NESTED } ,
[ TCA_ETS_PRIOMAP ] = { . type = NLA_NESTED } ,
} ;
static const struct nla_policy ets_priomap_policy [ TCA_ETS_MAX + 1 ] = {
[ TCA_ETS_PRIOMAP_BAND ] = { . type = NLA_U8 } ,
} ;
static const struct nla_policy ets_quanta_policy [ TCA_ETS_MAX + 1 ] = {
[ TCA_ETS_QUANTA_BAND ] = { . type = NLA_U32 } ,
} ;
static const struct nla_policy ets_class_policy [ TCA_ETS_MAX + 1 ] = {
[ TCA_ETS_QUANTA_BAND ] = { . type = NLA_U32 } ,
} ;
static int ets_quantum_parse ( struct Qdisc * sch , const struct nlattr * attr ,
unsigned int * quantum ,
struct netlink_ext_ack * extack )
{
* quantum = nla_get_u32 ( attr ) ;
if ( ! * quantum ) {
NL_SET_ERR_MSG ( extack , " ETS quantum cannot be zero " ) ;
return - EINVAL ;
}
return 0 ;
}
static struct ets_class *
ets_class_from_arg ( struct Qdisc * sch , unsigned long arg )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
return & q - > classes [ arg - 1 ] ;
}
static u32 ets_class_id ( struct Qdisc * sch , const struct ets_class * cl )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
int band = cl - q - > classes ;
return TC_H_MAKE ( sch - > handle , band + 1 ) ;
}
2019-12-18 17:55:15 +03:00
static void ets_offload_change ( struct Qdisc * sch )
{
struct net_device * dev = qdisc_dev ( sch ) ;
struct ets_sched * q = qdisc_priv ( sch ) ;
struct tc_ets_qopt_offload qopt ;
unsigned int w_psum_prev = 0 ;
unsigned int q_psum = 0 ;
unsigned int q_sum = 0 ;
unsigned int quantum ;
unsigned int w_psum ;
unsigned int weight ;
unsigned int i ;
if ( ! tc_can_offload ( dev ) | | ! dev - > netdev_ops - > ndo_setup_tc )
return ;
qopt . command = TC_ETS_REPLACE ;
qopt . handle = sch - > handle ;
qopt . parent = sch - > parent ;
qopt . replace_params . bands = q - > nbands ;
qopt . replace_params . qstats = & sch - > qstats ;
memcpy ( & qopt . replace_params . priomap ,
q - > prio2band , sizeof ( q - > prio2band ) ) ;
for ( i = 0 ; i < q - > nbands ; i + + )
q_sum + = q - > classes [ i ] . quantum ;
for ( i = 0 ; i < q - > nbands ; i + + ) {
quantum = q - > classes [ i ] . quantum ;
q_psum + = quantum ;
w_psum = quantum ? q_psum * 100 / q_sum : 0 ;
weight = w_psum - w_psum_prev ;
w_psum_prev = w_psum ;
qopt . replace_params . quanta [ i ] = quantum ;
qopt . replace_params . weights [ i ] = weight ;
}
dev - > netdev_ops - > ndo_setup_tc ( dev , TC_SETUP_QDISC_ETS , & qopt ) ;
}
static void ets_offload_destroy ( struct Qdisc * sch )
{
struct net_device * dev = qdisc_dev ( sch ) ;
struct tc_ets_qopt_offload qopt ;
if ( ! tc_can_offload ( dev ) | | ! dev - > netdev_ops - > ndo_setup_tc )
return ;
qopt . command = TC_ETS_DESTROY ;
qopt . handle = sch - > handle ;
qopt . parent = sch - > parent ;
dev - > netdev_ops - > ndo_setup_tc ( dev , TC_SETUP_QDISC_ETS , & qopt ) ;
}
static void ets_offload_graft ( struct Qdisc * sch , struct Qdisc * new ,
struct Qdisc * old , unsigned long arg ,
struct netlink_ext_ack * extack )
{
struct net_device * dev = qdisc_dev ( sch ) ;
struct tc_ets_qopt_offload qopt ;
qopt . command = TC_ETS_GRAFT ;
qopt . handle = sch - > handle ;
qopt . parent = sch - > parent ;
qopt . graft_params . band = arg - 1 ;
qopt . graft_params . child_handle = new - > handle ;
qdisc_offload_graft_helper ( dev , sch , new , old , TC_SETUP_QDISC_ETS ,
& qopt , extack ) ;
}
static int ets_offload_dump ( struct Qdisc * sch )
{
struct tc_ets_qopt_offload qopt ;
qopt . command = TC_ETS_STATS ;
qopt . handle = sch - > handle ;
qopt . parent = sch - > parent ;
qopt . stats . bstats = & sch - > bstats ;
qopt . stats . qstats = & sch - > qstats ;
return qdisc_offload_dump_helper ( sch , TC_SETUP_QDISC_ETS , & qopt ) ;
}
2019-12-18 17:55:13 +03:00
static bool ets_class_is_strict ( struct ets_sched * q , const struct ets_class * cl )
{
unsigned int band = cl - q - > classes ;
return band < q - > nstrict ;
}
static int ets_class_change ( struct Qdisc * sch , u32 classid , u32 parentid ,
struct nlattr * * tca , unsigned long * arg ,
struct netlink_ext_ack * extack )
{
struct ets_class * cl = ets_class_from_arg ( sch , * arg ) ;
struct ets_sched * q = qdisc_priv ( sch ) ;
struct nlattr * opt = tca [ TCA_OPTIONS ] ;
struct nlattr * tb [ TCA_ETS_MAX + 1 ] ;
unsigned int quantum ;
int err ;
/* Classes can be added and removed only through Qdisc_ops.change
* interface .
*/
if ( ! cl ) {
NL_SET_ERR_MSG ( extack , " Fine-grained class addition and removal is not supported " ) ;
return - EOPNOTSUPP ;
}
if ( ! opt ) {
NL_SET_ERR_MSG ( extack , " ETS options are required for this operation " ) ;
return - EINVAL ;
}
err = nla_parse_nested ( tb , TCA_ETS_MAX , opt , ets_class_policy , extack ) ;
if ( err < 0 )
return err ;
if ( ! tb [ TCA_ETS_QUANTA_BAND ] )
/* Nothing to configure. */
return 0 ;
if ( ets_class_is_strict ( q , cl ) ) {
NL_SET_ERR_MSG ( extack , " Strict bands do not have a configurable quantum " ) ;
return - EINVAL ;
}
err = ets_quantum_parse ( sch , tb [ TCA_ETS_QUANTA_BAND ] , & quantum ,
extack ) ;
if ( err )
return err ;
sch_tree_lock ( sch ) ;
cl - > quantum = quantum ;
sch_tree_unlock ( sch ) ;
2019-12-18 17:55:15 +03:00
ets_offload_change ( sch ) ;
2019-12-18 17:55:13 +03:00
return 0 ;
}
static int ets_class_graft ( struct Qdisc * sch , unsigned long arg ,
struct Qdisc * new , struct Qdisc * * old ,
struct netlink_ext_ack * extack )
{
struct ets_class * cl = ets_class_from_arg ( sch , arg ) ;
if ( ! new ) {
new = qdisc_create_dflt ( sch - > dev_queue , & pfifo_qdisc_ops ,
ets_class_id ( sch , cl ) , NULL ) ;
if ( ! new )
new = & noop_qdisc ;
else
qdisc_hash_add ( new , true ) ;
}
* old = qdisc_replace ( sch , new , & cl - > qdisc ) ;
2019-12-18 17:55:15 +03:00
ets_offload_graft ( sch , new , * old , arg , extack ) ;
2019-12-18 17:55:13 +03:00
return 0 ;
}
static struct Qdisc * ets_class_leaf ( struct Qdisc * sch , unsigned long arg )
{
struct ets_class * cl = ets_class_from_arg ( sch , arg ) ;
return cl - > qdisc ;
}
static unsigned long ets_class_find ( struct Qdisc * sch , u32 classid )
{
unsigned long band = TC_H_MIN ( classid ) ;
struct ets_sched * q = qdisc_priv ( sch ) ;
if ( band - 1 > = q - > nbands )
return 0 ;
return band ;
}
static void ets_class_qlen_notify ( struct Qdisc * sch , unsigned long arg )
{
struct ets_class * cl = ets_class_from_arg ( sch , arg ) ;
struct ets_sched * q = qdisc_priv ( sch ) ;
/* We get notified about zero-length child Qdiscs as well if they are
* offloaded . Those aren ' t on the active list though , so don ' t attempt
* to remove them .
*/
if ( ! ets_class_is_strict ( q , cl ) & & sch - > q . qlen )
list_del ( & cl - > alist ) ;
}
static int ets_class_dump ( struct Qdisc * sch , unsigned long arg ,
struct sk_buff * skb , struct tcmsg * tcm )
{
struct ets_class * cl = ets_class_from_arg ( sch , arg ) ;
struct ets_sched * q = qdisc_priv ( sch ) ;
struct nlattr * nest ;
tcm - > tcm_parent = TC_H_ROOT ;
tcm - > tcm_handle = ets_class_id ( sch , cl ) ;
tcm - > tcm_info = cl - > qdisc - > handle ;
nest = nla_nest_start_noflag ( skb , TCA_OPTIONS ) ;
if ( ! nest )
goto nla_put_failure ;
if ( ! ets_class_is_strict ( q , cl ) ) {
if ( nla_put_u32 ( skb , TCA_ETS_QUANTA_BAND , cl - > quantum ) )
goto nla_put_failure ;
}
return nla_nest_end ( skb , nest ) ;
nla_put_failure :
nla_nest_cancel ( skb , nest ) ;
return - EMSGSIZE ;
}
static int ets_class_dump_stats ( struct Qdisc * sch , unsigned long arg ,
struct gnet_dump * d )
{
struct ets_class * cl = ets_class_from_arg ( sch , arg ) ;
struct Qdisc * cl_q = cl - > qdisc ;
net: sched: Remove Qdisc::running sequence counter
The Qdisc::running sequence counter has two uses:
1. Reliably reading qdisc's tc statistics while the qdisc is running
(a seqcount read/retry loop at gnet_stats_add_basic()).
2. As a flag, indicating whether the qdisc in question is running
(without any retry loops).
For the first usage, the Qdisc::running sequence counter write section,
qdisc_run_begin() => qdisc_run_end(), covers a much wider area than what
is actually needed: the raw qdisc's bstats update. A u64_stats sync
point was thus introduced (in previous commits) inside the bstats
structure itself. A local u64_stats write section is then started and
stopped for the bstats updates.
Use that u64_stats sync point mechanism for the bstats read/retry loop
at gnet_stats_add_basic().
For the second qdisc->running usage, a __QDISC_STATE_RUNNING bit flag,
accessed with atomic bitops, is sufficient. Using a bit flag instead of
a sequence counter at qdisc_run_begin/end() and qdisc_is_running() leads
to the SMP barriers implicitly added through raw_read_seqcount() and
write_seqcount_begin/end() getting removed. All call sites have been
surveyed though, and no required ordering was identified.
Now that the qdisc->running sequence counter is no longer used, remove
it.
Note, using u64_stats implies no sequence counter protection for 64-bit
architectures. This can lead to the qdisc tc statistics "packets" vs.
"bytes" values getting out of sync on rare occasions. The individual
values will still be valid.
Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-16 11:49:10 +03:00
if ( gnet_stats_copy_basic ( d , NULL , & cl_q - > bstats , true ) < 0 | |
2019-12-18 17:55:13 +03:00
qdisc_qstats_copy ( d , cl_q ) < 0 )
return - 1 ;
return 0 ;
}
static void ets_qdisc_walk ( struct Qdisc * sch , struct qdisc_walker * arg )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
int i ;
if ( arg - > stop )
return ;
for ( i = 0 ; i < q - > nbands ; i + + ) {
2022-09-21 05:41:18 +03:00
if ( ! tc_qdisc_stats_dump ( sch , i + 1 , arg ) )
2019-12-18 17:55:13 +03:00
break ;
}
}
static struct tcf_block *
ets_qdisc_tcf_block ( struct Qdisc * sch , unsigned long cl ,
struct netlink_ext_ack * extack )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
if ( cl ) {
NL_SET_ERR_MSG ( extack , " ETS classid must be zero " ) ;
return NULL ;
}
return q - > block ;
}
static unsigned long ets_qdisc_bind_tcf ( struct Qdisc * sch , unsigned long parent ,
u32 classid )
{
return ets_class_find ( sch , classid ) ;
}
static void ets_qdisc_unbind_tcf ( struct Qdisc * sch , unsigned long arg )
{
}
static struct ets_class * ets_classify ( struct sk_buff * skb , struct Qdisc * sch ,
int * qerr )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
u32 band = skb - > priority ;
struct tcf_result res ;
struct tcf_proto * fl ;
int err ;
* qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS ;
if ( TC_H_MAJ ( skb - > priority ) ! = sch - > handle ) {
fl = rcu_dereference_bh ( q - > filter_list ) ;
2021-07-28 21:08:00 +03:00
err = tcf_classify ( skb , NULL , fl , & res , false ) ;
2019-12-18 17:55:13 +03:00
# ifdef CONFIG_NET_CLS_ACT
switch ( err ) {
case TC_ACT_STOLEN :
case TC_ACT_QUEUED :
case TC_ACT_TRAP :
* qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN ;
2020-07-07 20:21:38 +03:00
fallthrough ;
2019-12-18 17:55:13 +03:00
case TC_ACT_SHOT :
return NULL ;
}
# endif
if ( ! fl | | err < 0 ) {
if ( TC_H_MAJ ( band ) )
band = 0 ;
return & q - > classes [ q - > prio2band [ band & TC_PRIO_MAX ] ] ;
}
band = res . classid ;
}
band = TC_H_MIN ( band ) - 1 ;
if ( band > = q - > nbands )
return & q - > classes [ q - > prio2band [ 0 ] ] ;
return & q - > classes [ band ] ;
}
2020-07-14 20:03:08 +03:00
static int ets_qdisc_enqueue ( struct sk_buff * skb , struct Qdisc * sch ,
2019-12-18 17:55:13 +03:00
struct sk_buff * * to_free )
{
unsigned int len = qdisc_pkt_len ( skb ) ;
struct ets_sched * q = qdisc_priv ( sch ) ;
struct ets_class * cl ;
int err = 0 ;
bool first ;
cl = ets_classify ( skb , sch , & err ) ;
if ( ! cl ) {
if ( err & __NET_XMIT_BYPASS )
qdisc_qstats_drop ( sch ) ;
__qdisc_drop ( skb , to_free ) ;
return err ;
}
first = ! cl - > qdisc - > q . qlen ;
2020-07-14 20:03:08 +03:00
err = qdisc_enqueue ( skb , cl - > qdisc , to_free ) ;
2019-12-18 17:55:13 +03:00
if ( unlikely ( err ! = NET_XMIT_SUCCESS ) ) {
if ( net_xmit_drop_count ( err ) ) {
cl - > qstats . drops + + ;
qdisc_qstats_drop ( sch ) ;
}
return err ;
}
if ( first & & ! ets_class_is_strict ( q , cl ) ) {
list_add_tail ( & cl - > alist , & q - > active ) ;
cl - > deficit = cl - > quantum ;
}
sch - > qstats . backlog + = len ;
sch - > q . qlen + + ;
return err ;
}
static struct sk_buff *
ets_qdisc_dequeue_skb ( struct Qdisc * sch , struct sk_buff * skb )
{
qdisc_bstats_update ( sch , skb ) ;
qdisc_qstats_backlog_dec ( sch , skb ) ;
sch - > q . qlen - - ;
return skb ;
}
static struct sk_buff * ets_qdisc_dequeue ( struct Qdisc * sch )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
struct ets_class * cl ;
struct sk_buff * skb ;
unsigned int band ;
unsigned int len ;
while ( 1 ) {
for ( band = 0 ; band < q - > nstrict ; band + + ) {
cl = & q - > classes [ band ] ;
skb = qdisc_dequeue_peeked ( cl - > qdisc ) ;
if ( skb )
return ets_qdisc_dequeue_skb ( sch , skb ) ;
}
if ( list_empty ( & q - > active ) )
goto out ;
cl = list_first_entry ( & q - > active , struct ets_class , alist ) ;
skb = cl - > qdisc - > ops - > peek ( cl - > qdisc ) ;
if ( ! skb ) {
qdisc_warn_nonwc ( __func__ , cl - > qdisc ) ;
goto out ;
}
len = qdisc_pkt_len ( skb ) ;
if ( len < = cl - > deficit ) {
cl - > deficit - = len ;
skb = qdisc_dequeue_peeked ( cl - > qdisc ) ;
if ( unlikely ( ! skb ) )
goto out ;
if ( cl - > qdisc - > q . qlen = = 0 )
list_del ( & cl - > alist ) ;
return ets_qdisc_dequeue_skb ( sch , skb ) ;
}
cl - > deficit + = cl - > quantum ;
list_move_tail ( & cl - > alist , & q - > active ) ;
}
out :
return NULL ;
}
static int ets_qdisc_priomap_parse ( struct nlattr * priomap_attr ,
unsigned int nbands , u8 * priomap ,
struct netlink_ext_ack * extack )
{
const struct nlattr * attr ;
int prio = 0 ;
u8 band ;
int rem ;
int err ;
err = __nla_validate_nested ( priomap_attr , TCA_ETS_MAX ,
ets_priomap_policy , NL_VALIDATE_STRICT ,
extack ) ;
if ( err )
return err ;
nla_for_each_nested ( attr , priomap_attr , rem ) {
switch ( nla_type ( attr ) ) {
case TCA_ETS_PRIOMAP_BAND :
if ( prio > TC_PRIO_MAX ) {
NL_SET_ERR_MSG_MOD ( extack , " Too many priorities in ETS priomap " ) ;
return - EINVAL ;
}
band = nla_get_u8 ( attr ) ;
if ( band > = nbands ) {
NL_SET_ERR_MSG_MOD ( extack , " Invalid band number in ETS priomap " ) ;
return - EINVAL ;
}
priomap [ prio + + ] = band ;
break ;
default :
WARN_ON_ONCE ( 1 ) ; /* Validate should have caught this. */
return - EINVAL ;
}
}
return 0 ;
}
static int ets_qdisc_quanta_parse ( struct Qdisc * sch , struct nlattr * quanta_attr ,
unsigned int nbands , unsigned int nstrict ,
unsigned int * quanta ,
struct netlink_ext_ack * extack )
{
const struct nlattr * attr ;
int band = nstrict ;
int rem ;
int err ;
err = __nla_validate_nested ( quanta_attr , TCA_ETS_MAX ,
ets_quanta_policy , NL_VALIDATE_STRICT ,
extack ) ;
if ( err < 0 )
return err ;
nla_for_each_nested ( attr , quanta_attr , rem ) {
switch ( nla_type ( attr ) ) {
case TCA_ETS_QUANTA_BAND :
if ( band > = nbands ) {
NL_SET_ERR_MSG_MOD ( extack , " ETS quanta has more values than bands " ) ;
return - EINVAL ;
}
err = ets_quantum_parse ( sch , attr , & quanta [ band + + ] ,
extack ) ;
if ( err )
return err ;
break ;
default :
WARN_ON_ONCE ( 1 ) ; /* Validate should have caught this. */
return - EINVAL ;
}
}
return 0 ;
}
static int ets_qdisc_change ( struct Qdisc * sch , struct nlattr * opt ,
struct netlink_ext_ack * extack )
{
unsigned int quanta [ TCQ_ETS_MAX_BANDS ] = { 0 } ;
struct Qdisc * queues [ TCQ_ETS_MAX_BANDS ] ;
struct ets_sched * q = qdisc_priv ( sch ) ;
struct nlattr * tb [ TCA_ETS_MAX + 1 ] ;
unsigned int oldbands = q - > nbands ;
u8 priomap [ TC_PRIO_MAX + 1 ] ;
unsigned int nstrict = 0 ;
unsigned int nbands ;
unsigned int i ;
int err ;
err = nla_parse_nested ( tb , TCA_ETS_MAX , opt , ets_policy , extack ) ;
if ( err < 0 )
return err ;
if ( ! tb [ TCA_ETS_NBANDS ] ) {
NL_SET_ERR_MSG_MOD ( extack , " Number of bands is a required argument " ) ;
return - EINVAL ;
}
nbands = nla_get_u8 ( tb [ TCA_ETS_NBANDS ] ) ;
if ( nbands < 1 | | nbands > TCQ_ETS_MAX_BANDS ) {
NL_SET_ERR_MSG_MOD ( extack , " Invalid number of bands " ) ;
return - EINVAL ;
}
/* Unless overridden, traffic goes to the last band. */
memset ( priomap , nbands - 1 , sizeof ( priomap ) ) ;
if ( tb [ TCA_ETS_NSTRICT ] ) {
nstrict = nla_get_u8 ( tb [ TCA_ETS_NSTRICT ] ) ;
if ( nstrict > nbands ) {
NL_SET_ERR_MSG_MOD ( extack , " Invalid number of strict bands " ) ;
return - EINVAL ;
}
}
if ( tb [ TCA_ETS_PRIOMAP ] ) {
err = ets_qdisc_priomap_parse ( tb [ TCA_ETS_PRIOMAP ] ,
nbands , priomap , extack ) ;
if ( err )
return err ;
}
if ( tb [ TCA_ETS_QUANTA ] ) {
err = ets_qdisc_quanta_parse ( sch , tb [ TCA_ETS_QUANTA ] ,
nbands , nstrict , quanta , extack ) ;
if ( err )
return err ;
}
/* If there are more bands than strict + quanta provided, the remaining
* ones are ETS with quantum of MTU . Initialize the missing values here .
*/
for ( i = nstrict ; i < nbands ; i + + ) {
if ( ! quanta [ i ] )
quanta [ i ] = psched_mtu ( qdisc_dev ( sch ) ) ;
}
/* Before commit, make sure we can allocate all new qdiscs */
for ( i = oldbands ; i < nbands ; i + + ) {
queues [ i ] = qdisc_create_dflt ( sch - > dev_queue , & pfifo_qdisc_ops ,
ets_class_id ( sch , & q - > classes [ i ] ) ,
extack ) ;
if ( ! queues [ i ] ) {
while ( i > oldbands )
qdisc_put ( queues [ - - i ] ) ;
return - ENOMEM ;
}
}
sch_tree_lock ( sch ) ;
q - > nbands = nbands ;
net/sched: ets: fix crash when flipping from 'strict' to 'quantum'
While running kselftests, Hangbin observed that sch_ets.sh often crashes,
and splats like the following one are seen in the output of 'dmesg':
BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 159f12067 P4D 159f12067 PUD 159f13067 PMD 0
Oops: 0000 [#1] SMP NOPTI
CPU: 2 PID: 921 Comm: tc Not tainted 5.14.0-rc6+ #458
Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
RIP: 0010:__list_del_entry_valid+0x2d/0x50
Code: 48 8b 57 08 48 b9 00 01 00 00 00 00 ad de 48 39 c8 0f 84 ac 6e 5b 00 48 b9 22 01 00 00 00 00 ad de 48 39 ca 0f 84 cf 6e 5b 00 <48> 8b 32 48 39 fe 0f 85 af 6e 5b 00 48 8b 50 08 48 39 f2 0f 85 94
RSP: 0018:ffffb2da005c3890 EFLAGS: 00010217
RAX: 0000000000000000 RBX: ffff9073ba23f800 RCX: dead000000000122
RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff9073ba23fbc8
RBP: ffff9073ba23f890 R08: 0000000000000001 R09: 0000000000000001
R10: 0000000000000001 R11: 0000000000000001 R12: dead000000000100
R13: ffff9073ba23fb00 R14: 0000000000000002 R15: 0000000000000002
FS: 00007f93e5564e40(0000) GS:ffff9073bba00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 000000014ad34000 CR4: 0000000000350ee0
Call Trace:
ets_qdisc_reset+0x6e/0x100 [sch_ets]
qdisc_reset+0x49/0x1d0
tbf_reset+0x15/0x60 [sch_tbf]
qdisc_reset+0x49/0x1d0
dev_reset_queue.constprop.42+0x2f/0x90
dev_deactivate_many+0x1d3/0x3d0
dev_deactivate+0x56/0x90
qdisc_graft+0x47e/0x5a0
tc_get_qdisc+0x1db/0x3e0
rtnetlink_rcv_msg+0x164/0x4c0
netlink_rcv_skb+0x50/0x100
netlink_unicast+0x1a5/0x280
netlink_sendmsg+0x242/0x480
sock_sendmsg+0x5b/0x60
____sys_sendmsg+0x1f2/0x260
___sys_sendmsg+0x7c/0xc0
__sys_sendmsg+0x57/0xa0
do_syscall_64+0x3a/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f93e44b8338
Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55
RSP: 002b:00007ffc0db737a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000061255c06 RCX: 00007f93e44b8338
RDX: 0000000000000000 RSI: 00007ffc0db73810 RDI: 0000000000000003
RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000001
R13: 0000000000687880 R14: 0000000000000000 R15: 0000000000000000
Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt iTCO_vendor_support intel_rapl_msr intel_rapl_common joydev i2c_i801 pcspkr i2c_smbus lpc_ich virtio_balloon ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel ahci libahci ghash_clmulni_intel libata serio_raw virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod
CR2: 0000000000000000
When the change() function decreases the value of 'nstrict', we must take
into account that packets might be already enqueued on a class that flips
from 'strict' to 'quantum': otherwise that class will not be added to the
bandwidth-sharing list. Then, a call to ets_qdisc_reset() will attempt to
do list_del(&alist) with 'alist' filled with zero, hence the NULL pointer
dereference.
For classes flipping from 'strict' to 'quantum', initialize an empty list
and eventually add it to the bandwidth-sharing list, if there are packets
already enqueued. In this way, the kernel will:
a) prevent crashing as described above.
b) avoid retaining the backlog packets (for an arbitrarily long time) in
case no packet is enqueued after a change from 'strict' to 'quantum'.
Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-25 01:33:48 +03:00
for ( i = nstrict ; i < q - > nstrict ; i + + ) {
if ( q - > classes [ i ] . qdisc - > q . qlen ) {
list_add_tail ( & q - > classes [ i ] . alist , & q - > active ) ;
q - > classes [ i ] . deficit = quanta [ i ] ;
}
}
2021-11-24 19:14:40 +03:00
for ( i = q - > nbands ; i < oldbands ; i + + ) {
2021-12-10 19:42:47 +03:00
if ( i > = q - > nstrict & & q - > classes [ i ] . qdisc - > q . qlen )
2021-11-24 19:14:40 +03:00
list_del ( & q - > classes [ i ] . alist ) ;
2021-12-10 19:42:47 +03:00
qdisc_tree_flush_backlog ( q - > classes [ i ] . qdisc ) ;
2021-11-24 19:14:40 +03:00
}
2019-12-18 17:55:13 +03:00
q - > nstrict = nstrict ;
memcpy ( q - > prio2band , priomap , sizeof ( priomap ) ) ;
for ( i = 0 ; i < q - > nbands ; i + + )
q - > classes [ i ] . quantum = quanta [ i ] ;
for ( i = oldbands ; i < q - > nbands ; i + + ) {
q - > classes [ i ] . qdisc = queues [ i ] ;
if ( q - > classes [ i ] . qdisc ! = & noop_qdisc )
qdisc_hash_add ( q - > classes [ i ] . qdisc , true ) ;
}
sch_tree_unlock ( sch ) ;
2019-12-18 17:55:15 +03:00
ets_offload_change ( sch ) ;
2019-12-18 17:55:13 +03:00
for ( i = q - > nbands ; i < oldbands ; i + + ) {
qdisc_put ( q - > classes [ i ] . qdisc ) ;
2021-10-07 16:05:02 +03:00
q - > classes [ i ] . qdisc = NULL ;
q - > classes [ i ] . quantum = 0 ;
q - > classes [ i ] . deficit = 0 ;
2021-10-16 11:49:09 +03:00
gnet_stats_basic_sync_init ( & q - > classes [ i ] . bstats ) ;
2021-10-07 16:05:02 +03:00
memset ( & q - > classes [ i ] . qstats , 0 , sizeof ( q - > classes [ i ] . qstats ) ) ;
2019-12-18 17:55:13 +03:00
}
return 0 ;
}
static int ets_qdisc_init ( struct Qdisc * sch , struct nlattr * opt ,
struct netlink_ext_ack * extack )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
2021-10-07 16:05:02 +03:00
int err , i ;
2019-12-18 17:55:13 +03:00
if ( ! opt )
return - EINVAL ;
err = tcf_block_get ( & q - > block , & q - > filter_list , sch , extack ) ;
if ( err )
return err ;
INIT_LIST_HEAD ( & q - > active ) ;
2021-10-07 16:05:02 +03:00
for ( i = 0 ; i < TCQ_ETS_MAX_BANDS ; i + + )
INIT_LIST_HEAD ( & q - > classes [ i ] . alist ) ;
2019-12-18 17:55:13 +03:00
return ets_qdisc_change ( sch , opt , extack ) ;
}
static void ets_qdisc_reset ( struct Qdisc * sch )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
int band ;
for ( band = q - > nstrict ; band < q - > nbands ; band + + ) {
if ( q - > classes [ band ] . qdisc - > q . qlen )
list_del ( & q - > classes [ band ] . alist ) ;
}
for ( band = 0 ; band < q - > nbands ; band + + )
qdisc_reset ( q - > classes [ band ] . qdisc ) ;
}
static void ets_qdisc_destroy ( struct Qdisc * sch )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
int band ;
2019-12-18 17:55:15 +03:00
ets_offload_destroy ( sch ) ;
2019-12-18 17:55:13 +03:00
tcf_block_put ( q - > block ) ;
for ( band = 0 ; band < q - > nbands ; band + + )
qdisc_put ( q - > classes [ band ] . qdisc ) ;
}
static int ets_qdisc_dump ( struct Qdisc * sch , struct sk_buff * skb )
{
struct ets_sched * q = qdisc_priv ( sch ) ;
struct nlattr * opts ;
struct nlattr * nest ;
int band ;
int prio ;
2019-12-18 17:55:15 +03:00
int err ;
err = ets_offload_dump ( sch ) ;
if ( err )
return err ;
2019-12-18 17:55:13 +03:00
opts = nla_nest_start_noflag ( skb , TCA_OPTIONS ) ;
if ( ! opts )
goto nla_err ;
if ( nla_put_u8 ( skb , TCA_ETS_NBANDS , q - > nbands ) )
goto nla_err ;
if ( q - > nstrict & &
nla_put_u8 ( skb , TCA_ETS_NSTRICT , q - > nstrict ) )
goto nla_err ;
if ( q - > nbands > q - > nstrict ) {
nest = nla_nest_start ( skb , TCA_ETS_QUANTA ) ;
if ( ! nest )
goto nla_err ;
for ( band = q - > nstrict ; band < q - > nbands ; band + + ) {
if ( nla_put_u32 ( skb , TCA_ETS_QUANTA_BAND ,
q - > classes [ band ] . quantum ) )
goto nla_err ;
}
nla_nest_end ( skb , nest ) ;
}
nest = nla_nest_start ( skb , TCA_ETS_PRIOMAP ) ;
if ( ! nest )
goto nla_err ;
for ( prio = 0 ; prio < = TC_PRIO_MAX ; prio + + ) {
if ( nla_put_u8 ( skb , TCA_ETS_PRIOMAP_BAND , q - > prio2band [ prio ] ) )
goto nla_err ;
}
nla_nest_end ( skb , nest ) ;
return nla_nest_end ( skb , opts ) ;
nla_err :
nla_nest_cancel ( skb , opts ) ;
return - EMSGSIZE ;
}
static const struct Qdisc_class_ops ets_class_ops = {
. change = ets_class_change ,
. graft = ets_class_graft ,
. leaf = ets_class_leaf ,
. find = ets_class_find ,
. qlen_notify = ets_class_qlen_notify ,
. dump = ets_class_dump ,
. dump_stats = ets_class_dump_stats ,
. walk = ets_qdisc_walk ,
. tcf_block = ets_qdisc_tcf_block ,
. bind_tcf = ets_qdisc_bind_tcf ,
. unbind_tcf = ets_qdisc_unbind_tcf ,
} ;
static struct Qdisc_ops ets_qdisc_ops __read_mostly = {
. cl_ops = & ets_class_ops ,
. id = " ets " ,
. priv_size = sizeof ( struct ets_sched ) ,
. enqueue = ets_qdisc_enqueue ,
. dequeue = ets_qdisc_dequeue ,
. peek = qdisc_peek_dequeued ,
. change = ets_qdisc_change ,
. init = ets_qdisc_init ,
. reset = ets_qdisc_reset ,
. destroy = ets_qdisc_destroy ,
. dump = ets_qdisc_dump ,
. owner = THIS_MODULE ,
} ;
static int __init ets_init ( void )
{
return register_qdisc ( & ets_qdisc_ops ) ;
}
static void __exit ets_exit ( void )
{
unregister_qdisc ( & ets_qdisc_ops ) ;
}
module_init ( ets_init ) ;
module_exit ( ets_exit ) ;
MODULE_LICENSE ( " GPL " ) ;