linux/net/sched/sch_ets.c

829 lines
20 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_ets.c Enhanced Transmission Selection scheduler
*
* Description
* -----------
*
* The Enhanced Transmission Selection scheduler is a classful queuing
* discipline that merges functionality of PRIO and DRR qdiscs in one scheduler.
* ETS makes it easy to configure a set of strict and bandwidth-sharing bands to
* implement the transmission selection described in 802.1Qaz.
*
* Although ETS is technically classful, it's not possible to add and remove
* classes at will. Instead one specifies number of classes, how many are
* PRIO-like and how many DRR-like, and quanta for the latter.
*
* Algorithm
* ---------
*
* The strict classes, if any, are tried for traffic first: first band 0, if it
* has no traffic then band 1, etc.
*
* When there is no traffic in any of the strict queues, the bandwidth-sharing
* ones are tried next. Each band is assigned a deficit counter, initialized to
* "quantum" of that band. ETS maintains a list of active bandwidth-sharing
* bands whose qdiscs are non-empty. A packet is dequeued from the band at the
* head of the list if the packet size is smaller or equal to the deficit
* counter. If the counter is too small, it is increased by "quantum" and the
* scheduler moves on to the next band in the active list.
*/
#include <linux/module.h>
#include <net/gen_stats.h>
#include <net/netlink.h>
#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
#include <net/sch_generic.h>
struct ets_class {
struct list_head alist; /* In struct ets_sched.active. */
struct Qdisc *qdisc;
u32 quantum;
u32 deficit;
struct gnet_stats_basic_sync bstats;
struct gnet_stats_queue qstats;
};
struct ets_sched {
struct list_head active;
struct tcf_proto __rcu *filter_list;
struct tcf_block *block;
unsigned int nbands;
unsigned int nstrict;
u8 prio2band[TC_PRIO_MAX + 1];
struct ets_class classes[TCQ_ETS_MAX_BANDS];
};
static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = {
[TCA_ETS_NBANDS] = { .type = NLA_U8 },
[TCA_ETS_NSTRICT] = { .type = NLA_U8 },
[TCA_ETS_QUANTA] = { .type = NLA_NESTED },
[TCA_ETS_PRIOMAP] = { .type = NLA_NESTED },
};
static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = {
[TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 },
};
static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = {
[TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
};
static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = {
[TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
};
static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr,
unsigned int *quantum,
struct netlink_ext_ack *extack)
{
*quantum = nla_get_u32(attr);
if (!*quantum) {
NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero");
return -EINVAL;
}
return 0;
}
static struct ets_class *
ets_class_from_arg(struct Qdisc *sch, unsigned long arg)
{
struct ets_sched *q = qdisc_priv(sch);
return &q->classes[arg - 1];
}
static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl)
{
struct ets_sched *q = qdisc_priv(sch);
int band = cl - q->classes;
return TC_H_MAKE(sch->handle, band + 1);
}
static void ets_offload_change(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct ets_sched *q = qdisc_priv(sch);
struct tc_ets_qopt_offload qopt;
unsigned int w_psum_prev = 0;
unsigned int q_psum = 0;
unsigned int q_sum = 0;
unsigned int quantum;
unsigned int w_psum;
unsigned int weight;
unsigned int i;
if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
return;
qopt.command = TC_ETS_REPLACE;
qopt.handle = sch->handle;
qopt.parent = sch->parent;
qopt.replace_params.bands = q->nbands;
qopt.replace_params.qstats = &sch->qstats;
memcpy(&qopt.replace_params.priomap,
q->prio2band, sizeof(q->prio2band));
for (i = 0; i < q->nbands; i++)
q_sum += q->classes[i].quantum;
for (i = 0; i < q->nbands; i++) {
quantum = q->classes[i].quantum;
q_psum += quantum;
w_psum = quantum ? q_psum * 100 / q_sum : 0;
weight = w_psum - w_psum_prev;
w_psum_prev = w_psum;
qopt.replace_params.quanta[i] = quantum;
qopt.replace_params.weights[i] = weight;
}
dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
}
static void ets_offload_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct tc_ets_qopt_offload qopt;
if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
return;
qopt.command = TC_ETS_DESTROY;
qopt.handle = sch->handle;
qopt.parent = sch->parent;
dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
}
static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new,
struct Qdisc *old, unsigned long arg,
struct netlink_ext_ack *extack)
{
struct net_device *dev = qdisc_dev(sch);
struct tc_ets_qopt_offload qopt;
qopt.command = TC_ETS_GRAFT;
qopt.handle = sch->handle;
qopt.parent = sch->parent;
qopt.graft_params.band = arg - 1;
qopt.graft_params.child_handle = new->handle;
qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS,
&qopt, extack);
}
static int ets_offload_dump(struct Qdisc *sch)
{
struct tc_ets_qopt_offload qopt;
qopt.command = TC_ETS_STATS;
qopt.handle = sch->handle;
qopt.parent = sch->parent;
qopt.stats.bstats = &sch->bstats;
qopt.stats.qstats = &sch->qstats;
return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt);
}
static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl)
{
unsigned int band = cl - q->classes;
return band < q->nstrict;
}
static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid,
struct nlattr **tca, unsigned long *arg,
struct netlink_ext_ack *extack)
{
struct ets_class *cl = ets_class_from_arg(sch, *arg);
struct ets_sched *q = qdisc_priv(sch);
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_ETS_MAX + 1];
unsigned int quantum;
int err;
/* Classes can be added and removed only through Qdisc_ops.change
* interface.
*/
if (!cl) {
NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported");
return -EOPNOTSUPP;
}
if (!opt) {
NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
return -EINVAL;
}
err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack);
if (err < 0)
return err;
if (!tb[TCA_ETS_QUANTA_BAND])
/* Nothing to configure. */
return 0;
if (ets_class_is_strict(q, cl)) {
NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum");
return -EINVAL;
}
err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum,
extack);
if (err)
return err;
sch_tree_lock(sch);
cl->quantum = quantum;
sch_tree_unlock(sch);
ets_offload_change(sch);
return 0;
}
static int ets_class_graft(struct Qdisc *sch, unsigned long arg,
struct Qdisc *new, struct Qdisc **old,
struct netlink_ext_ack *extack)
{
struct ets_class *cl = ets_class_from_arg(sch, arg);
if (!new) {
new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
ets_class_id(sch, cl), NULL);
if (!new)
new = &noop_qdisc;
else
qdisc_hash_add(new, true);
}
*old = qdisc_replace(sch, new, &cl->qdisc);
ets_offload_graft(sch, new, *old, arg, extack);
return 0;
}
static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg)
{
struct ets_class *cl = ets_class_from_arg(sch, arg);
return cl->qdisc;
}
static unsigned long ets_class_find(struct Qdisc *sch, u32 classid)
{
unsigned long band = TC_H_MIN(classid);
struct ets_sched *q = qdisc_priv(sch);
if (band - 1 >= q->nbands)
return 0;
return band;
}
static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg)
{
struct ets_class *cl = ets_class_from_arg(sch, arg);
struct ets_sched *q = qdisc_priv(sch);
/* We get notified about zero-length child Qdiscs as well if they are
* offloaded. Those aren't on the active list though, so don't attempt
* to remove them.
*/
if (!ets_class_is_strict(q, cl) && sch->q.qlen)
list_del(&cl->alist);
}
static int ets_class_dump(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct ets_class *cl = ets_class_from_arg(sch, arg);
struct ets_sched *q = qdisc_priv(sch);
struct nlattr *nest;
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle = ets_class_id(sch, cl);
tcm->tcm_info = cl->qdisc->handle;
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
if (!ets_class_is_strict(q, cl)) {
if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum))
goto nla_put_failure;
}
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -EMSGSIZE;
}
static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct ets_class *cl = ets_class_from_arg(sch, arg);
struct Qdisc *cl_q = cl->qdisc;
net: sched: Remove Qdisc::running sequence counter The Qdisc::running sequence counter has two uses: 1. Reliably reading qdisc's tc statistics while the qdisc is running (a seqcount read/retry loop at gnet_stats_add_basic()). 2. As a flag, indicating whether the qdisc in question is running (without any retry loops). For the first usage, the Qdisc::running sequence counter write section, qdisc_run_begin() => qdisc_run_end(), covers a much wider area than what is actually needed: the raw qdisc's bstats update. A u64_stats sync point was thus introduced (in previous commits) inside the bstats structure itself. A local u64_stats write section is then started and stopped for the bstats updates. Use that u64_stats sync point mechanism for the bstats read/retry loop at gnet_stats_add_basic(). For the second qdisc->running usage, a __QDISC_STATE_RUNNING bit flag, accessed with atomic bitops, is sufficient. Using a bit flag instead of a sequence counter at qdisc_run_begin/end() and qdisc_is_running() leads to the SMP barriers implicitly added through raw_read_seqcount() and write_seqcount_begin/end() getting removed. All call sites have been surveyed though, and no required ordering was identified. Now that the qdisc->running sequence counter is no longer used, remove it. Note, using u64_stats implies no sequence counter protection for 64-bit architectures. This can lead to the qdisc tc statistics "packets" vs. "bytes" values getting out of sync on rare occasions. The individual values will still be valid. Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-16 11:49:10 +03:00
if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 ||
qdisc_qstats_copy(d, cl_q) < 0)
return -1;
return 0;
}
static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct ets_sched *q = qdisc_priv(sch);
int i;
if (arg->stop)
return;
for (i = 0; i < q->nbands; i++) {
if (!tc_qdisc_stats_dump(sch, i + 1, arg))
break;
}
}
static struct tcf_block *
ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl,
struct netlink_ext_ack *extack)
{
struct ets_sched *q = qdisc_priv(sch);
if (cl) {
NL_SET_ERR_MSG(extack, "ETS classid must be zero");
return NULL;
}
return q->block;
}
static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
return ets_class_find(sch, classid);
}
static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
}
static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
{
struct ets_sched *q = qdisc_priv(sch);
u32 band = skb->priority;
struct tcf_result res;
struct tcf_proto *fl;
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
if (TC_H_MAJ(skb->priority) != sch->handle) {
fl = rcu_dereference_bh(q->filter_list);
err = tcf_classify(skb, NULL, fl, &res, false);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
fallthrough;
case TC_ACT_SHOT:
return NULL;
}
#endif
if (!fl || err < 0) {
if (TC_H_MAJ(band))
band = 0;
return &q->classes[q->prio2band[band & TC_PRIO_MAX]];
}
band = res.classid;
}
band = TC_H_MIN(band) - 1;
if (band >= q->nbands)
return &q->classes[q->prio2band[0]];
return &q->classes[band];
}
static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
unsigned int len = qdisc_pkt_len(skb);
struct ets_sched *q = qdisc_priv(sch);
struct ets_class *cl;
int err = 0;
bool first;
cl = ets_classify(skb, sch, &err);
if (!cl) {
if (err & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
__qdisc_drop(skb, to_free);
return err;
}
first = !cl->qdisc->q.qlen;
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
cl->qstats.drops++;
qdisc_qstats_drop(sch);
}
return err;
}
if (first && !ets_class_is_strict(q, cl)) {
list_add_tail(&cl->alist, &q->active);
cl->deficit = cl->quantum;
}
sch->qstats.backlog += len;
sch->q.qlen++;
return err;
}
static struct sk_buff *
ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb)
{
qdisc_bstats_update(sch, skb);
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
}
static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
{
struct ets_sched *q = qdisc_priv(sch);
struct ets_class *cl;
struct sk_buff *skb;
unsigned int band;
unsigned int len;
while (1) {
for (band = 0; band < q->nstrict; band++) {
cl = &q->classes[band];
skb = qdisc_dequeue_peeked(cl->qdisc);
if (skb)
return ets_qdisc_dequeue_skb(sch, skb);
}
if (list_empty(&q->active))
goto out;
cl = list_first_entry(&q->active, struct ets_class, alist);
skb = cl->qdisc->ops->peek(cl->qdisc);
if (!skb) {
qdisc_warn_nonwc(__func__, cl->qdisc);
goto out;
}
len = qdisc_pkt_len(skb);
if (len <= cl->deficit) {
cl->deficit -= len;
skb = qdisc_dequeue_peeked(cl->qdisc);
if (unlikely(!skb))
goto out;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
return ets_qdisc_dequeue_skb(sch, skb);
}
cl->deficit += cl->quantum;
list_move_tail(&cl->alist, &q->active);
}
out:
return NULL;
}
static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr,
unsigned int nbands, u8 *priomap,
struct netlink_ext_ack *extack)
{
const struct nlattr *attr;
int prio = 0;
u8 band;
int rem;
int err;
err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX,
ets_priomap_policy, NL_VALIDATE_STRICT,
extack);
if (err)
return err;
nla_for_each_nested(attr, priomap_attr, rem) {
switch (nla_type(attr)) {
case TCA_ETS_PRIOMAP_BAND:
if (prio > TC_PRIO_MAX) {
NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap");
return -EINVAL;
}
band = nla_get_u8(attr);
if (band >= nbands) {
NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap");
return -EINVAL;
}
priomap[prio++] = band;
break;
default:
WARN_ON_ONCE(1); /* Validate should have caught this. */
return -EINVAL;
}
}
return 0;
}
static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr,
unsigned int nbands, unsigned int nstrict,
unsigned int *quanta,
struct netlink_ext_ack *extack)
{
const struct nlattr *attr;
int band = nstrict;
int rem;
int err;
err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX,
ets_quanta_policy, NL_VALIDATE_STRICT,
extack);
if (err < 0)
return err;
nla_for_each_nested(attr, quanta_attr, rem) {
switch (nla_type(attr)) {
case TCA_ETS_QUANTA_BAND:
if (band >= nbands) {
NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands");
return -EINVAL;
}
err = ets_quantum_parse(sch, attr, &quanta[band++],
extack);
if (err)
return err;
break;
default:
WARN_ON_ONCE(1); /* Validate should have caught this. */
return -EINVAL;
}
}
return 0;
}
static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0};
struct Qdisc *queues[TCQ_ETS_MAX_BANDS];
struct ets_sched *q = qdisc_priv(sch);
struct nlattr *tb[TCA_ETS_MAX + 1];
unsigned int oldbands = q->nbands;
u8 priomap[TC_PRIO_MAX + 1];
unsigned int nstrict = 0;
unsigned int nbands;
unsigned int i;
int err;
err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack);
if (err < 0)
return err;
if (!tb[TCA_ETS_NBANDS]) {
NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument");
return -EINVAL;
}
nbands = nla_get_u8(tb[TCA_ETS_NBANDS]);
if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) {
NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands");
return -EINVAL;
}
/* Unless overridden, traffic goes to the last band. */
memset(priomap, nbands - 1, sizeof(priomap));
if (tb[TCA_ETS_NSTRICT]) {
nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]);
if (nstrict > nbands) {
NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands");
return -EINVAL;
}
}
if (tb[TCA_ETS_PRIOMAP]) {
err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP],
nbands, priomap, extack);
if (err)
return err;
}
if (tb[TCA_ETS_QUANTA]) {
err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA],
nbands, nstrict, quanta, extack);
if (err)
return err;
}
/* If there are more bands than strict + quanta provided, the remaining
* ones are ETS with quantum of MTU. Initialize the missing values here.
*/
for (i = nstrict; i < nbands; i++) {
if (!quanta[i])
quanta[i] = psched_mtu(qdisc_dev(sch));
}
/* Before commit, make sure we can allocate all new qdiscs */
for (i = oldbands; i < nbands; i++) {
queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
ets_class_id(sch, &q->classes[i]),
extack);
if (!queues[i]) {
while (i > oldbands)
qdisc_put(queues[--i]);
return -ENOMEM;
}
}
sch_tree_lock(sch);
q->nbands = nbands;
net/sched: ets: fix crash when flipping from 'strict' to 'quantum' While running kselftests, Hangbin observed that sch_ets.sh often crashes, and splats like the following one are seen in the output of 'dmesg': BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 159f12067 P4D 159f12067 PUD 159f13067 PMD 0 Oops: 0000 [#1] SMP NOPTI CPU: 2 PID: 921 Comm: tc Not tainted 5.14.0-rc6+ #458 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:__list_del_entry_valid+0x2d/0x50 Code: 48 8b 57 08 48 b9 00 01 00 00 00 00 ad de 48 39 c8 0f 84 ac 6e 5b 00 48 b9 22 01 00 00 00 00 ad de 48 39 ca 0f 84 cf 6e 5b 00 <48> 8b 32 48 39 fe 0f 85 af 6e 5b 00 48 8b 50 08 48 39 f2 0f 85 94 RSP: 0018:ffffb2da005c3890 EFLAGS: 00010217 RAX: 0000000000000000 RBX: ffff9073ba23f800 RCX: dead000000000122 RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff9073ba23fbc8 RBP: ffff9073ba23f890 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: 0000000000000001 R12: dead000000000100 R13: ffff9073ba23fb00 R14: 0000000000000002 R15: 0000000000000002 FS: 00007f93e5564e40(0000) GS:ffff9073bba00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000014ad34000 CR4: 0000000000350ee0 Call Trace: ets_qdisc_reset+0x6e/0x100 [sch_ets] qdisc_reset+0x49/0x1d0 tbf_reset+0x15/0x60 [sch_tbf] qdisc_reset+0x49/0x1d0 dev_reset_queue.constprop.42+0x2f/0x90 dev_deactivate_many+0x1d3/0x3d0 dev_deactivate+0x56/0x90 qdisc_graft+0x47e/0x5a0 tc_get_qdisc+0x1db/0x3e0 rtnetlink_rcv_msg+0x164/0x4c0 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1a5/0x280 netlink_sendmsg+0x242/0x480 sock_sendmsg+0x5b/0x60 ____sys_sendmsg+0x1f2/0x260 ___sys_sendmsg+0x7c/0xc0 __sys_sendmsg+0x57/0xa0 do_syscall_64+0x3a/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f93e44b8338 Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55 RSP: 002b:00007ffc0db737a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000061255c06 RCX: 00007f93e44b8338 RDX: 0000000000000000 RSI: 00007ffc0db73810 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000687880 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt iTCO_vendor_support intel_rapl_msr intel_rapl_common joydev i2c_i801 pcspkr i2c_smbus lpc_ich virtio_balloon ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel ahci libahci ghash_clmulni_intel libata serio_raw virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000000 When the change() function decreases the value of 'nstrict', we must take into account that packets might be already enqueued on a class that flips from 'strict' to 'quantum': otherwise that class will not be added to the bandwidth-sharing list. Then, a call to ets_qdisc_reset() will attempt to do list_del(&alist) with 'alist' filled with zero, hence the NULL pointer dereference. For classes flipping from 'strict' to 'quantum', initialize an empty list and eventually add it to the bandwidth-sharing list, if there are packets already enqueued. In this way, the kernel will: a) prevent crashing as described above. b) avoid retaining the backlog packets (for an arbitrarily long time) in case no packet is enqueued after a change from 'strict' to 'quantum'. Reported-by: Hangbin Liu <liuhangbin@gmail.com> Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") Signed-off-by: Davide Caratti <dcaratti@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-25 01:33:48 +03:00
for (i = nstrict; i < q->nstrict; i++) {
if (q->classes[i].qdisc->q.qlen) {
list_add_tail(&q->classes[i].alist, &q->active);
q->classes[i].deficit = quanta[i];
}
}
net/sched: sch_ets: don't peek at classes beyond 'nbands' when the number of DRR classes decreases, the round-robin active list can contain elements that have already been freed in ets_qdisc_change(). As a consequence, it's possible to see a NULL dereference crash, caused by the attempt to call cl->qdisc->ops->peek(cl->qdisc) when cl->qdisc is NULL: BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 910 Comm: mausezahn Not tainted 5.16.0-rc1+ #475 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:ets_qdisc_dequeue+0x129/0x2c0 [sch_ets] Code: c5 01 41 39 ad e4 02 00 00 0f 87 18 ff ff ff 49 8b 85 c0 02 00 00 49 39 c4 0f 84 ba 00 00 00 49 8b ad c0 02 00 00 48 8b 7d 10 <48> 8b 47 18 48 8b 40 38 0f ae e8 ff d0 48 89 c3 48 85 c0 0f 84 9d RSP: 0000:ffffbb36c0b5fdd8 EFLAGS: 00010287 RAX: ffff956678efed30 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000002 RSI: ffffffff9b938dc9 RDI: 0000000000000000 RBP: ffff956678efed30 R08: e2f3207fe360129c R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffff956678efeac0 R13: ffff956678efe800 R14: ffff956611545000 R15: ffff95667ac8f100 FS: 00007f2aa9120740(0000) GS:ffff95667b800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000018 CR3: 000000011070c000 CR4: 0000000000350ee0 Call Trace: <TASK> qdisc_peek_dequeued+0x29/0x70 [sch_ets] tbf_dequeue+0x22/0x260 [sch_tbf] __qdisc_run+0x7f/0x630 net_tx_action+0x290/0x4c0 __do_softirq+0xee/0x4f8 irq_exit_rcu+0xf4/0x130 sysvec_apic_timer_interrupt+0x52/0xc0 asm_sysvec_apic_timer_interrupt+0x12/0x20 RIP: 0033:0x7f2aa7fc9ad4 Code: b9 ff ff 48 8b 54 24 18 48 83 c4 08 48 89 ee 48 89 df 5b 5d e9 ed fc ff ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa <53> 48 83 ec 10 48 8b 05 10 64 33 00 48 8b 00 48 85 c0 0f 85 84 00 RSP: 002b:00007ffe5d33fab8 EFLAGS: 00000202 RAX: 0000000000000002 RBX: 0000561f72c31460 RCX: 0000561f72c31720 RDX: 0000000000000002 RSI: 0000561f72c31722 RDI: 0000561f72c31720 RBP: 000000000000002a R08: 00007ffe5d33fa40 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000561f7187e380 R13: 0000000000000000 R14: 0000000000000000 R15: 0000561f72c31460 </TASK> Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt intel_rapl_msr iTCO_vendor_support intel_rapl_common joydev virtio_balloon lpc_ich i2c_i801 i2c_smbus pcspkr ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel ahci libahci ghash_clmulni_intel serio_raw libata virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000018 Ensuring that 'alist' was never zeroed [1] was not sufficient, we need to remove from the active list those elements that are no more SP nor DRR. [1] https://lore.kernel.org/netdev/60d274838bf09777f0371253416e8af71360bc08.1633609148.git.dcaratti@redhat.com/ v3: fix race between ets_qdisc_change() and ets_qdisc_dequeue() delisting DRR classes beyond 'nbands' in ets_qdisc_change() with the qdisc lock acquired, thanks to Cong Wang. v2: when a NULL qdisc is found in the DRR active list, try to dequeue skb from the next list item. Reported-by: Hangbin Liu <liuhangbin@gmail.com> Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") Signed-off-by: Davide Caratti <dcaratti@redhat.com> Link: https://lore.kernel.org/r/7a5c496eed2d62241620bdbb83eb03fb9d571c99.1637762721.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-24 19:14:40 +03:00
for (i = q->nbands; i < oldbands; i++) {
net/sched: sch_ets: don't remove idle classes from the round-robin list Shuang reported that the following script: 1) tc qdisc add dev ddd0 handle 10: parent 1: ets bands 8 strict 4 priomap 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 2) mausezahn ddd0 -A 10.10.10.1 -B 10.10.10.2 -c 0 -a own -b 00:c1:a0:c1:a0:00 -t udp & 3) tc qdisc change dev ddd0 handle 10: ets bands 4 strict 2 quanta 2500 2500 priomap 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 crashes systematically when line 2) is commented: list_del corruption, ffff8e028404bd30->next is LIST_POISON1 (dead000000000100) ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:47! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 954 Comm: tc Not tainted 5.16.0-rc4+ #478 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:__list_del_entry_valid.cold.1+0x12/0x47 Code: fe ff 0f 0b 48 89 c1 4c 89 c6 48 c7 c7 08 42 1b 87 e8 1d c5 fe ff 0f 0b 48 89 fe 48 89 c2 48 c7 c7 98 42 1b 87 e8 09 c5 fe ff <0f> 0b 48 c7 c7 48 43 1b 87 e8 fb c4 fe ff 0f 0b 48 89 f2 48 89 fe RSP: 0018:ffffae46807a3888 EFLAGS: 00010246 RAX: 000000000000004e RBX: 0000000000000007 RCX: 0000000000000202 RDX: 0000000000000000 RSI: ffffffff871ac536 RDI: 00000000ffffffff RBP: ffffae46807a3a10 R08: 0000000000000000 R09: c0000000ffff7fff R10: 0000000000000001 R11: ffffae46807a36a8 R12: ffff8e028404b800 R13: ffff8e028404bd30 R14: dead000000000100 R15: ffff8e02fafa2400 FS: 00007efdc92e4480(0000) GS:ffff8e02fb600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000682f48 CR3: 00000001058be000 CR4: 0000000000350ef0 Call Trace: <TASK> ets_qdisc_change+0x58b/0xa70 [sch_ets] tc_modify_qdisc+0x323/0x880 rtnetlink_rcv_msg+0x169/0x4a0 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1a5/0x280 netlink_sendmsg+0x257/0x4d0 sock_sendmsg+0x5b/0x60 ____sys_sendmsg+0x1f2/0x260 ___sys_sendmsg+0x7c/0xc0 __sys_sendmsg+0x57/0xa0 do_syscall_64+0x3a/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7efdc8031338 Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55 RSP: 002b:00007ffdf1ce9828 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000061b37a97 RCX: 00007efdc8031338 RDX: 0000000000000000 RSI: 00007ffdf1ce9890 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000078a940 R10: 000000000000000c R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000688880 R14: 0000000000000000 R15: 0000000000000000 </TASK> Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt iTCO_vendor_support intel_rapl_msr intel_rapl_common joydev pcspkr i2c_i801 virtio_balloon i2c_smbus lpc_ich ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel serio_raw ghash_clmulni_intel ahci libahci libata virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: sch_ets] ---[ end trace f35878d1912655c2 ]--- RIP: 0010:__list_del_entry_valid.cold.1+0x12/0x47 Code: fe ff 0f 0b 48 89 c1 4c 89 c6 48 c7 c7 08 42 1b 87 e8 1d c5 fe ff 0f 0b 48 89 fe 48 89 c2 48 c7 c7 98 42 1b 87 e8 09 c5 fe ff <0f> 0b 48 c7 c7 48 43 1b 87 e8 fb c4 fe ff 0f 0b 48 89 f2 48 89 fe RSP: 0018:ffffae46807a3888 EFLAGS: 00010246 RAX: 000000000000004e RBX: 0000000000000007 RCX: 0000000000000202 RDX: 0000000000000000 RSI: ffffffff871ac536 RDI: 00000000ffffffff RBP: ffffae46807a3a10 R08: 0000000000000000 R09: c0000000ffff7fff R10: 0000000000000001 R11: ffffae46807a36a8 R12: ffff8e028404b800 R13: ffff8e028404bd30 R14: dead000000000100 R15: ffff8e02fafa2400 FS: 00007efdc92e4480(0000) GS:ffff8e02fb600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000682f48 CR3: 00000001058be000 CR4: 0000000000350ef0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x4e00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- we can remove 'q->classes[i].alist' only if DRR class 'i' was part of the active list. In the ETS scheduler DRR classes belong to that list only if the queue length is greater than zero: we need to test for non-zero value of 'q->classes[i].qdisc->q.qlen' before removing from the list, similarly to what has been done elsewhere in the ETS code. Fixes: de6d25924c2a ("net/sched: sch_ets: don't peek at classes beyond 'nbands'") Reported-by: Shuang Li <shuali@redhat.com> Signed-off-by: Davide Caratti <dcaratti@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-10 19:42:47 +03:00
if (i >= q->nstrict && q->classes[i].qdisc->q.qlen)
net/sched: sch_ets: don't peek at classes beyond 'nbands' when the number of DRR classes decreases, the round-robin active list can contain elements that have already been freed in ets_qdisc_change(). As a consequence, it's possible to see a NULL dereference crash, caused by the attempt to call cl->qdisc->ops->peek(cl->qdisc) when cl->qdisc is NULL: BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 910 Comm: mausezahn Not tainted 5.16.0-rc1+ #475 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:ets_qdisc_dequeue+0x129/0x2c0 [sch_ets] Code: c5 01 41 39 ad e4 02 00 00 0f 87 18 ff ff ff 49 8b 85 c0 02 00 00 49 39 c4 0f 84 ba 00 00 00 49 8b ad c0 02 00 00 48 8b 7d 10 <48> 8b 47 18 48 8b 40 38 0f ae e8 ff d0 48 89 c3 48 85 c0 0f 84 9d RSP: 0000:ffffbb36c0b5fdd8 EFLAGS: 00010287 RAX: ffff956678efed30 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000002 RSI: ffffffff9b938dc9 RDI: 0000000000000000 RBP: ffff956678efed30 R08: e2f3207fe360129c R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffff956678efeac0 R13: ffff956678efe800 R14: ffff956611545000 R15: ffff95667ac8f100 FS: 00007f2aa9120740(0000) GS:ffff95667b800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000018 CR3: 000000011070c000 CR4: 0000000000350ee0 Call Trace: <TASK> qdisc_peek_dequeued+0x29/0x70 [sch_ets] tbf_dequeue+0x22/0x260 [sch_tbf] __qdisc_run+0x7f/0x630 net_tx_action+0x290/0x4c0 __do_softirq+0xee/0x4f8 irq_exit_rcu+0xf4/0x130 sysvec_apic_timer_interrupt+0x52/0xc0 asm_sysvec_apic_timer_interrupt+0x12/0x20 RIP: 0033:0x7f2aa7fc9ad4 Code: b9 ff ff 48 8b 54 24 18 48 83 c4 08 48 89 ee 48 89 df 5b 5d e9 ed fc ff ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa <53> 48 83 ec 10 48 8b 05 10 64 33 00 48 8b 00 48 85 c0 0f 85 84 00 RSP: 002b:00007ffe5d33fab8 EFLAGS: 00000202 RAX: 0000000000000002 RBX: 0000561f72c31460 RCX: 0000561f72c31720 RDX: 0000000000000002 RSI: 0000561f72c31722 RDI: 0000561f72c31720 RBP: 000000000000002a R08: 00007ffe5d33fa40 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000561f7187e380 R13: 0000000000000000 R14: 0000000000000000 R15: 0000561f72c31460 </TASK> Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt intel_rapl_msr iTCO_vendor_support intel_rapl_common joydev virtio_balloon lpc_ich i2c_i801 i2c_smbus pcspkr ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel ahci libahci ghash_clmulni_intel serio_raw libata virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000018 Ensuring that 'alist' was never zeroed [1] was not sufficient, we need to remove from the active list those elements that are no more SP nor DRR. [1] https://lore.kernel.org/netdev/60d274838bf09777f0371253416e8af71360bc08.1633609148.git.dcaratti@redhat.com/ v3: fix race between ets_qdisc_change() and ets_qdisc_dequeue() delisting DRR classes beyond 'nbands' in ets_qdisc_change() with the qdisc lock acquired, thanks to Cong Wang. v2: when a NULL qdisc is found in the DRR active list, try to dequeue skb from the next list item. Reported-by: Hangbin Liu <liuhangbin@gmail.com> Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") Signed-off-by: Davide Caratti <dcaratti@redhat.com> Link: https://lore.kernel.org/r/7a5c496eed2d62241620bdbb83eb03fb9d571c99.1637762721.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-24 19:14:40 +03:00
list_del(&q->classes[i].alist);
net/sched: sch_ets: don't remove idle classes from the round-robin list Shuang reported that the following script: 1) tc qdisc add dev ddd0 handle 10: parent 1: ets bands 8 strict 4 priomap 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 2) mausezahn ddd0 -A 10.10.10.1 -B 10.10.10.2 -c 0 -a own -b 00:c1:a0:c1:a0:00 -t udp & 3) tc qdisc change dev ddd0 handle 10: ets bands 4 strict 2 quanta 2500 2500 priomap 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 crashes systematically when line 2) is commented: list_del corruption, ffff8e028404bd30->next is LIST_POISON1 (dead000000000100) ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:47! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 954 Comm: tc Not tainted 5.16.0-rc4+ #478 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:__list_del_entry_valid.cold.1+0x12/0x47 Code: fe ff 0f 0b 48 89 c1 4c 89 c6 48 c7 c7 08 42 1b 87 e8 1d c5 fe ff 0f 0b 48 89 fe 48 89 c2 48 c7 c7 98 42 1b 87 e8 09 c5 fe ff <0f> 0b 48 c7 c7 48 43 1b 87 e8 fb c4 fe ff 0f 0b 48 89 f2 48 89 fe RSP: 0018:ffffae46807a3888 EFLAGS: 00010246 RAX: 000000000000004e RBX: 0000000000000007 RCX: 0000000000000202 RDX: 0000000000000000 RSI: ffffffff871ac536 RDI: 00000000ffffffff RBP: ffffae46807a3a10 R08: 0000000000000000 R09: c0000000ffff7fff R10: 0000000000000001 R11: ffffae46807a36a8 R12: ffff8e028404b800 R13: ffff8e028404bd30 R14: dead000000000100 R15: ffff8e02fafa2400 FS: 00007efdc92e4480(0000) GS:ffff8e02fb600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000682f48 CR3: 00000001058be000 CR4: 0000000000350ef0 Call Trace: <TASK> ets_qdisc_change+0x58b/0xa70 [sch_ets] tc_modify_qdisc+0x323/0x880 rtnetlink_rcv_msg+0x169/0x4a0 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1a5/0x280 netlink_sendmsg+0x257/0x4d0 sock_sendmsg+0x5b/0x60 ____sys_sendmsg+0x1f2/0x260 ___sys_sendmsg+0x7c/0xc0 __sys_sendmsg+0x57/0xa0 do_syscall_64+0x3a/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7efdc8031338 Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55 RSP: 002b:00007ffdf1ce9828 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000061b37a97 RCX: 00007efdc8031338 RDX: 0000000000000000 RSI: 00007ffdf1ce9890 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000078a940 R10: 000000000000000c R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000688880 R14: 0000000000000000 R15: 0000000000000000 </TASK> Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt iTCO_vendor_support intel_rapl_msr intel_rapl_common joydev pcspkr i2c_i801 virtio_balloon i2c_smbus lpc_ich ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel serio_raw ghash_clmulni_intel ahci libahci libata virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: sch_ets] ---[ end trace f35878d1912655c2 ]--- RIP: 0010:__list_del_entry_valid.cold.1+0x12/0x47 Code: fe ff 0f 0b 48 89 c1 4c 89 c6 48 c7 c7 08 42 1b 87 e8 1d c5 fe ff 0f 0b 48 89 fe 48 89 c2 48 c7 c7 98 42 1b 87 e8 09 c5 fe ff <0f> 0b 48 c7 c7 48 43 1b 87 e8 fb c4 fe ff 0f 0b 48 89 f2 48 89 fe RSP: 0018:ffffae46807a3888 EFLAGS: 00010246 RAX: 000000000000004e RBX: 0000000000000007 RCX: 0000000000000202 RDX: 0000000000000000 RSI: ffffffff871ac536 RDI: 00000000ffffffff RBP: ffffae46807a3a10 R08: 0000000000000000 R09: c0000000ffff7fff R10: 0000000000000001 R11: ffffae46807a36a8 R12: ffff8e028404b800 R13: ffff8e028404bd30 R14: dead000000000100 R15: ffff8e02fafa2400 FS: 00007efdc92e4480(0000) GS:ffff8e02fb600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000682f48 CR3: 00000001058be000 CR4: 0000000000350ef0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x4e00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- we can remove 'q->classes[i].alist' only if DRR class 'i' was part of the active list. In the ETS scheduler DRR classes belong to that list only if the queue length is greater than zero: we need to test for non-zero value of 'q->classes[i].qdisc->q.qlen' before removing from the list, similarly to what has been done elsewhere in the ETS code. Fixes: de6d25924c2a ("net/sched: sch_ets: don't peek at classes beyond 'nbands'") Reported-by: Shuang Li <shuali@redhat.com> Signed-off-by: Davide Caratti <dcaratti@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-10 19:42:47 +03:00
qdisc_tree_flush_backlog(q->classes[i].qdisc);
net/sched: sch_ets: don't peek at classes beyond 'nbands' when the number of DRR classes decreases, the round-robin active list can contain elements that have already been freed in ets_qdisc_change(). As a consequence, it's possible to see a NULL dereference crash, caused by the attempt to call cl->qdisc->ops->peek(cl->qdisc) when cl->qdisc is NULL: BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 910 Comm: mausezahn Not tainted 5.16.0-rc1+ #475 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: 0010:ets_qdisc_dequeue+0x129/0x2c0 [sch_ets] Code: c5 01 41 39 ad e4 02 00 00 0f 87 18 ff ff ff 49 8b 85 c0 02 00 00 49 39 c4 0f 84 ba 00 00 00 49 8b ad c0 02 00 00 48 8b 7d 10 <48> 8b 47 18 48 8b 40 38 0f ae e8 ff d0 48 89 c3 48 85 c0 0f 84 9d RSP: 0000:ffffbb36c0b5fdd8 EFLAGS: 00010287 RAX: ffff956678efed30 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000002 RSI: ffffffff9b938dc9 RDI: 0000000000000000 RBP: ffff956678efed30 R08: e2f3207fe360129c R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffff956678efeac0 R13: ffff956678efe800 R14: ffff956611545000 R15: ffff95667ac8f100 FS: 00007f2aa9120740(0000) GS:ffff95667b800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000018 CR3: 000000011070c000 CR4: 0000000000350ee0 Call Trace: <TASK> qdisc_peek_dequeued+0x29/0x70 [sch_ets] tbf_dequeue+0x22/0x260 [sch_tbf] __qdisc_run+0x7f/0x630 net_tx_action+0x290/0x4c0 __do_softirq+0xee/0x4f8 irq_exit_rcu+0xf4/0x130 sysvec_apic_timer_interrupt+0x52/0xc0 asm_sysvec_apic_timer_interrupt+0x12/0x20 RIP: 0033:0x7f2aa7fc9ad4 Code: b9 ff ff 48 8b 54 24 18 48 83 c4 08 48 89 ee 48 89 df 5b 5d e9 ed fc ff ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa <53> 48 83 ec 10 48 8b 05 10 64 33 00 48 8b 00 48 85 c0 0f 85 84 00 RSP: 002b:00007ffe5d33fab8 EFLAGS: 00000202 RAX: 0000000000000002 RBX: 0000561f72c31460 RCX: 0000561f72c31720 RDX: 0000000000000002 RSI: 0000561f72c31722 RDI: 0000561f72c31720 RBP: 000000000000002a R08: 00007ffe5d33fa40 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000561f7187e380 R13: 0000000000000000 R14: 0000000000000000 R15: 0000561f72c31460 </TASK> Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt intel_rapl_msr iTCO_vendor_support intel_rapl_common joydev virtio_balloon lpc_ich i2c_i801 i2c_smbus pcspkr ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel ahci libahci ghash_clmulni_intel serio_raw libata virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod CR2: 0000000000000018 Ensuring that 'alist' was never zeroed [1] was not sufficient, we need to remove from the active list those elements that are no more SP nor DRR. [1] https://lore.kernel.org/netdev/60d274838bf09777f0371253416e8af71360bc08.1633609148.git.dcaratti@redhat.com/ v3: fix race between ets_qdisc_change() and ets_qdisc_dequeue() delisting DRR classes beyond 'nbands' in ets_qdisc_change() with the qdisc lock acquired, thanks to Cong Wang. v2: when a NULL qdisc is found in the DRR active list, try to dequeue skb from the next list item. Reported-by: Hangbin Liu <liuhangbin@gmail.com> Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") Signed-off-by: Davide Caratti <dcaratti@redhat.com> Link: https://lore.kernel.org/r/7a5c496eed2d62241620bdbb83eb03fb9d571c99.1637762721.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-24 19:14:40 +03:00
}
q->nstrict = nstrict;
memcpy(q->prio2band, priomap, sizeof(priomap));
for (i = 0; i < q->nbands; i++)
q->classes[i].quantum = quanta[i];
for (i = oldbands; i < q->nbands; i++) {
q->classes[i].qdisc = queues[i];
if (q->classes[i].qdisc != &noop_qdisc)
qdisc_hash_add(q->classes[i].qdisc, true);
}
sch_tree_unlock(sch);
ets_offload_change(sch);
for (i = q->nbands; i < oldbands; i++) {
qdisc_put(q->classes[i].qdisc);
q->classes[i].qdisc = NULL;
q->classes[i].quantum = 0;
q->classes[i].deficit = 0;
gnet_stats_basic_sync_init(&q->classes[i].bstats);
memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats));
}
return 0;
}
static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct ets_sched *q = qdisc_priv(sch);
int err, i;
if (!opt)
return -EINVAL;
err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
if (err)
return err;
INIT_LIST_HEAD(&q->active);
for (i = 0; i < TCQ_ETS_MAX_BANDS; i++)
INIT_LIST_HEAD(&q->classes[i].alist);
return ets_qdisc_change(sch, opt, extack);
}
static void ets_qdisc_reset(struct Qdisc *sch)
{
struct ets_sched *q = qdisc_priv(sch);
int band;
for (band = q->nstrict; band < q->nbands; band++) {
if (q->classes[band].qdisc->q.qlen)
list_del(&q->classes[band].alist);
}
for (band = 0; band < q->nbands; band++)
qdisc_reset(q->classes[band].qdisc);
}
static void ets_qdisc_destroy(struct Qdisc *sch)
{
struct ets_sched *q = qdisc_priv(sch);
int band;
ets_offload_destroy(sch);
tcf_block_put(q->block);
for (band = 0; band < q->nbands; band++)
qdisc_put(q->classes[band].qdisc);
}
static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct ets_sched *q = qdisc_priv(sch);
struct nlattr *opts;
struct nlattr *nest;
int band;
int prio;
int err;
err = ets_offload_dump(sch);
if (err)
return err;
opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!opts)
goto nla_err;
if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands))
goto nla_err;
if (q->nstrict &&
nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict))
goto nla_err;
if (q->nbands > q->nstrict) {
nest = nla_nest_start(skb, TCA_ETS_QUANTA);
if (!nest)
goto nla_err;
for (band = q->nstrict; band < q->nbands; band++) {
if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND,
q->classes[band].quantum))
goto nla_err;
}
nla_nest_end(skb, nest);
}
nest = nla_nest_start(skb, TCA_ETS_PRIOMAP);
if (!nest)
goto nla_err;
for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio]))
goto nla_err;
}
nla_nest_end(skb, nest);
return nla_nest_end(skb, opts);
nla_err:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static const struct Qdisc_class_ops ets_class_ops = {
.change = ets_class_change,
.graft = ets_class_graft,
.leaf = ets_class_leaf,
.find = ets_class_find,
.qlen_notify = ets_class_qlen_notify,
.dump = ets_class_dump,
.dump_stats = ets_class_dump_stats,
.walk = ets_qdisc_walk,
.tcf_block = ets_qdisc_tcf_block,
.bind_tcf = ets_qdisc_bind_tcf,
.unbind_tcf = ets_qdisc_unbind_tcf,
};
static struct Qdisc_ops ets_qdisc_ops __read_mostly = {
.cl_ops = &ets_class_ops,
.id = "ets",
.priv_size = sizeof(struct ets_sched),
.enqueue = ets_qdisc_enqueue,
.dequeue = ets_qdisc_dequeue,
.peek = qdisc_peek_dequeued,
.change = ets_qdisc_change,
.init = ets_qdisc_init,
.reset = ets_qdisc_reset,
.destroy = ets_qdisc_destroy,
.dump = ets_qdisc_dump,
.owner = THIS_MODULE,
};
static int __init ets_init(void)
{
return register_qdisc(&ets_qdisc_ops);
}
static void __exit ets_exit(void)
{
unregister_qdisc(&ets_qdisc_ops);
}
module_init(ets_init);
module_exit(ets_exit);
MODULE_LICENSE("GPL");