aa1b46dcdc
a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked") made bio_endio() skip rq_qos_done_bio() if BIO_TRACKED is not set. While this fixed a potential oops, it also broke blk-iocost by skipping the done_bio callback for merged bios. Before, whether a bio goes through rq_qos_throttle() or rq_qos_merge(), rq_qos_done_bio() would be called on the bio on completion with BIO_TRACKED distinguishing the former from the latter. rq_qos_done_bio() is not called for bios which wenth through rq_qos_merge(). This royally confuses blk-iocost as the merged bios never finish and are considered perpetually in-flight. One reliably reproducible failure mode is an intermediate cgroup geting stuck active preventing its children from being activated due to the leaf-only rule, leading to loss of control. The following is from resctl-bench protection scenario which emulates isolating a web server like workload from a memory bomb run on an iocost configuration which should yield a reasonable level of protection. # cat /sys/block/nvme2n1/device/model Samsung SSD 970 PRO 512GB # cat /sys/fs/cgroup/io.cost.model 259:0 ctrl=user model=linear rbps=834913556 rseqiops=93622 rrandiops=102913 wbps=618985353 wseqiops=72325 wrandiops=71025 # cat /sys/fs/cgroup/io.cost.qos 259:0 enable=1 ctrl=user rpct=95.00 rlat=18776 wpct=95.00 wlat=8897 min=60.00 max=100.00 # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1 ... Memory Hog Summary ================== IO Latency: R p50=242u:336u/2.5m p90=794u:1.4m/7.5m p99=2.7m:8.0m/62.5m max=8.0m:36.4m/350m W p50=221u:323u/1.5m p90=709u:1.2m/5.5m p99=1.5m:2.5m/9.5m max=6.9m:35.9m/350m Isolation and Request Latency Impact Distributions: min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev isol% 15.90 15.90 15.90 40.05 57.24 59.07 60.01 74.63 74.63 90.35 90.35 58.12 15.82 lat-imp% 0 0 0 0 0 4.55 14.68 15.54 233.5 548.1 548.1 53.88 143.6 Result: isol=58.12:15.82% lat_imp=53.88%:143.6 work_csv=100.0% missing=3.96% The isolation result of 58.12% is close to what this device would show without any IO control. Fix it by introducing a new flag BIO_QOS_MERGED to mark merged bios and calling rq_qos_done_bio() on them too. For consistency and clarity, rename BIO_TRACKED to BIO_QOS_THROTTLED. The flag checks are moved into rq_qos_done_bio() so that it's next to the code paths that set the flags. With the patch applied, the above same benchmark shows: # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1 ... Memory Hog Summary ================== IO Latency: R p50=123u:84.4u/985u p90=322u:256u/2.5m p99=1.6m:1.4m/9.5m max=11.1m:36.0m/350m W p50=429u:274u/995u p90=1.7m:1.3m/4.5m p99=3.4m:2.7m/11.5m max=7.9m:5.9m/26.5m Isolation and Request Latency Impact Distributions: min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev isol% 84.91 84.91 89.51 90.73 92.31 94.49 96.36 98.04 98.71 100.0 100.0 94.42 2.81 lat-imp% 0 0 0 0 0 2.81 5.73 11.11 13.92 17.53 22.61 4.10 4.68 Result: isol=94.42:2.81% lat_imp=4.10%:4.68 work_csv=58.34% missing=0% Signed-off-by: Tejun Heo <tj@kernel.org> Fixes: a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked") Cc: stable@vger.kernel.org # v5.15+ Cc: Ming Lei <ming.lei@redhat.com> Cc: Yu Kuai <yukuai3@huawei.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/Yi7rdrzQEHjJLGKB@slm.duckdns.org Signed-off-by: Jens Axboe <axboe@kernel.dk>
223 lines
5.4 KiB
C
223 lines
5.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef RQ_QOS_H
|
|
#define RQ_QOS_H
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/blk_types.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/wait.h>
|
|
#include <linux/blk-mq.h>
|
|
|
|
#include "blk-mq-debugfs.h"
|
|
|
|
struct blk_mq_debugfs_attr;
|
|
|
|
enum rq_qos_id {
|
|
RQ_QOS_WBT,
|
|
RQ_QOS_LATENCY,
|
|
RQ_QOS_COST,
|
|
RQ_QOS_IOPRIO,
|
|
};
|
|
|
|
struct rq_wait {
|
|
wait_queue_head_t wait;
|
|
atomic_t inflight;
|
|
};
|
|
|
|
struct rq_qos {
|
|
struct rq_qos_ops *ops;
|
|
struct request_queue *q;
|
|
enum rq_qos_id id;
|
|
struct rq_qos *next;
|
|
#ifdef CONFIG_BLK_DEBUG_FS
|
|
struct dentry *debugfs_dir;
|
|
#endif
|
|
};
|
|
|
|
struct rq_qos_ops {
|
|
void (*throttle)(struct rq_qos *, struct bio *);
|
|
void (*track)(struct rq_qos *, struct request *, struct bio *);
|
|
void (*merge)(struct rq_qos *, struct request *, struct bio *);
|
|
void (*issue)(struct rq_qos *, struct request *);
|
|
void (*requeue)(struct rq_qos *, struct request *);
|
|
void (*done)(struct rq_qos *, struct request *);
|
|
void (*done_bio)(struct rq_qos *, struct bio *);
|
|
void (*cleanup)(struct rq_qos *, struct bio *);
|
|
void (*queue_depth_changed)(struct rq_qos *);
|
|
void (*exit)(struct rq_qos *);
|
|
const struct blk_mq_debugfs_attr *debugfs_attrs;
|
|
};
|
|
|
|
struct rq_depth {
|
|
unsigned int max_depth;
|
|
|
|
int scale_step;
|
|
bool scaled_max;
|
|
|
|
unsigned int queue_depth;
|
|
unsigned int default_depth;
|
|
};
|
|
|
|
static inline struct rq_qos *rq_qos_id(struct request_queue *q,
|
|
enum rq_qos_id id)
|
|
{
|
|
struct rq_qos *rqos;
|
|
for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
|
|
if (rqos->id == id)
|
|
break;
|
|
}
|
|
return rqos;
|
|
}
|
|
|
|
static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
|
|
{
|
|
return rq_qos_id(q, RQ_QOS_WBT);
|
|
}
|
|
|
|
static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
|
|
{
|
|
return rq_qos_id(q, RQ_QOS_LATENCY);
|
|
}
|
|
|
|
static inline void rq_wait_init(struct rq_wait *rq_wait)
|
|
{
|
|
atomic_set(&rq_wait->inflight, 0);
|
|
init_waitqueue_head(&rq_wait->wait);
|
|
}
|
|
|
|
static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
|
|
{
|
|
/*
|
|
* No IO can be in-flight when adding rqos, so freeze queue, which
|
|
* is fine since we only support rq_qos for blk-mq queue.
|
|
*
|
|
* Reuse ->queue_lock for protecting against other concurrent
|
|
* rq_qos adding/deleting
|
|
*/
|
|
blk_mq_freeze_queue(q);
|
|
|
|
spin_lock_irq(&q->queue_lock);
|
|
rqos->next = q->rq_qos;
|
|
q->rq_qos = rqos;
|
|
spin_unlock_irq(&q->queue_lock);
|
|
|
|
blk_mq_unfreeze_queue(q);
|
|
|
|
if (rqos->ops->debugfs_attrs)
|
|
blk_mq_debugfs_register_rqos(rqos);
|
|
}
|
|
|
|
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
|
|
{
|
|
struct rq_qos **cur;
|
|
|
|
/*
|
|
* See comment in rq_qos_add() about freezing queue & using
|
|
* ->queue_lock.
|
|
*/
|
|
blk_mq_freeze_queue(q);
|
|
|
|
spin_lock_irq(&q->queue_lock);
|
|
for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
|
|
if (*cur == rqos) {
|
|
*cur = rqos->next;
|
|
break;
|
|
}
|
|
}
|
|
spin_unlock_irq(&q->queue_lock);
|
|
|
|
blk_mq_unfreeze_queue(q);
|
|
|
|
blk_mq_debugfs_unregister_rqos(rqos);
|
|
}
|
|
|
|
typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
|
|
typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);
|
|
|
|
void rq_qos_wait(struct rq_wait *rqw, void *private_data,
|
|
acquire_inflight_cb_t *acquire_inflight_cb,
|
|
cleanup_cb_t *cleanup_cb);
|
|
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
|
|
bool rq_depth_scale_up(struct rq_depth *rqd);
|
|
bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
|
|
bool rq_depth_calc_max_depth(struct rq_depth *rqd);
|
|
|
|
void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
|
|
void __rq_qos_done(struct rq_qos *rqos, struct request *rq);
|
|
void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
|
|
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
|
|
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
|
|
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
|
|
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
|
|
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
|
|
void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
|
|
|
|
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
|
|
{
|
|
if (q->rq_qos)
|
|
__rq_qos_cleanup(q->rq_qos, bio);
|
|
}
|
|
|
|
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
|
|
{
|
|
if (q->rq_qos)
|
|
__rq_qos_done(q->rq_qos, rq);
|
|
}
|
|
|
|
static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
|
|
{
|
|
if (q->rq_qos)
|
|
__rq_qos_issue(q->rq_qos, rq);
|
|
}
|
|
|
|
static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
|
|
{
|
|
if (q->rq_qos)
|
|
__rq_qos_requeue(q->rq_qos, rq);
|
|
}
|
|
|
|
static inline void rq_qos_done_bio(struct bio *bio)
|
|
{
|
|
if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
|
|
bio_flagged(bio, BIO_QOS_MERGED))) {
|
|
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
|
if (q->rq_qos)
|
|
__rq_qos_done_bio(q->rq_qos, bio);
|
|
}
|
|
}
|
|
|
|
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
|
|
{
|
|
if (q->rq_qos) {
|
|
bio_set_flag(bio, BIO_QOS_THROTTLED);
|
|
__rq_qos_throttle(q->rq_qos, bio);
|
|
}
|
|
}
|
|
|
|
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
|
|
struct bio *bio)
|
|
{
|
|
if (q->rq_qos)
|
|
__rq_qos_track(q->rq_qos, rq, bio);
|
|
}
|
|
|
|
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
|
|
struct bio *bio)
|
|
{
|
|
if (q->rq_qos) {
|
|
bio_set_flag(bio, BIO_QOS_MERGED);
|
|
__rq_qos_merge(q->rq_qos, rq, bio);
|
|
}
|
|
}
|
|
|
|
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
|
|
{
|
|
if (q->rq_qos)
|
|
__rq_qos_queue_depth_changed(q->rq_qos);
|
|
}
|
|
|
|
void rq_qos_exit(struct request_queue *);
|
|
|
|
#endif
|