2019-04-30 14:42:40 -04:00
/* SPDX-License-Identifier: GPL-2.0-or-later */
2017-04-19 08:48:24 -06:00
/*
* Header file for the BFQ I / O scheduler : data structures and
* prototypes of interface functions among BFQ components .
*/
# ifndef _BFQ_H
# define _BFQ_H
# include <linux/blktrace_api.h>
# include <linux/hrtimer.h>
2019-11-07 11:18:04 -08:00
# include "blk-cgroup-rwstat.h"
2017-04-19 08:48:24 -06:00
# define BFQ_IOPRIO_CLASSES 3
# define BFQ_CL_IDLE_TIMEOUT (HZ / 5)
# define BFQ_MIN_WEIGHT 1
# define BFQ_MAX_WEIGHT 1000
# define BFQ_WEIGHT_CONVERSION_COEFF 10
# define BFQ_DEFAULT_QUEUE_IOPRIO 4
# define BFQ_WEIGHT_LEGACY_DFL 100
# define BFQ_DEFAULT_GRP_IOPRIO 0
# define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
2021-11-25 14:36:39 +01:00
# define MAX_BFQQ_NAME_LENGTH 16
2019-03-12 09:59:33 +01:00
2017-04-19 08:48:24 -06:00
/*
* Soft real - time applications are extremely more latency sensitive
* than interactive ones . Over - raise the weight of the former to
* privilege them against the latter .
*/
# define BFQ_SOFTRT_WEIGHT_FACTOR 100
struct bfq_entity ;
/**
* struct bfq_service_tree - per ioprio_class service tree .
*
* Each service tree represents a B - WF2Q + scheduler on its own . Each
* ioprio_class has its own independent scheduler , and so its own
* bfq_service_tree . All the fields are protected by the queue lock
* of the containing bfqd .
*/
struct bfq_service_tree {
/* tree for active entities (i.e., those backlogged) */
struct rb_root active ;
2017-07-12 15:25:01 +08:00
/* tree for idle entities (i.e., not backlogged, with V < F_i)*/
2017-04-19 08:48:24 -06:00
struct rb_root idle ;
/* idle entity with minimum F_i */
struct bfq_entity * first_idle ;
/* idle entity with maximum F_i */
struct bfq_entity * last_idle ;
/* scheduler virtual time */
u64 vtime ;
/* scheduler weight sum; active and idle entities contribute to it */
unsigned long wsum ;
} ;
/**
* struct bfq_sched_data - multi - class scheduler .
*
* bfq_sched_data is the basic scheduler queue . It supports three
* ioprio_classes , and can be used either as a toplevel queue or as an
block, bfq: consider also in_service_entity to state whether an entity is active
Groups of BFQ queues are represented by generic entities in BFQ. When
a queue belonging to a parent entity is deactivated, the parent entity
may need to be deactivated too, in case the deactivated queue was the
only active queue for the parent entity. This deactivation may need to
be propagated upwards if the entity belongs, in its turn, to a further
higher-level entity, and so on. In particular, the upward propagation
of deactivation stops at the first parent entity that remains active
even if one of its child entities has been deactivated.
To decide whether the last non-deactivation condition holds for a
parent entity, BFQ checks whether the field next_in_service is still
not NULL for the parent entity, after the deactivation of one of its
child entity. If it is not NULL, then there are certainly other active
entities in the parent entity, and deactivations can stop.
Unfortunately, this check misses a corner case: if in_service_entity
is not NULL, then next_in_service may happen to be NULL, although the
parent entity is evidently active. This happens if: 1) the entity
pointed by in_service_entity is the only active entity in the parent
entity, and 2) according to the definition of next_in_service, the
in_service_entity cannot be considered as next_in_service. See the
comments on the definition of next_in_service for details on this
second point.
Hitting the above corner case causes crashes.
To address this issue, this commit:
1) Extends the above check on only next_in_service to controlling both
next_in_service and in_service_entity (if any of them is not NULL,
then no further deactivation is performed)
2) Improves the (important) comments on how next_in_service is defined
and updated; in particular it fixes a few rather obscure paragraphs
Reported-by: Eric Wheeler <bfq-sched@lists.ewheeler.net>
Reported-by: Rick Yiu <rick_yiu@htc.com>
Reported-by: Tom X Nguyen <tom81094@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Eric Wheeler <bfq-sched@lists.ewheeler.net>
Tested-by: Rick Yiu <rick_yiu@htc.com>
Tested-by: Laurentiu Nicola <lnicola@dend.ro>
Tested-by: Tom X Nguyen <tom81094@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-07-29 12:42:56 +02:00
* intermediate queue in a hierarchical setup .
2017-04-19 08:48:24 -06:00
*
* The supported ioprio_classes are the same as in CFQ , in descending
* priority order , IOPRIO_CLASS_RT , IOPRIO_CLASS_BE , IOPRIO_CLASS_IDLE .
* Requests from higher priority queues are served before all the
* requests from lower priority queues ; among requests of the same
* queue requests are served according to B - WF2Q + .
block, bfq: consider also in_service_entity to state whether an entity is active
Groups of BFQ queues are represented by generic entities in BFQ. When
a queue belonging to a parent entity is deactivated, the parent entity
may need to be deactivated too, in case the deactivated queue was the
only active queue for the parent entity. This deactivation may need to
be propagated upwards if the entity belongs, in its turn, to a further
higher-level entity, and so on. In particular, the upward propagation
of deactivation stops at the first parent entity that remains active
even if one of its child entities has been deactivated.
To decide whether the last non-deactivation condition holds for a
parent entity, BFQ checks whether the field next_in_service is still
not NULL for the parent entity, after the deactivation of one of its
child entity. If it is not NULL, then there are certainly other active
entities in the parent entity, and deactivations can stop.
Unfortunately, this check misses a corner case: if in_service_entity
is not NULL, then next_in_service may happen to be NULL, although the
parent entity is evidently active. This happens if: 1) the entity
pointed by in_service_entity is the only active entity in the parent
entity, and 2) according to the definition of next_in_service, the
in_service_entity cannot be considered as next_in_service. See the
comments on the definition of next_in_service for details on this
second point.
Hitting the above corner case causes crashes.
To address this issue, this commit:
1) Extends the above check on only next_in_service to controlling both
next_in_service and in_service_entity (if any of them is not NULL,
then no further deactivation is performed)
2) Improves the (important) comments on how next_in_service is defined
and updated; in particular it fixes a few rather obscure paragraphs
Reported-by: Eric Wheeler <bfq-sched@lists.ewheeler.net>
Reported-by: Rick Yiu <rick_yiu@htc.com>
Reported-by: Tom X Nguyen <tom81094@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Eric Wheeler <bfq-sched@lists.ewheeler.net>
Tested-by: Rick Yiu <rick_yiu@htc.com>
Tested-by: Laurentiu Nicola <lnicola@dend.ro>
Tested-by: Tom X Nguyen <tom81094@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-07-29 12:42:56 +02:00
*
* The schedule is implemented by the service trees , plus the field
* @ next_in_service , which points to the entity on the active trees
* that will be served next , if 1 ) no changes in the schedule occurs
* before the current in - service entity is expired , 2 ) the in - service
* queue becomes idle when it expires , and 3 ) if the entity pointed by
* in_service_entity is not a queue , then the in - service child entity
* of the entity pointed by in_service_entity becomes idle on
* expiration . This peculiar definition allows for the following
* optimization , not yet exploited : while a given entity is still in
* service , we already know which is the best candidate for next
2019-04-08 17:35:34 +02:00
* service among the other active entities in the same parent
block, bfq: consider also in_service_entity to state whether an entity is active
Groups of BFQ queues are represented by generic entities in BFQ. When
a queue belonging to a parent entity is deactivated, the parent entity
may need to be deactivated too, in case the deactivated queue was the
only active queue for the parent entity. This deactivation may need to
be propagated upwards if the entity belongs, in its turn, to a further
higher-level entity, and so on. In particular, the upward propagation
of deactivation stops at the first parent entity that remains active
even if one of its child entities has been deactivated.
To decide whether the last non-deactivation condition holds for a
parent entity, BFQ checks whether the field next_in_service is still
not NULL for the parent entity, after the deactivation of one of its
child entity. If it is not NULL, then there are certainly other active
entities in the parent entity, and deactivations can stop.
Unfortunately, this check misses a corner case: if in_service_entity
is not NULL, then next_in_service may happen to be NULL, although the
parent entity is evidently active. This happens if: 1) the entity
pointed by in_service_entity is the only active entity in the parent
entity, and 2) according to the definition of next_in_service, the
in_service_entity cannot be considered as next_in_service. See the
comments on the definition of next_in_service for details on this
second point.
Hitting the above corner case causes crashes.
To address this issue, this commit:
1) Extends the above check on only next_in_service to controlling both
next_in_service and in_service_entity (if any of them is not NULL,
then no further deactivation is performed)
2) Improves the (important) comments on how next_in_service is defined
and updated; in particular it fixes a few rather obscure paragraphs
Reported-by: Eric Wheeler <bfq-sched@lists.ewheeler.net>
Reported-by: Rick Yiu <rick_yiu@htc.com>
Reported-by: Tom X Nguyen <tom81094@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Eric Wheeler <bfq-sched@lists.ewheeler.net>
Tested-by: Rick Yiu <rick_yiu@htc.com>
Tested-by: Laurentiu Nicola <lnicola@dend.ro>
Tested-by: Tom X Nguyen <tom81094@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-07-29 12:42:56 +02:00
* entity . We can then quickly compare the timestamps of the
* in - service entity with those of such best candidate .
*
* All fields are protected by the lock of the containing bfqd .
2017-04-19 08:48:24 -06:00
*/
struct bfq_sched_data {
/* entity in service */
struct bfq_entity * in_service_entity ;
/* head-of-line entity (see comments above) */
struct bfq_entity * next_in_service ;
/* array of service trees, one per ioprio_class */
struct bfq_service_tree service_tree [ BFQ_IOPRIO_CLASSES ] ;
/* last time CLASS_IDLE was served */
unsigned long bfq_class_idle_last_service ;
} ;
/**
block, bfq: improve asymmetric scenarios detection
bfq defines as asymmetric a scenario where an active entity, say E
(representing either a single bfq_queue or a group of other entities),
has a higher weight than some other entities. If the entity E does sync
I/O in such a scenario, then bfq plugs the dispatch of the I/O of the
other entities in the following situation: E is in service but
temporarily has no pending I/O request. In fact, without this plugging,
all the times that E stops being temporarily idle, it may find the
internal queues of the storage device already filled with an
out-of-control number of extra requests, from other entities. So E may
have to wait for the service of these extra requests, before finally
having its own requests served. This may easily break service
guarantees, with E getting less than its fair share of the device
throughput. Usually, the end result is that E gets the same fraction of
the throughput as the other entities, instead of getting more, according
to its higher weight.
Yet there are two other more subtle cases where E, even if its weight is
actually equal to or even lower than the weight of any other active
entities, may get less than its fair share of the throughput in case the
above I/O plugging is not performed:
1. other entities issue larger requests than E;
2. other entities contain more active child entities than E (or in
general tend to have more backlog than E).
In the first case, other entities may get more service than E because
they get larger requests, than those of E, served during the temporary
idle periods of E. In the second case, other entities get more service
because, by having many child entities, they have many requests ready
for dispatching while E is temporarily idle.
This commit addresses this issue by extending the definition of
asymmetric scenario: a scenario is asymmetric when
- active entities representing bfq_queues have differentiated weights,
as in the original definition
or (inclusive)
- one or more entities representing groups of entities are active.
This broader definition makes sure that I/O plugging will be performed
in all the above cases, provided that there is at least one active
group. Of course, this definition is very coarse, so it will trigger
I/O plugging also in cases where it is not needed, such as, e.g.,
multiple active entities with just one child each, and all with the same
I/O-request size. The reason for this coarse definition is just that a
finer-grained definition would be rather heavy to compute.
On the opposite end, even this new definition does not trigger I/O
plugging in all cases where there is no active group, and all bfq_queues
have the same weight. So, in these cases some unfairness may occur if
there are asymmetries in I/O-request sizes. We made this choice because
I/O plugging may lower throughput, and probably a user that has not
created any group cares more about throughput than about perfect
fairness. At any rate, as for possible applications that may care about
service guarantees, bfq already guarantees a high responsiveness and a
low latency to soft real-time applications automatically.
Signed-off-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-10-12 11:55:57 +02:00
* struct bfq_weight_counter - counter of the number of all active queues
2017-04-19 08:48:24 -06:00
* with a given weight .
*/
struct bfq_weight_counter {
block, bfq: improve asymmetric scenarios detection
bfq defines as asymmetric a scenario where an active entity, say E
(representing either a single bfq_queue or a group of other entities),
has a higher weight than some other entities. If the entity E does sync
I/O in such a scenario, then bfq plugs the dispatch of the I/O of the
other entities in the following situation: E is in service but
temporarily has no pending I/O request. In fact, without this plugging,
all the times that E stops being temporarily idle, it may find the
internal queues of the storage device already filled with an
out-of-control number of extra requests, from other entities. So E may
have to wait for the service of these extra requests, before finally
having its own requests served. This may easily break service
guarantees, with E getting less than its fair share of the device
throughput. Usually, the end result is that E gets the same fraction of
the throughput as the other entities, instead of getting more, according
to its higher weight.
Yet there are two other more subtle cases where E, even if its weight is
actually equal to or even lower than the weight of any other active
entities, may get less than its fair share of the throughput in case the
above I/O plugging is not performed:
1. other entities issue larger requests than E;
2. other entities contain more active child entities than E (or in
general tend to have more backlog than E).
In the first case, other entities may get more service than E because
they get larger requests, than those of E, served during the temporary
idle periods of E. In the second case, other entities get more service
because, by having many child entities, they have many requests ready
for dispatching while E is temporarily idle.
This commit addresses this issue by extending the definition of
asymmetric scenario: a scenario is asymmetric when
- active entities representing bfq_queues have differentiated weights,
as in the original definition
or (inclusive)
- one or more entities representing groups of entities are active.
This broader definition makes sure that I/O plugging will be performed
in all the above cases, provided that there is at least one active
group. Of course, this definition is very coarse, so it will trigger
I/O plugging also in cases where it is not needed, such as, e.g.,
multiple active entities with just one child each, and all with the same
I/O-request size. The reason for this coarse definition is just that a
finer-grained definition would be rather heavy to compute.
On the opposite end, even this new definition does not trigger I/O
plugging in all cases where there is no active group, and all bfq_queues
have the same weight. So, in these cases some unfairness may occur if
there are asymmetries in I/O-request sizes. We made this choice because
I/O plugging may lower throughput, and probably a user that has not
created any group cares more about throughput than about perfect
fairness. At any rate, as for possible applications that may care about
service guarantees, bfq already guarantees a high responsiveness and a
low latency to soft real-time applications automatically.
Signed-off-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-10-12 11:55:57 +02:00
unsigned int weight ; /* weight of the queues this counter refers to */
unsigned int num_active ; /* nr of active queues with this weight */
2017-04-19 08:48:24 -06:00
/*
block, bfq: improve asymmetric scenarios detection
bfq defines as asymmetric a scenario where an active entity, say E
(representing either a single bfq_queue or a group of other entities),
has a higher weight than some other entities. If the entity E does sync
I/O in such a scenario, then bfq plugs the dispatch of the I/O of the
other entities in the following situation: E is in service but
temporarily has no pending I/O request. In fact, without this plugging,
all the times that E stops being temporarily idle, it may find the
internal queues of the storage device already filled with an
out-of-control number of extra requests, from other entities. So E may
have to wait for the service of these extra requests, before finally
having its own requests served. This may easily break service
guarantees, with E getting less than its fair share of the device
throughput. Usually, the end result is that E gets the same fraction of
the throughput as the other entities, instead of getting more, according
to its higher weight.
Yet there are two other more subtle cases where E, even if its weight is
actually equal to or even lower than the weight of any other active
entities, may get less than its fair share of the throughput in case the
above I/O plugging is not performed:
1. other entities issue larger requests than E;
2. other entities contain more active child entities than E (or in
general tend to have more backlog than E).
In the first case, other entities may get more service than E because
they get larger requests, than those of E, served during the temporary
idle periods of E. In the second case, other entities get more service
because, by having many child entities, they have many requests ready
for dispatching while E is temporarily idle.
This commit addresses this issue by extending the definition of
asymmetric scenario: a scenario is asymmetric when
- active entities representing bfq_queues have differentiated weights,
as in the original definition
or (inclusive)
- one or more entities representing groups of entities are active.
This broader definition makes sure that I/O plugging will be performed
in all the above cases, provided that there is at least one active
group. Of course, this definition is very coarse, so it will trigger
I/O plugging also in cases where it is not needed, such as, e.g.,
multiple active entities with just one child each, and all with the same
I/O-request size. The reason for this coarse definition is just that a
finer-grained definition would be rather heavy to compute.
On the opposite end, even this new definition does not trigger I/O
plugging in all cases where there is no active group, and all bfq_queues
have the same weight. So, in these cases some unfairness may occur if
there are asymmetries in I/O-request sizes. We made this choice because
I/O plugging may lower throughput, and probably a user that has not
created any group cares more about throughput than about perfect
fairness. At any rate, as for possible applications that may care about
service guarantees, bfq already guarantees a high responsiveness and a
low latency to soft real-time applications automatically.
Signed-off-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-10-12 11:55:57 +02:00
* Weights tree member ( see bfq_data ' s @ queue_weights_tree )
2017-04-19 08:48:24 -06:00
*/
struct rb_node weights_node ;
} ;
/**
* struct bfq_entity - schedulable entity .
*
* A bfq_entity is used to represent either a bfq_queue ( leaf node in the
* cgroup hierarchy ) or a bfq_group into the upper level scheduler . Each
* entity belongs to the sched_data of the parent group in the cgroup
* hierarchy . Non - leaf entities have also their own sched_data , stored
* in @ my_sched_data .
*
* Each entity stores independently its priority values ; this would
* allow different weights on different devices , but this
* functionality is not exported to userspace by now . Priorities and
* weights are updated lazily , first storing the new values into the
* new_ * fields , then setting the @ prio_changed flag . As soon as
* there is a transition in the entity state that allows the priority
* update to take place the effective and the requested priority
* values are synchronized .
*
* Unless cgroups are used , the weight value is calculated from the
* ioprio to export the same interface as CFQ . When dealing with
2019-04-08 17:35:34 +02:00
* " well-behaved " queues ( i . e . , queues that do not spend too much
2017-04-19 08:48:24 -06:00
* time to consume their budget and have true sequential behavior , and
* when there are no external factors breaking anticipation ) the
* relative weights at each level of the cgroups hierarchy should be
* guaranteed . All the fields are protected by the queue lock of the
* containing bfqd .
*/
struct bfq_entity {
/* service_tree member */
struct rb_node rb_node ;
/*
* Flag , true if the entity is on a tree ( either the active or
* the idle one of its service_tree ) or is in service .
*/
2020-02-03 11:40:57 +01:00
bool on_st_or_in_serv ;
2017-04-19 08:48:24 -06:00
/* B-WF2Q+ start and finish timestamps [sectors/weight] */
u64 start , finish ;
/* tree the entity is enqueued into; %NULL if not on a tree */
struct rb_root * tree ;
/*
* minimum start time of the ( active ) subtree rooted at this
* entity ; used for O ( log N ) lookups into active trees
*/
u64 min_start ;
/* amount of service received during the last service slot */
int service ;
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget ;
2021-11-25 14:36:35 +01:00
/* Number of requests allocated in the subtree of this entity */
int allocated ;
2019-08-28 11:54:53 +08:00
/* device weight, if non-zero, it overrides the default weight of
* bfq_group_data */
int dev_weight ;
2017-04-19 08:48:24 -06:00
/* weight of the queue */
int weight ;
/* next weight if a change is in progress */
int new_weight ;
/* original weight, used to implement weight boosting */
int orig_weight ;
/* parent entity, for hierarchical scheduling */
struct bfq_entity * parent ;
/*
* For non - leaf nodes in the hierarchy , the associated
* scheduler queue , % NULL on leaf nodes .
*/
struct bfq_sched_data * my_sched_data ;
/* the scheduler queue this entity belongs to */
struct bfq_sched_data * sched_data ;
/* flag, set to request a weight, ioprio or ioprio_class change */
int prio_changed ;
block, bfq: fix decrement of num_active_groups
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
detection")', if there are process groups with I/O requests waiting for
completion, then BFQ tags the scenario as 'asymmetric'. This detection
is needed for preserving service guarantees (for details, see comments
on the computation * of the variable asymmetric_scenario in the
function bfq_better_to_idle).
Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric
scenarios detection")' contains an error exactly in the updating of
the number of groups with I/O requests waiting for completion: if a
group has more than one descendant process, then the above number of
groups, which is renamed from num_active_groups to a more appropriate
num_groups_with_pending_reqs by this commit, may happen to be wrongly
decremented multiple times, namely every time one of the descendant
processes gets all its pending I/O requests completed.
A correct, complete solution should work as follows. Consider a group
that is inactive, i.e., that has no descendant process with pending
I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs
is still accounting for this group, because the group still has some
descendant process with some I/O request still in
flight. num_groups_with_pending_reqs should be decremented when the
in-flight request of the last descendant process is finally completed
(assuming that nothing else has changed for the group in the meantime,
in terms of composition of the group and active/inactive state of
child groups and processes). To accomplish this, an additional
pending-request counter must be added to entities, and must be
updated correctly.
To avoid this additional field and operations, this commit resorts to
the following tradeoff between simplicity and accuracy: for an
inactive group that is still counted in num_groups_with_pending_reqs,
this commit decrements num_groups_with_pending_reqs when the first
descendant process of the group remains with no request waiting for
completion.
This simplified scheme provides a fix to the unbalanced decrements
introduced by 2d29c9f89fcd. Since this error was also caused by lack
of comments on this non-trivial issue, this commit also adds related
comments.
Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
Reported-by: Steven Barrett <steven@liquorix.net>
Tested-by: Steven Barrett <steven@liquorix.net>
Tested-by: Lucjan Lucjanov <lucjan.lucjanov@gmail.com>
Reviewed-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-12-06 19:18:18 +01:00
/* flag, set if the entity is counted in groups_with_pending_reqs */
bool in_groups_with_pending_reqs ;
block, bfq: merge bursts of newly-created queues
Many throughput-sensitive workloads are made of several parallel I/O
flows, with all flows generated by the same application, or more
generically by the same task (e.g., system boot). The most
counterproductive action with these workloads is plugging I/O dispatch
when one of the bfq_queues associated with these flows remains
temporarily empty.
To avoid this plugging, BFQ has been using a burst-handling mechanism
for years now. This mechanism has proven effective for throughput, and
not detrimental for service guarantees. This commit pushes this
mechanism a little bit further, basing on the following two facts.
First, all the I/O flows of a the same application or task contribute
to the execution/completion of that common application or task. So the
performance figures that matter are total throughput of the flows and
task-wide I/O latency. In particular, these flows do not need to be
protected from each other, in terms of individual bandwidth or
latency.
Second, the above fact holds regardless of the number of flows.
Putting these two facts together, this commits merges stably the
bfq_queues associated with these I/O flows, i.e., with the processes
that generate these IO/ flows, regardless of how many the involved
processes are.
To decide whether a set of bfq_queues is actually associated with the
I/O flows of a common application or task, and to merge these queues
stably, this commit operates as follows: given a bfq_queue, say Q2,
currently being created, and the last bfq_queue, say Q1, created
before Q2, Q2 is merged stably with Q1 if
- very little time has elapsed since when Q1 was created
- Q2 has the same ioprio as Q1
- Q2 belongs to the same group as Q1
Merging bfq_queues also reduces scheduling overhead. A fio test with
ten random readers on /dev/nullb shows a throughput boost of 40%, with
a quadcore. Since BFQ's execution time amounts to ~50% of the total
per-request processing time, the above throughput boost implies that
BFQ's overhead is reduced by more than 50%.
Tested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Link: https://lore.kernel.org/r/20210304174627.161-7-paolo.valente@linaro.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-03-04 18:46:27 +01:00
/* last child queue of entity created (for non-leaf entities) */
struct bfq_queue * last_bfqq_created ;
2017-04-19 08:48:24 -06:00
} ;
struct bfq_group ;
/**
* struct bfq_ttime - per process thinktime stats .
*/
struct bfq_ttime {
/* completion time of the last request */
u64 last_end_request ;
/* total process thinktime */
u64 ttime_total ;
/* number of thinktime samples */
unsigned long ttime_samples ;
/* average process thinktime */
u64 ttime_mean ;
} ;
/**
* struct bfq_queue - leaf schedulable entity .
*
* A bfq_queue is a leaf request queue ; it can be associated with an
* io_context or more , if it is async or shared between cooperating
* processes . @ cgroup holds a reference to the cgroup , to be sure that it
* does not disappear while a bfqq still references it ( mostly to avoid
* races between request issuing and task migration followed by cgroup
* destruction ) .
* All the fields are protected by the queue lock of the containing bfqd .
*/
struct bfq_queue {
/* reference counter */
int ref ;
block, bfq: merge bursts of newly-created queues
Many throughput-sensitive workloads are made of several parallel I/O
flows, with all flows generated by the same application, or more
generically by the same task (e.g., system boot). The most
counterproductive action with these workloads is plugging I/O dispatch
when one of the bfq_queues associated with these flows remains
temporarily empty.
To avoid this plugging, BFQ has been using a burst-handling mechanism
for years now. This mechanism has proven effective for throughput, and
not detrimental for service guarantees. This commit pushes this
mechanism a little bit further, basing on the following two facts.
First, all the I/O flows of a the same application or task contribute
to the execution/completion of that common application or task. So the
performance figures that matter are total throughput of the flows and
task-wide I/O latency. In particular, these flows do not need to be
protected from each other, in terms of individual bandwidth or
latency.
Second, the above fact holds regardless of the number of flows.
Putting these two facts together, this commits merges stably the
bfq_queues associated with these I/O flows, i.e., with the processes
that generate these IO/ flows, regardless of how many the involved
processes are.
To decide whether a set of bfq_queues is actually associated with the
I/O flows of a common application or task, and to merge these queues
stably, this commit operates as follows: given a bfq_queue, say Q2,
currently being created, and the last bfq_queue, say Q1, created
before Q2, Q2 is merged stably with Q1 if
- very little time has elapsed since when Q1 was created
- Q2 has the same ioprio as Q1
- Q2 belongs to the same group as Q1
Merging bfq_queues also reduces scheduling overhead. A fio test with
ten random readers on /dev/nullb shows a throughput boost of 40%, with
a quadcore. Since BFQ's execution time amounts to ~50% of the total
per-request processing time, the above throughput boost implies that
BFQ's overhead is reduced by more than 50%.
Tested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Link: https://lore.kernel.org/r/20210304174627.161-7-paolo.valente@linaro.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-03-04 18:46:27 +01:00
/* counter of references from other queues for delayed stable merge */
int stable_ref ;
2017-04-19 08:48:24 -06:00
/* parent bfq_data */
struct bfq_data * bfqd ;
/* current ioprio and ioprio class */
unsigned short ioprio , ioprio_class ;
/* next ioprio and ioprio class if a change is in progress */
unsigned short new_ioprio , new_ioprio_class ;
block, bfq: tune service injection basing on request service times
The processes associated with a bfq_queue, say Q, may happen to
generate their cumulative I/O at a lower rate than the rate at which
the device could serve the same I/O. This is rather probable, e.g., if
only one process is associated with Q and the device is an SSD. It
results in Q becoming often empty while in service. If BFQ is not
allowed to switch to another queue when Q becomes empty, then, during
the service of Q, there will be frequent "service holes", i.e., time
intervals during which Q gets empty and the device can only consume
the I/O already queued in its hardware queues. This easily causes
considerable losses of throughput.
To counter this problem, BFQ implements a request injection mechanism,
which tries to fill the above service holes with I/O requests taken
from other bfq_queues. The hard part in this mechanism is finding the
right amount of I/O to inject, so as to both boost throughput and not
break Q's bandwidth and latency guarantees. To this goal, the current
version of this mechanism measures the bandwidth enjoyed by Q while it
is being served, and tries to inject the maximum possible amount of
extra service that does not cause Q's bandwidth to decrease too
much.
This solution has an important shortcoming. For bandwidth measurements
to be stable and reliable, Q must remain in service for a much longer
time than that needed to serve a single I/O request. Unfortunately,
this does not hold with many workloads. This commit addresses this
issue by changing the way the amount of injection allowed is
dynamically computed. It tunes injection as a function of the service
times of single I/O requests of Q, instead of Q's
bandwidth. Single-request service times are evidently meaningful even
if Q gets very few I/O requests completed while it is in service.
As a testbed for this new solution, we measured the throughput reached
by BFQ for one of the nastiest workloads and configurations for this
scheduler: the workload generated by the dbench test (in the Phoronix
suite), with 6 clients, on a filesystem with journaling, and with the
journaling daemon enjoying a higher weight than normal processes.
With this commit, the throughput grows from ~100 MB/s to ~150 MB/s on
a PLEXTOR PX-256M5.
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Francesco Pollicino <fra.fra.800@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-03-12 09:59:29 +01:00
/* last total-service-time sample, see bfq_update_inject_limit() */
u64 last_serv_time_ns ;
/* limit for request injection */
unsigned int inject_limit ;
/* last time the inject limit has been decreased, in jiffies */
unsigned long decrease_time_jif ;
2017-04-19 08:48:24 -06:00
/*
* Shared bfq_queue if queue is cooperating with one or more
* other queues .
*/
struct bfq_queue * new_bfqq ;
/* request-position tree member (see bfq_group's @rq_pos_tree) */
struct rb_node pos_node ;
/* request-position tree root (see bfq_group's @rq_pos_tree) */
struct rb_root * pos_root ;
/* sorted list of pending requests */
struct rb_root sort_list ;
/* if fifo isn't expired, next request to serve */
struct request * next_rq ;
/* number of sync and async requests queued */
int queued [ 2 ] ;
/* number of pending metadata requests */
int meta_pending ;
/* fifo list of requests in sort_list */
struct list_head fifo ;
/* entity representing this queue in the scheduler */
struct bfq_entity entity ;
block, bfq: improve asymmetric scenarios detection
bfq defines as asymmetric a scenario where an active entity, say E
(representing either a single bfq_queue or a group of other entities),
has a higher weight than some other entities. If the entity E does sync
I/O in such a scenario, then bfq plugs the dispatch of the I/O of the
other entities in the following situation: E is in service but
temporarily has no pending I/O request. In fact, without this plugging,
all the times that E stops being temporarily idle, it may find the
internal queues of the storage device already filled with an
out-of-control number of extra requests, from other entities. So E may
have to wait for the service of these extra requests, before finally
having its own requests served. This may easily break service
guarantees, with E getting less than its fair share of the device
throughput. Usually, the end result is that E gets the same fraction of
the throughput as the other entities, instead of getting more, according
to its higher weight.
Yet there are two other more subtle cases where E, even if its weight is
actually equal to or even lower than the weight of any other active
entities, may get less than its fair share of the throughput in case the
above I/O plugging is not performed:
1. other entities issue larger requests than E;
2. other entities contain more active child entities than E (or in
general tend to have more backlog than E).
In the first case, other entities may get more service than E because
they get larger requests, than those of E, served during the temporary
idle periods of E. In the second case, other entities get more service
because, by having many child entities, they have many requests ready
for dispatching while E is temporarily idle.
This commit addresses this issue by extending the definition of
asymmetric scenario: a scenario is asymmetric when
- active entities representing bfq_queues have differentiated weights,
as in the original definition
or (inclusive)
- one or more entities representing groups of entities are active.
This broader definition makes sure that I/O plugging will be performed
in all the above cases, provided that there is at least one active
group. Of course, this definition is very coarse, so it will trigger
I/O plugging also in cases where it is not needed, such as, e.g.,
multiple active entities with just one child each, and all with the same
I/O-request size. The reason for this coarse definition is just that a
finer-grained definition would be rather heavy to compute.
On the opposite end, even this new definition does not trigger I/O
plugging in all cases where there is no active group, and all bfq_queues
have the same weight. So, in these cases some unfairness may occur if
there are asymmetries in I/O-request sizes. We made this choice because
I/O plugging may lower throughput, and probably a user that has not
created any group cares more about throughput than about perfect
fairness. At any rate, as for possible applications that may care about
service guarantees, bfq already guarantees a high responsiveness and a
low latency to soft real-time applications automatically.
Signed-off-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-10-12 11:55:57 +02:00
/* pointer to the weight counter associated with this entity */
struct bfq_weight_counter * weight_counter ;
2017-04-19 08:48:24 -06:00
/* maximum budget allowed from the feedback mechanism */
int max_budget ;
/* budget expiration (in jiffies) */
unsigned long budget_timeout ;
/* number of requests on the dispatch list or inside driver */
int dispatched ;
/* status flags */
unsigned long flags ;
/* node for active/idle bfqq list inside parent bfqd */
struct list_head bfqq_list ;
/* associated @bfq_ttime struct */
struct bfq_ttime ttime ;
2021-01-25 20:02:43 +01:00
/* when bfqq started to do I/O within the last observation window */
u64 io_start_time ;
/* how long bfqq has remained empty during the last observ. window */
u64 tot_idle_time ;
2017-04-19 08:48:24 -06:00
/* bit vector: a 1 for each seeky requests in history */
u32 seek_history ;
/* node for the device's burst list */
struct hlist_node burst_list_node ;
/* position of the last request enqueued */
sector_t last_request_pos ;
/* Number of consecutive pairs of request completion and
* arrival , such that the queue becomes idle after the
* completion , but the next request arrives within an idle
* time slice ; used only if the queue ' s IO_bound flag has been
* cleared .
*/
unsigned int requests_within_timer ;
/* pid of the process owning the queue, used for logging purposes */
pid_t pid ;
/*
* Pointer to the bfq_io_cq owning the bfq_queue , set to % NULL
* if the queue is shared .
*/
struct bfq_io_cq * bic ;
/* current maximum weight-raising time for this queue */
unsigned long wr_cur_max_time ;
/*
* Minimum time instant such that , only if a new request is
* enqueued after this time instant in an idle @ bfq_queue with
* no outstanding requests , then the task associated with the
* queue it is deemed as soft real - time ( see the comments on
* the function bfq_bfqq_softrt_next_start ( ) )
*/
unsigned long soft_rt_next_start ;
/*
* Start time of the current weight - raising period if
* the @ bfq - queue is being weight - raised , otherwise
* finish time of the last weight - raising period .
*/
unsigned long last_wr_start_finish ;
/* factor by which the weight of this queue is multiplied */
unsigned int wr_coeff ;
/*
* Time of the last transition of the @ bfq_queue from idle to
* backlogged .
*/
unsigned long last_idle_bklogged ;
/*
* Cumulative service received from the @ bfq_queue since the
* last transition from idle to backlogged .
*/
unsigned long service_from_backlogged ;
block, bfq: limit sectors served with interactive weight raising
To maximise responsiveness, BFQ raises the weight, and performs device
idling, for bfq_queues associated with processes deemed as
interactive. In particular, weight raising has a maximum duration,
equal to the time needed to start a large application. If a
weight-raised process goes on doing I/O beyond this maximum duration,
it loses weight-raising.
This mechanism is evidently vulnerable to the following false
positives: I/O-bound applications that will go on doing I/O for much
longer than the duration of weight-raising. These applications have
basically no benefit from being weight-raised at the beginning of
their I/O. On the opposite end, while being weight-raised, these
applications
a) unjustly steal throughput to applications that may truly need
low latency;
b) make BFQ uselessly perform device idling; device idling results
in loss of device throughput with most flash-based storage, and may
increase latencies when used purposelessly.
This commit adds a countermeasure to reduce both the above
problems. To introduce this countermeasure, we provide the following
extra piece of information (full details in the comments added by this
commit). During the start-up of the large application used as a
reference to set the duration of weight-raising, involved processes
transfer at most ~110K sectors each. Accordingly, a process initially
deemed as interactive has no right to be weight-raised any longer,
once transferred 110K sectors or more.
Basing on this consideration, this commit early-ends weight-raising
for a bfq_queue if the latter happens to have received an amount of
service at least equal to 110K sectors (actually, a little bit more,
to keep a safety margin). I/O-bound applications that reach a high
throughput, such as file copy, get to this threshold much before the
allowed weight-raising period finishes. Thus this early ending of
weight-raising reduces the amount of time during which these
applications cause the problems described above.
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-01-13 12:05:18 +01:00
/*
* Cumulative service received from the @ bfq_queue since its
* last transition to weight - raised state .
*/
unsigned long service_from_wr ;
2017-04-19 08:48:24 -06:00
/*
* Value of wr start time when switching to soft rt
*/
unsigned long wr_start_at_switch_to_srt ;
unsigned long split_time ; /* time of last split */
block, bfq: let a queue be merged only shortly after starting I/O
In BFQ and CFQ, two processes are said to be cooperating if they do
I/O in such a way that the union of their I/O requests yields a
sequential I/O pattern. To get such a sequential I/O pattern out of
the non-sequential pattern of each cooperating process, BFQ and CFQ
merge the queues associated with these processes. In more detail,
cooperating processes, and thus their associated queues, usually
start, or restart, to do I/O shortly after each other. This is the
case, e.g., for the I/O threads of KVM/QEMU and of the dump
utility. Basing on this assumption, this commit allows a bfq_queue to
be merged only during a short time interval (100ms) after it starts,
or re-starts, to do I/O. This filtering provides two important
benefits.
First, it greatly reduces the probability that two non-cooperating
processes have their queues merged by mistake, if they just happen to
do I/O close to each other for a short time interval. These spurious
merges cause loss of service guarantees. A low-weight bfq_queue may
unjustly get more than its expected share of the throughput: if such a
low-weight queue is merged with a high-weight queue, then the I/O for
the low-weight queue is served as if the queue had a high weight. This
may damage other high-weight queues unexpectedly. For instance,
because of this issue, lxterminal occasionally took 7.5 seconds to
start, instead of 6.5 seconds, when some sequential readers and
writers did I/O in the background on a FUJITSU MHX2300BT HDD. The
reason is that the bfq_queues associated with some of the readers or
the writers were merged with the high-weight queues of some processes
that had to do some urgent but little I/O. The readers then exploited
the inherited high weight for all or most of their I/O, during the
start-up of terminal. The filtering introduced by this commit
eliminated any outlier caused by spurious queue merges in our start-up
time tests.
This filtering also provides a little boost of the throughput
sustainable by BFQ: 3-4%, depending on the CPU. The reason is that,
once a bfq_queue cannot be merged any longer, this commit makes BFQ
stop updating the data needed to handle merging for the queue.
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-12-20 12:38:33 +01:00
unsigned long first_IO_time ; /* time of first I/O for this queue */
block, bfq: inject other-queue I/O into seeky idle queues on NCQ flash
The Achilles' heel of BFQ is its failing to reach a high throughput
with sync random I/O on flash storage with internal queueing, in case
the processes doing I/O have differentiated weights.
The cause of this failure is as follows. If at least two processes do
sync I/O, and have a different weight from each other, then BFQ plugs
I/O dispatching every time one of these processes, while it is being
served, remains temporarily without pending I/O requests. This
plugging is necessary to guarantee that every process enjoys a
bandwidth proportional to its weight; but it empties the internal
queue(s) of the drive. And this kills throughput with random I/O. So,
if some processes have differentiated weights and do both sync and
random I/O, the end result is a throughput collapse.
This commit tries to counter this problem by injecting the service of
other processes, in a controlled way, while the process in service
happens to have no I/O. This injection is performed only if the medium
is non rotational and performs internal queueing, and the process in
service does random I/O (service injection might be beneficial for
sequential I/O too, we'll work on that).
As an example of the benefits of this commit, on a PLEXTOR PX-256M5S
SSD, and with five processes having differentiated weights and doing
sync random 4KB I/O, this commit makes the throughput with bfq grow by
400%, from 25 to 100MB/s. This higher throughput is 10MB/s lower than
that reached with none. As some less random I/O is added to the mix,
the throughput becomes equal to or higher than that with none.
This commit is a very first attempt to recover throughput without
losing control, and certainly has many limitations. One is, e.g., that
the processes whose service is injected are not chosen so as to
distribute the extra bandwidth they receive in accordance to their
weights. Thus there might be loss of weighted fairness in some
cases. Anyway, this loss concerns extra service, which would not have
been received at all without this commit. Other limitations and issues
will probably show up with usage.
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-09-14 16:23:08 +02:00
block, bfq: merge bursts of newly-created queues
Many throughput-sensitive workloads are made of several parallel I/O
flows, with all flows generated by the same application, or more
generically by the same task (e.g., system boot). The most
counterproductive action with these workloads is plugging I/O dispatch
when one of the bfq_queues associated with these flows remains
temporarily empty.
To avoid this plugging, BFQ has been using a burst-handling mechanism
for years now. This mechanism has proven effective for throughput, and
not detrimental for service guarantees. This commit pushes this
mechanism a little bit further, basing on the following two facts.
First, all the I/O flows of a the same application or task contribute
to the execution/completion of that common application or task. So the
performance figures that matter are total throughput of the flows and
task-wide I/O latency. In particular, these flows do not need to be
protected from each other, in terms of individual bandwidth or
latency.
Second, the above fact holds regardless of the number of flows.
Putting these two facts together, this commits merges stably the
bfq_queues associated with these I/O flows, i.e., with the processes
that generate these IO/ flows, regardless of how many the involved
processes are.
To decide whether a set of bfq_queues is actually associated with the
I/O flows of a common application or task, and to merge these queues
stably, this commit operates as follows: given a bfq_queue, say Q2,
currently being created, and the last bfq_queue, say Q1, created
before Q2, Q2 is merged stably with Q1 if
- very little time has elapsed since when Q1 was created
- Q2 has the same ioprio as Q1
- Q2 belongs to the same group as Q1
Merging bfq_queues also reduces scheduling overhead. A fio test with
ten random readers on /dev/nullb shows a throughput boost of 40%, with
a quadcore. Since BFQ's execution time amounts to ~50% of the total
per-request processing time, the above throughput boost implies that
BFQ's overhead is reduced by more than 50%.
Tested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Link: https://lore.kernel.org/r/20210304174627.161-7-paolo.valente@linaro.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-03-04 18:46:27 +01:00
unsigned long creation_time ; /* when this queue is created */
block, bfq: inject other-queue I/O into seeky idle queues on NCQ flash
The Achilles' heel of BFQ is its failing to reach a high throughput
with sync random I/O on flash storage with internal queueing, in case
the processes doing I/O have differentiated weights.
The cause of this failure is as follows. If at least two processes do
sync I/O, and have a different weight from each other, then BFQ plugs
I/O dispatching every time one of these processes, while it is being
served, remains temporarily without pending I/O requests. This
plugging is necessary to guarantee that every process enjoys a
bandwidth proportional to its weight; but it empties the internal
queue(s) of the drive. And this kills throughput with random I/O. So,
if some processes have differentiated weights and do both sync and
random I/O, the end result is a throughput collapse.
This commit tries to counter this problem by injecting the service of
other processes, in a controlled way, while the process in service
happens to have no I/O. This injection is performed only if the medium
is non rotational and performs internal queueing, and the process in
service does random I/O (service injection might be beneficial for
sequential I/O too, we'll work on that).
As an example of the benefits of this commit, on a PLEXTOR PX-256M5S
SSD, and with five processes having differentiated weights and doing
sync random 4KB I/O, this commit makes the throughput with bfq grow by
400%, from 25 to 100MB/s. This higher throughput is 10MB/s lower than
that reached with none. As some less random I/O is added to the mix,
the throughput becomes equal to or higher than that with none.
This commit is a very first attempt to recover throughput without
losing control, and certainly has many limitations. One is, e.g., that
the processes whose service is injected are not chosen so as to
distribute the extra bandwidth they receive in accordance to their
weights. Thus there might be loss of weighted fairness in some
cases. Anyway, this loss concerns extra service, which would not have
been received at all without this commit. Other limitations and issues
will probably show up with usage.
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-09-14 16:23:08 +02:00
/* max service rate measured so far */
u32 max_service_rate ;
block, bfq: detect wakers and unconditionally inject their I/O
A bfq_queue Q may happen to be synchronized with another
bfq_queue Q2, i.e., the I/O of Q2 may need to be completed for Q to
receive new I/O. We call Q2 "waker queue".
If I/O plugging is being performed for Q, and Q is not receiving any
more I/O because of the above synchronization, then, thanks to BFQ's
injection mechanism, the waker queue is likely to get served before
the I/O-plugging timeout fires.
Unfortunately, this fact may not be sufficient to guarantee a high
throughput during the I/O plugging, because the inject limit for Q may
be too low to guarantee a lot of injected I/O. In addition, the
duration of the plugging, i.e., the time before Q finally receives new
I/O, may not be minimized, because the waker queue may happen to be
served only after other queues.
To address these issues, this commit introduces the explicit detection
of the waker queue, and the unconditional injection of a pending I/O
request of the waker queue on each invocation of
bfq_dispatch_request().
One may be concerned that this systematic injection of I/O from the
waker queue delays the service of Q's I/O. Fortunately, it doesn't. On
the contrary, next Q's I/O is brought forward dramatically, for it is
not blocked for milliseconds.
Reported-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Tested-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-25 07:12:47 +02:00
/*
* Pointer to the waker queue for this queue , i . e . , to the
* queue Q such that this queue happens to get new I / O right
* after some I / O request of Q is completed . For details , see
* the comments on the choice of the queue for injection in
* bfq_select_queue ( ) .
*/
struct bfq_queue * waker_bfqq ;
2021-01-25 20:02:48 +01:00
/* pointer to the curr. tentative waker queue, see bfq_check_waker() */
struct bfq_queue * tentative_waker_bfqq ;
/* number of times the same tentative waker has been detected */
unsigned int num_waker_detections ;
2021-11-25 14:36:38 +01:00
/* time when we started considering this waker */
u64 waker_detection_started ;
2021-01-25 20:02:48 +01:00
block, bfq: detect wakers and unconditionally inject their I/O
A bfq_queue Q may happen to be synchronized with another
bfq_queue Q2, i.e., the I/O of Q2 may need to be completed for Q to
receive new I/O. We call Q2 "waker queue".
If I/O plugging is being performed for Q, and Q is not receiving any
more I/O because of the above synchronization, then, thanks to BFQ's
injection mechanism, the waker queue is likely to get served before
the I/O-plugging timeout fires.
Unfortunately, this fact may not be sufficient to guarantee a high
throughput during the I/O plugging, because the inject limit for Q may
be too low to guarantee a lot of injected I/O. In addition, the
duration of the plugging, i.e., the time before Q finally receives new
I/O, may not be minimized, because the waker queue may happen to be
served only after other queues.
To address these issues, this commit introduces the explicit detection
of the waker queue, and the unconditional injection of a pending I/O
request of the waker queue on each invocation of
bfq_dispatch_request().
One may be concerned that this systematic injection of I/O from the
waker queue delays the service of Q's I/O. Fortunately, it doesn't. On
the contrary, next Q's I/O is brought forward dramatically, for it is
not blocked for milliseconds.
Reported-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Tested-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-25 07:12:47 +02:00
/* node for woken_list, see below */
struct hlist_node woken_list_node ;
/*
* Head of the list of the woken queues for this queue , i . e . ,
* of the list of the queues for which this queue is a waker
* queue . This list is used to reset the waker_bfqq pointer in
* the woken queues when this queue exits .
*/
struct hlist_head woken_list ;
2017-04-19 08:48:24 -06:00
} ;
/**
* struct bfq_io_cq - per ( request_queue , io_context ) structure .
*/
struct bfq_io_cq {
/* associated io_cq structure */
struct io_cq icq ; /* must be the first member */
/* array of two process queues, the sync and the async */
struct bfq_queue * bfqq [ 2 ] ;
/* per (request_queue, blkcg) ioprio */
int ioprio ;
# ifdef CONFIG_BFQ_GROUP_IOSCHED
uint64_t blkcg_serial_nr ; /* the current blkcg serial */
# endif
/*
2017-08-04 07:35:10 +02:00
* Snapshot of the has_short_time flag before merging ; taken
* to remember its value while the queue is merged , so as to
* be able to restore it in case of split .
2017-04-19 08:48:24 -06:00
*/
2017-08-04 07:35:10 +02:00
bool saved_has_short_ttime ;
2017-04-19 08:48:24 -06:00
/*
* Same purpose as the previous two fields for the I / O bound
* classification of a queue .
*/
bool saved_IO_bound ;
2021-01-25 20:02:43 +01:00
u64 saved_io_start_time ;
u64 saved_tot_idle_time ;
2017-04-19 08:48:24 -06:00
/*
* Same purpose as the previous fields for the value of the
* field keeping the queue ' s belonging to a large burst
*/
bool saved_in_large_burst ;
/*
* True if the queue belonged to a burst list before its merge
* with another cooperating queue .
*/
bool was_in_burst_list ;
2019-03-12 09:59:34 +01:00
/*
* Save the weight when a merge occurs , to be able
* to restore it in case of split . If the weight is not
* correctly resumed when the queue is recycled ,
* then the weight of the recycled queue could differ
* from the weight of the original queue .
*/
unsigned int saved_weight ;
2017-04-19 08:48:24 -06:00
/*
* Similar to previous fields : save wr information .
*/
unsigned long saved_wr_coeff ;
unsigned long saved_last_wr_start_finish ;
2021-01-25 20:02:46 +01:00
unsigned long saved_service_from_wr ;
2017-04-19 08:48:24 -06:00
unsigned long saved_wr_start_at_switch_to_srt ;
unsigned int saved_wr_cur_max_time ;
struct bfq_ttime saved_ttime ;
2021-01-25 20:02:47 +01:00
/* Save also injection state */
u64 saved_last_serv_time_ns ;
unsigned int saved_inject_limit ;
unsigned long saved_decrease_time_jif ;
block, bfq: merge bursts of newly-created queues
Many throughput-sensitive workloads are made of several parallel I/O
flows, with all flows generated by the same application, or more
generically by the same task (e.g., system boot). The most
counterproductive action with these workloads is plugging I/O dispatch
when one of the bfq_queues associated with these flows remains
temporarily empty.
To avoid this plugging, BFQ has been using a burst-handling mechanism
for years now. This mechanism has proven effective for throughput, and
not detrimental for service guarantees. This commit pushes this
mechanism a little bit further, basing on the following two facts.
First, all the I/O flows of a the same application or task contribute
to the execution/completion of that common application or task. So the
performance figures that matter are total throughput of the flows and
task-wide I/O latency. In particular, these flows do not need to be
protected from each other, in terms of individual bandwidth or
latency.
Second, the above fact holds regardless of the number of flows.
Putting these two facts together, this commits merges stably the
bfq_queues associated with these I/O flows, i.e., with the processes
that generate these IO/ flows, regardless of how many the involved
processes are.
To decide whether a set of bfq_queues is actually associated with the
I/O flows of a common application or task, and to merge these queues
stably, this commit operates as follows: given a bfq_queue, say Q2,
currently being created, and the last bfq_queue, say Q1, created
before Q2, Q2 is merged stably with Q1 if
- very little time has elapsed since when Q1 was created
- Q2 has the same ioprio as Q1
- Q2 belongs to the same group as Q1
Merging bfq_queues also reduces scheduling overhead. A fio test with
ten random readers on /dev/nullb shows a throughput boost of 40%, with
a quadcore. Since BFQ's execution time amounts to ~50% of the total
per-request processing time, the above throughput boost implies that
BFQ's overhead is reduced by more than 50%.
Tested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Link: https://lore.kernel.org/r/20210304174627.161-7-paolo.valente@linaro.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-03-04 18:46:27 +01:00
/* candidate queue for a stable merge (due to close creation time) */
struct bfq_queue * stable_merge_bfqq ;
bool stably_merged ; /* non splittable if true */
2022-05-19 12:52:29 +02:00
unsigned int requests ; /* Number of requests this process has in flight */
2017-04-19 08:48:24 -06:00
} ;
/**
* struct bfq_data - per - device data structure .
*
* All the fields are protected by @ lock .
*/
struct bfq_data {
/* device request queue */
struct request_queue * queue ;
/* dispatch queue */
struct list_head dispatch ;
/* root bfq_group for the device */
struct bfq_group * root_group ;
/*
* rbtree of weight counters of @ bfq_queues , sorted by
* weight . Used to keep track of whether all @ bfq_queues have
* the same weight . The tree contains one counter for each
* distinct weight associated to some active and not
* weight - raised @ bfq_queue ( see the comments to the functions
* bfq_weights_tree_ [ add | remove ] for further details ) .
*/
block, bfq: do not idle for lowest-weight queues
In most cases, it is detrimental for throughput to plug I/O dispatch
when the in-service bfq_queue becomes temporarily empty (plugging is
performed to wait for the possible arrival, soon, of new I/O from the
in-service queue). There is however a case where plugging is needed
for service guarantees. If a bfq_queue, say Q, has a higher weight
than some other active bfq_queue, and is sync, i.e., contains sync
I/O, then, to guarantee that Q does receive a higher share of the
throughput than other lower-weight queues, it is necessary to plug I/O
dispatch when Q remains temporarily empty while being served.
For this reason, BFQ performs I/O plugging when some active bfq_queue
has a higher weight than some other active bfq_queue. But this is
unnecessarily overkill. In fact, if the in-service bfq_queue actually
has a weight lower than or equal to the other queues, then the queue
*must not* be guaranteed a higher share of the throughput than the
other queues. So, not plugging I/O cannot cause any harm to the
queue. And can boost throughput.
Taking advantage of this fact, this commit does not plug I/O for sync
bfq_queues with a weight lower than or equal to the weights of the
other queues. Here is an example of the resulting throughput boost
with the dbench workload, which is particularly nasty for BFQ. With
the dbench test in the Phoronix suite, BFQ reaches its lowest total
throughput with 6 clients on a filesystem with journaling, in case the
journaling daemon has a higher weight than normal processes. Before
this commit, the total throughput was ~80 MB/sec on a PLEXTOR PX-256M5,
after this commit it is ~100 MB/sec.
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-03-12 09:59:28 +01:00
struct rb_root_cached queue_weights_tree ;
block, bfq: fix decrement of num_active_groups
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
detection")', if there are process groups with I/O requests waiting for
completion, then BFQ tags the scenario as 'asymmetric'. This detection
is needed for preserving service guarantees (for details, see comments
on the computation * of the variable asymmetric_scenario in the
function bfq_better_to_idle).
Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric
scenarios detection")' contains an error exactly in the updating of
the number of groups with I/O requests waiting for completion: if a
group has more than one descendant process, then the above number of
groups, which is renamed from num_active_groups to a more appropriate
num_groups_with_pending_reqs by this commit, may happen to be wrongly
decremented multiple times, namely every time one of the descendant
processes gets all its pending I/O requests completed.
A correct, complete solution should work as follows. Consider a group
that is inactive, i.e., that has no descendant process with pending
I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs
is still accounting for this group, because the group still has some
descendant process with some I/O request still in
flight. num_groups_with_pending_reqs should be decremented when the
in-flight request of the last descendant process is finally completed
(assuming that nothing else has changed for the group in the meantime,
in terms of composition of the group and active/inactive state of
child groups and processes). To accomplish this, an additional
pending-request counter must be added to entities, and must be
updated correctly.
To avoid this additional field and operations, this commit resorts to
the following tradeoff between simplicity and accuracy: for an
inactive group that is still counted in num_groups_with_pending_reqs,
this commit decrements num_groups_with_pending_reqs when the first
descendant process of the group remains with no request waiting for
completion.
This simplified scheme provides a fix to the unbalanced decrements
introduced by 2d29c9f89fcd. Since this error was also caused by lack
of comments on this non-trivial issue, this commit also adds related
comments.
Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
Reported-by: Steven Barrett <steven@liquorix.net>
Tested-by: Steven Barrett <steven@liquorix.net>
Tested-by: Lucjan Lucjanov <lucjan.lucjanov@gmail.com>
Reviewed-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-12-06 19:18:18 +01:00
2017-04-19 08:48:24 -06:00
/*
block, bfq: fix decrement of num_active_groups
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
detection")', if there are process groups with I/O requests waiting for
completion, then BFQ tags the scenario as 'asymmetric'. This detection
is needed for preserving service guarantees (for details, see comments
on the computation * of the variable asymmetric_scenario in the
function bfq_better_to_idle).
Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric
scenarios detection")' contains an error exactly in the updating of
the number of groups with I/O requests waiting for completion: if a
group has more than one descendant process, then the above number of
groups, which is renamed from num_active_groups to a more appropriate
num_groups_with_pending_reqs by this commit, may happen to be wrongly
decremented multiple times, namely every time one of the descendant
processes gets all its pending I/O requests completed.
A correct, complete solution should work as follows. Consider a group
that is inactive, i.e., that has no descendant process with pending
I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs
is still accounting for this group, because the group still has some
descendant process with some I/O request still in
flight. num_groups_with_pending_reqs should be decremented when the
in-flight request of the last descendant process is finally completed
(assuming that nothing else has changed for the group in the meantime,
in terms of composition of the group and active/inactive state of
child groups and processes). To accomplish this, an additional
pending-request counter must be added to entities, and must be
updated correctly.
To avoid this additional field and operations, this commit resorts to
the following tradeoff between simplicity and accuracy: for an
inactive group that is still counted in num_groups_with_pending_reqs,
this commit decrements num_groups_with_pending_reqs when the first
descendant process of the group remains with no request waiting for
completion.
This simplified scheme provides a fix to the unbalanced decrements
introduced by 2d29c9f89fcd. Since this error was also caused by lack
of comments on this non-trivial issue, this commit also adds related
comments.
Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
Reported-by: Steven Barrett <steven@liquorix.net>
Tested-by: Steven Barrett <steven@liquorix.net>
Tested-by: Lucjan Lucjanov <lucjan.lucjanov@gmail.com>
Reviewed-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-12-06 19:18:18 +01:00
* Number of groups with at least one descendant process that
* has at least one request waiting for completion . Note that
* this accounts for also requests already dispatched , but not
* yet completed . Therefore this number of groups may differ
* ( be larger ) than the number of active groups , as a group is
* considered active only if its corresponding entity has
* descendant queues with at least one request queued . This
* number is used to decide whether a scenario is symmetric .
* For a detailed explanation see comments on the computation
* of the variable asymmetric_scenario in the function
* bfq_better_to_idle ( ) .
*
* However , it is hard to compute this number exactly , for
* groups with multiple descendant processes . Consider a group
* that is inactive , i . e . , that has no descendant process with
* pending I / O inside BFQ queues . Then suppose that
* num_groups_with_pending_reqs is still accounting for this
* group , because the group has descendant processes with some
* I / O request still in flight . num_groups_with_pending_reqs
* should be decremented when the in - flight request of the
* last descendant process is finally completed ( assuming that
* nothing else has changed for the group in the meantime , in
* terms of composition of the group and active / inactive state of child
* groups and processes ) . To accomplish this , an additional
* pending - request counter must be added to entities , and must
* be updated correctly . To avoid this additional field and operations ,
* we resort to the following tradeoff between simplicity and
* accuracy : for an inactive group that is still counted in
* num_groups_with_pending_reqs , we decrement
* num_groups_with_pending_reqs when the first descendant
* process of the group remains with no request waiting for
* completion .
*
* Even this simpler decrement strategy requires a little
* carefulness : to avoid multiple decrements , we flag a group ,
* more precisely an entity representing a group , as still
* counted in num_groups_with_pending_reqs when it becomes
* inactive . Then , when the first descendant queue of the
* entity remains with no request waiting for completion ,
* num_groups_with_pending_reqs is decremented , and this flag
* is reset . After this flag is reset for the entity ,
* num_groups_with_pending_reqs won ' t be decremented any
* longer in case a new descendant queue of the entity remains
* with no request waiting for completion .
2017-04-19 08:48:24 -06:00
*/
block, bfq: fix decrement of num_active_groups
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios
detection")', if there are process groups with I/O requests waiting for
completion, then BFQ tags the scenario as 'asymmetric'. This detection
is needed for preserving service guarantees (for details, see comments
on the computation * of the variable asymmetric_scenario in the
function bfq_better_to_idle).
Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric
scenarios detection")' contains an error exactly in the updating of
the number of groups with I/O requests waiting for completion: if a
group has more than one descendant process, then the above number of
groups, which is renamed from num_active_groups to a more appropriate
num_groups_with_pending_reqs by this commit, may happen to be wrongly
decremented multiple times, namely every time one of the descendant
processes gets all its pending I/O requests completed.
A correct, complete solution should work as follows. Consider a group
that is inactive, i.e., that has no descendant process with pending
I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs
is still accounting for this group, because the group still has some
descendant process with some I/O request still in
flight. num_groups_with_pending_reqs should be decremented when the
in-flight request of the last descendant process is finally completed
(assuming that nothing else has changed for the group in the meantime,
in terms of composition of the group and active/inactive state of
child groups and processes). To accomplish this, an additional
pending-request counter must be added to entities, and must be
updated correctly.
To avoid this additional field and operations, this commit resorts to
the following tradeoff between simplicity and accuracy: for an
inactive group that is still counted in num_groups_with_pending_reqs,
this commit decrements num_groups_with_pending_reqs when the first
descendant process of the group remains with no request waiting for
completion.
This simplified scheme provides a fix to the unbalanced decrements
introduced by 2d29c9f89fcd. Since this error was also caused by lack
of comments on this non-trivial issue, this commit also adds related
comments.
Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")
Reported-by: Steven Barrett <steven@liquorix.net>
Tested-by: Steven Barrett <steven@liquorix.net>
Tested-by: Lucjan Lucjanov <lucjan.lucjanov@gmail.com>
Reviewed-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-12-06 19:18:18 +01:00
unsigned int num_groups_with_pending_reqs ;
2017-04-19 08:48:24 -06:00
/*
2019-01-29 12:06:29 +01:00
* Per - class ( RT , BE , IDLE ) number of bfq_queues containing
* requests ( including the queue in service , even if it is
* idling ) .
2017-04-19 08:48:24 -06:00
*/
2019-01-29 12:06:29 +01:00
unsigned int busy_queues [ 3 ] ;
2017-04-19 08:48:24 -06:00
/* number of weight-raised busy @bfq_queues */
int wr_busy_queues ;
/* number of queued requests */
int queued ;
/* number of requests dispatched and waiting for completion */
int rq_in_driver ;
block, bfq: do not merge queues on flash storage with queueing
To boost throughput with a set of processes doing interleaved I/O
(i.e., a set of processes whose individual I/O is random, but whose
merged cumulative I/O is sequential), BFQ merges the queues associated
with these processes, i.e., redirects the I/O of these processes into a
common, shared queue. In the shared queue, I/O requests are ordered by
their position on the medium, thus sequential I/O gets dispatched to
the device when the shared queue is served.
Queue merging costs execution time, because, to detect which queues to
merge, BFQ must maintain a list of the head I/O requests of active
queues, ordered by request positions. Measurements showed that this
costs about 10% of BFQ's total per-request processing time.
Request processing time becomes more and more critical as the speed of
the underlying storage device grows. Yet, fortunately, queue merging
is basically useless on the very devices that are so fast to make
request processing time critical. To reach a high throughput, these
devices must have many requests queued at the same time. But, in this
configuration, the internal scheduling algorithms of these devices do
also the job of queue merging: they reorder requests so as to obtain
as much as possible a sequential I/O pattern. As a consequence, with
processes doing interleaved I/O, the throughput reached by one such
device is likely to be the same, with and without queue merging.
In view of this fact, this commit disables queue merging, and all
related housekeeping, for non-rotational devices with internal
queueing. The total, single-lock-protected, per-request processing
time of BFQ drops to, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
(time measured with simple code instrumentation, and using the
throughput-sync.sh script of the S suite [1], in performance-profiling
mode). To put this result into context, the total,
single-lock-protected, per-request execution time of the lightest I/O
scheduler available in blk-mq, mq-deadline, is 0.7 us (mq-deadline is
~800 LOC, against ~10500 LOC for BFQ).
Disabling merging provides a further, remarkable benefit in terms of
throughput. Merging tends to make many workloads artificially more
uneven, mainly because of shared queues remaining non empty for
incomparably more time than normal queues. So, if, e.g., one of the
queues in a set of merged queues has a higher weight than a normal
queue, then the shared queue may inherit such a high weight and, by
staying almost always active, may force BFQ to perform I/O plugging
most of the time. This evidently makes it harder for BFQ to let the
device reach a high throughput.
As a practical example of this problem, and of the benefits of this
commit, we measured again the throughput in the nasty scenario
considered in previous commit messages: dbench test (in the Phoronix
suite), with 6 clients, on a filesystem with journaling, and with the
journaling daemon enjoying a higher weight than normal processes. With
this commit, the throughput grows from ~150 MB/s to ~200 MB/s on a
PLEXTOR PX-256M5 SSD. This is the same peak throughput reached by any
of the other I/O schedulers. As such, this is also likely to be the
maximum possible throughput reachable with this workload on this
device, because I/O is mostly random, and the other schedulers
basically just pass I/O requests to the drive as fast as possible.
[1] https://github.com/Algodev-github/S
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Francesco Pollicino <fra.fra.800@gmail.com>
Signed-off-by: Alessio Masola <alessio.masola@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-03-12 09:59:30 +01:00
/* true if the device is non rotational and performs queueing */
bool nonrot_with_queueing ;
2017-04-19 08:48:24 -06:00
/*
* Maximum number of requests in driver in the last
* @ hw_tag_samples completed requests .
*/
int max_rq_in_driver ;
/* number of samples used to calculate hw_tag */
int hw_tag_samples ;
/* flag set to one if the driver is showing a queueing behavior */
int hw_tag ;
/* number of budgets assigned */
int budgets_assigned ;
/*
* Timer set when idling ( waiting ) for the next request from
* the queue in service .
*/
struct hrtimer idle_slice_timer ;
/* bfq_queue in service */
struct bfq_queue * in_service_queue ;
/* on-disk position of the last served request */
sector_t last_position ;
block, bfq: fix in-service-queue check for queue merging
When a new I/O request arrives for a bfq_queue, say Q, bfq checks
whether that request is close to
(a) the head request of some other queue waiting to be served, or
(b) the last request dispatched for the in-service queue (in case Q
itself is not the in-service queue)
If a queue, say Q2, is found for which the above condition holds, then
bfq merges Q and Q2, to hopefully get a more sequential I/O in the
resulting merged queue, and thus a possibly higher throughput.
Case (b) is checked by comparing the new request for Q with the last
request dispatched, assuming that the latter necessarily belonged to the
in-service queue. Unfortunately, this assumption is no longer always
correct, since commit d0edc2473be9 ("block, bfq: inject other-queue I/O
into seeky idle queues on NCQ flash").
When the assumption does not hold, queues that must not be merged may be
merged, causing unexpected loss of control on per-queue service
guarantees.
This commit solves this problem by adding an extra field, which stores
the actual last request dispatched for the in-service queue, and by
using this new field to correctly check case (b).
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-01-29 12:06:38 +01:00
/* position of the last served request for the in-service queue */
sector_t in_serv_last_pos ;
2017-04-19 08:48:24 -06:00
/* time of last request completion (ns) */
u64 last_completion ;
block, bfq: detect wakers and unconditionally inject their I/O
A bfq_queue Q may happen to be synchronized with another
bfq_queue Q2, i.e., the I/O of Q2 may need to be completed for Q to
receive new I/O. We call Q2 "waker queue".
If I/O plugging is being performed for Q, and Q is not receiving any
more I/O because of the above synchronization, then, thanks to BFQ's
injection mechanism, the waker queue is likely to get served before
the I/O-plugging timeout fires.
Unfortunately, this fact may not be sufficient to guarantee a high
throughput during the I/O plugging, because the inject limit for Q may
be too low to guarantee a lot of injected I/O. In addition, the
duration of the plugging, i.e., the time before Q finally receives new
I/O, may not be minimized, because the waker queue may happen to be
served only after other queues.
To address these issues, this commit introduces the explicit detection
of the waker queue, and the unconditional injection of a pending I/O
request of the waker queue on each invocation of
bfq_dispatch_request().
One may be concerned that this systematic injection of I/O from the
waker queue delays the service of Q's I/O. Fortunately, it doesn't. On
the contrary, next Q's I/O is brought forward dramatically, for it is
not blocked for milliseconds.
Reported-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Tested-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-25 07:12:47 +02:00
/* bfqq owning the last completed rq */
struct bfq_queue * last_completed_rq_bfqq ;
block, bfq: merge bursts of newly-created queues
Many throughput-sensitive workloads are made of several parallel I/O
flows, with all flows generated by the same application, or more
generically by the same task (e.g., system boot). The most
counterproductive action with these workloads is plugging I/O dispatch
when one of the bfq_queues associated with these flows remains
temporarily empty.
To avoid this plugging, BFQ has been using a burst-handling mechanism
for years now. This mechanism has proven effective for throughput, and
not detrimental for service guarantees. This commit pushes this
mechanism a little bit further, basing on the following two facts.
First, all the I/O flows of a the same application or task contribute
to the execution/completion of that common application or task. So the
performance figures that matter are total throughput of the flows and
task-wide I/O latency. In particular, these flows do not need to be
protected from each other, in terms of individual bandwidth or
latency.
Second, the above fact holds regardless of the number of flows.
Putting these two facts together, this commits merges stably the
bfq_queues associated with these I/O flows, i.e., with the processes
that generate these IO/ flows, regardless of how many the involved
processes are.
To decide whether a set of bfq_queues is actually associated with the
I/O flows of a common application or task, and to merge these queues
stably, this commit operates as follows: given a bfq_queue, say Q2,
currently being created, and the last bfq_queue, say Q1, created
before Q2, Q2 is merged stably with Q1 if
- very little time has elapsed since when Q1 was created
- Q2 has the same ioprio as Q1
- Q2 belongs to the same group as Q1
Merging bfq_queues also reduces scheduling overhead. A fio test with
ten random readers on /dev/nullb shows a throughput boost of 40%, with
a quadcore. Since BFQ's execution time amounts to ~50% of the total
per-request processing time, the above throughput boost implies that
BFQ's overhead is reduced by more than 50%.
Tested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Link: https://lore.kernel.org/r/20210304174627.161-7-paolo.valente@linaro.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-03-04 18:46:27 +01:00
/* last bfqq created, among those in the root group */
struct bfq_queue * last_bfqq_created ;
block, bfq: tune service injection basing on request service times
The processes associated with a bfq_queue, say Q, may happen to
generate their cumulative I/O at a lower rate than the rate at which
the device could serve the same I/O. This is rather probable, e.g., if
only one process is associated with Q and the device is an SSD. It
results in Q becoming often empty while in service. If BFQ is not
allowed to switch to another queue when Q becomes empty, then, during
the service of Q, there will be frequent "service holes", i.e., time
intervals during which Q gets empty and the device can only consume
the I/O already queued in its hardware queues. This easily causes
considerable losses of throughput.
To counter this problem, BFQ implements a request injection mechanism,
which tries to fill the above service holes with I/O requests taken
from other bfq_queues. The hard part in this mechanism is finding the
right amount of I/O to inject, so as to both boost throughput and not
break Q's bandwidth and latency guarantees. To this goal, the current
version of this mechanism measures the bandwidth enjoyed by Q while it
is being served, and tries to inject the maximum possible amount of
extra service that does not cause Q's bandwidth to decrease too
much.
This solution has an important shortcoming. For bandwidth measurements
to be stable and reliable, Q must remain in service for a much longer
time than that needed to serve a single I/O request. Unfortunately,
this does not hold with many workloads. This commit addresses this
issue by changing the way the amount of injection allowed is
dynamically computed. It tunes injection as a function of the service
times of single I/O requests of Q, instead of Q's
bandwidth. Single-request service times are evidently meaningful even
if Q gets very few I/O requests completed while it is in service.
As a testbed for this new solution, we measured the throughput reached
by BFQ for one of the nastiest workloads and configurations for this
scheduler: the workload generated by the dbench test (in the Phoronix
suite), with 6 clients, on a filesystem with journaling, and with the
journaling daemon enjoying a higher weight than normal processes.
With this commit, the throughput grows from ~100 MB/s to ~150 MB/s on
a PLEXTOR PX-256M5.
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Francesco Pollicino <fra.fra.800@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-03-12 09:59:29 +01:00
/* time of last transition from empty to non-empty (ns) */
u64 last_empty_occupied_ns ;
/*
* Flag set to activate the sampling of the total service time
* of a just - arrived first I / O request ( see
* bfq_update_inject_limit ( ) ) . This will cause the setting of
* waited_rq when the request is finally dispatched .
*/
bool wait_dispatch ;
/*
* If set , then bfq_update_inject_limit ( ) is invoked when
* waited_rq is eventually completed .
*/
struct request * waited_rq ;
/*
* True if some request has been injected during the last service hole .
*/
bool rqs_injected ;
2017-04-19 08:48:24 -06:00
/* time of first rq dispatch in current observation interval (ns) */
u64 first_dispatch ;
/* time of last rq dispatch in current observation interval (ns) */
u64 last_dispatch ;
/* beginning of the last budget */
ktime_t last_budget_start ;
/* beginning of the last idle slice */
ktime_t last_idling_start ;
block, bfq: tune service injection basing on request service times
The processes associated with a bfq_queue, say Q, may happen to
generate their cumulative I/O at a lower rate than the rate at which
the device could serve the same I/O. This is rather probable, e.g., if
only one process is associated with Q and the device is an SSD. It
results in Q becoming often empty while in service. If BFQ is not
allowed to switch to another queue when Q becomes empty, then, during
the service of Q, there will be frequent "service holes", i.e., time
intervals during which Q gets empty and the device can only consume
the I/O already queued in its hardware queues. This easily causes
considerable losses of throughput.
To counter this problem, BFQ implements a request injection mechanism,
which tries to fill the above service holes with I/O requests taken
from other bfq_queues. The hard part in this mechanism is finding the
right amount of I/O to inject, so as to both boost throughput and not
break Q's bandwidth and latency guarantees. To this goal, the current
version of this mechanism measures the bandwidth enjoyed by Q while it
is being served, and tries to inject the maximum possible amount of
extra service that does not cause Q's bandwidth to decrease too
much.
This solution has an important shortcoming. For bandwidth measurements
to be stable and reliable, Q must remain in service for a much longer
time than that needed to serve a single I/O request. Unfortunately,
this does not hold with many workloads. This commit addresses this
issue by changing the way the amount of injection allowed is
dynamically computed. It tunes injection as a function of the service
times of single I/O requests of Q, instead of Q's
bandwidth. Single-request service times are evidently meaningful even
if Q gets very few I/O requests completed while it is in service.
As a testbed for this new solution, we measured the throughput reached
by BFQ for one of the nastiest workloads and configurations for this
scheduler: the workload generated by the dbench test (in the Phoronix
suite), with 6 clients, on a filesystem with journaling, and with the
journaling daemon enjoying a higher weight than normal processes.
With this commit, the throughput grows from ~100 MB/s to ~150 MB/s on
a PLEXTOR PX-256M5.
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Francesco Pollicino <fra.fra.800@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-03-12 09:59:29 +01:00
unsigned long last_idling_start_jiffies ;
2017-04-19 08:48:24 -06:00
/* number of samples in current observation interval */
int peak_rate_samples ;
/* num of samples of seq dispatches in current observation interval */
u32 sequential_samples ;
/* total num of sectors transferred in current observation interval */
u64 tot_sectors_dispatched ;
/* max rq size seen during current observation interval (sectors) */
u32 last_rq_max_size ;
/* time elapsed from first dispatch in current observ. interval (us) */
u64 delta_from_first ;
/*
* Current estimate of the device peak rate , measured in
2018-03-26 16:06:24 +02:00
* [ ( sectors / usec ) / 2 ^ BFQ_RATE_SHIFT ] . The left - shift by
2017-04-19 08:48:24 -06:00
* BFQ_RATE_SHIFT is performed to increase precision in
* fixed - point calculations .
*/
u32 peak_rate ;
/* maximum budget allotted to a bfq_queue before rescheduling */
int bfq_max_budget ;
/* list of all the bfq_queues active on the device */
struct list_head active_list ;
/* list of all the bfq_queues idle on the device */
struct list_head idle_list ;
/*
* Timeout for async / sync requests ; when it fires , requests
* are served in fifo order .
*/
u64 bfq_fifo_expire [ 2 ] ;
/* weight of backward seeks wrt forward ones */
unsigned int bfq_back_penalty ;
/* maximum allowed backward seek */
unsigned int bfq_back_max ;
/* maximum idling time */
u32 bfq_slice_idle ;
/* user-configured max budget value (0 for auto-tuning) */
int bfq_user_max_budget ;
/*
* Timeout for bfq_queues to consume their budget ; used to
* prevent seeky queues from imposing long latencies to
* sequential or quasi - sequential ones ( this also implies that
* seeky queues cannot receive guarantees in the service
* domain ; after a timeout they are charged for the time they
* have been in service , to preserve fairness among them , but
* without service - domain guarantees ) .
*/
unsigned int bfq_timeout ;
/*
* Force device idling whenever needed to provide accurate
* service guarantees , without caring about throughput
* issues . CAVEAT : this may even increase latencies , in case
* of useless idling for processes that did stop doing I / O .
*/
bool strict_guarantees ;
/*
* Last time at which a queue entered the current burst of
* queues being activated shortly after each other ; for more
* details about this and the following parameters related to
* a burst of activations , see the comments on the function
* bfq_handle_burst .
*/
unsigned long last_ins_in_burst ;
/*
* Reference time interval used to decide whether a queue has
* been activated shortly after @ last_ins_in_burst .
*/
unsigned long bfq_burst_interval ;
/* number of queues in the current burst of queue activations */
int burst_size ;
/* common parent entity for the queues in the burst */
struct bfq_entity * burst_parent_entity ;
/* Maximum burst size above which the current queue-activation
* burst is deemed as ' large ' .
*/
unsigned long bfq_large_burst_thresh ;
/* true if a large queue-activation burst is in progress */
bool large_burst ;
/*
* Head of the burst list ( as for the above fields , more
* details in the comments on the function bfq_handle_burst ) .
*/
struct hlist_head burst_list ;
/* if set to true, low-latency heuristics are enabled */
bool low_latency ;
/*
* Maximum factor by which the weight of a weight - raised queue
* is multiplied .
*/
unsigned int bfq_wr_coeff ;
/* maximum duration of a weight-raising period (jiffies) */
unsigned int bfq_wr_max_time ;
/* Maximum weight-raising duration for soft real-time processes */
unsigned int bfq_wr_rt_max_time ;
/*
* Minimum idle period after which weight - raising may be
* reactivated for a queue ( in jiffies ) .
*/
unsigned int bfq_wr_min_idle_time ;
/*
* Minimum period between request arrivals after which
* weight - raising may be reactivated for an already busy async
* queue ( in jiffies ) .
*/
unsigned long bfq_wr_min_inter_arr_async ;
/* Max service-rate for a soft real-time queue, in sectors/sec */
unsigned int bfq_wr_max_softrt_rate ;
/*
block, bfq: remove slow-system class
BFQ computes the duration of weight raising for interactive
applications automatically, using some reference parameters. In
particular, BFQ uses the best durations (see comments in the code for
how these durations have been assessed) for two classes of systems:
slow and fast ones. Examples of slow systems are old phones or systems
using micro HDDs. Fast systems are all the remaining ones. Using these
parameters, BFQ computes the actual duration of the weight raising,
for the system at hand, as a function of the relative speed of the
system w.r.t. the speed of a reference system, belonging to the same
class of systems as the system at hand.
This slow vs fast differentiation proved to be useful in the past, but
happens to have little meaning with current hardware. Even worse, it
does cause problems in virtual systems, where the speed of the system
can vary frequently, and so widely to just confuse the class-detection
mechanism, and, as we have verified experimentally, to cause BFQ to
compute non-sensical weight-raising durations.
This commit addresses this issue by removing the slow class and the
class-detection mechanism.
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-31 16:45:06 +02:00
* Cached value of the product ref_rate * ref_wr_duration , used
* for computing the maximum duration of weight raising
* automatically .
2017-04-19 08:48:24 -06:00
*/
block, bfq: remove slow-system class
BFQ computes the duration of weight raising for interactive
applications automatically, using some reference parameters. In
particular, BFQ uses the best durations (see comments in the code for
how these durations have been assessed) for two classes of systems:
slow and fast ones. Examples of slow systems are old phones or systems
using micro HDDs. Fast systems are all the remaining ones. Using these
parameters, BFQ computes the actual duration of the weight raising,
for the system at hand, as a function of the relative speed of the
system w.r.t. the speed of a reference system, belonging to the same
class of systems as the system at hand.
This slow vs fast differentiation proved to be useful in the past, but
happens to have little meaning with current hardware. Even worse, it
does cause problems in virtual systems, where the speed of the system
can vary frequently, and so widely to just confuse the class-detection
mechanism, and, as we have verified experimentally, to cause BFQ to
compute non-sensical weight-raising durations.
This commit addresses this issue by removing the slow class and the
class-detection mechanism.
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-31 16:45:06 +02:00
u64 rate_dur_prod ;
2017-04-19 08:48:24 -06:00
/* fallback dummy bfqq for extreme OOM conditions */
struct bfq_queue oom_bfqq ;
spinlock_t lock ;
/*
* bic associated with the task issuing current bio for
* merging . This and the next field are used as a support to
* be able to perform the bic lookup , needed by bio - merge
* functions , before the scheduler lock is taken , and thus
* avoid taking the request - queue lock while the scheduler
* lock is being held .
*/
struct bfq_io_cq * bio_bic ;
/* bfqq associated with the task issuing current bio for merging */
struct bfq_queue * bio_bfqq ;
block, bfq: limit tags for writes and async I/O
Asynchronous I/O can easily starve synchronous I/O (both sync reads
and sync writes), by consuming all request tags. Similarly, storms of
synchronous writes, such as those that sync(2) may trigger, can starve
synchronous reads. In their turn, these two problems may also cause
BFQ to loose control on latency for interactive and soft real-time
applications. For example, on a PLEXTOR PX-256M5S SSD, LibreOffice
Writer takes 0.6 seconds to start if the device is idle, but it takes
more than 45 seconds (!) if there are sequential writes in the
background.
This commit addresses this issue by limiting the maximum percentage of
tags that asynchronous I/O requests and synchronous write requests can
consume. In particular, this commit grants a higher threshold to
synchronous writes, to prevent the latter from being starved by
asynchronous I/O.
According to the above test, LibreOffice Writer now starts in about
1.2 seconds on average, regardless of the background workload, and
apart from some rare outlier. To check this improvement, run, e.g.,
sudo ./comm_startup_lat.sh bfq 5 5 seq 10 "lowriter --terminate_after_init"
for the comm_startup_lat benchmark in the S suite [1].
[1] https://github.com/Algodev-github/S
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-01-13 12:05:17 +01:00
/*
* Depth limits used in bfq_limit_depth ( see comments on the
* function )
*/
unsigned int word_depths [ 2 ] [ 2 ] ;
2021-11-25 14:36:36 +01:00
unsigned int full_depth_shift ;
2017-04-19 08:48:24 -06:00
} ;
enum bfqq_state_flags {
BFQQF_just_created = 0 , /* queue just allocated */
BFQQF_busy , /* has requests or is in service */
BFQQF_wait_request , /* waiting for a request */
BFQQF_non_blocking_wait_rq , /*
* waiting for a request
* without idling the device
*/
BFQQF_fifo_expire , /* FIFO checked in this slice */
2017-08-04 07:35:10 +02:00
BFQQF_has_short_ttime , /* queue has a short think time */
2017-04-19 08:48:24 -06:00
BFQQF_sync , /* synchronous queue */
BFQQF_IO_bound , /*
* bfqq has timed - out at least once
* having consumed at most 2 / 10 of
* its budget
*/
BFQQF_in_large_burst , /*
* bfqq activated in a large burst ,
* see comments to bfq_handle_burst .
*/
BFQQF_softrt_update , /*
* may need softrt - next - start
* update
*/
BFQQF_coop , /* bfqq is shared */
block, bfq: detect wakers and unconditionally inject their I/O
A bfq_queue Q may happen to be synchronized with another
bfq_queue Q2, i.e., the I/O of Q2 may need to be completed for Q to
receive new I/O. We call Q2 "waker queue".
If I/O plugging is being performed for Q, and Q is not receiving any
more I/O because of the above synchronization, then, thanks to BFQ's
injection mechanism, the waker queue is likely to get served before
the I/O-plugging timeout fires.
Unfortunately, this fact may not be sufficient to guarantee a high
throughput during the I/O plugging, because the inject limit for Q may
be too low to guarantee a lot of injected I/O. In addition, the
duration of the plugging, i.e., the time before Q finally receives new
I/O, may not be minimized, because the waker queue may happen to be
served only after other queues.
To address these issues, this commit introduces the explicit detection
of the waker queue, and the unconditional injection of a pending I/O
request of the waker queue on each invocation of
bfq_dispatch_request().
One may be concerned that this systematic injection of I/O from the
waker queue delays the service of Q's I/O. Fortunately, it doesn't. On
the contrary, next Q's I/O is brought forward dramatically, for it is
not blocked for milliseconds.
Reported-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Tested-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-25 07:12:47 +02:00
BFQQF_split_coop , /* shared bfqq will be split */
2017-04-19 08:48:24 -06:00
} ;
# define BFQ_BFQQ_FNS(name) \
void bfq_mark_bfqq_ # # name ( struct bfq_queue * bfqq ) ; \
void bfq_clear_bfqq_ # # name ( struct bfq_queue * bfqq ) ; \
int bfq_bfqq_ # # name ( const struct bfq_queue * bfqq ) ;
BFQ_BFQQ_FNS ( just_created ) ;
BFQ_BFQQ_FNS ( busy ) ;
BFQ_BFQQ_FNS ( wait_request ) ;
BFQ_BFQQ_FNS ( non_blocking_wait_rq ) ;
BFQ_BFQQ_FNS ( fifo_expire ) ;
2017-08-04 07:35:10 +02:00
BFQ_BFQQ_FNS ( has_short_ttime ) ;
2017-04-19 08:48:24 -06:00
BFQ_BFQQ_FNS ( sync ) ;
BFQ_BFQQ_FNS ( IO_bound ) ;
BFQ_BFQQ_FNS ( in_large_burst ) ;
BFQ_BFQQ_FNS ( coop ) ;
BFQ_BFQQ_FNS ( split_coop ) ;
BFQ_BFQQ_FNS ( softrt_update ) ;
# undef BFQ_BFQQ_FNS
/* Expiration reasons. */
enum bfqq_expiration {
BFQQE_TOO_IDLE = 0 , /*
* queue has been idling for
* too long
*/
BFQQE_BUDGET_TIMEOUT , /* budget took too long to be used */
BFQQE_BUDGET_EXHAUSTED , /* budget consumed */
BFQQE_NO_MORE_REQUESTS , /* the queue has no more requests */
BFQQE_PREEMPTED /* preemption in progress */
} ;
2019-06-06 12:26:22 +02:00
struct bfq_stat {
struct percpu_counter cpu_cnt ;
atomic64_t aux_cnt ;
} ;
2017-04-19 08:48:24 -06:00
struct bfqg_stats {
2019-11-07 11:18:00 -08:00
/* basic stats */
struct blkg_rwstat bytes ;
struct blkg_rwstat ios ;
2019-06-06 12:26:24 +02:00
# ifdef CONFIG_BFQ_CGROUP_DEBUG
2017-04-19 08:48:24 -06:00
/* number of ios merged */
struct blkg_rwstat merged ;
/* total time spent on device in ns, may not be accurate w/ queueing */
struct blkg_rwstat service_time ;
/* total time spent waiting in scheduler queue in ns */
struct blkg_rwstat wait_time ;
/* number of IOs queued up */
struct blkg_rwstat queued ;
/* total disk time and nr sectors dispatched by this group */
2019-06-06 12:26:22 +02:00
struct bfq_stat time ;
2017-04-19 08:48:24 -06:00
/* sum of number of ios queued across all samples */
2019-06-06 12:26:22 +02:00
struct bfq_stat avg_queue_size_sum ;
2017-04-19 08:48:24 -06:00
/* count of samples taken for average */
2019-06-06 12:26:22 +02:00
struct bfq_stat avg_queue_size_samples ;
2017-04-19 08:48:24 -06:00
/* how many times this group has been removed from service tree */
2019-06-06 12:26:22 +02:00
struct bfq_stat dequeue ;
2017-04-19 08:48:24 -06:00
/* total time spent waiting for it to be assigned a timeslice. */
2019-06-06 12:26:22 +02:00
struct bfq_stat group_wait_time ;
2017-04-19 08:48:24 -06:00
/* time spent idling for this blkcg_gq */
2019-06-06 12:26:22 +02:00
struct bfq_stat idle_time ;
2017-04-19 08:48:24 -06:00
/* total time with empty current active q with other requests queued */
2019-06-06 12:26:22 +02:00
struct bfq_stat empty_time ;
2017-04-19 08:48:24 -06:00
/* fields after this shouldn't be cleared on stat reset */
2018-05-09 02:08:51 -07:00
u64 start_group_wait_time ;
u64 start_idle_time ;
u64 start_empty_time ;
2017-04-19 08:48:24 -06:00
uint16_t flags ;
2019-06-06 12:26:24 +02:00
# endif /* CONFIG_BFQ_CGROUP_DEBUG */
2017-04-19 08:48:24 -06:00
} ;
# ifdef CONFIG_BFQ_GROUP_IOSCHED
/*
* struct bfq_group_data - per - blkcg storage for the blkio subsystem .
*
* @ ps : @ blkcg_policy_storage that this structure inherits
* @ weight : weight of the bfq_group
*/
struct bfq_group_data {
/* must be the first member */
struct blkcg_policy_data pd ;
unsigned int weight ;
} ;
/**
* struct bfq_group - per ( device , cgroup ) data structure .
* @ entity : schedulable entity to insert into the parent group sched_data .
* @ sched_data : own sched_data , to contain child entities ( they may be
* both bfq_queues and bfq_groups ) .
* @ bfqd : the bfq_data for the device this group acts upon .
* @ async_bfqq : array of async queues for all the tasks belonging to
* the group , one queue per ioprio value per ioprio_class ,
* except for the idle class that has only one queue .
* @ async_idle_bfqq : async queue for the idle class ( ioprio is ignored ) .
* @ my_entity : pointer to @ entity , % NULL for the toplevel group ; used
* to avoid too many special cases during group creation /
* migration .
* @ stats : stats for this bfqg .
* @ active_entities : number of active entities belonging to the group ;
* unused for the root group . Used to know whether there
* are groups with more than one active @ bfq_entity
* ( see the comments to the function
* bfq_bfqq_may_idle ( ) ) .
* @ rq_pos_tree : rbtree sorted by next_request position , used when
* determining if two or more queues have interleaving
* requests ( see bfq_find_close_cooperator ( ) ) .
*
* Each ( device , cgroup ) pair has its own bfq_group , i . e . , for each cgroup
* there is a set of bfq_groups , each one collecting the lower - level
* entities belonging to the group that are acting on the same device .
*
* Locking works as follows :
* o @ bfqd is protected by the queue lock , RCU is used to access it
* from the readers .
* o All the other fields are protected by the @ bfqd queue lock .
*/
struct bfq_group {
/* must be the first member */
struct blkg_policy_data pd ;
block, bfq: access and cache blkg data only when safe
In blk-cgroup, operations on blkg objects are protected with the
request_queue lock. This is no more the lock that protects
I/O-scheduler operations in blk-mq. In fact, the latter are now
protected with a finer-grained per-scheduler-instance lock. As a
consequence, although blkg lookups are also rcu-protected, blk-mq I/O
schedulers may see inconsistent data when they access blkg and
blkg-related objects. BFQ does access these objects, and does incur
this problem, in the following case.
The blkg_lookup performed in bfq_get_queue, being protected (only)
through rcu, may happen to return the address of a copy of the
original blkg. If this is the case, then the blkg_get performed in
bfq_get_queue, to pin down the blkg, is useless: it does not prevent
blk-cgroup code from destroying both the original blkg and all objects
directly or indirectly referred by the copy of the blkg. BFQ accesses
these objects, which typically causes a crash for NULL-pointer
dereference of memory-protection violation.
Some additional protection mechanism should be added to blk-cgroup to
address this issue. In the meantime, this commit provides a quick
temporary fix for BFQ: cache (when safe) blkg data that might
disappear right after a blkg_lookup.
In particular, this commit exploits the following facts to achieve its
goal without introducing further locks. Destroy operations on a blkg
invoke, as a first step, hooks of the scheduler associated with the
blkg. And these hooks are executed with bfqd->lock held for BFQ. As a
consequence, for any blkg associated with the request queue an
instance of BFQ is attached to, we are guaranteed that such a blkg is
not destroyed, and that all the pointers it contains are consistent,
while that instance is holding its bfqd->lock. A blkg_lookup performed
with bfqd->lock held then returns a fully consistent blkg, which
remains consistent until this lock is held. In more detail, this holds
even if the returned blkg is a copy of the original one.
Finally, also the object describing a group inside BFQ needs to be
protected from destruction on the blkg_free of the original blkg
(which invokes bfq_pd_free). This commit adds private refcounting for
this object, to let it disappear only after no bfq_queue refers to it
any longer.
This commit also removes or updates some stale comments on locking
issues related to blk-cgroup operations.
Reported-by: Tomas Konir <tomas.konir@gmail.com>
Reported-by: Lee Tibbert <lee.tibbert@gmail.com>
Reported-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Tomas Konir <tomas.konir@gmail.com>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Tested-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-06-05 10:11:15 +02:00
/* cached path for this blkg (see comments in bfq_bic_update_cgroup) */
char blkg_path [ 128 ] ;
/* reference counter (see comments in bfq_bic_update_cgroup) */
int ref ;
2022-04-01 12:27:48 +02:00
/* Is bfq_group still online? */
bool online ;
block, bfq: access and cache blkg data only when safe
In blk-cgroup, operations on blkg objects are protected with the
request_queue lock. This is no more the lock that protects
I/O-scheduler operations in blk-mq. In fact, the latter are now
protected with a finer-grained per-scheduler-instance lock. As a
consequence, although blkg lookups are also rcu-protected, blk-mq I/O
schedulers may see inconsistent data when they access blkg and
blkg-related objects. BFQ does access these objects, and does incur
this problem, in the following case.
The blkg_lookup performed in bfq_get_queue, being protected (only)
through rcu, may happen to return the address of a copy of the
original blkg. If this is the case, then the blkg_get performed in
bfq_get_queue, to pin down the blkg, is useless: it does not prevent
blk-cgroup code from destroying both the original blkg and all objects
directly or indirectly referred by the copy of the blkg. BFQ accesses
these objects, which typically causes a crash for NULL-pointer
dereference of memory-protection violation.
Some additional protection mechanism should be added to blk-cgroup to
address this issue. In the meantime, this commit provides a quick
temporary fix for BFQ: cache (when safe) blkg data that might
disappear right after a blkg_lookup.
In particular, this commit exploits the following facts to achieve its
goal without introducing further locks. Destroy operations on a blkg
invoke, as a first step, hooks of the scheduler associated with the
blkg. And these hooks are executed with bfqd->lock held for BFQ. As a
consequence, for any blkg associated with the request queue an
instance of BFQ is attached to, we are guaranteed that such a blkg is
not destroyed, and that all the pointers it contains are consistent,
while that instance is holding its bfqd->lock. A blkg_lookup performed
with bfqd->lock held then returns a fully consistent blkg, which
remains consistent until this lock is held. In more detail, this holds
even if the returned blkg is a copy of the original one.
Finally, also the object describing a group inside BFQ needs to be
protected from destruction on the blkg_free of the original blkg
(which invokes bfq_pd_free). This commit adds private refcounting for
this object, to let it disappear only after no bfq_queue refers to it
any longer.
This commit also removes or updates some stale comments on locking
issues related to blk-cgroup operations.
Reported-by: Tomas Konir <tomas.konir@gmail.com>
Reported-by: Lee Tibbert <lee.tibbert@gmail.com>
Reported-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Tomas Konir <tomas.konir@gmail.com>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Tested-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-06-05 10:11:15 +02:00
2017-04-19 08:48:24 -06:00
struct bfq_entity entity ;
struct bfq_sched_data sched_data ;
void * bfqd ;
2021-08-11 12:37:01 +09:00
struct bfq_queue * async_bfqq [ 2 ] [ IOPRIO_NR_LEVELS ] ;
2017-04-19 08:48:24 -06:00
struct bfq_queue * async_idle_bfqq ;
struct bfq_entity * my_entity ;
int active_entities ;
struct rb_root rq_pos_tree ;
struct bfqg_stats stats ;
} ;
# else
struct bfq_group {
2020-02-03 11:40:58 +01:00
struct bfq_entity entity ;
2017-04-19 08:48:24 -06:00
struct bfq_sched_data sched_data ;
2021-08-11 12:37:01 +09:00
struct bfq_queue * async_bfqq [ 2 ] [ IOPRIO_NR_LEVELS ] ;
2017-04-19 08:48:24 -06:00
struct bfq_queue * async_idle_bfqq ;
struct rb_root rq_pos_tree ;
} ;
# endif
/* --------------- main algorithm interface ----------------- */
# define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
{ RB_ROOT , RB_ROOT , NULL , NULL , 0 , 0 } )
extern const int bfq_timeout ;
struct bfq_queue * bic_to_bfqq ( struct bfq_io_cq * bic , bool is_sync ) ;
void bic_set_bfqq ( struct bfq_io_cq * bic , struct bfq_queue * bfqq , bool is_sync ) ;
struct bfq_data * bic_to_bfqd ( struct bfq_io_cq * bic ) ;
void bfq_pos_tree_add_move ( struct bfq_data * bfqd , struct bfq_queue * bfqq ) ;
block, bfq: improve asymmetric scenarios detection
bfq defines as asymmetric a scenario where an active entity, say E
(representing either a single bfq_queue or a group of other entities),
has a higher weight than some other entities. If the entity E does sync
I/O in such a scenario, then bfq plugs the dispatch of the I/O of the
other entities in the following situation: E is in service but
temporarily has no pending I/O request. In fact, without this plugging,
all the times that E stops being temporarily idle, it may find the
internal queues of the storage device already filled with an
out-of-control number of extra requests, from other entities. So E may
have to wait for the service of these extra requests, before finally
having its own requests served. This may easily break service
guarantees, with E getting less than its fair share of the device
throughput. Usually, the end result is that E gets the same fraction of
the throughput as the other entities, instead of getting more, according
to its higher weight.
Yet there are two other more subtle cases where E, even if its weight is
actually equal to or even lower than the weight of any other active
entities, may get less than its fair share of the throughput in case the
above I/O plugging is not performed:
1. other entities issue larger requests than E;
2. other entities contain more active child entities than E (or in
general tend to have more backlog than E).
In the first case, other entities may get more service than E because
they get larger requests, than those of E, served during the temporary
idle periods of E. In the second case, other entities get more service
because, by having many child entities, they have many requests ready
for dispatching while E is temporarily idle.
This commit addresses this issue by extending the definition of
asymmetric scenario: a scenario is asymmetric when
- active entities representing bfq_queues have differentiated weights,
as in the original definition
or (inclusive)
- one or more entities representing groups of entities are active.
This broader definition makes sure that I/O plugging will be performed
in all the above cases, provided that there is at least one active
group. Of course, this definition is very coarse, so it will trigger
I/O plugging also in cases where it is not needed, such as, e.g.,
multiple active entities with just one child each, and all with the same
I/O-request size. The reason for this coarse definition is just that a
finer-grained definition would be rather heavy to compute.
On the opposite end, even this new definition does not trigger I/O
plugging in all cases where there is no active group, and all bfq_queues
have the same weight. So, in these cases some unfairness may occur if
there are asymmetries in I/O-request sizes. We made this choice because
I/O plugging may lower throughput, and probably a user that has not
created any group cares more about throughput than about perfect
fairness. At any rate, as for possible applications that may care about
service guarantees, bfq already guarantees a high responsiveness and a
low latency to soft real-time applications automatically.
Signed-off-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-10-12 11:55:57 +02:00
void bfq_weights_tree_add ( struct bfq_data * bfqd , struct bfq_queue * bfqq ,
block, bfq: do not idle for lowest-weight queues
In most cases, it is detrimental for throughput to plug I/O dispatch
when the in-service bfq_queue becomes temporarily empty (plugging is
performed to wait for the possible arrival, soon, of new I/O from the
in-service queue). There is however a case where plugging is needed
for service guarantees. If a bfq_queue, say Q, has a higher weight
than some other active bfq_queue, and is sync, i.e., contains sync
I/O, then, to guarantee that Q does receive a higher share of the
throughput than other lower-weight queues, it is necessary to plug I/O
dispatch when Q remains temporarily empty while being served.
For this reason, BFQ performs I/O plugging when some active bfq_queue
has a higher weight than some other active bfq_queue. But this is
unnecessarily overkill. In fact, if the in-service bfq_queue actually
has a weight lower than or equal to the other queues, then the queue
*must not* be guaranteed a higher share of the throughput than the
other queues. So, not plugging I/O cannot cause any harm to the
queue. And can boost throughput.
Taking advantage of this fact, this commit does not plug I/O for sync
bfq_queues with a weight lower than or equal to the weights of the
other queues. Here is an example of the resulting throughput boost
with the dbench workload, which is particularly nasty for BFQ. With
the dbench test in the Phoronix suite, BFQ reaches its lowest total
throughput with 6 clients on a filesystem with journaling, in case the
journaling daemon has a higher weight than normal processes. Before
this commit, the total throughput was ~80 MB/sec on a PLEXTOR PX-256M5,
after this commit it is ~100 MB/sec.
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-03-12 09:59:28 +01:00
struct rb_root_cached * root ) ;
block, bfq: add/remove entity weights correctly
To keep I/O throughput high as often as possible, BFQ performs
I/O-dispatch plugging (aka device idling) only when beneficial exactly
for throughput, or when needed for service guarantees (low latency,
fairness). An important case where the latter condition holds is when
the scenario is 'asymmetric' in terms of weights: i.e., when some
bfq_queue or whole group of queues has a higher weight, and thus has
to receive more service, than other queues or groups. Without dispatch
plugging, lower-weight queues/groups may unjustly steal bandwidth to
higher-weight queues/groups.
To detect asymmetric scenarios, BFQ checks some sufficient
conditions. One of these conditions is that active groups have
different weights. BFQ controls this condition by maintaining a
special set of unique weights of active groups
(group_weights_tree). To this purpose, in the function
bfq_active_insert/bfq_active_extract BFQ adds/removes the weight of a
group to/from this set.
Unfortunately, the function bfq_active_extract may happen to be
invoked also for a group that is still active (to preserve the correct
update of the next queue to serve, see comments in function
bfq_no_longer_next_in_service() for details). In this case, removing
the weight of the group makes the set group_weights_tree
inconsistent. Service-guarantee violations follow.
This commit addresses this issue by moving group_weights_tree
insertions from their previous location (in bfq_active_insert) into
the function __bfq_activate_entity, and by moving group_weights_tree
extractions from bfq_active_extract to when the entity that represents
a group remains throughly idle, i.e., with no request either enqueued
or dispatched.
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-06-25 21:55:34 +02:00
void __bfq_weights_tree_remove ( struct bfq_data * bfqd ,
block, bfq: improve asymmetric scenarios detection
bfq defines as asymmetric a scenario where an active entity, say E
(representing either a single bfq_queue or a group of other entities),
has a higher weight than some other entities. If the entity E does sync
I/O in such a scenario, then bfq plugs the dispatch of the I/O of the
other entities in the following situation: E is in service but
temporarily has no pending I/O request. In fact, without this plugging,
all the times that E stops being temporarily idle, it may find the
internal queues of the storage device already filled with an
out-of-control number of extra requests, from other entities. So E may
have to wait for the service of these extra requests, before finally
having its own requests served. This may easily break service
guarantees, with E getting less than its fair share of the device
throughput. Usually, the end result is that E gets the same fraction of
the throughput as the other entities, instead of getting more, according
to its higher weight.
Yet there are two other more subtle cases where E, even if its weight is
actually equal to or even lower than the weight of any other active
entities, may get less than its fair share of the throughput in case the
above I/O plugging is not performed:
1. other entities issue larger requests than E;
2. other entities contain more active child entities than E (or in
general tend to have more backlog than E).
In the first case, other entities may get more service than E because
they get larger requests, than those of E, served during the temporary
idle periods of E. In the second case, other entities get more service
because, by having many child entities, they have many requests ready
for dispatching while E is temporarily idle.
This commit addresses this issue by extending the definition of
asymmetric scenario: a scenario is asymmetric when
- active entities representing bfq_queues have differentiated weights,
as in the original definition
or (inclusive)
- one or more entities representing groups of entities are active.
This broader definition makes sure that I/O plugging will be performed
in all the above cases, provided that there is at least one active
group. Of course, this definition is very coarse, so it will trigger
I/O plugging also in cases where it is not needed, such as, e.g.,
multiple active entities with just one child each, and all with the same
I/O-request size. The reason for this coarse definition is just that a
finer-grained definition would be rather heavy to compute.
On the opposite end, even this new definition does not trigger I/O
plugging in all cases where there is no active group, and all bfq_queues
have the same weight. So, in these cases some unfairness may occur if
there are asymmetries in I/O-request sizes. We made this choice because
I/O plugging may lower throughput, and probably a user that has not
created any group cares more about throughput than about perfect
fairness. At any rate, as for possible applications that may care about
service guarantees, bfq already guarantees a high responsiveness and a
low latency to soft real-time applications automatically.
Signed-off-by: Federico Motta <federico@willer.it>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-10-12 11:55:57 +02:00
struct bfq_queue * bfqq ,
block, bfq: do not idle for lowest-weight queues
In most cases, it is detrimental for throughput to plug I/O dispatch
when the in-service bfq_queue becomes temporarily empty (plugging is
performed to wait for the possible arrival, soon, of new I/O from the
in-service queue). There is however a case where plugging is needed
for service guarantees. If a bfq_queue, say Q, has a higher weight
than some other active bfq_queue, and is sync, i.e., contains sync
I/O, then, to guarantee that Q does receive a higher share of the
throughput than other lower-weight queues, it is necessary to plug I/O
dispatch when Q remains temporarily empty while being served.
For this reason, BFQ performs I/O plugging when some active bfq_queue
has a higher weight than some other active bfq_queue. But this is
unnecessarily overkill. In fact, if the in-service bfq_queue actually
has a weight lower than or equal to the other queues, then the queue
*must not* be guaranteed a higher share of the throughput than the
other queues. So, not plugging I/O cannot cause any harm to the
queue. And can boost throughput.
Taking advantage of this fact, this commit does not plug I/O for sync
bfq_queues with a weight lower than or equal to the weights of the
other queues. Here is an example of the resulting throughput boost
with the dbench workload, which is particularly nasty for BFQ. With
the dbench test in the Phoronix suite, BFQ reaches its lowest total
throughput with 6 clients on a filesystem with journaling, in case the
journaling daemon has a higher weight than normal processes. Before
this commit, the total throughput was ~80 MB/sec on a PLEXTOR PX-256M5,
after this commit it is ~100 MB/sec.
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-03-12 09:59:28 +01:00
struct rb_root_cached * root ) ;
block, bfq: add/remove entity weights correctly
To keep I/O throughput high as often as possible, BFQ performs
I/O-dispatch plugging (aka device idling) only when beneficial exactly
for throughput, or when needed for service guarantees (low latency,
fairness). An important case where the latter condition holds is when
the scenario is 'asymmetric' in terms of weights: i.e., when some
bfq_queue or whole group of queues has a higher weight, and thus has
to receive more service, than other queues or groups. Without dispatch
plugging, lower-weight queues/groups may unjustly steal bandwidth to
higher-weight queues/groups.
To detect asymmetric scenarios, BFQ checks some sufficient
conditions. One of these conditions is that active groups have
different weights. BFQ controls this condition by maintaining a
special set of unique weights of active groups
(group_weights_tree). To this purpose, in the function
bfq_active_insert/bfq_active_extract BFQ adds/removes the weight of a
group to/from this set.
Unfortunately, the function bfq_active_extract may happen to be
invoked also for a group that is still active (to preserve the correct
update of the next queue to serve, see comments in function
bfq_no_longer_next_in_service() for details). In this case, removing
the weight of the group makes the set group_weights_tree
inconsistent. Service-guarantee violations follow.
This commit addresses this issue by moving group_weights_tree
insertions from their previous location (in bfq_active_insert) into
the function __bfq_activate_entity, and by moving group_weights_tree
extractions from bfq_active_extract to when the entity that represents
a group remains throughly idle, i.e., with no request either enqueued
or dispatched.
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-06-25 21:55:34 +02:00
void bfq_weights_tree_remove ( struct bfq_data * bfqd ,
struct bfq_queue * bfqq ) ;
2017-04-19 08:48:24 -06:00
void bfq_bfqq_expire ( struct bfq_data * bfqd , struct bfq_queue * bfqq ,
bool compensate , enum bfqq_expiration reason ) ;
void bfq_put_queue ( struct bfq_queue * bfqq ) ;
2022-04-01 12:27:44 +02:00
void bfq_put_cooperator ( struct bfq_queue * bfqq ) ;
2017-04-19 08:48:24 -06:00
void bfq_end_wr_async_queues ( struct bfq_data * bfqd , struct bfq_group * bfqg ) ;
2020-03-21 10:45:19 +01:00
void bfq_release_process_ref ( struct bfq_data * bfqd , struct bfq_queue * bfqq ) ;
2017-04-19 08:48:24 -06:00
void bfq_schedule_dispatch ( struct bfq_data * bfqd ) ;
void bfq_put_async_queues ( struct bfq_data * bfqd , struct bfq_group * bfqg ) ;
/* ------------ end of main algorithm interface -------------- */
/* ---------------- cgroups-support interface ---------------- */
2019-11-07 11:18:00 -08:00
void bfqg_stats_update_legacy_io ( struct request_queue * q , struct request * rq ) ;
2017-04-19 08:48:24 -06:00
void bfqg_stats_update_io_add ( struct bfq_group * bfqg , struct bfq_queue * bfqq ,
unsigned int op ) ;
void bfqg_stats_update_io_remove ( struct bfq_group * bfqg , unsigned int op ) ;
void bfqg_stats_update_io_merged ( struct bfq_group * bfqg , unsigned int op ) ;
2018-05-09 02:08:51 -07:00
void bfqg_stats_update_completion ( struct bfq_group * bfqg , u64 start_time_ns ,
u64 io_start_time_ns , unsigned int op ) ;
2017-04-19 08:48:24 -06:00
void bfqg_stats_update_dequeue ( struct bfq_group * bfqg ) ;
void bfqg_stats_set_start_empty_time ( struct bfq_group * bfqg ) ;
void bfqg_stats_update_idle_time ( struct bfq_group * bfqg ) ;
void bfqg_stats_set_start_idle_time ( struct bfq_group * bfqg ) ;
void bfqg_stats_update_avg_queue_size ( struct bfq_group * bfqg ) ;
void bfq_bfqq_move ( struct bfq_data * bfqd , struct bfq_queue * bfqq ,
struct bfq_group * bfqg ) ;
void bfq_init_entity ( struct bfq_entity * entity , struct bfq_group * bfqg ) ;
void bfq_bic_update_cgroup ( struct bfq_io_cq * bic , struct bio * bio ) ;
void bfq_end_wr_async ( struct bfq_data * bfqd ) ;
2022-04-01 12:27:49 +02:00
struct bfq_group * bfq_bio_bfqg ( struct bfq_data * bfqd , struct bio * bio ) ;
2017-04-19 08:48:24 -06:00
struct blkcg_gq * bfqg_to_blkg ( struct bfq_group * bfqg ) ;
struct bfq_group * bfqq_group ( struct bfq_queue * bfqq ) ;
struct bfq_group * bfq_create_group_hierarchy ( struct bfq_data * bfqd , int node ) ;
block, bfq: access and cache blkg data only when safe
In blk-cgroup, operations on blkg objects are protected with the
request_queue lock. This is no more the lock that protects
I/O-scheduler operations in blk-mq. In fact, the latter are now
protected with a finer-grained per-scheduler-instance lock. As a
consequence, although blkg lookups are also rcu-protected, blk-mq I/O
schedulers may see inconsistent data when they access blkg and
blkg-related objects. BFQ does access these objects, and does incur
this problem, in the following case.
The blkg_lookup performed in bfq_get_queue, being protected (only)
through rcu, may happen to return the address of a copy of the
original blkg. If this is the case, then the blkg_get performed in
bfq_get_queue, to pin down the blkg, is useless: it does not prevent
blk-cgroup code from destroying both the original blkg and all objects
directly or indirectly referred by the copy of the blkg. BFQ accesses
these objects, which typically causes a crash for NULL-pointer
dereference of memory-protection violation.
Some additional protection mechanism should be added to blk-cgroup to
address this issue. In the meantime, this commit provides a quick
temporary fix for BFQ: cache (when safe) blkg data that might
disappear right after a blkg_lookup.
In particular, this commit exploits the following facts to achieve its
goal without introducing further locks. Destroy operations on a blkg
invoke, as a first step, hooks of the scheduler associated with the
blkg. And these hooks are executed with bfqd->lock held for BFQ. As a
consequence, for any blkg associated with the request queue an
instance of BFQ is attached to, we are guaranteed that such a blkg is
not destroyed, and that all the pointers it contains are consistent,
while that instance is holding its bfqd->lock. A blkg_lookup performed
with bfqd->lock held then returns a fully consistent blkg, which
remains consistent until this lock is held. In more detail, this holds
even if the returned blkg is a copy of the original one.
Finally, also the object describing a group inside BFQ needs to be
protected from destruction on the blkg_free of the original blkg
(which invokes bfq_pd_free). This commit adds private refcounting for
this object, to let it disappear only after no bfq_queue refers to it
any longer.
This commit also removes or updates some stale comments on locking
issues related to blk-cgroup operations.
Reported-by: Tomas Konir <tomas.konir@gmail.com>
Reported-by: Lee Tibbert <lee.tibbert@gmail.com>
Reported-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Tomas Konir <tomas.konir@gmail.com>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Tested-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2017-06-05 10:11:15 +02:00
void bfqg_and_blkg_put ( struct bfq_group * bfqg ) ;
2017-04-19 08:48:24 -06:00
# ifdef CONFIG_BFQ_GROUP_IOSCHED
2017-04-20 09:37:05 -06:00
extern struct cftype bfq_blkcg_legacy_files [ ] ;
extern struct cftype bfq_blkg_files [ ] ;
2017-04-19 08:48:24 -06:00
extern struct blkcg_policy blkcg_policy_bfq ;
# endif
/* ------------- end of cgroups-support interface ------------- */
/* - interface of the internal hierarchical B-WF2Q+ scheduler - */
# ifdef CONFIG_BFQ_GROUP_IOSCHED
/* both next loops stop at one of the child entities of the root group */
# define for_each_entity(entity) \
for ( ; entity ; entity = entity - > parent )
/*
* For each iteration , compute parent in advance , so as to be safe if
* entity is deallocated during the iteration . Such a deallocation may
* happen as a consequence of a bfq_put_queue that frees the bfq_queue
* containing entity .
*/
# define for_each_entity_safe(entity, parent) \
for ( ; entity & & ( { parent = entity - > parent ; 1 ; } ) ; entity = parent )
# else /* CONFIG_BFQ_GROUP_IOSCHED */
/*
* Next two macros are fake loops when cgroups support is not
* enabled . I fact , in such a case , there is only one level to go up
* ( to reach the root group ) .
*/
# define for_each_entity(entity) \
for ( ; entity ; entity = NULL )
# define for_each_entity_safe(entity, parent) \
for ( parent = NULL ; entity ; entity = parent )
# endif /* CONFIG_BFQ_GROUP_IOSCHED */
struct bfq_queue * bfq_entity_to_bfqq ( struct bfq_entity * entity ) ;
2019-01-29 12:06:29 +01:00
unsigned int bfq_tot_busy_queues ( struct bfq_data * bfqd ) ;
2017-04-19 08:48:24 -06:00
struct bfq_service_tree * bfq_entity_service_tree ( struct bfq_entity * entity ) ;
struct bfq_entity * bfq_entity_of ( struct rb_node * node ) ;
unsigned short bfq_ioprio_to_weight ( int ioprio ) ;
void bfq_put_idle_entity ( struct bfq_service_tree * st ,
struct bfq_entity * entity ) ;
struct bfq_service_tree *
__bfq_entity_update_weight_prio ( struct bfq_service_tree * old_st ,
block, bfq: don't change ioprio class for a bfq_queue on a service tree
On each deactivation or re-scheduling (after being served) of a
bfq_queue, BFQ invokes the function __bfq_entity_update_weight_prio(),
to perform pending updates of ioprio, weight and ioprio class for the
bfq_queue. BFQ also invokes this function on I/O-request dispatches,
to raise or lower weights more quickly when needed, thereby improving
latency. However, the entity representing the bfq_queue may be on the
active (sub)tree of a service tree when this happens, and, although
with a very low probability, the bfq_queue may happen to also have a
pending change of its ioprio class. If both conditions hold when
__bfq_entity_update_weight_prio() is invoked, then the entity moves to
a sort of hybrid state: the new service tree for the entity, as
returned by bfq_entity_service_tree(), differs from service tree on
which the entity still is. The functions that handle activations and
deactivations of entities do not cope with such a hybrid state (and
would need to become more complex to cope).
This commit addresses this issue by just making
__bfq_entity_update_weight_prio() not perform also a possible pending
change of ioprio class, when invoked on an I/O-request dispatch for a
bfq_queue. Such a change is thus postponed to when
__bfq_entity_update_weight_prio() is invoked on deactivation or
re-scheduling of the bfq_queue.
Reported-by: Marco Piazza <mpiazza@gmail.com>
Reported-by: Laurentiu Nicola <lnicola@dend.ro>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-07-03 10:00:10 +02:00
struct bfq_entity * entity ,
bool update_class_too ) ;
2017-04-19 08:48:24 -06:00
void bfq_bfqq_served ( struct bfq_queue * bfqq , int served ) ;
void bfq_bfqq_charge_time ( struct bfq_data * bfqd , struct bfq_queue * bfqq ,
unsigned long time_ms ) ;
bool __bfq_deactivate_entity ( struct bfq_entity * entity ,
bool ins_into_idle_tree ) ;
bool next_queue_may_preempt ( struct bfq_data * bfqd ) ;
struct bfq_queue * bfq_get_next_queue ( struct bfq_data * bfqd ) ;
block, bfq: fix use after free in bfq_bfqq_expire
The function bfq_bfqq_expire() invokes the function
__bfq_bfqq_expire(), and the latter may free the in-service bfq-queue.
If this happens, then no other instruction of bfq_bfqq_expire() must
be executed, or a use-after-free will occur.
Basing on the assumption that __bfq_bfqq_expire() invokes
bfq_put_queue() on the in-service bfq-queue exactly once, the queue is
assumed to be freed if its refcounter is equal to one right before
invoking __bfq_bfqq_expire().
But, since commit 9dee8b3b057e ("block, bfq: fix queue removal from
weights tree") this assumption is false. __bfq_bfqq_expire() may also
invoke bfq_weights_tree_remove() and, since commit 9dee8b3b057e
("block, bfq: fix queue removal from weights tree"), also
the latter function may invoke bfq_put_queue(). So __bfq_bfqq_expire()
may invoke bfq_put_queue() twice, and this is the actual case where
the in-service queue may happen to be freed.
To address this issue, this commit moves the check on the refcounter
of the queue right around the last bfq_put_queue() that may be invoked
on the queue.
Fixes: 9dee8b3b057e ("block, bfq: fix queue removal from weights tree")
Reported-by: Dmitrii Tcvetkov <demfloro@demfloro.ru>
Reported-by: Douglas Anderson <dianders@chromium.org>
Tested-by: Dmitrii Tcvetkov <demfloro@demfloro.ru>
Tested-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-04-10 10:38:33 +02:00
bool __bfq_bfqd_reset_in_service ( struct bfq_data * bfqd ) ;
2017-04-19 08:48:24 -06:00
void bfq_deactivate_bfqq ( struct bfq_data * bfqd , struct bfq_queue * bfqq ,
bool ins_into_idle_tree , bool expiration ) ;
void bfq_activate_bfqq ( struct bfq_data * bfqd , struct bfq_queue * bfqq ) ;
block, bfq: make lookup_next_entity push up vtime on expirations
To provide a very smooth service, bfq starts to serve a bfq_queue
only if the queue is 'eligible', i.e., if the same queue would
have started to be served in the ideal, perfectly fair system that
bfq simulates internally. This is obtained by associating each
queue with a virtual start time, and by computing a special system
virtual time quantity: a queue is eligible only if the system
virtual time has reached the virtual start time of the
queue. Finally, bfq guarantees that, when a new queue must be set
in service, there is always at least one eligible entity for each
active parent entity in the scheduler. To provide this guarantee,
the function __bfq_lookup_next_entity pushes up, for each parent
entity on which it is invoked, the system virtual time to the
minimum among the virtual start times of the entities in the
active tree for the parent entity (more precisely, the push up
occurs if the system virtual time happens to be lower than all
such virtual start times).
There is however a circumstance in which __bfq_lookup_next_entity
cannot push up the system virtual time for a parent entity, even
if the system virtual time is lower than the virtual start times
of all the child entities in the active tree. It happens if one of
the child entities is in service. In fact, in such a case, there
is already an eligible entity, the in-service one, even if it may
not be not present in the active tree (because in-service entities
may be removed from the active tree).
Unfortunately, in the last re-design of the
hierarchical-scheduling engine, the reset of the pointer to the
in-service entity for a given parent entity--reset to be done as a
consequence of the expiration of the in-service entity--always
happens after the function __bfq_lookup_next_entity has been
invoked. This causes the function to think that there is still an
entity in service for the parent entity, and then that the system
virtual time cannot be pushed up, even if actually such a
no-more-in-service entity has already been properly reinserted
into the active tree (or in some other tree if no more
active). Yet, the system virtual time *had* to be pushed up, to be
ready to correctly choose the next queue to serve. Because of the
lack of this push up, bfq may wrongly set in service a queue that
had been speculatively pre-computed as the possible
next-in-service queue, but that would no more be the one to serve
after the expiration and the reinsertion into the active trees of
the previously in-service entities.
This commit addresses this issue by making
__bfq_lookup_next_entity properly push up the system virtual time
if an expiration is occurring.
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-08-31 08:46:29 +02:00
void bfq_requeue_bfqq ( struct bfq_data * bfqd , struct bfq_queue * bfqq ,
bool expiration ) ;
2017-04-19 08:48:24 -06:00
void bfq_del_bfqq_busy ( struct bfq_data * bfqd , struct bfq_queue * bfqq ,
bool expiration ) ;
void bfq_add_bfqq_busy ( struct bfq_data * bfqd , struct bfq_queue * bfqq ) ;
/* --------------- end of interface of B-WF2Q+ ---------------- */
/* Logging facilities. */
2021-11-25 14:36:39 +01:00
static inline void bfq_bfqq_name ( struct bfq_queue * bfqq , char * str , int len )
2019-03-12 09:59:33 +01:00
{
2021-11-25 14:36:39 +01:00
char type = bfq_bfqq_sync ( bfqq ) ? ' S ' : ' A ' ;
if ( bfqq - > pid ! = - 1 )
snprintf ( str , len , " bfq%d%c " , bfqq - > pid , type ) ;
2019-03-12 09:59:33 +01:00
else
2021-11-25 14:36:39 +01:00
snprintf ( str , len , " bfqSHARED-%c " , type ) ;
2019-03-12 09:59:33 +01:00
}
2017-04-19 08:48:24 -06:00
# ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_group * bfqq_group ( struct bfq_queue * bfqq ) ;
# define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
2021-11-25 14:36:39 +01:00
char pid_str [ MAX_BFQQ_NAME_LENGTH ] ; \
2019-11-01 13:11:10 +00:00
if ( likely ( ! blk_trace_note_message_enabled ( ( bfqd ) - > queue ) ) ) \
break ; \
2021-11-25 14:36:39 +01:00
bfq_bfqq_name ( ( bfqq ) , pid_str , MAX_BFQQ_NAME_LENGTH ) ; \
2017-07-12 11:49:56 -07:00
blk_add_cgroup_trace_msg ( ( bfqd ) - > queue , \
2022-04-20 06:27:16 +02:00
& bfqg_to_blkg ( bfqq_group ( bfqq ) ) - > blkcg - > css , \
2021-11-25 14:36:39 +01:00
" %s " fmt , pid_str , # # args ) ; \
2017-04-19 08:48:24 -06:00
} while ( 0 )
2017-07-12 11:49:56 -07:00
# define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
blk_add_cgroup_trace_msg ( ( bfqd ) - > queue , \
2022-04-20 06:27:16 +02:00
& bfqg_to_blkg ( bfqg ) - > blkcg - > css , fmt , # # args ) ; \
2017-07-12 11:49:56 -07:00
} while ( 0 )
2017-04-19 08:48:24 -06:00
# else /* CONFIG_BFQ_GROUP_IOSCHED */
2019-03-12 09:59:33 +01:00
# define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
2021-11-25 14:36:39 +01:00
char pid_str [ MAX_BFQQ_NAME_LENGTH ] ; \
2019-11-01 13:11:10 +00:00
if ( likely ( ! blk_trace_note_message_enabled ( ( bfqd ) - > queue ) ) ) \
break ; \
2021-11-25 14:36:39 +01:00
bfq_bfqq_name ( ( bfqq ) , pid_str , MAX_BFQQ_NAME_LENGTH ) ; \
blk_add_trace_msg ( ( bfqd ) - > queue , " %s " fmt , pid_str , # # args ) ; \
2019-03-12 09:59:33 +01:00
} while ( 0 )
2017-04-19 08:48:24 -06:00
# define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
# endif /* CONFIG_BFQ_GROUP_IOSCHED */
# define bfq_log(bfqd, fmt, args...) \
blk_add_trace_msg ( ( bfqd ) - > queue , " bfq " fmt , # # args )
# endif /* _BFQ_H */