8dc8146f9c
Introduce zone write locking to avoid write request reordering with zoned block devices. This is achieved using a finer selection of the next request to dispatch: 1) Any non-write request is always allowed to proceed. 2) Any write to a conventional zone is always allowed to proceed. 3) For a write to a sequential zone, the zone lock is first checked. a) If the zone is not locked, the write is allowed to proceed after its target zone is locked. b) If the zone is locked, the write request is skipped and the next request in the dispatch queue tested (back to step 1). For a write request that has locked its target zone, the zone is unlocked either when the request completes and the method deadline_request_completed() is called, or when the request is requeued using the method deadline_add_request(). Requests targeting a locked zone are always left in the scheduler queue to preserve the initial write order. If no write request can be dispatched, allow reads to be dispatched even if the write batch is not done. If the device used is not a zoned block device, or if zoned block device support is disabled, this patch does not modify deadline behavior. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
562 lines
13 KiB
C
562 lines
13 KiB
C
/*
|
|
* Deadline i/o scheduler.
|
|
*
|
|
* Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/elevator.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/module.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/init.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/rbtree.h>
|
|
|
|
/*
|
|
* See Documentation/block/deadline-iosched.txt
|
|
*/
|
|
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
|
|
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
|
|
static const int writes_starved = 2; /* max times reads can starve a write */
|
|
static const int fifo_batch = 16; /* # of sequential requests treated as one
|
|
by the above parameters. For throughput. */
|
|
|
|
struct deadline_data {
|
|
/*
|
|
* run time data
|
|
*/
|
|
|
|
/*
|
|
* requests (deadline_rq s) are present on both sort_list and fifo_list
|
|
*/
|
|
struct rb_root sort_list[2];
|
|
struct list_head fifo_list[2];
|
|
|
|
/*
|
|
* next in sort order. read, write or both are NULL
|
|
*/
|
|
struct request *next_rq[2];
|
|
unsigned int batching; /* number of sequential requests made */
|
|
unsigned int starved; /* times reads have starved writes */
|
|
|
|
/*
|
|
* settings that change how the i/o scheduler behaves
|
|
*/
|
|
int fifo_expire[2];
|
|
int fifo_batch;
|
|
int writes_starved;
|
|
int front_merges;
|
|
};
|
|
|
|
static inline struct rb_root *
|
|
deadline_rb_root(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
return &dd->sort_list[rq_data_dir(rq)];
|
|
}
|
|
|
|
/*
|
|
* get the request after `rq' in sector-sorted order
|
|
*/
|
|
static inline struct request *
|
|
deadline_latter_request(struct request *rq)
|
|
{
|
|
struct rb_node *node = rb_next(&rq->rb_node);
|
|
|
|
if (node)
|
|
return rb_entry_rq(node);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void
|
|
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
struct rb_root *root = deadline_rb_root(dd, rq);
|
|
|
|
elv_rb_add(root, rq);
|
|
}
|
|
|
|
static inline void
|
|
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
const int data_dir = rq_data_dir(rq);
|
|
|
|
if (dd->next_rq[data_dir] == rq)
|
|
dd->next_rq[data_dir] = deadline_latter_request(rq);
|
|
|
|
elv_rb_del(deadline_rb_root(dd, rq), rq);
|
|
}
|
|
|
|
/*
|
|
* add rq to rbtree and fifo
|
|
*/
|
|
static void
|
|
deadline_add_request(struct request_queue *q, struct request *rq)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
const int data_dir = rq_data_dir(rq);
|
|
|
|
/*
|
|
* This may be a requeue of a write request that has locked its
|
|
* target zone. If it is the case, this releases the zone lock.
|
|
*/
|
|
blk_req_zone_write_unlock(rq);
|
|
|
|
deadline_add_rq_rb(dd, rq);
|
|
|
|
/*
|
|
* set expire time and add to fifo list
|
|
*/
|
|
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
|
|
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
|
|
}
|
|
|
|
/*
|
|
* remove rq from rbtree and fifo.
|
|
*/
|
|
static void deadline_remove_request(struct request_queue *q, struct request *rq)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
|
|
rq_fifo_clear(rq);
|
|
deadline_del_rq_rb(dd, rq);
|
|
}
|
|
|
|
static enum elv_merge
|
|
deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
struct request *__rq;
|
|
|
|
/*
|
|
* check for front merge
|
|
*/
|
|
if (dd->front_merges) {
|
|
sector_t sector = bio_end_sector(bio);
|
|
|
|
__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
|
|
if (__rq) {
|
|
BUG_ON(sector != blk_rq_pos(__rq));
|
|
|
|
if (elv_bio_merge_ok(__rq, bio)) {
|
|
*req = __rq;
|
|
return ELEVATOR_FRONT_MERGE;
|
|
}
|
|
}
|
|
}
|
|
|
|
return ELEVATOR_NO_MERGE;
|
|
}
|
|
|
|
static void deadline_merged_request(struct request_queue *q,
|
|
struct request *req, enum elv_merge type)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
|
|
/*
|
|
* if the merge was a front merge, we need to reposition request
|
|
*/
|
|
if (type == ELEVATOR_FRONT_MERGE) {
|
|
elv_rb_del(deadline_rb_root(dd, req), req);
|
|
deadline_add_rq_rb(dd, req);
|
|
}
|
|
}
|
|
|
|
static void
|
|
deadline_merged_requests(struct request_queue *q, struct request *req,
|
|
struct request *next)
|
|
{
|
|
/*
|
|
* if next expires before rq, assign its expire time to rq
|
|
* and move into next position (next will be deleted) in fifo
|
|
*/
|
|
if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
|
|
if (time_before((unsigned long)next->fifo_time,
|
|
(unsigned long)req->fifo_time)) {
|
|
list_move(&req->queuelist, &next->queuelist);
|
|
req->fifo_time = next->fifo_time;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* kill knowledge of next, this one is a goner
|
|
*/
|
|
deadline_remove_request(q, next);
|
|
}
|
|
|
|
/*
|
|
* move request from sort list to dispatch queue.
|
|
*/
|
|
static inline void
|
|
deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
struct request_queue *q = rq->q;
|
|
|
|
/*
|
|
* For a zoned block device, write requests must write lock their
|
|
* target zone.
|
|
*/
|
|
blk_req_zone_write_lock(rq);
|
|
|
|
deadline_remove_request(q, rq);
|
|
elv_dispatch_add_tail(q, rq);
|
|
}
|
|
|
|
/*
|
|
* move an entry to dispatch queue
|
|
*/
|
|
static void
|
|
deadline_move_request(struct deadline_data *dd, struct request *rq)
|
|
{
|
|
const int data_dir = rq_data_dir(rq);
|
|
|
|
dd->next_rq[READ] = NULL;
|
|
dd->next_rq[WRITE] = NULL;
|
|
dd->next_rq[data_dir] = deadline_latter_request(rq);
|
|
|
|
/*
|
|
* take it off the sort and fifo list, move
|
|
* to dispatch queue
|
|
*/
|
|
deadline_move_to_dispatch(dd, rq);
|
|
}
|
|
|
|
/*
|
|
* deadline_check_fifo returns 0 if there are no expired requests on the fifo,
|
|
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
|
|
*/
|
|
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
|
|
{
|
|
struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
|
|
|
|
/*
|
|
* rq is expired!
|
|
*/
|
|
if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* For the specified data direction, return the next request to dispatch using
|
|
* arrival ordered lists.
|
|
*/
|
|
static struct request *
|
|
deadline_fifo_request(struct deadline_data *dd, int data_dir)
|
|
{
|
|
struct request *rq;
|
|
|
|
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
|
|
return NULL;
|
|
|
|
if (list_empty(&dd->fifo_list[data_dir]))
|
|
return NULL;
|
|
|
|
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
|
|
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
|
|
return rq;
|
|
|
|
/*
|
|
* Look for a write request that can be dispatched, that is one with
|
|
* an unlocked target zone.
|
|
*/
|
|
list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
|
|
if (blk_req_can_dispatch_to_zone(rq))
|
|
return rq;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* For the specified data direction, return the next request to dispatch using
|
|
* sector position sorted lists.
|
|
*/
|
|
static struct request *
|
|
deadline_next_request(struct deadline_data *dd, int data_dir)
|
|
{
|
|
struct request *rq;
|
|
|
|
if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
|
|
return NULL;
|
|
|
|
rq = dd->next_rq[data_dir];
|
|
if (!rq)
|
|
return NULL;
|
|
|
|
if (data_dir == READ || !blk_queue_is_zoned(rq->q))
|
|
return rq;
|
|
|
|
/*
|
|
* Look for a write request that can be dispatched, that is one with
|
|
* an unlocked target zone.
|
|
*/
|
|
while (rq) {
|
|
if (blk_req_can_dispatch_to_zone(rq))
|
|
return rq;
|
|
rq = deadline_latter_request(rq);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* deadline_dispatch_requests selects the best request according to
|
|
* read/write expire, fifo_batch, etc
|
|
*/
|
|
static int deadline_dispatch_requests(struct request_queue *q, int force)
|
|
{
|
|
struct deadline_data *dd = q->elevator->elevator_data;
|
|
const int reads = !list_empty(&dd->fifo_list[READ]);
|
|
const int writes = !list_empty(&dd->fifo_list[WRITE]);
|
|
struct request *rq, *next_rq;
|
|
int data_dir;
|
|
|
|
/*
|
|
* batches are currently reads XOR writes
|
|
*/
|
|
rq = deadline_next_request(dd, WRITE);
|
|
if (!rq)
|
|
rq = deadline_next_request(dd, READ);
|
|
|
|
if (rq && dd->batching < dd->fifo_batch)
|
|
/* we have a next request are still entitled to batch */
|
|
goto dispatch_request;
|
|
|
|
/*
|
|
* at this point we are not running a batch. select the appropriate
|
|
* data direction (read / write)
|
|
*/
|
|
|
|
if (reads) {
|
|
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
|
|
|
|
if (deadline_fifo_request(dd, WRITE) &&
|
|
(dd->starved++ >= dd->writes_starved))
|
|
goto dispatch_writes;
|
|
|
|
data_dir = READ;
|
|
|
|
goto dispatch_find_request;
|
|
}
|
|
|
|
/*
|
|
* there are either no reads or writes have been starved
|
|
*/
|
|
|
|
if (writes) {
|
|
dispatch_writes:
|
|
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
|
|
|
|
dd->starved = 0;
|
|
|
|
data_dir = WRITE;
|
|
|
|
goto dispatch_find_request;
|
|
}
|
|
|
|
return 0;
|
|
|
|
dispatch_find_request:
|
|
/*
|
|
* we are not running a batch, find best request for selected data_dir
|
|
*/
|
|
next_rq = deadline_next_request(dd, data_dir);
|
|
if (deadline_check_fifo(dd, data_dir) || !next_rq) {
|
|
/*
|
|
* A deadline has expired, the last request was in the other
|
|
* direction, or we have run out of higher-sectored requests.
|
|
* Start again from the request with the earliest expiry time.
|
|
*/
|
|
rq = deadline_fifo_request(dd, data_dir);
|
|
} else {
|
|
/*
|
|
* The last req was the same dir and we have a next request in
|
|
* sort order. No expired requests so continue on from here.
|
|
*/
|
|
rq = next_rq;
|
|
}
|
|
|
|
/*
|
|
* For a zoned block device, if we only have writes queued and none of
|
|
* them can be dispatched, rq will be NULL.
|
|
*/
|
|
if (!rq)
|
|
return 0;
|
|
|
|
dd->batching = 0;
|
|
|
|
dispatch_request:
|
|
/*
|
|
* rq is the selected appropriate request.
|
|
*/
|
|
dd->batching++;
|
|
deadline_move_request(dd, rq);
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* For zoned block devices, write unlock the target zone of completed
|
|
* write requests.
|
|
*/
|
|
static void
|
|
deadline_completed_request(struct request_queue *q, struct request *rq)
|
|
{
|
|
blk_req_zone_write_unlock(rq);
|
|
}
|
|
|
|
static void deadline_exit_queue(struct elevator_queue *e)
|
|
{
|
|
struct deadline_data *dd = e->elevator_data;
|
|
|
|
BUG_ON(!list_empty(&dd->fifo_list[READ]));
|
|
BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
|
|
|
|
kfree(dd);
|
|
}
|
|
|
|
/*
|
|
* initialize elevator private data (deadline_data).
|
|
*/
|
|
static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
|
|
{
|
|
struct deadline_data *dd;
|
|
struct elevator_queue *eq;
|
|
|
|
eq = elevator_alloc(q, e);
|
|
if (!eq)
|
|
return -ENOMEM;
|
|
|
|
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
|
|
if (!dd) {
|
|
kobject_put(&eq->kobj);
|
|
return -ENOMEM;
|
|
}
|
|
eq->elevator_data = dd;
|
|
|
|
INIT_LIST_HEAD(&dd->fifo_list[READ]);
|
|
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
|
|
dd->sort_list[READ] = RB_ROOT;
|
|
dd->sort_list[WRITE] = RB_ROOT;
|
|
dd->fifo_expire[READ] = read_expire;
|
|
dd->fifo_expire[WRITE] = write_expire;
|
|
dd->writes_starved = writes_starved;
|
|
dd->front_merges = 1;
|
|
dd->fifo_batch = fifo_batch;
|
|
|
|
spin_lock_irq(q->queue_lock);
|
|
q->elevator = eq;
|
|
spin_unlock_irq(q->queue_lock);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* sysfs parts below
|
|
*/
|
|
|
|
static ssize_t
|
|
deadline_var_show(int var, char *page)
|
|
{
|
|
return sprintf(page, "%d\n", var);
|
|
}
|
|
|
|
static void
|
|
deadline_var_store(int *var, const char *page)
|
|
{
|
|
char *p = (char *) page;
|
|
|
|
*var = simple_strtol(p, &p, 10);
|
|
}
|
|
|
|
#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
|
|
static ssize_t __FUNC(struct elevator_queue *e, char *page) \
|
|
{ \
|
|
struct deadline_data *dd = e->elevator_data; \
|
|
int __data = __VAR; \
|
|
if (__CONV) \
|
|
__data = jiffies_to_msecs(__data); \
|
|
return deadline_var_show(__data, (page)); \
|
|
}
|
|
SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
|
|
SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
|
|
SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
|
|
SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
|
|
SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
|
|
#undef SHOW_FUNCTION
|
|
|
|
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
|
|
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
|
|
{ \
|
|
struct deadline_data *dd = e->elevator_data; \
|
|
int __data; \
|
|
deadline_var_store(&__data, (page)); \
|
|
if (__data < (MIN)) \
|
|
__data = (MIN); \
|
|
else if (__data > (MAX)) \
|
|
__data = (MAX); \
|
|
if (__CONV) \
|
|
*(__PTR) = msecs_to_jiffies(__data); \
|
|
else \
|
|
*(__PTR) = __data; \
|
|
return count; \
|
|
}
|
|
STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
|
|
STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
|
|
STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
|
|
STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
|
|
STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
|
|
#undef STORE_FUNCTION
|
|
|
|
#define DD_ATTR(name) \
|
|
__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
|
|
deadline_##name##_store)
|
|
|
|
static struct elv_fs_entry deadline_attrs[] = {
|
|
DD_ATTR(read_expire),
|
|
DD_ATTR(write_expire),
|
|
DD_ATTR(writes_starved),
|
|
DD_ATTR(front_merges),
|
|
DD_ATTR(fifo_batch),
|
|
__ATTR_NULL
|
|
};
|
|
|
|
static struct elevator_type iosched_deadline = {
|
|
.ops.sq = {
|
|
.elevator_merge_fn = deadline_merge,
|
|
.elevator_merged_fn = deadline_merged_request,
|
|
.elevator_merge_req_fn = deadline_merged_requests,
|
|
.elevator_dispatch_fn = deadline_dispatch_requests,
|
|
.elevator_completed_req_fn = deadline_completed_request,
|
|
.elevator_add_req_fn = deadline_add_request,
|
|
.elevator_former_req_fn = elv_rb_former_request,
|
|
.elevator_latter_req_fn = elv_rb_latter_request,
|
|
.elevator_init_fn = deadline_init_queue,
|
|
.elevator_exit_fn = deadline_exit_queue,
|
|
},
|
|
|
|
.elevator_attrs = deadline_attrs,
|
|
.elevator_name = "deadline",
|
|
.elevator_owner = THIS_MODULE,
|
|
};
|
|
|
|
static int __init deadline_init(void)
|
|
{
|
|
return elv_register(&iosched_deadline);
|
|
}
|
|
|
|
static void __exit deadline_exit(void)
|
|
{
|
|
elv_unregister(&iosched_deadline);
|
|
}
|
|
|
|
module_init(deadline_init);
|
|
module_exit(deadline_exit);
|
|
|
|
MODULE_AUTHOR("Jens Axboe");
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_DESCRIPTION("deadline IO scheduler");
|