dm: support bio polling
Support bio polling (REQ_POLLED) in the following approach: 1) only support io polling on normal READ/WRITE, and other abnormal IOs still fallback to IRQ mode, so the target io (and DM's clone bio) is exactly inside the dm io. 2) hold one refcnt on io->io_count after submitting this dm bio with REQ_POLLED 3) support dm native bio splitting, any dm io instance associated with current bio will be added into one list which head is bio->bi_private which will be recovered before ending this bio 4) implement .poll_bio() callback, call bio_poll() on the single target bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call dm_io_dec_pending() after the target io is done in .poll_bio() 5) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL, which is based on Jeffle's previous patch. These changes are good for a 30-35% IOPS improvement for polled IO. For detailed test results please see (Jens, thanks for testing!): https://listman.redhat.com/archives/dm-devel/2022-March/049868.html or https://marc.info/?l=linux-block&m=164684246214700&w=2 Tested-by: Jens Axboe <axboe@kernel.dk> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
This commit is contained in:
parent
69fe0f2989
commit
b99fdcdc36
@ -235,6 +235,8 @@ struct dm_io {
|
|||||||
bool start_io_acct:1;
|
bool start_io_acct:1;
|
||||||
int was_accounted;
|
int was_accounted;
|
||||||
unsigned long start_time;
|
unsigned long start_time;
|
||||||
|
void *data;
|
||||||
|
struct hlist_node node;
|
||||||
spinlock_t endio_lock;
|
spinlock_t endio_lock;
|
||||||
struct dm_stats_aux stats_aux;
|
struct dm_stats_aux stats_aux;
|
||||||
/* last member of dm_target_io is 'struct bio' */
|
/* last member of dm_target_io is 'struct bio' */
|
||||||
|
@ -1481,6 +1481,14 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
|
|||||||
return &t->targets[(KEYS_PER_NODE * n) + k];
|
return &t->targets[(KEYS_PER_NODE * n) + k];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
|
||||||
|
sector_t start, sector_t len, void *data)
|
||||||
|
{
|
||||||
|
struct request_queue *q = bdev_get_queue(dev->bdev);
|
||||||
|
|
||||||
|
return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* type->iterate_devices() should be called when the sanity check needs to
|
* type->iterate_devices() should be called when the sanity check needs to
|
||||||
* iterate and check all underlying data devices. iterate_devices() will
|
* iterate and check all underlying data devices. iterate_devices() will
|
||||||
@ -1531,6 +1539,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int dm_table_supports_poll(struct dm_table *t)
|
||||||
|
{
|
||||||
|
return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check whether a table has no data devices attached using each
|
* Check whether a table has no data devices attached using each
|
||||||
* target's iterate_devices method.
|
* target's iterate_devices method.
|
||||||
@ -2067,6 +2080,20 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
|
|||||||
dm_update_crypto_profile(q, t);
|
dm_update_crypto_profile(q, t);
|
||||||
disk_update_readahead(t->md->disk);
|
disk_update_readahead(t->md->disk);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check for request-based device is left to
|
||||||
|
* dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
|
||||||
|
*
|
||||||
|
* For bio-based device, only set QUEUE_FLAG_POLL when all
|
||||||
|
* underlying devices supporting polling.
|
||||||
|
*/
|
||||||
|
if (__table_type_bio_based(t->type)) {
|
||||||
|
if (dm_table_supports_poll(t))
|
||||||
|
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
|
||||||
|
else
|
||||||
|
blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
143
drivers/md/dm.c
143
drivers/md/dm.c
@ -40,6 +40,13 @@
|
|||||||
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
|
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
|
||||||
#define DM_COOKIE_LENGTH 24
|
#define DM_COOKIE_LENGTH 24
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For REQ_POLLED fs bio, this flag is set if we link mapped underlying
|
||||||
|
* dm_io into one list, and reuse bio->bi_private as the list head. Before
|
||||||
|
* ending this fs bio, we will recover its ->bi_private.
|
||||||
|
*/
|
||||||
|
#define REQ_DM_POLL_LIST REQ_DRV
|
||||||
|
|
||||||
static const char *_name = DM_NAME;
|
static const char *_name = DM_NAME;
|
||||||
|
|
||||||
static unsigned int major = 0;
|
static unsigned int major = 0;
|
||||||
@ -73,6 +80,7 @@ struct clone_info {
|
|||||||
struct dm_io *io;
|
struct dm_io *io;
|
||||||
sector_t sector;
|
sector_t sector;
|
||||||
unsigned sector_count;
|
unsigned sector_count;
|
||||||
|
bool submit_as_polled;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
|
#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
|
||||||
@ -599,6 +607,9 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
|
|||||||
if (!clone)
|
if (!clone)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
/* REQ_DM_POLL_LIST shouldn't be inherited */
|
||||||
|
clone->bi_opf &= ~REQ_DM_POLL_LIST;
|
||||||
|
|
||||||
tio = clone_to_tio(clone);
|
tio = clone_to_tio(clone);
|
||||||
tio->inside_dm_io = false;
|
tio->inside_dm_io = false;
|
||||||
}
|
}
|
||||||
@ -888,8 +899,15 @@ void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
|
|||||||
if (unlikely(wq_has_sleeper(&md->wait)))
|
if (unlikely(wq_has_sleeper(&md->wait)))
|
||||||
wake_up(&md->wait);
|
wake_up(&md->wait);
|
||||||
|
|
||||||
if (io_error == BLK_STS_DM_REQUEUE)
|
if (io_error == BLK_STS_DM_REQUEUE) {
|
||||||
|
/*
|
||||||
|
* Upper layer won't help us poll split bio, io->orig_bio
|
||||||
|
* may only reflect a subset of the pre-split original,
|
||||||
|
* so clear REQ_POLLED in case of requeue
|
||||||
|
*/
|
||||||
|
bio->bi_opf &= ~REQ_POLLED;
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (bio_is_flush_with_data(bio)) {
|
if (bio_is_flush_with_data(bio)) {
|
||||||
/*
|
/*
|
||||||
@ -1440,6 +1458,47 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reuse ->bi_private as hlist head for storing all dm_io instances
|
||||||
|
* associated with this bio, and this bio's bi_private needs to be
|
||||||
|
* stored in dm_io->data before the reuse.
|
||||||
|
*
|
||||||
|
* bio->bi_private is owned by fs or upper layer, so block layer won't
|
||||||
|
* touch it after splitting. Meantime it won't be changed by anyone after
|
||||||
|
* bio is submitted. So this reuse is safe.
|
||||||
|
*/
|
||||||
|
static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio)
|
||||||
|
{
|
||||||
|
return (struct hlist_head *)&bio->bi_private;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
|
||||||
|
{
|
||||||
|
struct hlist_head *head = dm_get_bio_hlist_head(bio);
|
||||||
|
|
||||||
|
if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
|
||||||
|
bio->bi_opf |= REQ_DM_POLL_LIST;
|
||||||
|
/*
|
||||||
|
* Save .bi_private into dm_io, so that we can reuse
|
||||||
|
* .bi_private as hlist head for storing dm_io list
|
||||||
|
*/
|
||||||
|
io->data = bio->bi_private;
|
||||||
|
|
||||||
|
INIT_HLIST_HEAD(head);
|
||||||
|
|
||||||
|
/* tell block layer to poll for completion */
|
||||||
|
bio->bi_cookie = ~BLK_QC_T_NONE;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* bio recursed due to split, reuse original poll list,
|
||||||
|
* and save bio->bi_private too.
|
||||||
|
*/
|
||||||
|
io->data = hlist_entry(head->first, struct dm_io, node)->data;
|
||||||
|
}
|
||||||
|
|
||||||
|
hlist_add_head(&io->node, head);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Select the correct strategy for processing a non-flush bio.
|
* Select the correct strategy for processing a non-flush bio.
|
||||||
*/
|
*/
|
||||||
@ -1457,6 +1516,12 @@ static int __split_and_process_bio(struct clone_info *ci)
|
|||||||
if (__process_abnormal_io(ci, ti, &r))
|
if (__process_abnormal_io(ci, ti, &r))
|
||||||
return r;
|
return r;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Only support bio polling for normal IO, and the target io is
|
||||||
|
* exactly inside the dm_io instance (verified in dm_poll_dm_io)
|
||||||
|
*/
|
||||||
|
ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED;
|
||||||
|
|
||||||
len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
|
len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
|
||||||
clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
|
clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
|
||||||
__map_bio(clone);
|
__map_bio(clone);
|
||||||
@ -1473,6 +1538,7 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
|
|||||||
ci->map = map;
|
ci->map = map;
|
||||||
ci->io = alloc_io(md, bio);
|
ci->io = alloc_io(md, bio);
|
||||||
ci->bio = bio;
|
ci->bio = bio;
|
||||||
|
ci->submit_as_polled = false;
|
||||||
ci->sector = bio->bi_iter.bi_sector;
|
ci->sector = bio->bi_iter.bi_sector;
|
||||||
ci->sector_count = bio_sectors(bio);
|
ci->sector_count = bio_sectors(bio);
|
||||||
|
|
||||||
@ -1522,8 +1588,17 @@ out:
|
|||||||
if (ci.io->start_io_acct)
|
if (ci.io->start_io_acct)
|
||||||
dm_start_io_acct(ci.io, NULL);
|
dm_start_io_acct(ci.io, NULL);
|
||||||
|
|
||||||
/* drop the extra reference count */
|
/*
|
||||||
dm_io_dec_pending(ci.io, errno_to_blk_status(error));
|
* Drop the extra reference count for non-POLLED bio, and hold one
|
||||||
|
* reference for POLLED bio, which will be released in dm_poll_bio
|
||||||
|
*
|
||||||
|
* Add every dm_io instance into the hlist_head which is stored in
|
||||||
|
* bio->bi_private, so that dm_poll_bio can poll them all.
|
||||||
|
*/
|
||||||
|
if (error || !ci.submit_as_polled)
|
||||||
|
dm_io_dec_pending(ci.io, errno_to_blk_status(error));
|
||||||
|
else
|
||||||
|
dm_queue_poll_io(bio, ci.io);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dm_submit_bio(struct bio *bio)
|
static void dm_submit_bio(struct bio *bio)
|
||||||
@ -1558,6 +1633,67 @@ out:
|
|||||||
dm_put_live_table(md, srcu_idx);
|
dm_put_live_table(md, srcu_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
|
||||||
|
unsigned int flags)
|
||||||
|
{
|
||||||
|
WARN_ON_ONCE(!io->tio.inside_dm_io);
|
||||||
|
|
||||||
|
/* don't poll if the mapped io is done */
|
||||||
|
if (atomic_read(&io->io_count) > 1)
|
||||||
|
bio_poll(&io->tio.clone, iob, flags);
|
||||||
|
|
||||||
|
/* bio_poll holds the last reference */
|
||||||
|
return atomic_read(&io->io_count) == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
|
||||||
|
unsigned int flags)
|
||||||
|
{
|
||||||
|
struct hlist_head *head = dm_get_bio_hlist_head(bio);
|
||||||
|
struct hlist_head tmp = HLIST_HEAD_INIT;
|
||||||
|
struct hlist_node *next;
|
||||||
|
struct dm_io *io;
|
||||||
|
|
||||||
|
/* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
|
||||||
|
if (!(bio->bi_opf & REQ_DM_POLL_LIST))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
WARN_ON_ONCE(hlist_empty(head));
|
||||||
|
|
||||||
|
hlist_move_list(head, &tmp);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Restore .bi_private before possibly completing dm_io.
|
||||||
|
*
|
||||||
|
* bio_poll() is only possible once @bio has been completely
|
||||||
|
* submitted via submit_bio_noacct()'s depth-first submission.
|
||||||
|
* So there is no dm_queue_poll_io() race associated with
|
||||||
|
* clearing REQ_DM_POLL_LIST here.
|
||||||
|
*/
|
||||||
|
bio->bi_opf &= ~REQ_DM_POLL_LIST;
|
||||||
|
bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data;
|
||||||
|
|
||||||
|
hlist_for_each_entry_safe(io, next, &tmp, node) {
|
||||||
|
if (dm_poll_dm_io(io, iob, flags)) {
|
||||||
|
hlist_del_init(&io->node);
|
||||||
|
/*
|
||||||
|
* clone_endio() has already occurred, so passing
|
||||||
|
* error as 0 here doesn't override io->status
|
||||||
|
*/
|
||||||
|
dm_io_dec_pending(io, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Not done? */
|
||||||
|
if (!hlist_empty(&tmp)) {
|
||||||
|
bio->bi_opf |= REQ_DM_POLL_LIST;
|
||||||
|
/* Reset bio->bi_private to dm_io list head */
|
||||||
|
hlist_move_list(&tmp, head);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
/*-----------------------------------------------------------------
|
/*-----------------------------------------------------------------
|
||||||
* An IDR is used to keep track of allocated minor numbers.
|
* An IDR is used to keep track of allocated minor numbers.
|
||||||
*---------------------------------------------------------------*/
|
*---------------------------------------------------------------*/
|
||||||
@ -2983,6 +3119,7 @@ static const struct pr_ops dm_pr_ops = {
|
|||||||
|
|
||||||
static const struct block_device_operations dm_blk_dops = {
|
static const struct block_device_operations dm_blk_dops = {
|
||||||
.submit_bio = dm_submit_bio,
|
.submit_bio = dm_submit_bio,
|
||||||
|
.poll_bio = dm_poll_bio,
|
||||||
.open = dm_blk_open,
|
.open = dm_blk_open,
|
||||||
.release = dm_blk_close,
|
.release = dm_blk_close,
|
||||||
.ioctl = dm_blk_ioctl,
|
.ioctl = dm_blk_ioctl,
|
||||||
|
Loading…
Reference in New Issue
Block a user