Revert "MD: fix lock contention for flush bios"
This reverts commit5a409b4f56
. This patch has two problems. 1/ it make multiple calls to submit_bio() from inside a make_request_fn. The bios thus submitted will be queued on current->bio_list and not submitted immediately. As the bios are allocated from a mempool, this can theoretically result in a deadlock - all the pool of requests could be in various ->bio_list queues and a subsequent mempool_alloc could block waiting for one of them to be released. 2/ It aims to handle a case when there are many concurrent flush requests. It handles this by submitting many requests in parallel - all of which are identical and so most of which do nothing useful. It would be more efficient to just send one lower-level request, but allow that to satisfy multiple upper-level requests. Fixes:5a409b4f56
("MD: fix lock contention for flush bios") Cc: <stable@vger.kernel.org> # v4.19+ Tested-by: Xiao Ni <xni@redhat.com> Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
4f4fd7c579
commit
4bc034d353
153
drivers/md/md.c
153
drivers/md/md.c
@ -132,24 +132,6 @@ static inline int speed_max(struct mddev *mddev)
|
||||
mddev->sync_speed_max : sysctl_speed_limit_max;
|
||||
}
|
||||
|
||||
static void * flush_info_alloc(gfp_t gfp_flags, void *data)
|
||||
{
|
||||
return kzalloc(sizeof(struct flush_info), gfp_flags);
|
||||
}
|
||||
static void flush_info_free(void *flush_info, void *data)
|
||||
{
|
||||
kfree(flush_info);
|
||||
}
|
||||
|
||||
static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
|
||||
{
|
||||
return kzalloc(sizeof(struct flush_bio), gfp_flags);
|
||||
}
|
||||
static void flush_bio_free(void *flush_bio, void *data)
|
||||
{
|
||||
kfree(flush_bio);
|
||||
}
|
||||
|
||||
static struct ctl_table_header *raid_table_header;
|
||||
|
||||
static struct ctl_table raid_table[] = {
|
||||
@ -423,54 +405,30 @@ static int md_congested(void *data, int bits)
|
||||
/*
|
||||
* Generic flush handling for md
|
||||
*/
|
||||
static void submit_flushes(struct work_struct *ws)
|
||||
|
||||
static void md_end_flush(struct bio *bio)
|
||||
{
|
||||
struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
|
||||
struct mddev *mddev = fi->mddev;
|
||||
struct bio *bio = fi->bio;
|
||||
|
||||
bio->bi_opf &= ~REQ_PREFLUSH;
|
||||
md_handle_request(mddev, bio);
|
||||
|
||||
mempool_free(fi, mddev->flush_pool);
|
||||
}
|
||||
|
||||
static void md_end_flush(struct bio *fbio)
|
||||
{
|
||||
struct flush_bio *fb = fbio->bi_private;
|
||||
struct md_rdev *rdev = fb->rdev;
|
||||
struct flush_info *fi = fb->fi;
|
||||
struct bio *bio = fi->bio;
|
||||
struct mddev *mddev = fi->mddev;
|
||||
struct md_rdev *rdev = bio->bi_private;
|
||||
struct mddev *mddev = rdev->mddev;
|
||||
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
|
||||
if (atomic_dec_and_test(&fi->flush_pending)) {
|
||||
if (bio->bi_iter.bi_size == 0) {
|
||||
/* an empty barrier - all done */
|
||||
bio_endio(bio);
|
||||
mempool_free(fi, mddev->flush_pool);
|
||||
} else {
|
||||
INIT_WORK(&fi->flush_work, submit_flushes);
|
||||
queue_work(md_wq, &fi->flush_work);
|
||||
if (atomic_dec_and_test(&mddev->flush_pending)) {
|
||||
/* The pre-request flush has finished */
|
||||
queue_work(md_wq, &mddev->flush_work);
|
||||
}
|
||||
}
|
||||
|
||||
mempool_free(fb, mddev->flush_bio_pool);
|
||||
bio_put(fbio);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
void md_flush_request(struct mddev *mddev, struct bio *bio)
|
||||
static void md_submit_flush_data(struct work_struct *ws);
|
||||
|
||||
static void submit_flushes(struct work_struct *ws)
|
||||
{
|
||||
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
|
||||
struct md_rdev *rdev;
|
||||
struct flush_info *fi;
|
||||
|
||||
fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
|
||||
|
||||
fi->bio = bio;
|
||||
fi->mddev = mddev;
|
||||
atomic_set(&fi->flush_pending, 1);
|
||||
|
||||
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
|
||||
atomic_set(&mddev->flush_pending, 1);
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
@ -480,40 +438,59 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
|
||||
* we reclaim rcu_read_lock
|
||||
*/
|
||||
struct bio *bi;
|
||||
struct flush_bio *fb;
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
|
||||
fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
|
||||
fb->fi = fi;
|
||||
fb->rdev = rdev;
|
||||
|
||||
bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
|
||||
bio_set_dev(bi, rdev->bdev);
|
||||
bi->bi_end_io = md_end_flush;
|
||||
bi->bi_private = fb;
|
||||
bi->bi_private = rdev;
|
||||
bio_set_dev(bi, rdev->bdev);
|
||||
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
|
||||
|
||||
atomic_inc(&fi->flush_pending);
|
||||
atomic_inc(&mddev->flush_pending);
|
||||
submit_bio(bi);
|
||||
|
||||
rcu_read_lock();
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (atomic_dec_and_test(&mddev->flush_pending))
|
||||
queue_work(md_wq, &mddev->flush_work);
|
||||
}
|
||||
|
||||
static void md_submit_flush_data(struct work_struct *ws)
|
||||
{
|
||||
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
|
||||
struct bio *bio = mddev->flush_bio;
|
||||
|
||||
/*
|
||||
* must reset flush_bio before calling into md_handle_request to avoid a
|
||||
* deadlock, because other bios passed md_handle_request suspend check
|
||||
* could wait for this and below md_handle_request could wait for those
|
||||
* bios because of suspend check
|
||||
*/
|
||||
mddev->flush_bio = NULL;
|
||||
wake_up(&mddev->sb_wait);
|
||||
|
||||
if (atomic_dec_and_test(&fi->flush_pending)) {
|
||||
if (bio->bi_iter.bi_size == 0) {
|
||||
/* an empty barrier - all done */
|
||||
bio_endio(bio);
|
||||
mempool_free(fi, mddev->flush_pool);
|
||||
} else {
|
||||
INIT_WORK(&fi->flush_work, submit_flushes);
|
||||
queue_work(md_wq, &fi->flush_work);
|
||||
}
|
||||
bio->bi_opf &= ~REQ_PREFLUSH;
|
||||
md_handle_request(mddev, bio);
|
||||
}
|
||||
}
|
||||
|
||||
void md_flush_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
spin_lock_irq(&mddev->lock);
|
||||
wait_event_lock_irq(mddev->sb_wait,
|
||||
!mddev->flush_bio,
|
||||
mddev->lock);
|
||||
mddev->flush_bio = bio;
|
||||
spin_unlock_irq(&mddev->lock);
|
||||
|
||||
INIT_WORK(&mddev->flush_work, submit_flushes);
|
||||
queue_work(md_wq, &mddev->flush_work);
|
||||
}
|
||||
EXPORT_SYMBOL(md_flush_request);
|
||||
|
||||
static inline struct mddev *mddev_get(struct mddev *mddev)
|
||||
@ -560,6 +537,7 @@ void mddev_init(struct mddev *mddev)
|
||||
atomic_set(&mddev->openers, 0);
|
||||
atomic_set(&mddev->active_io, 0);
|
||||
spin_lock_init(&mddev->lock);
|
||||
atomic_set(&mddev->flush_pending, 0);
|
||||
init_waitqueue_head(&mddev->sb_wait);
|
||||
init_waitqueue_head(&mddev->recovery_wait);
|
||||
mddev->reshape_position = MaxSector;
|
||||
@ -5511,22 +5489,6 @@ int md_run(struct mddev *mddev)
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
if (mddev->flush_pool == NULL) {
|
||||
mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
|
||||
flush_info_free, mddev);
|
||||
if (!mddev->flush_pool) {
|
||||
err = -ENOMEM;
|
||||
goto abort;
|
||||
}
|
||||
}
|
||||
if (mddev->flush_bio_pool == NULL) {
|
||||
mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
|
||||
flush_bio_free, mddev);
|
||||
if (!mddev->flush_bio_pool) {
|
||||
err = -ENOMEM;
|
||||
goto abort;
|
||||
}
|
||||
}
|
||||
|
||||
spin_lock(&pers_lock);
|
||||
pers = find_pers(mddev->level, mddev->clevel);
|
||||
@ -5686,11 +5648,8 @@ int md_run(struct mddev *mddev)
|
||||
return 0;
|
||||
|
||||
abort:
|
||||
mempool_destroy(mddev->flush_bio_pool);
|
||||
mddev->flush_bio_pool = NULL;
|
||||
mempool_destroy(mddev->flush_pool);
|
||||
mddev->flush_pool = NULL;
|
||||
|
||||
bioset_exit(&mddev->bio_set);
|
||||
bioset_exit(&mddev->sync_set);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_run);
|
||||
@ -5894,14 +5853,6 @@ static void __md_stop(struct mddev *mddev)
|
||||
mddev->to_remove = &md_redundancy_group;
|
||||
module_put(pers->owner);
|
||||
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
if (mddev->flush_bio_pool) {
|
||||
mempool_destroy(mddev->flush_bio_pool);
|
||||
mddev->flush_bio_pool = NULL;
|
||||
}
|
||||
if (mddev->flush_pool) {
|
||||
mempool_destroy(mddev->flush_pool);
|
||||
mddev->flush_pool = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void md_stop(struct mddev *mddev)
|
||||
|
@ -252,19 +252,6 @@ enum mddev_sb_flags {
|
||||
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
|
||||
};
|
||||
|
||||
#define NR_FLUSH_INFOS 8
|
||||
#define NR_FLUSH_BIOS 64
|
||||
struct flush_info {
|
||||
struct bio *bio;
|
||||
struct mddev *mddev;
|
||||
struct work_struct flush_work;
|
||||
atomic_t flush_pending;
|
||||
};
|
||||
struct flush_bio {
|
||||
struct flush_info *fi;
|
||||
struct md_rdev *rdev;
|
||||
};
|
||||
|
||||
struct mddev {
|
||||
void *private;
|
||||
struct md_personality *pers;
|
||||
@ -470,8 +457,13 @@ struct mddev {
|
||||
* metadata and bitmap writes
|
||||
*/
|
||||
|
||||
mempool_t *flush_pool;
|
||||
mempool_t *flush_bio_pool;
|
||||
/* Generic flush handling.
|
||||
* The last to finish preflush schedules a worker to submit
|
||||
* the rest of the request (without the REQ_PREFLUSH flag).
|
||||
*/
|
||||
struct bio *flush_bio;
|
||||
atomic_t flush_pending;
|
||||
struct work_struct flush_work;
|
||||
struct work_struct event_work; /* used by dm to report failure event */
|
||||
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
|
||||
struct md_cluster_info *cluster_info;
|
||||
|
Loading…
Reference in New Issue
Block a user