From 6024fece739518c4c101c767d527fd624b096a34 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher <agruen@linbit.com> Date: Fri, 28 Jan 2011 15:53:51 +0100 Subject: [PATCH] drbd: Defer new writes when detecting conflicting writes Before submitting a new local write request, wait for any conflicting local or remote requests to complete. We could assume that the new request occurred first and that the conflicting requests overwrote it (and therefore discard the new reques), but we know for sure that the new request occurred after the conflicting requests and so this behavior would we weird. We would also end up with the wrong result if the new request is not fully contained within the conflicting requests. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com> --- drivers/block/drbd/drbd_req.c | 103 +++++++++++++--------------------- 1 file changed, 39 insertions(+), 64 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index e11ea475a4a3..6bcf4171a76f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -300,48 +300,6 @@ static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_e _req_may_be_done(req, m); } -/* - * checks whether there was an overlapping request - * or ee already registered. - * - * if so, return 1, in which case this request is completed on the spot, - * without ever being submitted or send. - * - * return 0 if it is ok to submit this request. - * - * NOTE: - * paranoia: assume something above us is broken, and issues different write - * requests for the same block simultaneously... - * - * To ensure these won't be reordered differently on both nodes, resulting in - * diverging data sets, we discard the later one(s). Not that this is supposed - * to happen, but this is the rationale why we also have to check for - * conflicting requests with local origin, and why we have to do so regardless - * of whether we allowed multiple primaries. - */ -static int _req_conflicts(struct drbd_request *req) -{ - struct drbd_conf *mdev = req->mdev; - const sector_t sector = req->i.sector; - const int size = req->i.size; - struct drbd_interval *i; - - D_ASSERT(drbd_interval_empty(&req->i)); - - i = drbd_find_overlap(&mdev->write_requests, sector, size); - if (i) { - dev_alert(DEV, "%s[%u] Concurrent %s write detected! " - "[DISCARD L] new: %llus +%u; " - "pending: %llus +%u\n", - current->comm, current->pid, - i->local ? "local" : "remote", - (unsigned long long)sector, size, - (unsigned long long)i->sector, i->size); - return 1; - } - return 0; -} - /* obviously this could be coded as many single functions * instead of one huge switch, * or by putting the code directly in the respective locations @@ -721,6 +679,34 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); } +/* + * complete_conflicting_writes - wait for any conflicting write requests + * + * The write_requests tree contains all active write requests which we + * currently know about. Wait for any requests to complete which conflict with + * the new one. + */ +static int complete_conflicting_writes(struct drbd_conf *mdev, + sector_t sector, int size) +{ + for(;;) { + DEFINE_WAIT(wait); + struct drbd_interval *i; + + i = drbd_find_overlap(&mdev->write_requests, sector, size); + if (!i) + return 0; + i->waiting = true; + prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE); + spin_unlock_irq(&mdev->tconn->req_lock); + schedule(); + finish_wait(&mdev->misc_wait, &wait); + spin_lock_irq(&mdev->tconn->req_lock); + if (signal_pending(current)) + return -ERESTARTSYS; + } +} + static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) { const int rw = bio_rw(bio); @@ -729,7 +715,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns struct drbd_tl_epoch *b = NULL; struct drbd_request *req; int local, remote, send_oos = 0; - int err = -EIO; + int err; int ret = 0; /* allocate outside of all locks; */ @@ -799,6 +785,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns if (!(local || remote) && !is_susp(mdev->state)) { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + err = -EIO; goto fail_free_complete; } @@ -823,6 +810,14 @@ allocate_barrier: /* GOOD, everything prepared, grab the spin_lock */ spin_lock_irq(&mdev->tconn->req_lock); + if (rw == WRITE) { + err = complete_conflicting_writes(mdev, sector, size); + if (err) { + spin_unlock_irq(&mdev->tconn->req_lock); + goto fail_free_complete; + } + } + if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of generic_make_request() to restart processing of this @@ -843,6 +838,7 @@ allocate_barrier: if (!(local || remote)) { dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); spin_unlock_irq(&mdev->tconn->req_lock); + err = -EIO; goto fail_free_complete; } } @@ -903,12 +899,6 @@ allocate_barrier: if (local) _req_mod(req, TO_BE_SUBMITTED); - /* check this request on the collision detection hash tables. - * if we have a conflict, just complete it here. - * THINK do we want to check reads, too? (I don't think so...) */ - if (rw == WRITE && _req_conflicts(req)) - goto fail_conflicting; - list_add_tail(&req->tl_requests, &mdev->tconn->newest_tle->requests); /* NOTE remote first: to get the concurrent write detection right, @@ -975,21 +965,6 @@ allocate_barrier: return 0; -fail_conflicting: - /* this is a conflicting request. - * even though it may have been only _partially_ - * overlapping with one of the currently pending requests, - * without even submitting or sending it, we will - * pretend that it was successfully served right now. - */ - _drbd_end_io_acct(mdev, req); - spin_unlock_irq(&mdev->tconn->req_lock); - if (remote) - dec_ap_pending(mdev); - /* THINK: do we want to fail it (-EIO), or pretend success? - * this pretends success. */ - err = 0; - fail_free_complete: if (req->rq_state & RQ_IN_ACT_LOG) drbd_al_complete_io(mdev, sector);