mirror of
git://sourceware.org/git/lvm2.git
synced 2025-01-03 05:18:29 +03:00
Fix for bug 596453: multiple mirror image failures cause lvm repair...
The lvm repair issues I believe are the superficial symptoms of this bug - there are worse issues that are not as clearly seen. From my inline comments: * If the mirror was successfully recovered, we want to always * force every machine to write to all devices - otherwise, * corruption will occur. Here's how: * Node1 suffers a failure and marks a region out-of-sync * Node2 attempts a write, gets by is_remote_recovering, * and queries the sync status of the region - finding * it out-of-sync. * Node2 thinks the write should be a nosync write, but it * hasn't suffered the drive failure that Node1 has yet. * It then issues a generic_make_request directly to * the primary image only - which is exactly the device * that has suffered the failure. * Node2 suffers a lost write - which completely bypasses the * mirror layer because it had gone through generic_m_r. * The file system will likely explode at this point due to * I/O errors. If it wasn't the primary that failed, it is * easily possible in this case to issue writes to just one * of the remaining images - also leaving the mirror inconsistent. * * We let in_sync() return 1 in a cluster regardless of what is * in the bitmap once recovery has successfully completed on a * mirror. This ensures the mirroring code will continue to * attempt to write to all mirror images. The worst that can * happen for reads is that additional read attempts may be * taken.
This commit is contained in:
parent
c496bc13e2
commit
53670b18f5
@ -1,5 +1,6 @@
|
|||||||
Version 2.02.73 -
|
Version 2.02.73 -
|
||||||
================================
|
================================
|
||||||
|
Fix potential for corruption during cluster mirror device failure.
|
||||||
Use 'SINGLENODE' instead of 'dead' in clvmd singlenode messages.
|
Use 'SINGLENODE' instead of 'dead' in clvmd singlenode messages.
|
||||||
Ignore snapshots when performing mirror recovery beneath an origin.
|
Ignore snapshots when performing mirror recovery beneath an origin.
|
||||||
Pass LCK_ORIGIN_ONLY flag around cluster.
|
Pass LCK_ORIGIN_ONLY flag around cluster.
|
||||||
|
@ -54,6 +54,7 @@ struct log_c {
|
|||||||
|
|
||||||
time_t delay; /* limits how fast a resume can happen after suspend */
|
time_t delay; /* limits how fast a resume can happen after suspend */
|
||||||
int touched;
|
int touched;
|
||||||
|
int in_sync; /* An in-sync that stays set until suspend/resume */
|
||||||
uint32_t region_size;
|
uint32_t region_size;
|
||||||
uint32_t region_count;
|
uint32_t region_count;
|
||||||
uint64_t sync_count;
|
uint64_t sync_count;
|
||||||
@ -720,6 +721,7 @@ static int clog_resume(struct dm_ulog_request *rq)
|
|||||||
if (!lc)
|
if (!lc)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
|
lc->in_sync = 0;
|
||||||
switch (lc->resume_override) {
|
switch (lc->resume_override) {
|
||||||
case 1000:
|
case 1000:
|
||||||
LOG_ERROR("[%s] Additional resume issued before suspend",
|
LOG_ERROR("[%s] Additional resume issued before suspend",
|
||||||
@ -963,6 +965,42 @@ static int clog_in_sync(struct dm_ulog_request *rq)
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
*rtn = log_test_bit(lc->sync_bits, region);
|
*rtn = log_test_bit(lc->sync_bits, region);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the mirror was successfully recovered, we want to always
|
||||||
|
* force every machine to write to all devices - otherwise,
|
||||||
|
* corruption will occur. Here's how:
|
||||||
|
* Node1 suffers a failure and marks a region out-of-sync
|
||||||
|
* Node2 attempts a write, gets by is_remote_recovering,
|
||||||
|
* and queries the sync status of the region - finding
|
||||||
|
* it out-of-sync.
|
||||||
|
* Node2 thinks the write should be a nosync write, but it
|
||||||
|
* hasn't suffered the drive failure that Node1 has yet.
|
||||||
|
* It then issues a generic_make_request directly to
|
||||||
|
* the primary image only - which is exactly the device
|
||||||
|
* that has suffered the failure.
|
||||||
|
* Node2 suffers a lost write - which completely bypasses the
|
||||||
|
* mirror layer because it had gone through generic_m_r.
|
||||||
|
* The file system will likely explode at this point due to
|
||||||
|
* I/O errors. If it wasn't the primary that failed, it is
|
||||||
|
* easily possible in this case to issue writes to just one
|
||||||
|
* of the remaining images - also leaving the mirror inconsistent.
|
||||||
|
*
|
||||||
|
* We let in_sync() return 1 in a cluster regardless of what is
|
||||||
|
* in the bitmap once recovery has successfully completed on a
|
||||||
|
* mirror. This ensures the mirroring code will continue to
|
||||||
|
* attempt to write to all mirror images. The worst that can
|
||||||
|
* happen for reads is that additional read attempts may be
|
||||||
|
* taken.
|
||||||
|
*
|
||||||
|
* Futher investigation may be required to determine if there are
|
||||||
|
* similar possible outcomes when the mirror is in the process of
|
||||||
|
* recovering. In that case, lc->in_sync would not have been set
|
||||||
|
* yet.
|
||||||
|
*/
|
||||||
|
if (!*rtn && lc->in_sync)
|
||||||
|
*rtn = 1;
|
||||||
|
|
||||||
if (*rtn)
|
if (*rtn)
|
||||||
LOG_DBG("[%s] Region is in-sync: %llu",
|
LOG_DBG("[%s] Region is in-sync: %llu",
|
||||||
SHORT_UUID(lc->uuid), (unsigned long long)region);
|
SHORT_UUID(lc->uuid), (unsigned long long)region);
|
||||||
@ -1282,7 +1320,7 @@ static int clog_set_region_sync(struct dm_ulog_request *rq, uint32_t originator)
|
|||||||
lc->skip_bit_warning = lc->region_count;
|
lc->skip_bit_warning = lc->region_count;
|
||||||
|
|
||||||
if (pkg->region > (lc->skip_bit_warning + 5)) {
|
if (pkg->region > (lc->skip_bit_warning + 5)) {
|
||||||
LOG_ERROR("*** Region #%llu skipped during recovery ***",
|
LOG_SPRINT(lc, "*** Region #%llu skipped during recovery ***",
|
||||||
(unsigned long long)lc->skip_bit_warning);
|
(unsigned long long)lc->skip_bit_warning);
|
||||||
lc->skip_bit_warning = lc->region_count;
|
lc->skip_bit_warning = lc->region_count;
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -1324,6 +1362,9 @@ static int clog_set_region_sync(struct dm_ulog_request *rq, uint32_t originator)
|
|||||||
"(lc->sync_count > lc->region_count) - this is bad",
|
"(lc->sync_count > lc->region_count) - this is bad",
|
||||||
rq->seq, SHORT_UUID(lc->uuid), originator);
|
rq->seq, SHORT_UUID(lc->uuid), originator);
|
||||||
|
|
||||||
|
if (lc->sync_count == lc->region_count)
|
||||||
|
lc->in_sync = 1;
|
||||||
|
|
||||||
rq->data_size = 0;
|
rq->data_size = 0;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user