mirror of
git://sourceware.org/git/lvm2.git
synced 2025-03-10 16:58:47 +03:00
RAID (lvconvert/dmeventd): Cleanly handle primary failure during 'recover' op
Add the checks necessary to distiguish the state of a RAID when the primary source for syncing fails during the "recover" process. It has been possible to hit this condition before (like when converting from 2-way RAID1 to 3-way and having the first two devices die during the "recover" process). However, this condition is now more likely since we treat linear -> RAID1 conversions as "recover" now - so it is especially important we cleanly handle this condition.
This commit is contained in:
parent
d34d2068dd
commit
4c0e908b0a
@ -58,6 +58,22 @@ static int _process_raid_event(struct dso_state *state, char *params, const char
|
||||
dead = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* if we are converting from non-RAID to RAID (e.g. linear -> raid1)
|
||||
* and too many original devices die, such that we cannot continue
|
||||
* the "recover" operation, the sync action will go to "idle", the
|
||||
* unsynced devs will remain at 'a', and the original devices will
|
||||
* NOT SWITCH TO 'D', but will remain at 'A' - hoping to be revived.
|
||||
*
|
||||
* This is simply the way the kernel works...
|
||||
*/
|
||||
if (!strcmp(status->sync_action, "idle") &&
|
||||
strchr(status->dev_health, 'a')) {
|
||||
log_error("Primary sources for new RAID, %s, have failed.",
|
||||
device);
|
||||
dead = 1; /* run it through LVM repair */
|
||||
}
|
||||
|
||||
if (dead) {
|
||||
if (status->insync_regions < status->total_regions) {
|
||||
if (!state->warned) {
|
||||
|
@ -6407,6 +6407,39 @@ has_enough_space:
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* _lv_raid_has_primary_failure_on_recover
|
||||
* @lv
|
||||
*
|
||||
* The kernel behaves strangely in the presense of a primary failure
|
||||
* during a "recover" sync operation. It's not technically a bug, I
|
||||
* suppose, but the output of the status line can make it difficult
|
||||
* to determine that we are in this state. The sync ratio will be
|
||||
* 100% and the sync action will be "idle", but the health characters
|
||||
* will be e.g. "Aaa" or "Aa", where the 'A' is the dead
|
||||
* primary source that cannot be marked dead by the kernel b/c
|
||||
* it is the only source for the remainder of data.
|
||||
*
|
||||
* This function helps to detect that condition.
|
||||
*
|
||||
* Returns: 1 if the state is detected, 0 otherwise.
|
||||
* FIXME: would be better to return -1,0,1 to allow error report.
|
||||
*/
|
||||
int _lv_raid_has_primary_failure_on_recover(struct logical_volume *lv)
|
||||
{
|
||||
char *tmp_dev_health;
|
||||
char *tmp_sync_action;
|
||||
|
||||
if (!lv_raid_sync_action(lv, &tmp_sync_action) ||
|
||||
!lv_raid_dev_health(lv, &tmp_dev_health))
|
||||
return_0;
|
||||
|
||||
if (!strcmp(tmp_sync_action, "idle") && strchr(tmp_dev_health, 'a'))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper:
|
||||
*
|
||||
@ -6458,11 +6491,38 @@ static int _lv_raid_rebuild_or_replace(struct logical_volume *lv,
|
||||
}
|
||||
|
||||
if (!_raid_in_sync(lv)) {
|
||||
/*
|
||||
* FIXME: There is a bug in the kernel that prevents 'rebuild'
|
||||
* from being specified when the array is not in-sync.
|
||||
* There are conditions where this should be allowed,
|
||||
* but only when we are doing a repair - as indicated by
|
||||
* 'lv->vg->cmd->handles_missing_pvs'. The above
|
||||
* conditional should be:
|
||||
(!lv->vg->cmd->handles_missing_pvs && !_raid_in_sync(lv))
|
||||
*/
|
||||
log_error("Unable to replace devices in %s while it is "
|
||||
"not in-sync.", display_lvname(lv));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (_lv_raid_has_primary_failure_on_recover(lv)) {
|
||||
/*
|
||||
* I hate having multiple error lines, but this
|
||||
* seems to work best for syslog and CLI.
|
||||
*/
|
||||
log_error("Unable to repair %s/%s. Source devices failed"
|
||||
" before the RAID could synchronize.",
|
||||
lv->vg->name, lv->name);
|
||||
log_error("You should choose one of the following:");
|
||||
log_error(" 1) deactivate %s/%s, revive failed "
|
||||
"device, re-activate LV, and proceed.",
|
||||
lv->vg->name, lv->name);
|
||||
log_error(" 2) remove the LV (all data is lost).");
|
||||
log_error(" 3) Seek expert advice to attempt to salvage any"
|
||||
" data from remaining devices.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* How many sub-LVs are being removed?
|
||||
*/
|
||||
|
Loading…
x
Reference in New Issue
Block a user