1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-12-21 13:34:40 +03:00

lvconvert: fix (automatic) raid repair regression

The dm-raid target now rejects device rebuild requests during ongoing
resynchronization thus causing 'lvconvert --repair ...' to fail with
a kernel error message. This regresses with respect to failing automatic
repair via the dmeventd RAID plugin in case raid_fault_policy="allocate"
is configured in lvm.conf as well.

Previously allowing such repair request required cancelling the
resynchronization of any still accessible DataLVs, hence reasoning
potential data loss.

Patch allows the resynchronization of still accessible DataLVs to
finish up by rejecting any 'lvconvert --repair ...'.

It enhances the dmeventd RAID plugin to be able to automatically repair
by postponing the repair after synchronization ended.

More tests are added to lvconvert-rebuild-raid.sh to cover single
and multiple DataLV failure cases for the different RAID levels.

- resolves: rhbz1371717
This commit is contained in:
Heinz Mauelshagen 2016-09-21 00:38:38 +02:00
parent 78e0fdb618
commit 5d455b28fc
6 changed files with 134 additions and 39 deletions

View File

@ -1,5 +1,6 @@
Version 2.02.166 -
=====================================
Fix lvconvert --repair regression
Fix reported origin lv field for cache volumes. (2.02.133)
Always specify snapshot cow LV for monitoring not internal LV. (2.02.165)
Fix lvchange --discard|--zero for active thin-pool.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2005-2015 Red Hat, Inc. All rights reserved.
* Copyright (C) 2005-2016 Red Hat, Inc. All rights reserved.
*
* This file is part of LVM2.
*
@ -13,14 +13,20 @@
*/
#include "lib.h"
#include "defaults.h"
#include "dmeventd_lvm.h"
#include "libdevmapper-event.h"
/* Hold enough elements for the mximum number of RAID images */
#define RAID_DEVS_ELEMS ((DEFAULT_RAID_MAX_IMAGES + 63) / 64)
struct dso_state {
struct dm_pool *mem;
char cmd_lvscan[512];
char cmd_lvconvert[512];
uint64_t raid_devs[RAID_DEVS_ELEMS];
int failed;
int warned;
};
DM_EVENT_LOG_FN("raid")
@ -31,20 +37,39 @@ static int _process_raid_event(struct dso_state *state, char *params, const char
{
struct dm_status_raid *status;
const char *d;
int dead = 0, r = 1;
if (!dm_get_status_raid(state->mem, params, &status)) {
log_error("Failed to process status line for %s.", device);
return 0;
}
if ((d = strchr(status->dev_health, 'D'))) {
d = status->dev_health;
while ((d = strchr(d, 'D'))) {
uint32_t dev = (uint32_t)(d - status->dev_health);
if (!(state->raid_devs[dev / 64] & (1 << (dev % 64))))
log_error("Device #%u of %s array, %s, has failed.",
dev, status->raid_type, device);
state->raid_devs[dev / 64] |= (1 << (dev % 64));
d++;
dead = 1;
}
if (dead) {
if (status->insync_regions < status->total_regions) {
if (!state->warned)
log_warn("WARNING: waiting for resynchronization to finish "
"before initiating repair on RAID device %s", device);
state->warned = 1;
goto out; /* Not yet done syncing with accessible devices */
}
if (state->failed)
goto out; /* already reported */
log_error("Device #%d of %s array, %s, has failed.",
(int)(d - status->dev_health),
status->raid_type, device);
state->failed = 1;
if (!dmeventd_lvm2_run_with_lock(state->cmd_lvscan))
log_warn("WARNING: Re-scan of RAID device %s failed.", device);
@ -52,8 +77,7 @@ static int _process_raid_event(struct dso_state *state, char *params, const char
/* if repair goes OK, report success even if lvscan has failed */
if (!dmeventd_lvm2_run_with_lock(state->cmd_lvconvert)) {
log_info("Repair of RAID device %s failed.", device);
dm_pool_free(state->mem, status);
return 0;
r = 0;
}
} else {
state->failed = 0;
@ -64,7 +88,7 @@ static int _process_raid_event(struct dso_state *state, char *params, const char
out:
dm_pool_free(state->mem, status);
return 1;
return r;
}
void process_event(struct dm_task *dmt,

View File

@ -1018,12 +1018,6 @@ int lv_raid_image_in_sync(const struct logical_volume *lv)
return 0;
}
if (!lv_raid_percent(raid_seg->lv, &percent))
return_0;
if (percent == DM_PERCENT_100)
return 1;
/* Find out which sub-LV this is. */
for (s = 0; s < raid_seg->area_count; s++)
if (seg_lv(raid_seg, s) == lv)

View File

@ -3658,7 +3658,7 @@ static int _lv_raid_rebuild_or_replace(struct logical_volume *lv,
return 0;
}
if (!mirror_in_sync() && !_raid_in_sync(lv)) {
if (!_raid_in_sync(lv)) {
log_error("Unable to replace devices in %s/%s while it is"
" not in-sync.", lv->vg->name, lv->name);
return 0;

View File

@ -22,11 +22,52 @@ aux lvmconf 'allocation/maximise_cling = 0' \
aux prepare_vg 8
function delay
{
for d in $(< DEVICES)
do
aux delay_dev "$d" 0 $1 $(get first_extent_sector "$d")
done
}
# It's possible small raid arrays do have problems with reporting in-sync.
# So try bigger size
RAID_SIZE=32
# Fast sync and repair afterwards
delay 0
# RAID1 dual-leg single replace after initial sync
lvcreate --type raid1 -m 1 -L $RAID_SIZE -n $lv1 $vg "$dev1" "$dev2"
aux wait_for_sync $vg $lv1
aux disable_dev "$dev2"
lvconvert -y --repair $vg/$lv1
vgreduce --removemissing $vg
aux enable_dev "$dev2"
vgextend $vg "$dev2"
lvremove -ff $vg/$lv1
# Delayed sync to allow for repair during rebuild
delay 50
# RAID1 triple-leg single replace during initial sync
lvcreate --type raid1 -m 2 -L $RAID_SIZE -n $lv1 $vg "$dev1" "$dev2" "$dev3"
aux disable_dev "$dev2" "$dev3"
not lvconvert -y --repair $vg/$lv1
aux wait_for_sync $vg $lv1
lvconvert -y --repair $vg/$lv1
vgreduce --removemissing $vg
aux enable_dev "$dev2" "$dev3"
vgextend $vg "$dev2" "$dev3"
lvremove -ff $vg/$lv1
# Larger RAID size possible for striped RAID
RAID_SIZE=64
# RAID5 single replace
# Fast sync and repair afterwards
delay 0
# RAID5 single replace after initial sync
lvcreate --type raid5 -i 2 -L $RAID_SIZE -n $lv1 $vg "$dev1" "$dev2" "$dev3"
aux wait_for_sync $vg $lv1
aux disable_dev "$dev3"
@ -34,16 +75,69 @@ lvconvert -y --repair $vg/$lv1
vgreduce --removemissing $vg
aux enable_dev "$dev3"
vgextend $vg "$dev3"
lvremove -ff $vg
lvremove -ff $vg/$lv1
# RAID6 double replace
# Delayed sync to allow for repair during rebuild
delay 50
# RAID5 single replace during initial sync
lvcreate --type raid5 -i 2 -L $RAID_SIZE -n $lv1 $vg "$dev1" "$dev2" "$dev3"
aux disable_dev "$dev3"
not lvconvert -y --repair $vg/$lv1
aux wait_for_sync $vg $lv1
lvconvert -y --repair $vg/$lv1
vgreduce --removemissing $vg
aux enable_dev "$dev3"
vgextend $vg "$dev3"
lvremove -ff $vg/$lv1
# Fast sync and repair afterwards
delay 0
# RAID6 double replace after initial sync
lvcreate --type raid6 -i 3 -L $RAID_SIZE -n $lv1 $vg \
"$dev1" "$dev2" "$dev3" "$dev4" "$dev5"
aux wait_for_sync $vg $lv1
aux disable_dev "$dev4" "$dev5"
lvconvert -y --repair $vg/$lv1
vgreduce --removemissing $vg
aux enable_dev "$dev4"
aux enable_dev "$dev5"
aux enable_dev "$dev4" "$dev5"
vgextend $vg "$dev4" "$dev5"
lvremove -ff $vg/$lv1
# Delayed sync to allow for repair during rebuild
delay 50
# RAID6 single replace after initial sync
lvcreate --type raid6 -i 3 -L $RAID_SIZE -n $lv1 $vg \
"$dev1" "$dev2" "$dev3" "$dev4" "$dev5"
aux disable_dev "$dev4"
not lvconvert -y --repair $vg/$lv1
delay 0 # Fast sync and repair afterwards
aux disable_dev "$dev4" # Need to disable again after changing delay
aux wait_for_sync $vg $lv1
lvconvert -y --repair $vg/$lv1
vgreduce --removemissing $vg
aux enable_dev "$dev4"
vgextend $vg "$dev4"
lvremove -ff $vg/$lv1
# Delayed sync to allow for repair during rebuild
delay 50
# RAID10 single replace after initial sync
lvcreate --type raid10 -m 1 -i 2 -L $RAID_SIZE -n $lv1 $vg \
"$dev1" "$dev2" "$dev3" "$dev4"
aux disable_dev "$dev4"
not lvconvert -y --repair $vg/$lv1
delay 0 # Fast sync and repair afterwards
aux disable_dev "$dev4" # Need to disable again after changing delay
aux disable_dev "$dev1"
aux wait_for_sync $vg $lv1
lvconvert -y --repair $vg/$lv1
vgreduce --removemissing $vg
aux enable_dev "$dev4"
vgextend $vg "$dev4"
lvremove -ff $vg/$lv1
vgremove -ff $vg

View File

@ -1974,24 +1974,6 @@ static int _lvconvert_raid(struct logical_volume *lv, struct lvconvert_params *l
return 0;
}
if (!lv_raid_percent(lv, &sync_percent)) {
log_error("Unable to determine sync status of %s.",
display_lvname(lv));
return 0;
}
if (sync_percent != DM_PERCENT_100) {
log_warn("WARNING: %s is not in-sync.", display_lvname(lv));
log_warn("WARNING: Portions of the array may be unrecoverable.");
/*
* The kernel will not allow a device to be replaced
* in an array that is not in-sync unless we override
* by forcing the array to be considered "in-sync".
*/
init_mirror_in_sync(1);
}
_lvconvert_raid_repair_ask(cmd, lp, &replace);
if (replace) {