1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-12-21 13:34:40 +03:00

raid: Allow repair to reuse PVs from same image that suffered a PV failure

When repairing RAID LVs that have multiple PVs per image, allow
replacement images to be reallocated from the PVs that have not
failed in the image if there is sufficient space.

This allows for scenarios where a 2-way RAID1 is spread across 4 PVs,
where each image lives on two PVs but doesn't use the entire space
on any of them.  If one PV fails and there is sufficient space on the
remaining PV in the image, the image can be reallocated on just the
remaining PV.
This commit is contained in:
Jonathan Brassow 2014-06-25 22:26:06 -05:00
parent 7028fd31a0
commit ed3c2537b8
3 changed files with 133 additions and 15 deletions

View File

@ -1,5 +1,6 @@
Version 2.02.108 - Version 2.02.108 -
================================= =================================
Allow RAID repair to reuse PVs from same image that suffered a failure.
New RAID images now avoid allocation on any PVs in the same parent RAID LV. New RAID images now avoid allocation on any PVs in the same parent RAID LV.
Always reevaluate filters just before creating PV. Always reevaluate filters just before creating PV.

View File

@ -1501,6 +1501,85 @@ int lv_raid_reshape(struct logical_volume *lv,
return 0; return 0;
} }
static int _remove_partial_multi_segment_image(struct logical_volume *lv,
struct dm_list *remove_pvs)
{
uint32_t s, extents_needed;
struct lv_segment *rm_seg, *raid_seg = first_seg(lv);
struct logical_volume *rm_image = NULL;
struct physical_volume *pv;
if (!(lv->status & PARTIAL_LV))
return_0;
for (s = 0; s < raid_seg->area_count; s++) {
extents_needed = 0;
if ((seg_lv(raid_seg, s)->status & PARTIAL_LV) &&
lv_is_on_pvs(seg_lv(raid_seg, s), remove_pvs) &&
(dm_list_size(&(seg_lv(raid_seg, s)->segments)) > 1)) {
rm_image = seg_lv(raid_seg, s);
/* First, how many damaged extents are there */
if (seg_metalv(raid_seg, s)->status & PARTIAL_LV)
extents_needed += seg_metalv(raid_seg, s)->le_count;
dm_list_iterate_items(rm_seg, &rm_image->segments) {
/*
* segment areas are for stripe, mirror, raid,
* etc. We only need to check the first area
* if we are dealing with RAID image LVs.
*/
if (seg_type(rm_seg, 0) != AREA_PV)
continue;
pv = seg_pv(rm_seg, 0);
if (pv->status & MISSING_PV)
extents_needed += rm_seg->len;
}
log_debug("%u extents needed to repair %s",
extents_needed, rm_image->name);
/* Second, do the other PVs have the space */
dm_list_iterate_items(rm_seg, &rm_image->segments) {
if (seg_type(rm_seg, 0) != AREA_PV)
continue;
pv = seg_pv(rm_seg, 0);
if (pv->status & MISSING_PV)
continue;
if ((pv->pe_count - pv->pe_alloc_count) >
extents_needed) {
log_debug("%s has enough space for %s",
pv_dev_name(pv),
rm_image->name);
goto has_enough_space;
}
log_debug("Not enough space on %s for %s",
pv_dev_name(pv), rm_image->name);
}
}
}
/*
* This is likely to be the normal case - single
* segment images.
*/
return_0;
has_enough_space:
/*
* Now we have a multi-segment, partial image that has enough
* space on just one of its PVs for the entire image to be
* replaced. So, we replace the image's space with an error
* target so that the allocator can find that space (along with
* the remaining free space) in order to allocate the image
* anew.
*/
if (!replace_lv_with_error_segment(rm_image))
return_0;
return 1;
}
/* /*
* lv_raid_replace * lv_raid_replace
* @lv * @lv
@ -1513,6 +1592,7 @@ int lv_raid_replace(struct logical_volume *lv,
struct dm_list *remove_pvs, struct dm_list *remove_pvs,
struct dm_list *allocate_pvs) struct dm_list *allocate_pvs)
{ {
int partial_segment_removed = 0;
uint32_t s, sd, match_count = 0; uint32_t s, sd, match_count = 0;
struct dm_list old_lvs; struct dm_list old_lvs;
struct dm_list new_meta_lvs, new_data_lvs; struct dm_list new_meta_lvs, new_data_lvs;
@ -1605,25 +1685,40 @@ int lv_raid_replace(struct logical_volume *lv,
try_again: try_again:
if (!_alloc_image_components(lv, allocate_pvs, match_count, if (!_alloc_image_components(lv, allocate_pvs, match_count,
&new_meta_lvs, &new_data_lvs)) { &new_meta_lvs, &new_data_lvs)) {
log_error("Failed to allocate replacement images for %s/%s", if (!(lv->status & PARTIAL_LV))
lv->vg->name, lv->name); return 0;
/*
* If this is a repair, then try to
* do better than all-or-nothing
*/
if (match_count > 1) {
log_error("Attempting replacement of %u devices"
" instead of %u", match_count - 1, match_count);
match_count--;
/* This is a repair, so try to do better than all-or-nothing */
match_count--;
if (match_count > 0) {
log_error("Failed to replace %u devices."
" Attempting to replace %u instead.",
match_count, match_count+1);
/* /*
* Since we are replacing some but not all of the bad * Since we are replacing some but not all of the bad
* devices, we must set partial_activation * devices, we must set partial_activation
*/ */
lv->vg->cmd->partial_activation = 1; lv->vg->cmd->partial_activation = 1;
goto try_again; goto try_again;
} else if (!match_count && !partial_segment_removed) {
/*
* We are down to the last straw. We can only hope
* that a failed PV is just one of several PVs in
* the image; and if we extract the image, there may
* be enough room on the image's other PVs for a
* reallocation of the image.
*/
if (!_remove_partial_multi_segment_image(lv, remove_pvs))
return_0;
match_count = 1;
partial_segment_removed = 1;
lv->vg->cmd->partial_activation = 1;
goto try_again;
} }
log_error("Failed to allocate replacement images for %s/%s",
lv->vg->name, lv->name);
return 0; return 0;
} }
@ -1632,9 +1727,17 @@ try_again:
* - If we did this before the allocate, we wouldn't have to rename * - If we did this before the allocate, we wouldn't have to rename
* the allocated images, but it'd be much harder to avoid the right * the allocated images, but it'd be much harder to avoid the right
* PVs during allocation. * PVs during allocation.
*
* - If this is a repair and we were forced to call
* _remove_partial_multi_segment_image, then the remove_pvs list
* is no longer relevant - _raid_extract_images is forced to replace
* the image with the error target. Thus, the full set of PVs is
* supplied - knowing that only the image with the error target
* will be affected.
*/ */
if (!_raid_extract_images(lv, raid_seg->area_count - match_count, if (!_raid_extract_images(lv, raid_seg->area_count - match_count,
remove_pvs, 0, partial_segment_removed ?
&lv->vg->pvs : remove_pvs, 0,
&old_lvs, &old_lvs)) { &old_lvs, &old_lvs)) {
log_error("Failed to remove the specified images from %s/%s", log_error("Failed to remove the specified images from %s/%s",
lv->vg->name, lv->name); lv->vg->name, lv->name);

View File

@ -27,7 +27,8 @@ lvconvert -m 0 $vg/$lv1
# lvconvert --type raid1 -m 1 --alloc anywhere $vg/$lv1 "$dev1" "$dev2" # lvconvert --type raid1 -m 1 --alloc anywhere $vg/$lv1 "$dev1" "$dev2"
lvremove -ff $vg lvremove -ff $vg
# Setup 2-way RAID1 LV to spread across 4 devices.
# Setup 2-way RAID1 LV, spread across 4 devices.
# For each image: # For each image:
# - metadata LV + 1 image extent (2 total extents) on one PV # - metadata LV + 1 image extent (2 total extents) on one PV
# - 2 image extents on the other PV # - 2 image extents on the other PV
@ -43,10 +44,8 @@ aux wait_for_sync $vg $lv1
# Should not be enough non-overlapping space. # Should not be enough non-overlapping space.
not lvconvert -m +1 $vg/$lv1 \ not lvconvert -m +1 $vg/$lv1 \
"$dev5:0-1" "$dev1" "$dev2" "$dev3" "$dev4" "$dev5:0-1" "$dev1" "$dev2" "$dev3" "$dev4"
lvconvert -m +1 $vg/$lv1 "$dev5" lvconvert -m +1 $vg/$lv1 "$dev5"
lvconvert -m 0 $vg/$lv1 lvconvert -m 0 $vg/$lv1
# Should work due to '--alloc anywhere' # Should work due to '--alloc anywhere'
# RAID conversion not honoring allocation policy! # RAID conversion not honoring allocation policy!
#lvconvert -m +1 --alloc anywhere $vg/$lv1 \ #lvconvert -m +1 --alloc anywhere $vg/$lv1 \
@ -54,4 +53,19 @@ lvconvert -m 0 $vg/$lv1
lvremove -ff $vg lvremove -ff $vg
# Setup 2-way RAID1 LV, spread across 4 devices
# - metadata LV + 1 image extent (2 total extents) on one PV
# - 2 image extents on the other PV
# Kill one PV. There should be enough space on the remaining
# PV for that image to reallocate the entire image there and
# still maintain redundancy.
lvcreate --type raid1 -m 1 -l 3 -n $lv1 $vg \
"$dev1:0-1" "$dev2:0-1" "$dev3:0-1" "$dev4:0-1"
aux wait_for_sync $vg $lv1
aux disable_dev "$dev1"
lvconvert --repair -y $vg/$lv1 "$dev1" "$dev2" "$dev3" "$dev4"
#FIXME: ensure non-overlapping images (they should not share PVs)
aux enable_dev "$dev1"
lvremove -ff $vg
vgremove -ff $vg vgremove -ff $vg