diff --git a/WHATS_NEW b/WHATS_NEW index a2d1cc381..8a6f5a79c 100644 --- a/WHATS_NEW +++ b/WHATS_NEW @@ -1,5 +1,6 @@ Version 2.02.87 - =============================== + Add ability to down-convert RAID1 arrays. Update udev rules to skip DM flags decoding for removed devices. Add detect_internal_vg_cache_corruption to lvm.conf, disabled by default. Use memory pool locking to check for corruption of internal VG structs. diff --git a/lib/Makefile.in b/lib/Makefile.in index 433a61536..313c3b77f 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -85,6 +85,7 @@ SOURCES =\ metadata/pv.c \ metadata/pv_manip.c \ metadata/pv_map.c \ + metadata/raid_manip.c \ metadata/replicator_manip.c \ metadata/segtype.c \ metadata/snapshot_manip.c \ diff --git a/lib/activate/activate.c b/lib/activate/activate.c index d368f38c5..5dcfb79f2 100644 --- a/lib/activate/activate.c +++ b/lib/activate/activate.c @@ -611,6 +611,11 @@ int lv_mirror_percent(struct cmd_context *cmd, const struct logical_volume *lv, return r; } +int lv_raid_percent(const struct logical_volume *lv, percent_t *percent) +{ + return lv_mirror_percent(lv->vg->cmd, lv, 0, percent, NULL); +} + static int _lv_active(struct cmd_context *cmd, struct logical_volume *lv) { struct lvinfo info; diff --git a/lib/activate/activate.h b/lib/activate/activate.h index f253c1277..7030633c7 100644 --- a/lib/activate/activate.h +++ b/lib/activate/activate.h @@ -93,6 +93,7 @@ int lv_check_transient(struct logical_volume *lv); int lv_snapshot_percent(const struct logical_volume *lv, percent_t *percent); int lv_mirror_percent(struct cmd_context *cmd, const struct logical_volume *lv, int wait, percent_t *percent, uint32_t *event_nr); +int lv_raid_percent(const struct logical_volume *lv, percent_t *percent); /* * Return number of LVs in the VG that are active. diff --git a/lib/metadata/metadata-exported.h b/lib/metadata/metadata-exported.h index 6b0b2c022..ee9033033 100644 --- a/lib/metadata/metadata-exported.h +++ b/lib/metadata/metadata-exported.h @@ -737,6 +737,13 @@ int lv_is_rlog(const struct logical_volume *lv); int lv_is_slog(const struct logical_volume *lv); struct logical_volume *first_replicator_dev(const struct logical_volume *lv); /* -- metadata/replicator_manip.c */ + +/* ++ metadata/raid_manip.c */ +uint32_t lv_raid_image_count(const struct logical_volume *lv); +int lv_raid_change_image_count(struct logical_volume *lv, + uint32_t new_count, struct dm_list *pvs); +/* -- metadata/raid_manip.c */ + struct cmd_vg *cmd_vg_add(struct dm_pool *mem, struct dm_list *cmd_vgs, const char *vg_name, const char *vgid, uint32_t flags); diff --git a/lib/metadata/raid_manip.c b/lib/metadata/raid_manip.c new file mode 100644 index 000000000..59c2e10ee --- /dev/null +++ b/lib/metadata/raid_manip.c @@ -0,0 +1,478 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "lib.h" +#include "metadata.h" +#include "toolcontext.h" +#include "segtype.h" +#include "display.h" +#include "archiver.h" +#include "activate.h" +#include "lv_alloc.h" +#include "lvm-string.h" +#include "str_list.h" +#include "memlock.h" + +uint32_t lv_raid_image_count(const struct logical_volume *lv) +{ + struct lv_segment *seg = first_seg(lv); + + if (!seg_is_raid(seg)) + return 1; + + return seg->area_count; +} + +/* + * lv_is_on_pv + * @lv: + * @pv: + * + * If any of the component devices of the LV are on the given PV, 1 + * is returned; otherwise 0. For example if one of the images of a RAID + * (or its metadata device) is on the PV, 1 would be returned for the + * top-level LV. + * If you wish to check the images themselves, you should pass them. + * + * FIXME: This should be made more generic, possibly use 'for_each_sub_lv', + * and be put in lv_manip.c. 'for_each_sub_lv' does not yet allow us to + * short-circuit execution or pass back the values we need yet though... + */ +static int lv_is_on_pv(struct logical_volume *lv, struct physical_volume *pv) +{ + uint32_t s; + struct physical_volume *pv2; + struct lv_segment *seg; + + if (!lv) + return 0; + + seg = first_seg(lv); + if (!seg) + return 0; + + /* Check mirror log */ + if (lv_is_on_pv(seg->log_lv, pv)) + return 1; + + /* Check stack of LVs */ + dm_list_iterate_items(seg, &lv->segments) { + for (s = 0; s < seg->area_count; s++) { + if (seg_type(seg, s) == AREA_PV) { + pv2 = seg_pv(seg, s); + if (id_equal(&pv->id, &pv2->id)) + return 1; + if (pv->dev && pv2->dev && + (pv->dev->dev == pv2->dev->dev)) + return 1; + } + + if ((seg_type(seg, s) == AREA_LV) && + lv_is_on_pv(seg_lv(seg, s), pv)) + return 1; + + if (!seg_is_raid(seg)) + continue; + + /* This is RAID, so we know the meta_area is AREA_LV */ + if (lv_is_on_pv(seg_metalv(seg, s), pv)) + return 1; + } + } + + return 0; +} + +static int lv_is_on_pvs(struct logical_volume *lv, struct dm_list *pvs) +{ + struct pv_list *pvl; + + dm_list_iterate_items(pvl, pvs) + if (lv_is_on_pv(lv, pvl->pv)) { + log_debug("%s is on %s", lv->name, + pv_dev_name(pvl->pv)); + return 1; + } else + log_debug("%s is not on %s", lv->name, + pv_dev_name(pvl->pv)); + return 0; +} + +static int raid_in_sync(struct logical_volume *lv) +{ + percent_t sync_percent; + + if (!lv_raid_percent(lv, &sync_percent)) { + log_error("Unable to determine sync status of %s/%s.", + lv->vg->name, lv->name); + return 0; + } + + return (sync_percent == PERCENT_100) ? 1 : 0; +} + +/* + * _shift_and_rename_image_components + * @seg: Top-level RAID segment + * + * Shift all higher indexed segment areas down to fill in gaps where + * there are 'AREA_UNASSIGNED' areas and rename data/metadata LVs so + * that their names match their new index. When finished, set + * seg->area_count to new reduced total. + * + * Returns: 1 on success, 0 on failure + */ +static int _shift_and_rename_image_components(struct lv_segment *seg) +{ + int len; + char *shift_name; + uint32_t s, missing; + struct cmd_context *cmd = seg->lv->vg->cmd; + + /* + * All LVs must be properly named for their index before + * shifting begins. (e.g. Index '0' must contain *_rimage_0 and + * *_rmeta_0. Index 'n' must contain *_rimage_n and *_rmeta_n.) + */ + + if (!seg_is_raid(seg)) + return_0; + + if (seg->area_count > 10) { + /* + * FIXME: Handling more would mean I'd have + * to handle double digits + */ + log_error("Unable handle arrays with more than 10 devices"); + return 0; + } + + log_very_verbose("Shifting images in %s", seg->lv->name); + + for (s = 0, missing = 0; s < seg->area_count; s++) { + if (seg_type(seg, s) == AREA_UNASSIGNED) { + if (seg_metatype(seg, s) != AREA_UNASSIGNED) { + log_error(INTERNAL_ERROR "Metadata segment area" + " #%d should be AREA_UNASSIGNED", s); + return 0; + } + missing++; + continue; + } + if (!missing) + continue; + + log_very_verbose("Shifting %s and %s by %u", + seg_metalv(seg, s)->name, + seg_lv(seg, s)->name, missing); + + /* Alter rmeta name */ + shift_name = dm_pool_strdup(cmd->mem, seg_metalv(seg, s)->name); + if (!shift_name) + return_0; + len = strlen(shift_name) - 1; + shift_name[len] -= missing; + seg_metalv(seg, s)->name = shift_name; + + /* Alter rimage name */ + shift_name = dm_pool_strdup(cmd->mem, seg_lv(seg, s)->name); + if (!shift_name) + return_0; + len = strlen(shift_name) - 1; + shift_name[len] -= missing; + seg_lv(seg, s)->name = shift_name; + + seg->areas[s - missing] = seg->areas[s]; + seg->meta_areas[s - missing] = seg->meta_areas[s]; + } + + seg->area_count -= missing; + return 1; +} + +static int raid_add_images(struct logical_volume *lv, + uint32_t new_count, struct dm_list *pvs) +{ + /* Not implemented */ + log_error("Unable to add images to LV, %s/%s", + lv->vg->name, lv->name); + + return 0; +} + +/* + * _extract_image_components + * @seg + * @idx: The index in the areas array to remove + * @extracted_rmeta: The displaced metadata LV + * @extracted_rimage: The displaced data LV + * + * This function extracts the image components - setting the respective + * 'extracted' pointers. It appends '_extracted' to the LVs' names, so that + * there are not future conflicts. It does /not/ commit the results. + * (IOW, erroring-out requires no unwinding of operations.) + * + * This function does /not/ attempt to: + * 1) shift the 'areas' or 'meta_areas' arrays. + * The '[meta_]areas' are left as AREA_UNASSIGNED. + * 2) Adjust the seg->area_count + * 3) Name the extracted LVs appropriately (appends '_extracted' to names) + * These actions must be performed by the caller. + * + * Returns: 1 on success, 0 on failure + */ +static int _extract_image_components(struct lv_segment *seg, uint32_t idx, + struct logical_volume **extracted_rmeta, + struct logical_volume **extracted_rimage) +{ + int len; + char *tmp_name; + struct cmd_context *cmd = seg->lv->vg->cmd; + struct logical_volume *data_lv = seg_lv(seg, idx); + struct logical_volume *meta_lv = seg_metalv(seg, idx); + + log_very_verbose("Extracting image components %s and %s from %s", + data_lv->name, meta_lv->name, seg->lv->name); + + data_lv->status &= ~RAID_IMAGE; + meta_lv->status &= ~RAID_META; + lv_set_visible(data_lv); + lv_set_visible(meta_lv); + + /* release removes data and meta areas */ + remove_seg_from_segs_using_this_lv(data_lv, seg); + remove_seg_from_segs_using_this_lv(meta_lv, seg); + + seg_type(seg, idx) = AREA_UNASSIGNED; + seg_metatype(seg, idx) = AREA_UNASSIGNED; + + len = strlen(meta_lv->name) + strlen("_extracted") + 1; + tmp_name = dm_pool_alloc(cmd->mem, len); + if (!tmp_name) + return_0; + sprintf(tmp_name, "%s_extracted", meta_lv->name); + meta_lv->name = tmp_name; + + len = strlen(data_lv->name) + strlen("_extracted") + 1; + tmp_name = dm_pool_alloc(cmd->mem, len); + if (!tmp_name) + return_0; + sprintf(tmp_name, "%s_extracted", data_lv->name); + data_lv->name = tmp_name; + + *extracted_rmeta = meta_lv; + *extracted_rimage = data_lv; + + return 1; +} + +/* + * raid_extract_images + * @lv + * @new_count: The absolute count of images (e.g. '2' for a 2-way mirror) + * @target_pvs: The list of PVs that are candidates for removal + * @shift: If set, use _shift_and_rename_image_components(). + * Otherwise, leave the [meta_]areas as AREA_UNASSIGNED and + * seg->area_count unchanged. + * @extracted_[meta|data]_lvs: The LVs removed from the array. If 'shift' + * is set, then there will likely be name conflicts. + * + * This function extracts _both_ portions of the indexed image. It + * does /not/ commit the results. (IOW, erroring-out requires no unwinding + * of operations.) + * + * Returns: 1 on success, 0 on failure + */ +static int raid_extract_images(struct logical_volume *lv, uint32_t new_count, + struct dm_list *target_pvs, int shift, + struct dm_list *extracted_meta_lvs, + struct dm_list *extracted_data_lvs) +{ + int s, extract, lvl_idx = 0; + struct lv_list *lvl_array; + struct lv_segment *seg = first_seg(lv); + struct logical_volume *rmeta_lv, *rimage_lv; + + extract = seg->area_count - new_count; + log_verbose("Extracting %u %s from %s/%s", extract, + (extract > 1) ? "images" : "image", + lv->vg->name, lv->name); + + lvl_array = dm_pool_alloc(lv->vg->cmd->mem, + sizeof(*lvl_array) * extract * 2); + if (!lvl_array) + return_0; + + for (s = seg->area_count - 1; (s >= 0) && extract; s--) { + if (!lv_is_on_pvs(seg_lv(seg, s), target_pvs) || + !lv_is_on_pvs(seg_metalv(seg, s), target_pvs)) + continue; + if (!raid_in_sync(lv) && + (!seg_is_mirrored(seg) || (s == 0))) { + log_error("Unable to extract %sRAID image" + " while RAID array is not in-sync", + seg_is_mirrored(seg) ? "primary " : ""); + return 0; + } + + if (!_extract_image_components(seg, s, &rmeta_lv, &rimage_lv)) { + log_error("Failed to extract %s from %s", + seg_lv(seg, s)->name, lv->name); + return 0; + } + + if (shift && !_shift_and_rename_image_components(seg)) { + log_error("Failed to shift and rename image components"); + return 0; + } + + lvl_array[lvl_idx].lv = rmeta_lv; + lvl_array[lvl_idx + 1].lv = rimage_lv; + dm_list_add(extracted_meta_lvs, &(lvl_array[lvl_idx++].list)); + dm_list_add(extracted_data_lvs, &(lvl_array[lvl_idx++].list)); + + extract--; + } + if (extract) { + log_error("Unable to extract enough images to satisfy request"); + return 0; + } + + return 1; +} + +/* + * lv_raid_change_image_count + * @lv + * @new_count: The absolute count of images (e.g. '2' for a 2-way mirror) + * @pvs: The list of PVs that are candidates for removal (or empty list) + * + * RAID arrays have 'images' which are composed of two parts, they are: + * - 'rimage': The data/parity holding portion + * - 'rmeta' : The metadata holding portion (i.e. superblock/bitmap area) + * This function adds or removes _both_ portions of the image and commits + * the results. + * + * Returns: 1 on success, 0 on failure + */ +int lv_raid_change_image_count(struct logical_volume *lv, + uint32_t new_count, struct dm_list *pvs) +{ + int r; + uint32_t old_count = lv_raid_image_count(lv); + struct lv_segment *seg = first_seg(lv); + struct dm_list removal_list; + struct lv_list *lvl_array, *lvl; + + dm_list_init(&removal_list); + + if (!seg_is_mirrored(seg)) { + log_error("Unable to change image count of non-mirrored RAID."); + return 0; + } + + if (old_count == new_count) { + log_verbose("%s/%s already has image count of %d", + lv->vg->name, lv->name, new_count); + return 1; + } + + if (old_count > new_count) + r = raid_extract_images(lv, new_count, pvs, 1, + &removal_list, &removal_list); + else + r = raid_add_images(lv, new_count, pvs); + if (!r) + return 0; + + /* Convert to linear? */ + if (new_count == 1) { + /* Add last metadata area to removal_list */ + lvl_array = dm_pool_alloc(lv->vg->cmd->mem, 2 * sizeof(*lvl)); + if (!lvl_array) + return_0; + lvl_array[0].lv = seg_metalv(seg, 0); + remove_seg_from_segs_using_this_lv(seg_metalv(seg, 0), seg); + seg_metatype(seg, 0) = AREA_UNASSIGNED; + dm_list_add(&removal_list, &(lvl_array[0].list)); + + /* Remove RAID layer */ + seg_lv(seg, 0)->status &= ~RAID_IMAGE; + lv_set_visible(seg_lv(seg, 0)); + lvl_array[1].lv = seg_lv(seg, 0); + dm_list_add(&removal_list, &(lvl_array[1].list)); + + if (!remove_layer_from_lv(lv, seg_lv(seg, 0))) + return_0; + lv->status &= ~(MIRRORED | RAID); + } + + if (!vg_write(lv->vg)) { + log_error("Failed to write changes to %s in %s", + lv->name, lv->vg->name); + return 0; + } + + if (!suspend_lv(lv->vg->cmd, lv)) { + log_error("Failed to suspend %s/%s before committing changes", + lv->vg->name, lv->name); + return 0; + } + + if (!vg_commit(lv->vg)) { + log_error("Failed to commit changes to %s in %s", + lv->name, lv->vg->name); + return 0; + } + + /* + * Bring extracted LVs into existance, so there are no + * conflicts for the main RAID device's resume + */ + if (!dm_list_empty(&removal_list)) { + dm_list_iterate_items(lvl, &removal_list) { + /* If top RAID was EX, use EX */ + if (lv_is_active_exclusive_locally(lv)) { + if (!activate_lv_excl(lv->vg->cmd, lvl->lv)) + return_0; + } else { + if (!activate_lv(lv->vg->cmd, lvl->lv)) + return_0; + } + } + } + + if (!resume_lv(lv->vg->cmd, lv)) { + log_error("Failed to resume %s/%s after committing changes", + lv->vg->name, lv->name); + return 0; + } + + /* + * Eliminate the extracted LVs + */ + if (!dm_list_empty(&removal_list)) { + dm_list_iterate_items(lvl, &removal_list) { + if (!deactivate_lv(lv->vg->cmd, lvl->lv)) + return_0; + if (!lv_remove(lvl->lv)) + return_0; + } + + if (!vg_write(lv->vg) || !vg_commit(lv->vg)) + return_0; + } + + return 1; +} diff --git a/test/t-lvcreate-raid.sh b/test/t-lvcreate-raid.sh new file mode 100644 index 000000000..ca51bd6f1 --- /dev/null +++ b/test/t-lvcreate-raid.sh @@ -0,0 +1,122 @@ +# Copyright (C) 2011 Red Hat, Inc. All rights reserved. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +. lib/test + +# is_raid_in_sync +function is_raid_in_sync() +{ + local dm_name + local a + local b + local idx + + dm_name=`echo $1 | sed s:-:--: | sed s:/:-:` + + if ! a=(`dmsetup status $dm_name`); then + echo "Unable to get sync status of $1" + exit 1 + fi + idx=$((${#a[@]} - 1)) + b=(`echo ${a[$idx]} | sed s:/:' ':`) + + if [ ${b[0]} != ${b[1]} ]; then + echo "$dm_name (${a[3]}) is not in-sync" + return 1 + fi + + echo "$dm_name (${a[3]}) is in-sync" + return 0 +} + +# wait_for_raid_sync +function wait_for_raid_sync() +{ + local i=0 + + while ! is_raid_in_sync $1; do + sleep 2 + i=$(($i + 1)) + if [ $i -gt 500 ]; then + echo "Sync is taking too long - assume stuck" + exit 1 + fi + done +} + +function is_raid_available() +{ + local a + + modprobe dm-raid + a=(`dmsetup targets | grep raid`) + if [ -z $a ]; then + echo "RAID target not available" + return 1 + fi + if [ ${a[1]} != "v1.1.0" ]; then + echo "Bad RAID version" + return 1 + fi + + return 0 +} + +######################################################## +# MAIN +######################################################## +is_raid_available || exit 200 + +aux prepare_vg 5 80 + + +########################################### +# Create, wait for sync, remove tests +########################################### + +# Create RAID1 (implicit 2-way) +lvcreate --type raid1 -l 2 -n $lv1 $vg +wait_for_raid_sync $vg/$lv1 +lvremove -ff $vg + +# Create RAID1 (explicit 2-way) +lvcreate --type raid1 -m 1 -l 2 -n $lv1 $vg +wait_for_raid_sync $vg/$lv1 +lvremove -ff $vg + +# Create RAID1 (explicit 3-way) +lvcreate --type raid1 -m 2 -l 2 -n $lv1 $vg +wait_for_raid_sync $vg/$lv1 +lvremove -ff $vg + +# Create RAID 4/5/6 (explicit 3-stripe + parity devs) +for i in raid4 \ + raid5 raid5_ls raid5_la raid5_rs raid5_ra \ + raid6 raid6_zr raid6_nr raid6_nc; do + + lvcreate --type $i -l 3 -i 3 -n $lv1 $vg + wait_for_raid_sync $vg/$lv1 + lvremove -ff $vg +done + +########################################### +# RAID1 down-convert tests +########################################### + +# 3-way to 2-way +lvcreate --type raid1 -m 2 -l 2 -n $lv1 $vg +wait_for_raid_sync $vg/$lv1 +lvconvert -m 1 $vg/$lv1 +# FIXME: ensure no residual devices + +# 2-way to linear +lvconvert -m 0 $vg/$lv1 +# FIXME: ensure no residual devices +lvremove -ff $vg diff --git a/tools/lvconvert.c b/tools/lvconvert.c index 20ec617bc..d3d5b73e0 100644 --- a/tools/lvconvert.c +++ b/tools/lvconvert.c @@ -39,7 +39,7 @@ struct lvconvert_params { uint32_t stripes; uint32_t stripe_size; - struct segment_type *segtype; + const struct segment_type *segtype; alloc_policy_t alloc; @@ -1366,6 +1366,62 @@ static int _lvconvert_mirrors(struct cmd_context *cmd, return 1; } +static int is_valid_raid_conversion(const struct segment_type *from_segtype, + const struct segment_type *to_segtype) +{ + if (from_segtype == to_segtype) + return 1; + + if (!segtype_is_raid(from_segtype) && !segtype_is_raid(to_segtype)) + return_0; /* Not converting to or from RAID? */ + + return 0; +} + +static int lvconvert_raid(struct logical_volume *lv, struct lvconvert_params *lp) +{ + int image_count; + struct cmd_context *cmd = lv->vg->cmd; + struct lv_segment *seg = first_seg(lv); + + if (!arg_count(cmd, type_ARG)) + lp->segtype = seg->segtype; + + if (arg_count(cmd, mirrors_ARG) && !seg_is_mirrored(seg)) { + log_error("'--mirrors/-m' is not compatible with %s", + seg->segtype->name); + return 0; + } + + if (!is_valid_raid_conversion(seg->segtype, lp->segtype)) { + log_error("Unable to convert %s/%s from %s to %s", + lv->vg->name, lv->name, + seg->segtype->name, lp->segtype->name); + return 0; + } + + /* Change number of RAID1 images */ + if (arg_count(cmd, mirrors_ARG)) { + image_count = lv_raid_image_count(lv); + if (lp->mirrors_sign == SIGN_PLUS) + image_count += lp->mirrors; + else if (lp->mirrors_sign == SIGN_MINUS) + image_count -= lp->mirrors; + else + image_count = lp->mirrors + 1; + + if (image_count < 1) { + log_error("Unable to reduce images by specified amount"); + return 0; + } + + return lv_raid_change_image_count(lv, image_count, lp->pvh); + } + + log_error("Conversion operation not yet supported."); + return 0; +} + static int lvconvert_snapshot(struct cmd_context *cmd, struct logical_volume *lv, struct lvconvert_params *lp) @@ -1580,6 +1636,24 @@ static int _lvconvert_single(struct cmd_context *cmd, struct logical_volume *lv, stack; return ECMD_FAILED; } + } else if (segtype_is_raid(lp->segtype) || (lv->status & RAID)) { + if (!archive(lv->vg)) { + stack; + return ECMD_FAILED; + } + if (!lvconvert_raid(lv, lp)) { + stack; + return ECMD_FAILED; + } + + if (!(failed_pvs = _failed_pv_list(lv->vg))) { + stack; + return ECMD_FAILED; + } + + /* If repairing and using policies, remove missing PVs from VG */ + if (arg_count(cmd, repair_ARG) && arg_count(cmd, use_policies_ARG)) + _remove_missing_empty_pv(lv->vg, failed_pvs); } else if (arg_count(cmd, mirrors_ARG) || arg_count(cmd, splitmirrors_ARG) || (lv->status & MIRRORED)) {