/* * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. * * This file is part of LVM2. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU Lesser General Public License v.2.1. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "lib.h" #include "device.h" #include "metadata.h" #include "toolcontext.h" #include "lvm-string.h" #include "lvm-file.h" #include "lvmcache.h" #include "memlock.h" #include "str_list.h" #include "pv_alloc.h" #include "segtype.h" #include "activate.h" #include "display.h" #include "locking.h" #include "archiver.h" #include "defaults.h" #include "filter-persistent.h" #include #include static struct physical_volume *_pv_read(struct cmd_context *cmd, struct dm_pool *pvmem, const char *pv_name, struct format_instance *fid, int warnings, int scan_label_only); static struct physical_volume *_find_pv_by_name(struct cmd_context *cmd, const char *pv_name); static struct pv_list *_find_pv_in_vg(const struct volume_group *vg, const char *pv_name); static struct pv_list *_find_pv_in_vg_by_uuid(const struct volume_group *vg, const struct id *id); static uint32_t _vg_bad_status_bits(const struct volume_group *vg, uint64_t status); const char _really_init[] = "Really INITIALIZE physical volume \"%s\" of volume group \"%s\" [y/n]? "; static int _alignment_overrides_default(unsigned long data_alignment, unsigned long default_pe_align) { return data_alignment && (default_pe_align % data_alignment); } unsigned long set_pe_align(struct physical_volume *pv, unsigned long data_alignment) { unsigned long default_pe_align, temp_pe_align; if (pv->pe_align) goto out; if (data_alignment) { /* Always use specified data_alignment */ pv->pe_align = data_alignment; goto out; } default_pe_align = find_config_tree_int(pv->fmt->cmd, "devices/default_data_alignment", DEFAULT_DATA_ALIGNMENT); if (default_pe_align) /* align on 1 MiB multiple */ default_pe_align *= DEFAULT_PE_ALIGN; else /* align on 64 KiB multiple (old default) */ default_pe_align = DEFAULT_PE_ALIGN_OLD; pv->pe_align = MAX((default_pe_align << SECTOR_SHIFT), lvm_getpagesize()) >> SECTOR_SHIFT; if (!pv->dev) goto out; /* * Align to stripe-width of underlying md device if present */ if (find_config_tree_bool(pv->fmt->cmd, "devices/md_chunk_alignment", DEFAULT_MD_CHUNK_ALIGNMENT)) { temp_pe_align = dev_md_stripe_width(pv->fmt->cmd->sysfs_dir, pv->dev); if (_alignment_overrides_default(temp_pe_align, default_pe_align)) pv->pe_align = MAX(pv->pe_align, temp_pe_align); } /* * Align to topology's minimum_io_size or optimal_io_size if present * - minimum_io_size - the smallest request the device can perform * w/o incurring a read-modify-write penalty (e.g. MD's chunk size) * - optimal_io_size - the device's preferred unit of receiving I/O * (e.g. MD's stripe width) */ if (find_config_tree_bool(pv->fmt->cmd, "devices/data_alignment_detection", DEFAULT_DATA_ALIGNMENT_DETECTION)) { temp_pe_align = dev_minimum_io_size(pv->fmt->cmd->sysfs_dir, pv->dev); if (_alignment_overrides_default(temp_pe_align, default_pe_align)) pv->pe_align = MAX(pv->pe_align, temp_pe_align); temp_pe_align = dev_optimal_io_size(pv->fmt->cmd->sysfs_dir, pv->dev); if (_alignment_overrides_default(temp_pe_align, default_pe_align)) pv->pe_align = MAX(pv->pe_align, temp_pe_align); } out: log_very_verbose("%s: Setting PE alignment to %lu sectors.", dev_name(pv->dev), pv->pe_align); return pv->pe_align; } unsigned long set_pe_align_offset(struct physical_volume *pv, unsigned long data_alignment_offset) { if (pv->pe_align_offset) goto out; if (data_alignment_offset) { /* Always use specified data_alignment_offset */ pv->pe_align_offset = data_alignment_offset; goto out; } if (!pv->dev) goto out; if (find_config_tree_bool(pv->fmt->cmd, "devices/data_alignment_offset_detection", DEFAULT_DATA_ALIGNMENT_OFFSET_DETECTION)) { int align_offset = dev_alignment_offset(pv->fmt->cmd->sysfs_dir, pv->dev); /* must handle a -1 alignment_offset; means dev is misaligned */ if (align_offset < 0) align_offset = 0; pv->pe_align_offset = MAX(pv->pe_align_offset, align_offset); } out: log_very_verbose("%s: Setting PE alignment offset to %lu sectors.", dev_name(pv->dev), pv->pe_align_offset); return pv->pe_align_offset; } void add_pvl_to_vgs(struct volume_group *vg, struct pv_list *pvl) { dm_list_add(&vg->pvs, &pvl->list); vg->pv_count++; pvl->pv->vg = vg; pv_set_fid(pvl->pv, vg->fid); } void del_pvl_from_vgs(struct volume_group *vg, struct pv_list *pvl) { struct format_instance_ctx fic; struct format_instance *fid; vg->pv_count--; dm_list_del(&pvl->list); pvl->pv->vg = NULL; /* orphan */ /* Use a new PV-based format instance since the PV is orphan now. */ fic.type = FMT_INSTANCE_PV | FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS; fic.context.pv_id = (const char *) &pvl->pv->id; fid = pvl->pv->fid->fmt->ops->create_instance(pvl->pv->fid->fmt, &fic); pv_set_fid(pvl->pv, fid); } /** * add_pv_to_vg - Add a physical volume to a volume group * @vg - volume group to add to * @pv_name - name of the pv (to be removed) * @pv - physical volume to add to volume group * @pp - physical volume creation params (OPTIONAL) * * Returns: * 0 - failure * 1 - success * FIXME: remove pv_name - obtain safely from pv */ int add_pv_to_vg(struct volume_group *vg, const char *pv_name, struct physical_volume *pv, struct pvcreate_params *pp) { struct pv_to_create *pvc; struct pv_list *pvl; struct format_instance *fid = vg->fid; struct dm_pool *mem = vg->vgmem; char uuid[64] __attribute__((aligned(8))); log_verbose("Adding physical volume '%s' to volume group '%s'", pv_name, vg->name); if (!(pvl = dm_pool_zalloc(mem, sizeof(*pvl)))) { log_error("pv_list allocation for '%s' failed", pv_name); return 0; } if (!is_orphan_vg(pv->vg_name)) { log_error("Physical volume '%s' is already in volume group " "'%s'", pv_name, pv->vg_name); return 0; } if (pv->fmt != fid->fmt) { log_error("Physical volume %s is of different format type (%s)", pv_name, pv->fmt->name); return 0; } /* Ensure PV doesn't depend on another PV already in the VG */ if (pv_uses_vg(pv, vg)) { log_error("Physical volume %s might be constructed from same " "volume group %s", pv_name, vg->name); return 0; } if (!(pv->vg_name = dm_pool_strdup(mem, vg->name))) { log_error("vg->name allocation failed for '%s'", pv_name); return 0; } memcpy(&pv->vgid, &vg->id, sizeof(vg->id)); /* Units of 512-byte sectors */ pv->pe_size = vg->extent_size; /* * pe_count must always be calculated by pv_setup */ pv->pe_alloc_count = 0; if (!fid->fmt->ops->pv_setup(fid->fmt, pv, vg)) { log_error("Format-specific setup of physical volume '%s' " "failed.", pv_name); return 0; } if (_find_pv_in_vg(vg, pv_name) || _find_pv_in_vg_by_uuid(vg, &pv->id)) { if (!id_write_format(&pv->id, uuid, sizeof(uuid))) { stack; uuid[0] = '\0'; } log_error("Physical volume '%s (%s)' listed more than once.", pv_name, uuid); return 0; } if (vg->pv_count && (vg->pv_count == vg->max_pv)) { log_error("No space for '%s' - volume group '%s' " "holds max %d physical volume(s).", pv_name, vg->name, vg->max_pv); return 0; } if (!alloc_pv_segment_whole_pv(mem, pv)) return_0; if ((uint64_t) vg->extent_count + pv->pe_count > UINT32_MAX) { log_error("Unable to add %s to %s: new extent count (%" PRIu64 ") exceeds limit (%" PRIu32 ").", pv_name, vg->name, (uint64_t) vg->extent_count + pv->pe_count, UINT32_MAX); return 0; } pvl->pv = pv; add_pvl_to_vgs(vg, pvl); vg->extent_count += pv->pe_count; vg->free_count += pv->pe_count; if (pv->status & UNLABELLED_PV) { if (!(pvc = dm_pool_zalloc(mem, sizeof(*pvc)))) { log_error("pv_to_create allocation for '%s' failed", pv_name); return 0; } pvc->pv = pv; pvc->pp = pp; dm_list_add(&vg->pvs_to_create, &pvc->list); } return 1; } static int _copy_pv(struct dm_pool *pvmem, struct physical_volume *pv_to, struct physical_volume *pv_from) { memcpy(pv_to, pv_from, sizeof(*pv_to)); /* We must use pv_set_fid here to update the reference counter! */ pv_to->fid = NULL; pv_set_fid(pv_to, pv_from->fid); if (!(pv_to->vg_name = dm_pool_strdup(pvmem, pv_from->vg_name))) return_0; if (!str_list_dup(pvmem, &pv_to->tags, &pv_from->tags)) return_0; if (!peg_dup(pvmem, &pv_to->segments, &pv_from->segments)) return_0; return 1; } static struct pv_list *_copy_pvl(struct dm_pool *pvmem, struct pv_list *pvl_from) { struct pv_list *pvl_to = NULL; if (!(pvl_to = dm_pool_zalloc(pvmem, sizeof(*pvl_to)))) return_NULL; if (!(pvl_to->pv = dm_pool_alloc(pvmem, sizeof(*pvl_to->pv)))) goto_bad; if(!_copy_pv(pvmem, pvl_to->pv, pvl_from->pv)) goto_bad; return pvl_to; bad: dm_pool_free(pvmem, pvl_to); return NULL; } int get_pv_from_vg_by_id(const struct format_type *fmt, const char *vg_name, const char *vgid, const char *pvid, struct physical_volume *pv) { struct volume_group *vg; struct pv_list *pvl; int r = 0, consistent = 0; if (!(vg = vg_read_internal(fmt->cmd, vg_name, vgid, 1, &consistent))) { log_error("get_pv_from_vg_by_id: vg_read_internal failed to read VG %s", vg_name); return 0; } if (!consistent) log_warn("WARNING: Volume group %s is not consistent", vg_name); dm_list_iterate_items(pvl, &vg->pvs) { if (id_equal(&pvl->pv->id, (const struct id *) pvid)) { if (!_copy_pv(fmt->cmd->mem, pv, pvl->pv)) { log_error("internal PV duplication failed"); r = 0; goto out; } r = 1; goto out; } } out: free_vg(vg); return r; } int move_pv(struct volume_group *vg_from, struct volume_group *vg_to, const char *pv_name) { struct physical_volume *pv; struct pv_list *pvl; /* FIXME: handle tags */ if (!(pvl = find_pv_in_vg(vg_from, pv_name))) { log_error("Physical volume %s not in volume group %s", pv_name, vg_from->name); return 0; } if (_vg_bad_status_bits(vg_from, RESIZEABLE_VG) || _vg_bad_status_bits(vg_to, RESIZEABLE_VG)) return 0; del_pvl_from_vgs(vg_from, pvl); add_pvl_to_vgs(vg_to, pvl); pv = pvl->pv; vg_from->extent_count -= pv_pe_count(pv); vg_to->extent_count += pv_pe_count(pv); vg_from->free_count -= pv_pe_count(pv) - pv_pe_alloc_count(pv); vg_to->free_count += pv_pe_count(pv) - pv_pe_alloc_count(pv); return 1; } int move_pvs_used_by_lv(struct volume_group *vg_from, struct volume_group *vg_to, const char *lv_name) { struct lv_segment *lvseg; unsigned s; struct lv_list *lvl; struct logical_volume *lv; /* FIXME: handle tags */ if (!(lvl = find_lv_in_vg(vg_from, lv_name))) { log_error("Logical volume %s not in volume group %s", lv_name, vg_from->name); return 0; } if (_vg_bad_status_bits(vg_from, RESIZEABLE_VG) || _vg_bad_status_bits(vg_to, RESIZEABLE_VG)) return 0; dm_list_iterate_items(lvseg, &lvl->lv->segments) { if (lvseg->log_lv) if (!move_pvs_used_by_lv(vg_from, vg_to, lvseg->log_lv->name)) return_0; for (s = 0; s < lvseg->area_count; s++) { if (seg_type(lvseg, s) == AREA_PV) { if (!move_pv(vg_from, vg_to, pv_dev_name(seg_pv(lvseg, s)))) return_0; } else if (seg_type(lvseg, s) == AREA_LV) { lv = seg_lv(lvseg, s); if (!move_pvs_used_by_lv(vg_from, vg_to, lv->name)) return_0; } } } return 1; } static int validate_new_vg_name(struct cmd_context *cmd, const char *vg_name) { char vg_path[PATH_MAX]; if (!validate_name(vg_name)) return_0; snprintf(vg_path, PATH_MAX, "%s%s", cmd->dev_dir, vg_name); if (path_exists(vg_path)) { log_error("%s: already exists in filesystem", vg_path); return 0; } return 1; } int validate_vg_rename_params(struct cmd_context *cmd, const char *vg_name_old, const char *vg_name_new) { unsigned length; char *dev_dir; dev_dir = cmd->dev_dir; length = strlen(dev_dir); /* Check sanity of new name */ if (strlen(vg_name_new) > NAME_LEN - length - 2) { log_error("New volume group path exceeds maximum length " "of %d!", NAME_LEN - length - 2); return 0; } if (!validate_new_vg_name(cmd, vg_name_new)) { log_error("New volume group name \"%s\" is invalid", vg_name_new); return 0; } if (!strcmp(vg_name_old, vg_name_new)) { log_error("Old and new volume group names must differ"); return 0; } return 1; } int vg_rename(struct cmd_context *cmd, struct volume_group *vg, const char *new_name) { struct dm_pool *mem = vg->vgmem; struct pv_list *pvl; vg->old_name = vg->name; if (!(vg->name = dm_pool_strdup(mem, new_name))) { log_error("vg->name allocation failed for '%s'", new_name); return 0; } dm_list_iterate_items(pvl, &vg->pvs) { if (!(pvl->pv->vg_name = dm_pool_strdup(mem, new_name))) { log_error("pv->vg_name allocation failed for '%s'", pv_dev_name(pvl->pv)); return 0; } } return 1; } int remove_lvs_in_vg(struct cmd_context *cmd, struct volume_group *vg, force_t force) { struct dm_list *lst; struct lv_list *lvl; while ((lst = dm_list_first(&vg->lvs))) { lvl = dm_list_item(lst, struct lv_list); if (!lv_remove_with_dependencies(cmd, lvl->lv, force, 0)) return 0; } return 1; } int vg_remove_check(struct volume_group *vg) { unsigned lv_count; if (vg_read_error(vg) || vg_missing_pv_count(vg)) { log_error("Volume group \"%s\" not found, is inconsistent " "or has PVs missing.", vg ? vg->name : ""); log_error("Consider vgreduce --removemissing if metadata " "is inconsistent."); return 0; } if (!vg_check_status(vg, EXPORTED_VG)) return 0; lv_count = vg_visible_lvs(vg); if (lv_count) { log_error("Volume group \"%s\" still contains %u " "logical volume(s)", vg->name, lv_count); return 0; } if (!archive(vg)) return 0; return 1; } void vg_remove_pvs(struct volume_group *vg) { struct pv_list *pvl, *tpvl; dm_list_iterate_items_safe(pvl, tpvl, &vg->pvs) { del_pvl_from_vgs(vg, pvl); dm_list_add(&vg->removed_pvs, &pvl->list); } } int vg_remove(struct volume_group *vg) { struct physical_volume *pv; struct pv_list *pvl; int ret = 1; if (!lock_vol(vg->cmd, VG_ORPHANS, LCK_VG_WRITE)) { log_error("Can't get lock for orphan PVs"); return 0; } if (!vg_remove_mdas(vg)) { log_error("vg_remove_mdas %s failed", vg->name); unlock_vg(vg->cmd, VG_ORPHANS); return 0; } /* init physical volumes */ dm_list_iterate_items(pvl, &vg->removed_pvs) { pv = pvl->pv; if (is_missing_pv(pv)) continue; log_verbose("Removing physical volume \"%s\" from " "volume group \"%s\"", pv_dev_name(pv), vg->name); pv->vg_name = vg->fid->fmt->orphan_vg_name; pv->status = ALLOCATABLE_PV; if (!dev_get_size(pv_dev(pv), &pv->size)) { log_error("%s: Couldn't get size.", pv_dev_name(pv)); ret = 0; continue; } /* FIXME Write to same sector label was read from */ if (!pv_write(vg->cmd, pv, 0)) { log_error("Failed to remove physical volume \"%s\"" " from volume group \"%s\"", pv_dev_name(pv), vg->name); ret = 0; } } if (!backup_remove(vg->cmd, vg->name)) stack; if (ret) log_print("Volume group \"%s\" successfully removed", vg->name); else log_error("Volume group \"%s\" not properly removed", vg->name); unlock_vg(vg->cmd, VG_ORPHANS); return ret; } /* * Extend a VG by a single PV / device path * * Parameters: * - vg: handle of volume group to extend by 'pv_name' * - pv_name: device path of PV to add to VG * - pp: parameters to pass to implicit pvcreate; if NULL, do not pvcreate * */ static int vg_extend_single_pv(struct volume_group *vg, char *pv_name, struct pvcreate_params *pp) { struct physical_volume *pv; pv = pv_by_path(vg->fid->fmt->cmd, pv_name); if (!pv && !pp) { log_error("%s not identified as an existing " "physical volume", pv_name); return 0; } else if (!pv && pp) { pv = pvcreate_single(vg->cmd, pv_name, pp, 0); if (!pv) return 0; } if (!add_pv_to_vg(vg, pv_name, pv, pp)) { free_pv_fid(pv); return 0; } return 1; } /* * Extend a VG by a single PV / device path * * Parameters: * - vg: handle of volume group to extend by 'pv_name' * - pv_count: count of device paths of PVs * - pv_names: device paths of PVs to add to VG * - pp: parameters to pass to implicit pvcreate; if NULL, do not pvcreate * */ int vg_extend(struct volume_group *vg, int pv_count, const char *const *pv_names, struct pvcreate_params *pp) { int i; char *pv_name; if (_vg_bad_status_bits(vg, RESIZEABLE_VG)) return 0; /* attach each pv */ for (i = 0; i < pv_count; i++) { if (!(pv_name = dm_strdup(pv_names[i]))) { log_error("Failed to duplicate pv name %s.", pv_names[i]); return 0; } unescape_colons_and_at_signs(pv_name, NULL, NULL); if (!vg_extend_single_pv(vg, pv_name, pp)) { log_error("Unable to add physical volume '%s' to " "volume group '%s'.", pv_name, vg->name); dm_free(pv_name); return 0; } dm_free(pv_name); } /* FIXME Decide whether to initialise and add new mdahs to format instance */ return 1; } /* FIXME: use this inside vgreduce_single? */ int vg_reduce(struct volume_group *vg, const char *pv_name) { struct physical_volume *pv; struct pv_list *pvl; if (_vg_bad_status_bits(vg, RESIZEABLE_VG)) return 0; if (!archive(vg)) goto bad; /* remove each pv */ if (!(pvl = find_pv_in_vg(vg, pv_name))) { log_error("Physical volume %s not in volume group %s.", pv_name, vg->name); goto bad; } pv = pvl->pv; if (pv_pe_alloc_count(pv)) { log_error("Physical volume %s still in use.", pv_name); goto bad; } if (!dev_get_size(pv_dev(pv), &pv->size)) { log_error("%s: Couldn't get size.", pv_name); goto bad; } vg->free_count -= pv_pe_count(pv) - pv_pe_alloc_count(pv); vg->extent_count -= pv_pe_count(pv); del_pvl_from_vgs(vg, pvl); /* add pv to the remove_pvs list */ dm_list_add(&vg->removed_pvs, &pvl->list); return 1; bad: log_error("Unable to remove physical volume '%s' from " "volume group '%s'.", pv_name, vg->name); return 0; } int lv_change_tag(struct logical_volume *lv, const char *tag, int add_tag) { char *tag_new; if (!(lv->vg->fid->fmt->features & FMT_TAGS)) { log_error("Logical volume %s/%s does not support tags", lv->vg->name, lv->name); return 0; } if (add_tag) { if (!(tag_new = dm_pool_strdup(lv->vg->vgmem, tag))) { log_error("Failed to duplicate tag %s from %s/%s", tag, lv->vg->name, lv->name); return 0; } if (!str_list_add(lv->vg->vgmem, &lv->tags, tag_new)) { log_error("Failed to add tag %s to %s/%s", tag, lv->vg->name, lv->name); return 0; } } else { if (!str_list_del(&lv->tags, tag)) { log_error("Failed to remove tag %s from %s/%s", tag, lv->vg->name, lv->name); return 0; } } return 1; } int vg_change_tag(struct volume_group *vg, const char *tag, int add_tag) { char *tag_new; if (!(vg->fid->fmt->features & FMT_TAGS)) { log_error("Volume group %s does not support tags", vg->name); return 0; } if (add_tag) { if (!(tag_new = dm_pool_strdup(vg->vgmem, tag))) { log_error("Failed to duplicate tag %s from %s", tag, vg->name); return 0; } if (!str_list_add(vg->vgmem, &vg->tags, tag_new)) { log_error("Failed to add tag %s to volume group %s", tag, vg->name); return 0; } } else { if (!str_list_del(&vg->tags, tag)) { log_error("Failed to remove tag %s from volume group " "%s", tag, vg->name); return 0; } } return 1; } const char *strip_dir(const char *vg_name, const char *dev_dir) { size_t len = strlen(dev_dir); if (!strncmp(vg_name, dev_dir, len)) vg_name += len; return vg_name; } /* * Validate parameters to vg_create() before calling. * FIXME: Move inside vg_create library function. * FIXME: Change vgcreate_params struct to individual gets/sets */ int vgcreate_params_validate(struct cmd_context *cmd, struct vgcreate_params *vp) { if (!validate_new_vg_name(cmd, vp->vg_name)) { log_error("New volume group name \"%s\" is invalid", vp->vg_name); return 1; } if (vp->alloc == ALLOC_INHERIT) { log_error("Volume Group allocation policy cannot inherit " "from anything"); return 1; } if (!vp->extent_size) { log_error("Physical extent size may not be zero"); return 1; } if (!(cmd->fmt->features & FMT_UNLIMITED_VOLS)) { if (!vp->max_lv) vp->max_lv = 255; if (!vp->max_pv) vp->max_pv = 255; if (vp->max_lv > 255 || vp->max_pv > 255) { log_error("Number of volumes may not exceed 255"); return 1; } } return 0; } /* * Create a (struct volume_group) volume group handle from a struct volume_group pointer and a * possible failure code or zero for success. */ static struct volume_group *_vg_make_handle(struct cmd_context *cmd, struct volume_group *vg, uint32_t failure) { if (!vg && !(vg = alloc_vg("vg_make_handle", cmd, NULL))) return_NULL; if (vg->read_status != failure) vg->read_status = failure; return vg; } int lv_has_unknown_segments(const struct logical_volume *lv) { struct lv_segment *seg; /* foreach segment */ dm_list_iterate_items(seg, &lv->segments) if (seg_unknown(seg)) return 1; return 0; } int vg_has_unknown_segments(const struct volume_group *vg) { struct lv_list *lvl; /* foreach LV */ dm_list_iterate_items(lvl, &vg->lvs) if (lv_has_unknown_segments(lvl->lv)) return 1; return 0; } /* * Create a VG with default parameters. * Returns: * - struct volume_group* with SUCCESS code: VG structure created * - NULL or struct volume_group* with FAILED_* code: error creating VG structure * Use vg_read_error() to determine success or failure. * FIXME: cleanup usage of _vg_make_handle() */ struct volume_group *vg_create(struct cmd_context *cmd, const char *vg_name) { struct volume_group *vg; struct format_instance_ctx fic; struct format_instance *fid; int consistent = 0; uint32_t rc; if (!validate_name(vg_name)) { log_error("Invalid vg name %s", vg_name); /* FIXME: use _vg_make_handle() w/proper error code */ return NULL; } rc = vg_lock_newname(cmd, vg_name); if (rc != SUCCESS) /* NOTE: let caller decide - this may be check for existence */ return _vg_make_handle(cmd, NULL, rc); /* FIXME: Is this vg_read_internal necessary? Move it inside vg_lock_newname? */ /* is this vg name already in use ? */ if ((vg = vg_read_internal(cmd, vg_name, NULL, 1, &consistent))) { log_error("A volume group called '%s' already exists.", vg_name); unlock_and_free_vg(cmd, vg, vg_name); return _vg_make_handle(cmd, NULL, FAILED_EXIST); } /* Strip dev_dir if present */ vg_name = strip_dir(vg_name, cmd->dev_dir); if (!(vg = alloc_vg("vg_create", cmd, vg_name))) goto_bad; if (!id_create(&vg->id)) { log_error("Couldn't create uuid for volume group '%s'.", vg_name); goto bad; } vg->status = (RESIZEABLE_VG | LVM_READ | LVM_WRITE); if (!(vg->system_id = dm_pool_zalloc(vg->vgmem, NAME_LEN + 1))) goto_bad; *vg->system_id = '\0'; vg->extent_size = DEFAULT_EXTENT_SIZE * 2; vg->extent_count = 0; vg->free_count = 0; vg->max_lv = DEFAULT_MAX_LV; vg->max_pv = DEFAULT_MAX_PV; vg->alloc = DEFAULT_ALLOC_POLICY; vg->mda_copies = DEFAULT_VGMETADATACOPIES; vg->pv_count = 0; fic.type = FMT_INSTANCE_VG | FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS; fic.context.vg_ref.vg_name = vg_name; fic.context.vg_ref.vg_id = NULL; if (!(fid = cmd->fmt->ops->create_instance(cmd->fmt, &fic))) { log_error("Failed to create format instance"); goto bad; } vg_set_fid(vg, fid); if (vg->fid->fmt->ops->vg_setup && !vg->fid->fmt->ops->vg_setup(vg->fid, vg)) { log_error("Format specific setup of volume group '%s' failed.", vg_name); goto bad; } return _vg_make_handle(cmd, vg, SUCCESS); bad: unlock_and_free_vg(cmd, vg, vg_name); /* FIXME: use _vg_make_handle() w/proper error code */ return NULL; } uint64_t extents_from_size(struct cmd_context *cmd, uint64_t size, uint32_t extent_size) { if (size % extent_size) { size += extent_size - size % extent_size; log_print("Rounding up size to full physical extent %s", display_size(cmd, size)); } if (size > (uint64_t) UINT32_MAX * extent_size) { log_error("Volume too large (%s) for extent size %s. " "Upper limit is %s.", display_size(cmd, size), display_size(cmd, (uint64_t) extent_size), display_size(cmd, (uint64_t) UINT32_MAX * extent_size)); return 0; } return (uint64_t) size / extent_size; } /* * Return random integer in [0,max) interval * * The loop rejects numbers that come from an "incomplete" slice of the * RAND_MAX space (considering the number space [0, RAND_MAX] is divided * into some "max"-sized slices and at most a single smaller slice, * between [n*max, RAND_MAX] for suitable n -- numbers from this last slice * are discarded because they could distort the distribution in favour of * smaller numbers. */ static unsigned _even_rand( unsigned *seed, unsigned max ) { unsigned r, ret; /* make sure distribution is even */ do { r = (unsigned) rand_r( seed ); ret = r % max; } while ( r - ret > RAND_MAX - max ); return ret; } static dm_bitset_t _bitset_with_random_bits(struct dm_pool *mem, uint32_t num_bits, uint32_t num_set_bits, unsigned *seed) { dm_bitset_t bs; unsigned bit_selected; char buf[32]; uint32_t i = num_bits - num_set_bits; if (!(bs = dm_bitset_create(mem, (unsigned) num_bits))) { log_error("Failed to allocate bitset for setting random bits."); return NULL; } if (!dm_pool_begin_object(mem, 512)) { log_error("dm_pool_begin_object failed for random list of bits."); dm_pool_free(mem, bs); return NULL; } /* Perform loop num_set_bits times, selecting one bit each time */ while (i++ < num_bits) { /* Select a random bit between 0 and (i-1) inclusive. */ bit_selected = _even_rand(seed, i); /* * If the bit was already set, set the new bit that became * choosable for the first time during this pass. * This maintains a uniform probability distribution by compensating * for being unable to select it until this pass. */ if (dm_bit(bs, bit_selected)) bit_selected = i - 1; dm_bit_set(bs, bit_selected); if (dm_snprintf(buf, sizeof(buf), "%u ", bit_selected) < 0) { log_error("snprintf random bit failed."); dm_pool_free(mem, bs); return NULL; } if (!dm_pool_grow_object(mem, buf, strlen(buf))) { log_error("Failed to generate list of random bits."); dm_pool_free(mem, bs); return NULL; } } if (!dm_pool_grow_object(mem, "\0", 1)) { log_error("Failed to finish list of random bits."); dm_pool_free(mem, bs); return NULL; } log_debug("Selected %" PRIu32 " random bits from %" PRIu32 ": %s", num_set_bits, num_bits, (char *) dm_pool_end_object(mem)); return bs; } static int _vg_ignore_mdas(struct volume_group *vg, uint32_t num_to_ignore) { struct metadata_area *mda; uint32_t mda_used_count = vg_mda_used_count(vg); dm_bitset_t mda_to_ignore_bs; int r = 1; log_debug("Adjusting ignored mdas for %s: %" PRIu32 " of %" PRIu32 " mdas in use " "but %" PRIu32 " required. Changing %" PRIu32 " mda.", vg->name, mda_used_count, vg_mda_count(vg), vg_mda_copies(vg), num_to_ignore); if (!num_to_ignore) return 1; if (!(mda_to_ignore_bs = _bitset_with_random_bits(vg->vgmem, mda_used_count, num_to_ignore, &vg->cmd->rand_seed))) return_0; dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) if (!mda_is_ignored(mda) && (--mda_used_count, dm_bit(mda_to_ignore_bs, mda_used_count))) { mda_set_ignored(mda, 1); if (!--num_to_ignore) goto out; } log_error(INTERNAL_ERROR "Unable to find %"PRIu32" metadata areas to ignore " "on volume group %s", num_to_ignore, vg->name); r = 0; out: dm_pool_free(vg->vgmem, mda_to_ignore_bs); return r; } static int _vg_unignore_mdas(struct volume_group *vg, uint32_t num_to_unignore) { struct metadata_area *mda, *tmda; uint32_t mda_used_count = vg_mda_used_count(vg); uint32_t mda_count = vg_mda_count(vg); uint32_t mda_free_count = mda_count - mda_used_count; dm_bitset_t mda_to_unignore_bs; int r = 1; if (!num_to_unignore) return 1; log_debug("Adjusting ignored mdas for %s: %" PRIu32 " of %" PRIu32 " mdas in use " "but %" PRIu32 " required. Changing %" PRIu32 " mda.", vg->name, mda_used_count, mda_count, vg_mda_copies(vg), num_to_unignore); if (!(mda_to_unignore_bs = _bitset_with_random_bits(vg->vgmem, mda_free_count, num_to_unignore, &vg->cmd->rand_seed))) return_0; dm_list_iterate_items_safe(mda, tmda, &vg->fid->metadata_areas_ignored) if (mda_is_ignored(mda) && (--mda_free_count, dm_bit(mda_to_unignore_bs, mda_free_count))) { mda_set_ignored(mda, 0); dm_list_move(&vg->fid->metadata_areas_in_use, &mda->list); if (!--num_to_unignore) goto out; } dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) if (mda_is_ignored(mda) && (--mda_free_count, dm_bit(mda_to_unignore_bs, mda_free_count))) { mda_set_ignored(mda, 0); if (!--num_to_unignore) goto out; } log_error(INTERNAL_ERROR "Unable to find %"PRIu32" metadata areas to unignore " "on volume group %s", num_to_unignore, vg->name); r = 0; out: dm_pool_free(vg->vgmem, mda_to_unignore_bs); return r; } static int _vg_adjust_ignored_mdas(struct volume_group *vg) { uint32_t mda_copies_used = vg_mda_used_count(vg); if (vg->mda_copies == VGMETADATACOPIES_UNMANAGED) { /* Ensure at least one mda is in use. */ if (!mda_copies_used && vg_mda_count(vg) && !_vg_unignore_mdas(vg, 1)) return_0; else return 1; } /* Not an error to have vg_mda_count larger than total mdas. */ if (vg->mda_copies == VGMETADATACOPIES_ALL || vg->mda_copies >= vg_mda_count(vg)) { /* Use all */ if (!_vg_unignore_mdas(vg, vg_mda_count(vg) - mda_copies_used)) return_0; } else if (mda_copies_used < vg->mda_copies) { if (!_vg_unignore_mdas(vg, vg->mda_copies - mda_copies_used)) return_0; } else if (mda_copies_used > vg->mda_copies) if (!_vg_ignore_mdas(vg, mda_copies_used - vg->mda_copies)) return_0; /* * The VGMETADATACOPIES_ALL value will never be written disk. * It is a special cmdline value that means 2 things: * 1. clear all ignore bits in all mdas in this vg * 2. set the "unmanaged" policy going forward for metadata balancing */ if (vg->mda_copies == VGMETADATACOPIES_ALL) vg->mda_copies = VGMETADATACOPIES_UNMANAGED; return 1; } uint64_t find_min_mda_size(struct dm_list *mdas) { uint64_t min_mda_size = UINT64_MAX, mda_size; struct metadata_area *mda; dm_list_iterate_items(mda, mdas) { if (!mda->ops->mda_total_sectors) continue; mda_size = mda->ops->mda_total_sectors(mda); if (mda_size < min_mda_size) min_mda_size = mda_size; } if (min_mda_size == UINT64_MAX) min_mda_size = UINT64_C(0); return min_mda_size; } static int _move_mdas(struct volume_group *vg_from, struct volume_group *vg_to, struct dm_list *mdas_from, struct dm_list *mdas_to) { struct metadata_area *mda, *mda2; int common_mda = 0; dm_list_iterate_items_safe(mda, mda2, mdas_from) { if (!mda->ops->mda_in_vg) { common_mda = 1; continue; } if (!mda->ops->mda_in_vg(vg_from->fid, vg_from, mda)) { if (is_orphan_vg(vg_to->name)) dm_list_del(&mda->list); else dm_list_move(mdas_to, &mda->list); } } return common_mda; } /* * Separate metadata areas after splitting a VG. * Also accepts orphan VG as destination (for vgreduce). */ int vg_split_mdas(struct cmd_context *cmd __attribute__((unused)), struct volume_group *vg_from, struct volume_group *vg_to) { struct dm_list *mdas_from_in_use, *mdas_to_in_use; struct dm_list *mdas_from_ignored, *mdas_to_ignored; int common_mda = 0; mdas_from_in_use = &vg_from->fid->metadata_areas_in_use; mdas_from_ignored = &vg_from->fid->metadata_areas_ignored; mdas_to_in_use = &vg_to->fid->metadata_areas_in_use; mdas_to_ignored = &vg_to->fid->metadata_areas_ignored; common_mda = _move_mdas(vg_from, vg_to, mdas_from_in_use, mdas_to_in_use); common_mda = _move_mdas(vg_from, vg_to, mdas_from_ignored, mdas_to_ignored); if ((dm_list_empty(mdas_from_in_use) && dm_list_empty(mdas_from_ignored)) || ((!is_orphan_vg(vg_to->name) && dm_list_empty(mdas_to_in_use) && dm_list_empty(mdas_to_ignored)))) return common_mda; return 1; } static int _wipe_sb(struct device *dev, const char *type, const char *name, int wipe_len, struct pvcreate_params *pp, int (*func)(struct device *dev, uint64_t *signature)) { int wipe; uint64_t superblock; wipe = func(dev, &superblock); if (wipe == -1) { log_error("Fatal error while trying to detect %s on %s.", type, name); return 0; } if (wipe == 0) return 1; /* Specifying --yes => do not ask. */ if (!pp->yes && (pp->force == PROMPT) && yes_no_prompt("WARNING: %s detected on %s. Wipe it? [y/n] ", type, name) != 'y') { log_error("Aborting pvcreate on %s.", name); return 0; } log_print("Wiping %s on %s.", type, name); if (!dev_set(dev, superblock, wipe_len, 0)) { log_error("Failed to wipe %s on %s.", type, name); return 0; } return 1; } /* * See if we may pvcreate on this device. * 0 indicates we may not. */ static int pvcreate_check(struct cmd_context *cmd, const char *name, struct pvcreate_params *pp) { struct physical_volume *pv; struct device *dev; /* FIXME Check partition type is LVM unless --force is given */ /* Is there a pv here already? */ pv = pv_read(cmd, name, 0, 0); /* * If a PV has no MDAs it may appear to be an orphan until the * metadata is read off another PV in the same VG. Detecting * this means checking every VG by scanning every PV on the * system. */ if (pv && is_orphan(pv) && !dm_list_size(&pv->fid->metadata_areas_in_use)) { free_pv_fid(pv); if (!scan_vgs_for_pvs(cmd, 0)) return_0; pv = pv_read(cmd, name, 0, 0); } /* Allow partial & exported VGs to be destroyed. */ /* We must have -ff to overwrite a non orphan */ if (pv && !is_orphan(pv) && pp->force != DONT_PROMPT_OVERRIDE) { log_error("Can't initialize physical volume \"%s\" of " "volume group \"%s\" without -ff", name, pv_vg_name(pv)); goto bad; } /* prompt */ if (pv && !is_orphan(pv) && !pp->yes && yes_no_prompt(_really_init, name, pv_vg_name(pv)) == 'n') { log_error("%s: physical volume not initialized", name); goto bad; } if (sigint_caught()) goto_bad; dev = dev_cache_get(name, cmd->filter); /* Is there an md superblock here? */ /* FIXME: still possible issues here - rescan cache? */ if (!dev && md_filtering()) { refresh_filters(cmd); init_md_filtering(0); dev = dev_cache_get(name, cmd->filter); init_md_filtering(1); } if (!dev) { log_error("Device %s not found (or ignored by filtering).", name); goto bad; } /* * This test will fail if the device belongs to an MD array. */ if (!dev_test_excl(dev)) { /* FIXME Detect whether device-mapper itself is still using it */ log_error("Can't open %s exclusively. Mounted filesystem?", name); goto bad; } if (!_wipe_sb(dev, "software RAID md superblock", name, 4, pp, dev_is_md)) goto_bad; if (!_wipe_sb(dev, "swap signature", name, 10, pp, dev_is_swap)) goto_bad; if (!_wipe_sb(dev, "LUKS signature", name, 8, pp, dev_is_luks)) goto_bad; if (sigint_caught()) goto_bad; if (pv && !is_orphan(pv) && pp->force) { log_warn("WARNING: Forcing physical volume creation on " "%s%s%s%s", name, !is_orphan(pv) ? " of volume group \"" : "", !is_orphan(pv) ? pv_vg_name(pv) : "", !is_orphan(pv) ? "\"" : ""); } free_pv_fid(pv); return 1; bad: free_pv_fid(pv); return 0; } void pvcreate_params_set_defaults(struct pvcreate_params *pp) { memset(pp, 0, sizeof(*pp)); pp->zero = 1; pp->size = 0; pp->data_alignment = UINT64_C(0); pp->data_alignment_offset = UINT64_C(0); pp->pvmetadatacopies = DEFAULT_PVMETADATACOPIES; pp->pvmetadatasize = DEFAULT_PVMETADATASIZE; pp->labelsector = DEFAULT_LABELSECTOR; pp->idp = 0; pp->pe_start = 0; pp->extent_count = 0; pp->extent_size = 0; pp->restorefile = 0; pp->force = PROMPT; pp->yes = 0; pp->metadataignore = DEFAULT_PVMETADATAIGNORE; } static int _pvcreate_write(struct cmd_context *cmd, struct pv_to_create *pvc) { int zero = pvc->pp->zero; struct physical_volume *pv = pvc->pv; struct device *dev = pv->dev; const char *pv_name = dev_name(dev); /* Wipe existing label first */ if (!label_remove(pv_dev(pv))) { log_error("Failed to wipe existing label on %s", pv_name); return 0; } if (zero) { log_verbose("Zeroing start of device %s", pv_name); if (!dev_open_quiet(dev)) { log_error("%s not opened: device not zeroed", pv_name); return 0; } if (!dev_set(dev, UINT64_C(0), (size_t) 2048, 0)) { log_error("%s not wiped: aborting", pv_name); dev_close(dev); return 0; } dev_close(dev); } log_error("Writing physical volume data to disk \"%s\"", pv_name); if (!(pv_write(cmd, pv, 1))) { log_error("Failed to write physical volume \"%s\"", pv_name); return 0; } log_print("Physical volume \"%s\" successfully created", pv_name); return 1; } /* * pvcreate_single() - initialize a device with PV label and metadata area * * Parameters: * - pv_name: device path to initialize * - pp: parameters to pass to pv_create; if NULL, use default values * * Returns: * NULL: error * struct physical_volume * (non-NULL): handle to physical volume created */ struct physical_volume * pvcreate_single(struct cmd_context *cmd, const char *pv_name, struct pvcreate_params *pp, int write_now) { struct physical_volume *pv = NULL; struct device *dev; struct dm_list mdas; struct pvcreate_params default_pp; char buffer[64] __attribute__((aligned(8))); pvcreate_params_set_defaults(&default_pp); if (!pp) pp = &default_pp; if (pp->idp) { if ((dev = device_from_pvid(cmd, pp->idp, NULL, NULL)) && (dev != dev_cache_get(pv_name, cmd->filter))) { if (!id_write_format((const struct id*)&pp->idp->uuid, buffer, sizeof(buffer))) goto_bad; log_error("uuid %s already in use on \"%s\"", buffer, dev_name(dev)); goto bad;; } } if (!pvcreate_check(cmd, pv_name, pp)) goto_bad; if (sigint_caught()) goto_bad; if (!(dev = dev_cache_get(pv_name, cmd->filter))) { log_error("%s: Couldn't find device. Check your filters?", pv_name); goto bad; } dm_list_init(&mdas); if (!(pv = pv_create(cmd, dev, pp->idp, pp->size, pp->data_alignment, pp->data_alignment_offset, pp->pe_start ? pp->pe_start : PV_PE_START_CALC, pp->extent_count, pp->extent_size, pp->labelsector, pp->pvmetadatacopies, pp->pvmetadatasize, pp->metadataignore))) { log_error("Failed to setup physical volume \"%s\"", pv_name); goto bad; } log_verbose("Set up physical volume for \"%s\" with %" PRIu64 " available sectors", pv_name, pv_size(pv)); if (write_now) { struct pv_to_create pvc; pvc.pp = pp; pvc.pv = pv; if (!_pvcreate_write(cmd, &pvc)) goto bad; } else { pv->status |= UNLABELLED_PV; } return pv; bad: free_pv_fid(pv); return NULL; } static struct physical_volume *_alloc_pv(struct dm_pool *mem, struct device *dev) { struct physical_volume *pv = dm_pool_zalloc(mem, sizeof(*pv)); if (!pv) return_NULL; pv_set_fid(pv, NULL); pv->pe_size = 0; pv->pe_start = 0; pv->pe_count = 0; pv->pe_alloc_count = 0; pv->pe_align = 0; pv->pe_align_offset = 0; pv->fmt = NULL; pv->dev = dev; pv->status = ALLOCATABLE_PV; dm_list_init(&pv->tags); dm_list_init(&pv->segments); return pv; } /** * pv_create - initialize a physical volume for use with a volume group * * @fmt: format type * @dev: PV device to initialize * @size: size of the PV in sectors * @data_alignment: requested alignment of data * @data_alignment_offset: requested offset to aligned data * @pe_start: physical extent start * @existing_extent_count * @existing_extent_size * @pvmetadatacopies * @pvmetadatasize * @mdas * * Returns: * PV handle - physical volume initialized successfully * NULL - invalid parameter or problem initializing the physical volume * * Note: * FIXME: shorten argument list and replace with explict 'set' functions */ struct physical_volume *pv_create(const struct cmd_context *cmd, struct device *dev, struct id *id, uint64_t size, unsigned long data_alignment, unsigned long data_alignment_offset, uint64_t pe_start, uint32_t existing_extent_count, uint32_t existing_extent_size, uint64_t label_sector, int pvmetadatacopies, uint64_t pvmetadatasize, unsigned metadataignore) { const struct format_type *fmt = cmd->fmt; struct format_instance_ctx fic; struct format_instance *fid; struct dm_pool *mem = fmt->cmd->mem; struct physical_volume *pv = _alloc_pv(mem, dev); unsigned mda_index; if (!pv) return NULL; if (id) memcpy(&pv->id, id, sizeof(*id)); else if (!id_create(&pv->id)) { log_error("Failed to create random uuid for %s.", dev_name(dev)); goto bad; } if (!dev_get_size(pv->dev, &pv->size)) { log_error("%s: Couldn't get size.", pv_dev_name(pv)); goto bad; } if (size) { if (size > pv->size) log_warn("WARNING: %s: Overriding real size. " "You could lose data.", pv_dev_name(pv)); log_verbose("%s: Pretending size is %" PRIu64 " sectors.", pv_dev_name(pv), size); pv->size = size; } if (pv->size < pv_min_size()) { log_error("%s: Size must exceed minimum of %" PRIu64 " sectors.", pv_dev_name(pv), pv_min_size()); goto bad; } if (pv->size < data_alignment) { log_error("%s: Data alignment must not exceed device size.", pv_dev_name(pv)); goto bad; } fic.type = FMT_INSTANCE_PV; fic.context.pv_id = (const char *) &pv->id; if (!(fid = fmt->ops->create_instance(fmt, &fic))) { log_error("Couldn't create format instance for PV %s.", pv_dev_name(pv)); goto bad; } pv_set_fid(pv, fid); pv->fmt = fmt; pv->vg_name = fmt->orphan_vg_name; if (!fmt->ops->pv_initialise(fmt, label_sector, pe_start, existing_extent_count, existing_extent_size, data_alignment, data_alignment_offset, pv)) { log_error("Format-specific initialisation of physical " "volume %s failed.", pv_dev_name(pv)); goto bad; } for (mda_index = 0; mda_index < pvmetadatacopies; mda_index++) { if (pv->fmt->ops->pv_add_metadata_area && !pv->fmt->ops->pv_add_metadata_area(pv->fmt, pv, pe_start != PV_PE_START_CALC, mda_index, pvmetadatasize, metadataignore)) { log_error("Failed to add metadata area for " "new physical volume %s", pv_dev_name(pv)); goto bad; } } return pv; bad: free_pv_fid(pv); dm_pool_free(mem, pv); return NULL; } /* FIXME: liblvm todo - make into function that returns handle */ struct pv_list *find_pv_in_vg(const struct volume_group *vg, const char *pv_name) { return _find_pv_in_vg(vg, pv_name); } static struct pv_list *_find_pv_in_vg(const struct volume_group *vg, const char *pv_name) { struct pv_list *pvl; dm_list_iterate_items(pvl, &vg->pvs) if (pvl->pv->dev == dev_cache_get(pv_name, vg->cmd->filter)) return pvl; return NULL; } struct pv_list *find_pv_in_pv_list(const struct dm_list *pl, const struct physical_volume *pv) { struct pv_list *pvl; dm_list_iterate_items(pvl, pl) if (pvl->pv == pv) return pvl; return NULL; } int pv_is_in_vg(struct volume_group *vg, struct physical_volume *pv) { struct pv_list *pvl; dm_list_iterate_items(pvl, &vg->pvs) if (pv == pvl->pv) return 1; return 0; } static struct pv_list *_find_pv_in_vg_by_uuid(const struct volume_group *vg, const struct id *id) { struct pv_list *pvl; dm_list_iterate_items(pvl, &vg->pvs) if (id_equal(&pvl->pv->id, id)) return pvl; return NULL; } /** * find_pv_in_vg_by_uuid - Find PV in VG by PV UUID * @vg: volume group to search * @id: UUID of the PV to match * * Returns: * struct pv_list within owning struct volume_group - if UUID of PV found in VG * NULL - invalid parameter or UUID of PV not found in VG * * Note * FIXME - liblvm todo - make into function that takes VG handle */ struct pv_list *find_pv_in_vg_by_uuid(const struct volume_group *vg, const struct id *id) { return _find_pv_in_vg_by_uuid(vg, id); } struct lv_list *find_lv_in_vg(const struct volume_group *vg, const char *lv_name) { struct lv_list *lvl; const char *ptr; /* Use last component */ if ((ptr = strrchr(lv_name, '/'))) ptr++; else ptr = lv_name; dm_list_iterate_items(lvl, &vg->lvs) if (!strcmp(lvl->lv->name, ptr)) return lvl; return NULL; } struct lv_list *find_lv_in_lv_list(const struct dm_list *ll, const struct logical_volume *lv) { struct lv_list *lvl; dm_list_iterate_items(lvl, ll) if (lvl->lv == lv) return lvl; return NULL; } struct lv_list *find_lv_in_vg_by_lvid(struct volume_group *vg, const union lvid *lvid) { struct lv_list *lvl; dm_list_iterate_items(lvl, &vg->lvs) if (!strncmp(lvl->lv->lvid.s, lvid->s, sizeof(*lvid))) return lvl; return NULL; } struct logical_volume *find_lv(const struct volume_group *vg, const char *lv_name) { struct lv_list *lvl = find_lv_in_vg(vg, lv_name); return lvl ? lvl->lv : NULL; } struct physical_volume *find_pv(struct volume_group *vg, struct device *dev) { struct pv_list *pvl; dm_list_iterate_items(pvl, &vg->pvs) if (dev == pvl->pv->dev) return pvl->pv; return NULL; } /* FIXME: liblvm todo - make into function that returns handle */ struct physical_volume *find_pv_by_name(struct cmd_context *cmd, const char *pv_name) { return _find_pv_by_name(cmd, pv_name); } static struct physical_volume *_find_pv_by_name(struct cmd_context *cmd, const char *pv_name) { struct physical_volume *pv; if (!(pv = _pv_read(cmd, cmd->mem, pv_name, NULL, 1, 0))) { log_error("Physical volume %s not found", pv_name); goto bad; } if (is_orphan_vg(pv->vg_name) && !dm_list_size(&pv->fid->metadata_areas_in_use)) { /* If a PV has no MDAs - need to search all VGs for it */ if (!scan_vgs_for_pvs(cmd, 1)) goto_bad; free_pv_fid(pv); if (!(pv = _pv_read(cmd, cmd->mem, pv_name, NULL, 1, 0))) { log_error("Physical volume %s not found", pv_name); goto bad; } } if (is_orphan_vg(pv->vg_name)) { log_error("Physical volume %s not in a volume group", pv_name); goto bad; } return pv; bad: free_pv_fid(pv); return NULL; } /* Find segment at a given logical extent in an LV */ struct lv_segment *find_seg_by_le(const struct logical_volume *lv, uint32_t le) { struct lv_segment *seg; dm_list_iterate_items(seg, &lv->segments) if (le >= seg->le && le < seg->le + seg->len) return seg; return NULL; } struct lv_segment *first_seg(const struct logical_volume *lv) { struct lv_segment *seg; dm_list_iterate_items(seg, &lv->segments) return seg; return NULL; } int vg_remove_mdas(struct volume_group *vg) { struct metadata_area *mda; /* FIXME Improve recovery situation? */ /* Remove each copy of the metadata */ dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) { if (mda->ops->vg_remove && !mda->ops->vg_remove(vg->fid, vg, mda)) return_0; } return 1; } /* * Determine whether two vgs are compatible for merging. */ int vgs_are_compatible(struct cmd_context *cmd __attribute__((unused)), struct volume_group *vg_from, struct volume_group *vg_to) { struct lv_list *lvl1, *lvl2; struct pv_list *pvl; const char *name1, *name2; if (lvs_in_vg_activated(vg_from)) { log_error("Logical volumes in \"%s\" must be inactive", vg_from->name); return 0; } /* Check compatibility */ if (vg_to->extent_size != vg_from->extent_size) { log_error("Extent sizes differ: %d (%s) and %d (%s)", vg_to->extent_size, vg_to->name, vg_from->extent_size, vg_from->name); return 0; } if (vg_to->max_pv && (vg_to->max_pv < vg_to->pv_count + vg_from->pv_count)) { log_error("Maximum number of physical volumes (%d) exceeded " " for \"%s\" and \"%s\"", vg_to->max_pv, vg_to->name, vg_from->name); return 0; } if (vg_to->max_lv && (vg_to->max_lv < vg_visible_lvs(vg_to) + vg_visible_lvs(vg_from))) { log_error("Maximum number of logical volumes (%d) exceeded " " for \"%s\" and \"%s\"", vg_to->max_lv, vg_to->name, vg_from->name); return 0; } /* Metadata types must be the same */ if (vg_to->fid->fmt != vg_from->fid->fmt) { log_error("Metadata types differ for \"%s\" and \"%s\"", vg_to->name, vg_from->name); return 0; } /* Clustering attribute must be the same */ if (vg_is_clustered(vg_to) != vg_is_clustered(vg_from)) { log_error("Clustered attribute differs for \"%s\" and \"%s\"", vg_to->name, vg_from->name); return 0; } /* Check no conflicts with LV names */ dm_list_iterate_items(lvl1, &vg_to->lvs) { name1 = lvl1->lv->name; dm_list_iterate_items(lvl2, &vg_from->lvs) { name2 = lvl2->lv->name; if (!strcmp(name1, name2)) { log_error("Duplicate logical volume " "name \"%s\" " "in \"%s\" and \"%s\"", name1, vg_to->name, vg_from->name); return 0; } } } /* Check no PVs are constructed from either VG */ dm_list_iterate_items(pvl, &vg_to->pvs) { if (pv_uses_vg(pvl->pv, vg_from)) { log_error("Physical volume %s might be constructed " "from same volume group %s.", pv_dev_name(pvl->pv), vg_from->name); return 0; } } dm_list_iterate_items(pvl, &vg_from->pvs) { if (pv_uses_vg(pvl->pv, vg_to)) { log_error("Physical volume %s might be constructed " "from same volume group %s.", pv_dev_name(pvl->pv), vg_to->name); return 0; } } return 1; } struct _lv_postorder_baton { int (*fn)(struct logical_volume *lv, void *data); void *data; }; static int _lv_postorder_visit(struct logical_volume *, int (*fn)(struct logical_volume *lv, void *data), void *data); static int _lv_each_dependency(struct logical_volume *lv, int (*fn)(struct logical_volume *lv, void *data), void *data) { unsigned i, s; struct lv_segment *lvseg; struct logical_volume *deps[] = { (lv->rdevice && lv != lv->rdevice->lv) ? lv->rdevice->lv : 0, (lv->rdevice && lv != lv->rdevice->slog) ? lv->rdevice->slog : 0, lv->snapshot ? lv->snapshot->origin : 0, lv->snapshot ? lv->snapshot->cow : 0 }; for (i = 0; i < sizeof(deps) / sizeof(*deps); ++i) { if (deps[i] && !fn(deps[i], data)) return_0; } dm_list_iterate_items(lvseg, &lv->segments) { if (lvseg->log_lv && !fn(lvseg->log_lv, data)) return_0; if (lvseg->rlog_lv && !fn(lvseg->rlog_lv, data)) return_0; for (s = 0; s < lvseg->area_count; ++s) { if (seg_type(lvseg, s) == AREA_LV && !fn(seg_lv(lvseg,s), data)) return_0; } } return 1; } static int _lv_postorder_cleanup(struct logical_volume *lv, void *data) { if (!(lv->status & POSTORDER_FLAG)) return 1; lv->status &= ~POSTORDER_FLAG; if (!_lv_each_dependency(lv, _lv_postorder_cleanup, data)) return_0; return 1; } static int _lv_postorder_level(struct logical_volume *lv, void *data) { struct _lv_postorder_baton *baton = data; return _lv_postorder_visit(lv, baton->fn, baton->data); }; static int _lv_postorder_visit(struct logical_volume *lv, int (*fn)(struct logical_volume *lv, void *data), void *data) { struct _lv_postorder_baton baton; int r; if (lv->status & POSTORDER_FLAG) return 1; if (lv->status & POSTORDER_OPEN_FLAG) return 1; // a data structure loop has closed... lv->status |= POSTORDER_OPEN_FLAG; baton.fn = fn; baton.data = data; r = _lv_each_dependency(lv, _lv_postorder_level, &baton); if (r) r = fn(lv, data); lv->status &= ~POSTORDER_OPEN_FLAG; lv->status |= POSTORDER_FLAG; return r; } /* * This will walk the LV dependency graph in depth-first order and in the * postorder, call a callback function "fn". The void *data is passed along all * the calls. The callback may return zero to indicate an error and terminate * the depth-first walk. The error is propagated to return value of * _lv_postorder. */ static int _lv_postorder(struct logical_volume *lv, int (*fn)(struct logical_volume *lv, void *data), void *data) { int r; r = _lv_postorder_visit(lv, fn, data); _lv_postorder_cleanup(lv, 0); return r; } /* * Calls _lv_postorder() on each LV from VG. Avoids duplicate transitivity visits. * Clears with _lv_postorder_cleanup() when all LVs were visited by postorder. */ static int _lv_postorder_vg(struct volume_group *vg, int (*fn)(struct logical_volume *lv, void *data), void *data) { struct lv_list *lvl; int r = 1; dm_list_iterate_items(lvl, &vg->lvs) if (!_lv_postorder_visit(lvl->lv, fn, data)) { stack; r = 0; } dm_list_iterate_items(lvl, &vg->lvs) _lv_postorder_cleanup(lvl->lv, 0); return r; } struct _lv_mark_if_partial_baton { int partial; }; static int _lv_mark_if_partial_collect(struct logical_volume *lv, void *data) { struct _lv_mark_if_partial_baton *baton = data; if (lv->status & PARTIAL_LV) baton->partial = 1; return 1; } static int _lv_mark_if_partial_single(struct logical_volume *lv, void *data) { unsigned s; struct _lv_mark_if_partial_baton baton; struct lv_segment *lvseg; dm_list_iterate_items(lvseg, &lv->segments) { for (s = 0; s < lvseg->area_count; ++s) { if (seg_type(lvseg, s) == AREA_PV) { if (is_missing_pv(seg_pv(lvseg, s))) lv->status |= PARTIAL_LV; } } } baton.partial = 0; _lv_each_dependency(lv, _lv_mark_if_partial_collect, &baton); if (baton.partial) lv->status |= PARTIAL_LV; return 1; } /* * Mark LVs with missing PVs using PARTIAL_LV status flag. The flag is * propagated transitively, so LVs referencing other LVs are marked * partial as well, if any of their referenced LVs are marked partial. */ int vg_mark_partial_lvs(struct volume_group *vg, int clear) { struct lv_list *lvl; if (clear) dm_list_iterate_items(lvl, &vg->lvs) lvl->lv->status &= ~PARTIAL_LV; if (!_lv_postorder_vg(vg, _lv_mark_if_partial_single, NULL)) return_0; return 1; } /* * Be sure that all PV devices have cached read ahead in dev-cache * Currently it takes read_ahead from first PV segment only */ static int _lv_read_ahead_single(struct logical_volume *lv, void *data) { struct lv_segment *seg = first_seg(lv); uint32_t seg_read_ahead = 0, *read_ahead = data; if (seg && seg->area_count && seg_type(seg, 0) == AREA_PV) dev_get_read_ahead(seg_pv(seg, 0)->dev, &seg_read_ahead); if (seg_read_ahead > *read_ahead) *read_ahead = seg_read_ahead; return 1; } /* * Calculate readahead for logical volume from underlying PV devices. * If read_ahead is NULL, only ensure that readahead of PVs are preloaded * into PV struct device in dev cache. */ void lv_calculate_readahead(const struct logical_volume *lv, uint32_t *read_ahead) { uint32_t _read_ahead = 0; if (lv->read_ahead == DM_READ_AHEAD_AUTO) _lv_postorder((struct logical_volume *)lv, _lv_read_ahead_single, &_read_ahead); if (read_ahead) { log_debug("Calculated readahead of LV %s is %u", lv->name, _read_ahead); *read_ahead = _read_ahead; } } struct validate_hash { struct dm_hash_table *lvname; struct dm_hash_table *lvid; struct dm_hash_table *pvid; }; /* * Check that an LV and all its PV references are correctly listed in vg->lvs * and vg->pvs, respectively. This only looks at a single LV, but *not* at the * LVs it is using. To do the latter, you should use _lv_postorder with this * function. C.f. vg_validate. */ static int _lv_validate_references_single(struct logical_volume *lv, void *data) { struct volume_group *vg = lv->vg; struct validate_hash *vhash = data; struct lv_segment *lvseg; struct physical_volume *pv; unsigned s; int r = 1; if (lv != dm_hash_lookup_binary(vhash->lvid, &lv->lvid.id[1], sizeof(lv->lvid.id[1]))) { log_error(INTERNAL_ERROR "Referenced LV %s not listed in VG %s.", lv->name, vg->name); r = 0; } dm_list_iterate_items(lvseg, &lv->segments) { for (s = 0; s < lvseg->area_count; ++s) { if (seg_type(lvseg, s) != AREA_PV) continue; pv = seg_pv(lvseg, s); /* look up the reference in vg->pvs */ if (pv != dm_hash_lookup_binary(vhash->pvid, &pv->id, sizeof(pv->id))) { log_error(INTERNAL_ERROR "Referenced PV %s not listed in VG %s.", pv_dev_name(pv), vg->name); r = 0; } } } return r; } int vg_validate(struct volume_group *vg) { struct pv_list *pvl; struct lv_list *lvl; struct lv_segment *seg; char uuid[64] __attribute__((aligned(8))); int r = 1; uint32_t hidden_lv_count = 0, lv_count = 0, lv_visible_count = 0; uint32_t pv_count = 0; uint32_t num_snapshots = 0; struct validate_hash vhash = { NULL }; if (vg->alloc == ALLOC_CLING_BY_TAGS) { log_error(INTERNAL_ERROR "VG %s allocation policy set to invalid cling_by_tags.", vg->name); r = 0; } /* FIXME Also check there's no data/metadata overlap */ if (!(vhash.pvid = dm_hash_create(vg->pv_count))) { log_error("Failed to allocate pvid hash."); return 0; } dm_list_iterate_items(pvl, &vg->pvs) { if (++pv_count > vg->pv_count) { log_error(INTERNAL_ERROR "PV list corruption detected in VG %s.", vg->name); /* FIXME Dump list structure? */ r = 0; } if (pvl->pv->vg != vg) { log_error(INTERNAL_ERROR "VG %s PV list entry points " "to different VG %s.", vg->name, pvl->pv->vg ? pvl->pv->vg->name : "NULL"); r = 0; } if (strcmp(pvl->pv->vg_name, vg->name)) { log_error(INTERNAL_ERROR "VG name for PV %s is corrupted.", pv_dev_name(pvl->pv)); r = 0; } if (dm_hash_lookup_binary(vhash.pvid, &pvl->pv->id, sizeof(pvl->pv->id))) { if (!id_write_format(&pvl->pv->id, uuid, sizeof(uuid))) stack; log_error(INTERNAL_ERROR "Duplicate PV id " "%s detected for %s in %s.", uuid, pv_dev_name(pvl->pv), vg->name); r = 0; } if (!dm_hash_insert_binary(vhash.pvid, &pvl->pv->id, sizeof(pvl->pv->id), pvl->pv)) { log_error("Failed to hash pvid."); r = 0; break; } } if (!check_pv_segments(vg)) { log_error(INTERNAL_ERROR "PV segments corrupted in %s.", vg->name); r = 0; } /* * Count all non-snapshot invisible LVs */ dm_list_iterate_items(lvl, &vg->lvs) { lv_count++; if (lv_is_cow(lvl->lv)) num_snapshots++; if (lv_is_visible(lvl->lv)) lv_visible_count++; if (!check_lv_segments(lvl->lv, 0)) { log_error(INTERNAL_ERROR "LV segments corrupted in %s.", lvl->lv->name); r = 0; } if (lvl->lv->alloc == ALLOC_CLING_BY_TAGS) { log_error(INTERNAL_ERROR "LV %s allocation policy set to invalid cling_by_tags.", lvl->lv->name); r = 0; } if (lvl->lv->status & VISIBLE_LV) continue; /* snapshots */ if (lv_is_cow(lvl->lv)) continue; /* virtual origins are always hidden */ if (lv_is_origin(lvl->lv) && !lv_is_virtual_origin(lvl->lv)) continue; /* count other non-snapshot invisible volumes */ hidden_lv_count++; /* * FIXME: add check for unreferenced invisible LVs * - snapshot cow & origin * - mirror log & images * - mirror conversion volumes (_mimagetmp*) */ } /* * all volumes = visible LVs + snapshot_cows + invisible LVs */ if (lv_count != lv_visible_count + num_snapshots + hidden_lv_count) { log_error(INTERNAL_ERROR "#internal LVs (%u) != #LVs (%" PRIu32 ") + #snapshots (%" PRIu32 ") + #internal LVs (%u) in VG %s", lv_count, lv_visible_count, num_snapshots, hidden_lv_count, vg->name); r = 0; } /* Avoid endless loop if lv->segments list is corrupt */ if (!r) goto out; if (!(vhash.lvname = dm_hash_create(lv_count))) { log_error("Failed to allocate lv_name hash"); r = 0; goto out; } if (!(vhash.lvid = dm_hash_create(lv_count))) { log_error("Failed to allocate uuid hash"); r = 0; goto out; } dm_list_iterate_items(lvl, &vg->lvs) { if (dm_hash_lookup(vhash.lvname, lvl->lv->name)) { log_error(INTERNAL_ERROR "Duplicate LV name %s detected in %s.", lvl->lv->name, vg->name); r = 0; } if (dm_hash_lookup_binary(vhash.lvid, &lvl->lv->lvid.id[1], sizeof(lvl->lv->lvid.id[1]))) { if (!id_write_format(&lvl->lv->lvid.id[1], uuid, sizeof(uuid))) stack; log_error(INTERNAL_ERROR "Duplicate LV id " "%s detected for %s in %s.", uuid, lvl->lv->name, vg->name); r = 0; } if (!check_lv_segments(lvl->lv, 1)) { log_error(INTERNAL_ERROR "LV segments corrupted in %s.", lvl->lv->name); r = 0; } if (!dm_hash_insert(vhash.lvname, lvl->lv->name, lvl)) { log_error("Failed to hash lvname."); r = 0; break; } if (!dm_hash_insert_binary(vhash.lvid, &lvl->lv->lvid.id[1], sizeof(lvl->lv->lvid.id[1]), lvl->lv)) { log_error("Failed to hash lvid."); r = 0; break; } } if (!_lv_postorder_vg(vg, _lv_validate_references_single, &vhash)) { stack; r = 0; } dm_list_iterate_items(lvl, &vg->lvs) { if (!(lvl->lv->status & PVMOVE)) continue; dm_list_iterate_items(seg, &lvl->lv->segments) { if (seg_is_mirrored(seg)) { if (seg->area_count != 2) { log_error(INTERNAL_ERROR "Segment in %s is not 2-way.", lvl->lv->name); r = 0; } } else if (seg->area_count != 1) { log_error(INTERNAL_ERROR "Segment in %s has wrong number of areas: %d.", lvl->lv->name, seg->area_count); r = 0; } } } if (!(vg->fid->fmt->features & FMT_UNLIMITED_VOLS) && (!vg->max_lv || !vg->max_pv)) { log_error(INTERNAL_ERROR "Volume group %s has limited PV/LV count" " but limit is not set.", vg->name); r = 0; } if (vg_max_lv_reached(vg)) stack; out: if (vhash.lvid) dm_hash_destroy(vhash.lvid); if (vhash.lvname) dm_hash_destroy(vhash.lvname); if (vhash.pvid) dm_hash_destroy(vhash.pvid); return r; } /* * After vg_write() returns success, * caller MUST call either vg_commit() or vg_revert() */ int vg_write(struct volume_group *vg) { struct dm_list *mdah; struct pv_to_create *pv_to_create; struct metadata_area *mda; if (!vg_validate(vg)) return_0; if (vg->status & PARTIAL_VG) { log_error("Cannot update partial volume group %s.", vg->name); return 0; } if (vg_missing_pv_count(vg) && !vg->cmd->handles_missing_pvs) { log_error("Cannot update volume group %s while physical " "volumes are missing.", vg->name); return 0; } if (vg_has_unknown_segments(vg) && !vg->cmd->handles_unknown_segments) { log_error("Cannot update volume group %s with unknown segments in it!", vg->name); return 0; } if ((vg->fid->fmt->features & FMT_MDAS) && !_vg_adjust_ignored_mdas(vg)) return_0; if (!vg_mda_used_count(vg)) { log_error("Aborting vg_write: No metadata areas to write to!"); return 0; } if (!drop_cached_metadata(vg)) { log_error("Unable to drop cached metadata for VG %s.", vg->name); return 0; } vg->seqno++; dm_list_iterate_items(pv_to_create, &vg->pvs_to_create) { if (!_pvcreate_write(vg->cmd, pv_to_create)) return 0; pv_to_create->pv->status &= ~UNLABELLED_PV; } /* Write to each copy of the metadata area */ dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) { if (!mda->ops->vg_write) { log_error("Format does not support writing volume" "group metadata areas"); /* Revert */ dm_list_uniterate(mdah, &vg->fid->metadata_areas_in_use, &mda->list) { mda = dm_list_item(mdah, struct metadata_area); if (mda->ops->vg_revert && !mda->ops->vg_revert(vg->fid, vg, mda)) { stack; } } return 0; } if (!mda->ops->vg_write(vg->fid, vg, mda)) { stack; /* Revert */ dm_list_uniterate(mdah, &vg->fid->metadata_areas_in_use, &mda->list) { mda = dm_list_item(mdah, struct metadata_area); if (mda->ops->vg_revert && !mda->ops->vg_revert(vg->fid, vg, mda)) { stack; } } return 0; } } /* Now pre-commit each copy of the new metadata */ dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) { if (mda->ops->vg_precommit && !mda->ops->vg_precommit(vg->fid, vg, mda)) { stack; /* Revert */ dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) { if (mda->ops->vg_revert && !mda->ops->vg_revert(vg->fid, vg, mda)) { stack; } } return 0; } } return 1; } static int _vg_commit_mdas(struct volume_group *vg) { struct metadata_area *mda, *tmda; struct dm_list ignored; int failed = 0; int cache_updated = 0; /* Rearrange the metadata_areas_in_use so ignored mdas come first. */ dm_list_init(&ignored); dm_list_iterate_items_safe(mda, tmda, &vg->fid->metadata_areas_in_use) if (mda_is_ignored(mda)) dm_list_move(&ignored, &mda->list); dm_list_iterate_items_safe(mda, tmda, &ignored) dm_list_move(&vg->fid->metadata_areas_in_use, &mda->list); /* Commit to each copy of the metadata area */ dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) { failed = 0; if (mda->ops->vg_commit && !mda->ops->vg_commit(vg->fid, vg, mda)) { stack; failed = 1; } /* Update cache first time we succeed */ if (!failed && !cache_updated) { lvmcache_update_vg(vg, 0); cache_updated = 1; } } return cache_updated; } /* Commit pending changes */ int vg_commit(struct volume_group *vg) { int cache_updated = 0; if (!vgname_is_locked(vg->name)) { log_error(INTERNAL_ERROR "Attempt to write new VG metadata " "without locking %s", vg->name); return cache_updated; } cache_updated = _vg_commit_mdas(vg); if (cache_updated) { /* Instruct remote nodes to upgrade cached metadata. */ remote_commit_cached_metadata(vg); /* * We need to clear old_name after a successful commit. * The volume_group structure could be reused later. */ vg->old_name = NULL; } /* If update failed, remove any cached precommitted metadata. */ if (!cache_updated && !drop_cached_metadata(vg)) log_error("Attempt to drop cached metadata failed " "after commit for VG %s.", vg->name); /* If at least one mda commit succeeded, it was committed */ return cache_updated; } /* Don't commit any pending changes */ int vg_revert(struct volume_group *vg) { struct metadata_area *mda; dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) { if (mda->ops->vg_revert && !mda->ops->vg_revert(vg->fid, vg, mda)) { stack; } } if (!drop_cached_metadata(vg)) log_error("Attempt to drop cached metadata failed " "after reverted update for VG %s.", vg->name); remote_revert_cached_metadata(vg); return 1; } /* Make orphan PVs look like a VG */ static struct volume_group *_vg_read_orphans(struct cmd_context *cmd, int warnings, const char *orphan_vgname) { struct format_instance_ctx fic; struct format_instance *fid; struct lvmcache_vginfo *vginfo; struct lvmcache_info *info; struct pv_list *pvl; struct volume_group *vg = NULL; struct physical_volume *pv = NULL; lvmcache_label_scan(cmd, 0); if (!(vginfo = vginfo_from_vgname(orphan_vgname, NULL))) return_NULL; if (!(vg = alloc_vg("vg_read_orphans", cmd, orphan_vgname))) return_NULL; /* create format instance with appropriate metadata area */ fic.type = FMT_INSTANCE_VG | FMT_INSTANCE_AUX_MDAS; fic.context.vg_ref.vg_name = orphan_vgname; fic.context.vg_ref.vg_id = NULL; if (!(fid = vginfo->fmt->ops->create_instance(vginfo->fmt, &fic))) { log_error("Failed to create format instance"); goto bad; } vg_set_fid(vg, fid); dm_list_iterate_items(info, &vginfo->infos) { if (!(pv = _pv_read(cmd, vg->vgmem, dev_name(info->dev), vg->fid, warnings, 0))) { continue; } if (!(pvl = dm_pool_zalloc(vg->vgmem, sizeof(*pvl)))) { log_error("pv_list allocation failed"); goto bad; } pvl->pv = pv; add_pvl_to_vgs(vg, pvl); } return vg; bad: free_pv_fid(pv); free_vg(vg); return NULL; } static int _update_pv_list(struct dm_pool *pvmem, struct dm_list *all_pvs, struct volume_group *vg) { struct pv_list *pvl, *pvl2; dm_list_iterate_items(pvl, &vg->pvs) { dm_list_iterate_items(pvl2, all_pvs) { if (pvl->pv->dev == pvl2->pv->dev) goto next_pv; } /* * PV is not on list so add it. */ if (!(pvl2 = _copy_pvl(pvmem, pvl))) { log_error("pv_list allocation for '%s' failed", pv_dev_name(pvl->pv)); return 0; } dm_list_add(all_pvs, &pvl2->list); next_pv: ; } return 1; } static void _free_pv_list(struct dm_list *all_pvs) { struct pv_list *pvl; dm_list_iterate_items(pvl, all_pvs) pvl->pv->fid->fmt->ops->destroy_instance(pvl->pv->fid); } int vg_missing_pv_count(const struct volume_group *vg) { int ret = 0; struct pv_list *pvl; dm_list_iterate_items(pvl, &vg->pvs) { if (is_missing_pv(pvl->pv)) ++ ret; } return ret; } static void check_reappeared_pv(struct volume_group *correct_vg, struct physical_volume *pv) { struct pv_list *pvl; /* * Skip these checks in case the tool is going to deal with missing * PVs, especially since the resulting messages can be pretty * confusing. */ if (correct_vg->cmd->handles_missing_pvs) return; dm_list_iterate_items(pvl, &correct_vg->pvs) if (pv->dev == pvl->pv->dev && is_missing_pv(pvl->pv)) { log_warn("Missing device %s reappeared, updating " "metadata for VG %s to version %u.", pv_dev_name(pvl->pv), pv_vg_name(pvl->pv), correct_vg->seqno); if (pvl->pv->pe_alloc_count == 0) { pv->status &= ~MISSING_PV; pvl->pv->status &= ~MISSING_PV; } else log_warn("Device still marked missing because of allocated data " "on it, remove volumes and consider vgreduce --removemissing."); } } /* Caller sets consistent to 1 if it's safe for vg_read_internal to correct * inconsistent metadata on disk (i.e. the VG write lock is held). * This guarantees only consistent metadata is returned. * If consistent is 0, caller must check whether consistent == 1 on return * and take appropriate action if it isn't (e.g. abort; get write lock * and call vg_read_internal again). * * If precommitted is set, use precommitted metadata if present. * * Either of vgname or vgid may be NULL. * * Note: vginfo structs must not be held or used as parameters * across the call to this function. */ static struct volume_group *_vg_read(struct cmd_context *cmd, const char *vgname, const char *vgid, int warnings, int *consistent, unsigned precommitted) { struct format_instance *fid; struct format_instance_ctx fic; const struct format_type *fmt; struct volume_group *vg, *correct_vg = NULL; struct metadata_area *mda; struct lvmcache_info *info; int inconsistent = 0; int inconsistent_vgid = 0; int inconsistent_pvs = 0; int inconsistent_seqno = 0; int inconsistent_mdas = 0; int inconsistent_mda_count = 0; unsigned use_precommitted = precommitted; unsigned saved_handles_missing_pvs = cmd->handles_missing_pvs; struct dm_list *pvids; struct pv_list *pvl, *pvl2; struct dm_list all_pvs; char uuid[64] __attribute__((aligned(8))); if (is_orphan_vg(vgname)) { if (use_precommitted) { log_error(INTERNAL_ERROR "vg_read_internal requires vgname " "with pre-commit."); return NULL; } *consistent = 1; return _vg_read_orphans(cmd, warnings, vgname); } /* * If cached metadata was inconsistent and *consistent is set * then repair it now. Otherwise just return it. * Also return if use_precommitted is set due to the FIXME in * the missing PV logic below. */ if ((correct_vg = lvmcache_get_vg(vgid, precommitted)) && (use_precommitted || !*consistent || !(correct_vg->status & INCONSISTENT_VG))) { if (!(correct_vg->status & INCONSISTENT_VG)) *consistent = 1; else /* Inconsistent but we can't repair it */ correct_vg->status &= ~INCONSISTENT_VG; return correct_vg; } else { free_vg(correct_vg); correct_vg = NULL; } /* Find the vgname in the cache */ /* If it's not there we must do full scan to be completely sure */ if (!(fmt = fmt_from_vgname(vgname, vgid, 1))) { lvmcache_label_scan(cmd, 0); if (!(fmt = fmt_from_vgname(vgname, vgid, 1))) { /* Independent MDAs aren't supported under low memory */ if (!cmd->independent_metadata_areas && critical_section()) return_NULL; lvmcache_label_scan(cmd, 2); if (!(fmt = fmt_from_vgname(vgname, vgid, 0))) return_NULL; } } /* Now determine the correct vgname if none was supplied */ if (!vgname && !(vgname = vgname_from_vgid(cmd->mem, vgid))) return_NULL; if (use_precommitted && !(fmt->features & FMT_PRECOMMIT)) use_precommitted = 0; /* create format instance with appropriate metadata area */ fic.type = FMT_INSTANCE_VG | FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS; fic.context.vg_ref.vg_name = vgname; fic.context.vg_ref.vg_id = vgid; if (!(fid = fmt->ops->create_instance(fmt, &fic))) { log_error("Failed to create format instance"); return NULL; } /* Store pvids for later so we can check if any are missing */ if (!(pvids = lvmcache_get_pvids(cmd, vgname, vgid))) return_NULL; /* Ensure contents of all metadata areas match - else do recovery */ inconsistent_mda_count=0; dm_list_iterate_items(mda, &fid->metadata_areas_in_use) { if ((use_precommitted && !(vg = mda->ops->vg_read_precommit(fid, vgname, mda))) || (!use_precommitted && !(vg = mda->ops->vg_read(fid, vgname, mda)))) { inconsistent = 1; free_vg(vg); continue; } if (!correct_vg) { correct_vg = vg; continue; } /* FIXME Also ensure contents same - checksum compare? */ if (correct_vg->seqno != vg->seqno) { if (cmd->metadata_read_only) log_very_verbose("Not repairing VG %s metadata seqno (%d != %d) " "as global/metadata_read_only is set.", vgname, vg->seqno, correct_vg->seqno); else { inconsistent = 1; inconsistent_seqno = 1; } if (vg->seqno > correct_vg->seqno) { free_vg(correct_vg); correct_vg = vg; } else { mda->status |= MDA_INCONSISTENT; ++inconsistent_mda_count; } } if (vg != correct_vg) free_vg(vg); } /* Ensure every PV in the VG was in the cache */ if (correct_vg) { /* * If the VG has PVs without mdas, or ignored mdas, they may * still be orphans in the cache: update the cache state here, * and update the metadata lists in the vg. */ if (!inconsistent && dm_list_size(&correct_vg->pvs) > dm_list_size(pvids)) { dm_list_iterate_items(pvl, &correct_vg->pvs) { if (!pvl->pv->dev) { inconsistent_pvs = 1; break; } if (str_list_match_item(pvids, pvl->pv->dev->pvid)) continue; /* * PV not marked as belonging to this VG in cache. * Check it's an orphan without metadata area * not ignored. */ if (!(info = info_from_pvid(pvl->pv->dev->pvid, 1)) || !info->vginfo || !is_orphan_vg(info->vginfo->vgname)) { inconsistent_pvs = 1; break; } if (dm_list_size(&info->mdas)) { if (!fid_add_mdas(fid, &info->mdas, info->dev->pvid, ID_LEN)) return_NULL; log_debug("Empty mda found for VG %s.", vgname); if (inconsistent_mdas) continue; /* * If any newly-added mdas are in-use then their * metadata needs updating. */ dm_list_iterate_items(mda, &info->mdas) if (!mda_is_ignored(mda)) { inconsistent_mdas = 1; break; } } } /* If the check passed, let's update VG and recalculate pvids */ if (!inconsistent_pvs) { log_debug("Updating cache for PVs without mdas " "in VG %s.", vgname); /* * If there is no precommitted metadata, committed metadata * is read and stored in the cache even if use_precommitted is set */ lvmcache_update_vg(correct_vg, correct_vg->status & PRECOMMITTED); if (!(pvids = lvmcache_get_pvids(cmd, vgname, vgid))) return_NULL; } } if (dm_list_size(&correct_vg->pvs) != dm_list_size(pvids) + vg_missing_pv_count(correct_vg)) { log_debug("Cached VG %s had incorrect PV list", vgname); if (critical_section()) inconsistent = 1; else { free_vg(correct_vg); correct_vg = NULL; } } else dm_list_iterate_items(pvl, &correct_vg->pvs) { if (is_missing_pv(pvl->pv)) continue; if (!str_list_match_item(pvids, pvl->pv->dev->pvid)) { log_debug("Cached VG %s had incorrect PV list", vgname); free_vg(correct_vg); correct_vg = NULL; break; } } if (correct_vg && inconsistent_mdas) { free_vg(correct_vg); correct_vg = NULL; } } dm_list_init(&all_pvs); /* Failed to find VG where we expected it - full scan and retry */ if (!correct_vg) { inconsistent = 0; /* Independent MDAs aren't supported under low memory */ if (!cmd->independent_metadata_areas && critical_section()) return_NULL; lvmcache_label_scan(cmd, 2); if (!(fmt = fmt_from_vgname(vgname, vgid, 0))) return_NULL; if (precommitted && !(fmt->features & FMT_PRECOMMIT)) use_precommitted = 0; /* create format instance with appropriate metadata area */ fic.type = FMT_INSTANCE_VG | FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS; fic.context.vg_ref.vg_name = vgname; fic.context.vg_ref.vg_id = vgid; if (!(fid = fmt->ops->create_instance(fmt, &fic))) { log_error("Failed to create format instance"); return NULL; } /* Ensure contents of all metadata areas match - else recover */ inconsistent_mda_count=0; dm_list_iterate_items(mda, &fid->metadata_areas_in_use) { if ((use_precommitted && !(vg = mda->ops->vg_read_precommit(fid, vgname, mda))) || (!use_precommitted && !(vg = mda->ops->vg_read(fid, vgname, mda)))) { inconsistent = 1; continue; } if (!correct_vg) { correct_vg = vg; if (!_update_pv_list(cmd->mem, &all_pvs, correct_vg)) { _free_pv_list(&all_pvs); free_vg(vg); return_NULL; } continue; } if (!id_equal(&vg->id, &correct_vg->id)) { inconsistent = 1; inconsistent_vgid = 1; } /* FIXME Also ensure contents same - checksums same? */ if (correct_vg->seqno != vg->seqno) { /* Ignore inconsistent seqno if told to skip repair logic */ if (cmd->metadata_read_only) log_very_verbose("Not repairing VG %s metadata seqno (%d != %d) " "as global/metadata_read_only is set.", vgname, vg->seqno, correct_vg->seqno); else { inconsistent = 1; inconsistent_seqno = 1; } if (!_update_pv_list(cmd->mem, &all_pvs, vg)) { _free_pv_list(&all_pvs); free_vg(vg); free_vg(correct_vg); return_NULL; } if (vg->seqno > correct_vg->seqno) { free_vg(correct_vg); correct_vg = vg; } else { mda->status |= MDA_INCONSISTENT; ++inconsistent_mda_count; } } if (vg != correct_vg) free_vg(vg); } /* Give up looking */ if (!correct_vg) { _free_pv_list(&all_pvs); return_NULL; } } /* * If there is no precommitted metadata, committed metadata * is read and stored in the cache even if use_precommitted is set */ lvmcache_update_vg(correct_vg, correct_vg->status & PRECOMMITTED & (inconsistent ? INCONSISTENT_VG : 0)); if (inconsistent) { /* FIXME Test should be if we're *using* precommitted metadata not if we were searching for it */ if (use_precommitted) { log_error("Inconsistent pre-commit metadata copies " "for volume group %s", vgname); /* * Check whether all of the inconsistent MDAs were on * MISSING PVs -- in that case, we should be safe. */ dm_list_iterate_items(mda, &fid->metadata_areas_in_use) { if (mda->status & MDA_INCONSISTENT) { log_debug("Checking inconsistent MDA: %s", dev_name(mda_get_device(mda))); dm_list_iterate_items(pvl, &correct_vg->pvs) { if (mda_get_device(mda) == pvl->pv->dev && (pvl->pv->status & MISSING_PV)) --inconsistent_mda_count; } } } if (inconsistent_mda_count < 0) log_error(INTERNAL_ERROR "Too many inconsistent MDAs."); if (!inconsistent_mda_count) { *consistent = 0; _free_pv_list(&all_pvs); return correct_vg; } _free_pv_list(&all_pvs); free_vg(correct_vg); return NULL; } if (!*consistent) { _free_pv_list(&all_pvs); return correct_vg; } /* Don't touch if vgids didn't match */ if (inconsistent_vgid) { log_error("Inconsistent metadata UUIDs found for " "volume group %s", vgname); *consistent = 0; _free_pv_list(&all_pvs); return correct_vg; } log_warn("WARNING: Inconsistent metadata found for VG %s - updating " "to use version %u", vgname, correct_vg->seqno); /* * If PV is marked missing but we found it, * update metadata and remove MISSING flag */ dm_list_iterate_items(pvl, &all_pvs) check_reappeared_pv(correct_vg, pvl->pv); cmd->handles_missing_pvs = 1; if (!vg_write(correct_vg)) { log_error("Automatic metadata correction failed"); _free_pv_list(&all_pvs); free_vg(correct_vg); cmd->handles_missing_pvs = saved_handles_missing_pvs; return NULL; } cmd->handles_missing_pvs = saved_handles_missing_pvs; if (!vg_commit(correct_vg)) { log_error("Automatic metadata correction commit " "failed"); free_vg(correct_vg); return NULL; } dm_list_iterate_items(pvl, &all_pvs) { dm_list_iterate_items(pvl2, &correct_vg->pvs) { if (pvl->pv->dev == pvl2->pv->dev) goto next_pv; } if (!id_write_format(&pvl->pv->id, uuid, sizeof(uuid))) { _free_pv_list(&all_pvs); free_vg(correct_vg); return_NULL; } log_error("Removing PV %s (%s) that no longer belongs to VG %s", pv_dev_name(pvl->pv), uuid, correct_vg->name); if (!pv_write_orphan(cmd, pvl->pv)) { _free_pv_list(&all_pvs); free_vg(correct_vg); return_NULL; } /* Refresh metadata after orphan write */ drop_cached_metadata(correct_vg); next_pv: ; } } _free_pv_list(&all_pvs); if (vg_missing_pv_count(correct_vg)) { log_verbose("There are %d physical volumes missing.", vg_missing_pv_count(correct_vg)); vg_mark_partial_lvs(correct_vg, 1); } if ((correct_vg->status & PVMOVE) && !pvmove_mode()) { log_error("WARNING: Interrupted pvmove detected in " "volume group %s", correct_vg->name); log_error("Please restore the metadata by running " "vgcfgrestore."); free_vg(correct_vg); return NULL; } *consistent = 1; return correct_vg; } struct volume_group *vg_read_internal(struct cmd_context *cmd, const char *vgname, const char *vgid, int warnings, int *consistent) { struct volume_group *vg; struct lv_list *lvl; if (!(vg = _vg_read(cmd, vgname, vgid, warnings, consistent, 0))) return NULL; if (!check_pv_segments(vg)) { log_error(INTERNAL_ERROR "PV segments corrupted in %s.", vg->name); free_vg(vg); return NULL; } dm_list_iterate_items(lvl, &vg->lvs) { if (!check_lv_segments(lvl->lv, 0)) { log_error(INTERNAL_ERROR "LV segments corrupted in %s.", lvl->lv->name); free_vg(vg); return NULL; } } dm_list_iterate_items(lvl, &vg->lvs) { /* * Checks that cross-reference other LVs. */ if (!check_lv_segments(lvl->lv, 1)) { log_error(INTERNAL_ERROR "LV segments corrupted in %s.", lvl->lv->name); free_vg(vg); return NULL; } } return vg; } void free_pv_fid(struct physical_volume *pv) { if (!pv) return; if (pv->fid) pv->fid->fmt->ops->destroy_instance(pv->fid); } void free_vg(struct volume_group *vg) { if (!vg) return; vg_set_fid(vg, NULL); if (vg->cmd && vg->vgmem == vg->cmd->mem) { log_error(INTERNAL_ERROR "global memory pool used for VG %s", vg->name); return; } dm_pool_destroy(vg->vgmem); } /* This is only called by lv_from_lvid, which is only called from * activate.c so we know the appropriate VG lock is already held and * the vg_read_internal is therefore safe. */ static struct volume_group *_vg_read_by_vgid(struct cmd_context *cmd, const char *vgid, unsigned precommitted) { const char *vgname; struct dm_list *vgnames; struct volume_group *vg; struct lvmcache_vginfo *vginfo; struct str_list *strl; int consistent = 0; /* Is corresponding vgname already cached? */ if ((vginfo = vginfo_from_vgid(vgid)) && vginfo->vgname && !is_orphan_vg(vginfo->vgname)) { if ((vg = _vg_read(cmd, NULL, vgid, 1, &consistent, precommitted)) && id_equal(&vg->id, (const struct id *)vgid)) { if (!consistent) log_error("Volume group %s metadata is " "inconsistent", vg->name); return vg; } free_vg(vg); } /* Mustn't scan if memory locked: ensure cache gets pre-populated! */ if (critical_section()) return_NULL; /* FIXME Need a genuine read by ID here - don't vg_read_internal by name! */ /* FIXME Disabled vgrenames while active for now because we aren't * allowed to do a full scan here any more. */ // The slow way - full scan required to cope with vgrename lvmcache_label_scan(cmd, 2); if (!(vgnames = get_vgnames(cmd, 0))) { log_error("vg_read_by_vgid: get_vgnames failed"); return NULL; } dm_list_iterate_items(strl, vgnames) { vgname = strl->str; if (!vgname) continue; // FIXME Unnecessary? consistent = 0; if ((vg = _vg_read(cmd, vgname, vgid, 1, &consistent, precommitted)) && id_equal(&vg->id, (const struct id *)vgid)) { if (!consistent) { log_error("Volume group %s metadata is " "inconsistent", vgname); free_vg(vg); return NULL; } return vg; } free_vg(vg); } return NULL; } /* Only called by activate.c */ struct logical_volume *lv_from_lvid(struct cmd_context *cmd, const char *lvid_s, unsigned precommitted) { struct lv_list *lvl; struct volume_group *vg; const union lvid *lvid; lvid = (const union lvid *) lvid_s; log_very_verbose("Finding volume group for uuid %s", lvid_s); if (!(vg = _vg_read_by_vgid(cmd, (const char *)lvid->id[0].uuid, precommitted))) { log_error("Volume group for uuid not found: %s", lvid_s); return NULL; } log_verbose("Found volume group \"%s\"", vg->name); if (vg->status & EXPORTED_VG) { log_error("Volume group \"%s\" is exported", vg->name); goto out; } if (!(lvl = find_lv_in_vg_by_lvid(vg, lvid))) { log_very_verbose("Can't find logical volume id %s", lvid_s); goto out; } return lvl->lv; out: free_vg(vg); return NULL; } const char *find_vgname_from_pvid(struct cmd_context *cmd, const char *pvid) { char *vgname; struct lvmcache_info *info; vgname = lvmcache_vgname_from_pvid(cmd, pvid); if (is_orphan_vg(vgname)) { if (!(info = info_from_pvid(pvid, 0))) { return_NULL; } /* * If an orphan PV has no MDAs, or it has MDAs but the * MDA is ignored, it may appear to be an orphan until * the metadata is read off another PV in the same VG. * Detecting this means checking every VG by scanning * every PV on the system. */ if (mdas_empty_or_ignored(&info->mdas)) { if (!scan_vgs_for_pvs(cmd, 1)) { log_error("Rescan for PVs without " "metadata areas failed."); return NULL; } /* * Ask lvmcache again - we may have a non-orphan * name now */ vgname = lvmcache_vgname_from_pvid(cmd, pvid); } } return vgname; } const char *find_vgname_from_pvname(struct cmd_context *cmd, const char *pvname) { const char *pvid; pvid = pvid_from_devname(cmd, pvname); if (!pvid) /* Not a PV */ return NULL; return find_vgname_from_pvid(cmd, pvid); } /** * pv_read - read and return a handle to a physical volume * @cmd: LVM command initiating the pv_read * @pv_name: full device name of the PV, including the path * @mdas: list of metadata areas of the PV * @label_sector: sector number where the PV label is stored on @pv_name * @warnings: * * Returns: * PV handle - valid pv_name and successful read of the PV, or * NULL - invalid parameter or error in reading the PV * * Note: * FIXME - liblvm todo - make into function that returns handle */ struct physical_volume *pv_read(struct cmd_context *cmd, const char *pv_name, int warnings, int scan_label_only) { return _pv_read(cmd, cmd->mem, pv_name, NULL, warnings, scan_label_only); } /* FIXME Use label functions instead of PV functions */ static struct physical_volume *_pv_read(struct cmd_context *cmd, struct dm_pool *pvmem, const char *pv_name, struct format_instance *fid, int warnings, int scan_label_only) { struct physical_volume *pv; struct format_instance_ctx fic; struct label *label; struct lvmcache_info *info; struct device *dev; if (!(dev = dev_cache_get(pv_name, cmd->filter))) return_NULL; if (!(label_read(dev, &label, UINT64_C(0)))) { if (warnings) log_error("No physical volume label read from %s", pv_name); return NULL; } info = (struct lvmcache_info *) label->info; pv = _alloc_pv(pvmem, dev); if (!pv) { log_error("pv allocation for '%s' failed", pv_name); return NULL; } pv->label_sector = label->sector; /* FIXME Move more common code up here */ if (!(info->fmt->ops->pv_read(info->fmt, pv_name, pv, scan_label_only))) { log_error("Failed to read existing physical volume '%s'", pv_name); goto bad; } if (!pv->size) goto bad; if (!alloc_pv_segment_whole_pv(pvmem, pv)) goto_bad; if (fid) fid_add_mdas(fid, &info->mdas, (const char *) &pv->id, ID_LEN); else { fic.type = FMT_INSTANCE_PV | FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS; fic.context.pv_id = (const char *) &pv->id; if (!(fid = pv->fmt->ops->create_instance(pv->fmt, &fic))) { log_error("_pv_read: Couldn't create format instance " "for PV %s", pv_name); goto bad; } pv_set_fid(pv, fid); } return pv; bad: free_pv_fid(pv); dm_pool_free(pvmem, pv); return NULL; } /* May return empty list */ struct dm_list *get_vgnames(struct cmd_context *cmd, int include_internal) { return lvmcache_get_vgnames(cmd, include_internal); } struct dm_list *get_vgids(struct cmd_context *cmd, int include_internal) { return lvmcache_get_vgids(cmd, include_internal); } static int _get_pvs(struct cmd_context *cmd, int warnings, struct dm_list **pvslist) { struct str_list *strl; struct dm_list * uninitialized_var(results); const char *vgname, *vgid; struct pv_list *pvl, *pvl_copy; struct dm_list *vgids; struct volume_group *vg; int consistent = 0; int old_pvmove; lvmcache_label_scan(cmd, 0); if (pvslist) { if (!(results = dm_pool_alloc(cmd->mem, sizeof(*results)))) { log_error("PV list allocation failed"); return 0; } dm_list_init(results); } /* Get list of VGs */ if (!(vgids = get_vgids(cmd, 1))) { log_error("get_pvs: get_vgids failed"); return 0; } /* Read every VG to ensure cache consistency */ /* Orphan VG is last on list */ old_pvmove = pvmove_mode(); init_pvmove(1); dm_list_iterate_items(strl, vgids) { vgid = strl->str; if (!vgid) continue; /* FIXME Unnecessary? */ consistent = 0; if (!(vgname = vgname_from_vgid(NULL, vgid))) { stack; continue; } if (!(vg = vg_read_internal(cmd, vgname, vgid, warnings, &consistent))) { stack; continue; } if (!consistent) log_warn("WARNING: Volume Group %s is not consistent", vgname); /* Move PVs onto results list */ if (pvslist) dm_list_iterate_items(pvl, &vg->pvs) { if (!(pvl_copy = _copy_pvl(cmd->mem, pvl))) { log_error("PV list allocation failed"); free_vg(vg); return 0; } dm_list_add(results, &pvl_copy->list); } free_vg(vg); } init_pvmove(old_pvmove); if (pvslist) *pvslist = results; else dm_pool_free(cmd->mem, vgids); return 1; } struct dm_list *get_pvs(struct cmd_context *cmd) { struct dm_list *results; if (!_get_pvs(cmd, 1, &results)) return NULL; return results; } int scan_vgs_for_pvs(struct cmd_context *cmd, int warnings) { return _get_pvs(cmd, warnings, NULL); } int pv_write(struct cmd_context *cmd __attribute__((unused)), struct physical_volume *pv, int allow_non_orphan) { if (!pv->fmt->ops->pv_write) { log_error("Format does not support writing physical volumes"); return 0; } /* * FIXME: Try to remove this restriction. This requires checking * that the PV and the VG are in a consistent state. We need * to provide some revert mechanism since PV label together * with VG metadata write is not atomic. */ if (!allow_non_orphan && (!is_orphan_vg(pv->vg_name) || pv->pe_alloc_count)) { log_error("Assertion failed: can't _pv_write non-orphan PV " "(in VG %s)", pv->vg_name); return 0; } if (!pv->fmt->ops->pv_write(pv->fmt, pv)) return_0; return 1; } int pv_write_orphan(struct cmd_context *cmd, struct physical_volume *pv) { const char *old_vg_name = pv->vg_name; pv->vg_name = cmd->fmt->orphan_vg_name; pv->status = ALLOCATABLE_PV; pv->pe_alloc_count = 0; if (!dev_get_size(pv->dev, &pv->size)) { log_error("%s: Couldn't get size.", pv_dev_name(pv)); return 0; } if (!pv_write(cmd, pv, 0)) { log_error("Failed to clear metadata from physical " "volume \"%s\" after removal from \"%s\"", pv_dev_name(pv), old_vg_name); return 0; } return 1; } int is_global_vg(const char *vg_name) { return (vg_name && !strcmp(vg_name, VG_GLOBAL)) ? 1 : 0; } /** * is_orphan_vg - Determine whether a vg_name is an orphan * @vg_name: pointer to the vg_name */ int is_orphan_vg(const char *vg_name) { return (vg_name && !strncmp(vg_name, ORPHAN_PREFIX, sizeof(ORPHAN_PREFIX) - 1)) ? 1 : 0; } /* * Exclude pseudo VG names used for locking. */ int is_real_vg(const char *vg_name) { return (vg_name && *vg_name != '#'); } /* * Returns: * 0 - fail * 1 - success */ int pv_analyze(struct cmd_context *cmd, const char *pv_name, uint64_t label_sector) { struct label *label; struct device *dev; struct metadata_area *mda; struct lvmcache_info *info; dev = dev_cache_get(pv_name, cmd->filter); if (!dev) { log_error("Device %s not found (or ignored by filtering).", pv_name); return 0; } /* * First, scan for LVM labels. */ if (!label_read(dev, &label, label_sector)) { log_error("Could not find LVM label on %s", pv_name); return 0; } log_print("Found label on %s, sector %"PRIu64", type=%s", pv_name, label->sector, label->type); /* * Next, loop through metadata areas */ info = label->info; dm_list_iterate_items(mda, &info->mdas) mda->ops->pv_analyze_mda(info->fmt, mda); return 1; } /* FIXME: remove / combine this with locking? */ int vg_check_write_mode(struct volume_group *vg) { if (vg->open_mode != 'w') { log_errno(EPERM, "Attempt to modify a read-only VG"); return 0; } return 1; } /* * Performs a set of checks against a VG according to bits set in status * and returns FAILED_* bits for those that aren't acceptable. * * FIXME Remove the unnecessary duplicate definitions and return bits directly. */ static uint32_t _vg_bad_status_bits(const struct volume_group *vg, uint64_t status) { uint32_t failure = 0; if ((status & CLUSTERED) && (vg_is_clustered(vg)) && !locking_is_clustered()) { log_error("Skipping clustered volume group %s", vg->name); /* Return because other flags are considered undefined. */ return FAILED_CLUSTERED; } if ((status & EXPORTED_VG) && vg_is_exported(vg)) { log_error("Volume group %s is exported", vg->name); failure |= FAILED_EXPORTED; } if ((status & LVM_WRITE) && !(vg->status & LVM_WRITE)) { log_error("Volume group %s is read-only", vg->name); failure |= FAILED_READ_ONLY; } if ((status & RESIZEABLE_VG) && !vg_is_resizeable(vg)) { log_error("Volume group %s is not resizeable.", vg->name); failure |= FAILED_RESIZEABLE; } return failure; } /** * vg_check_status - check volume group status flags and log error * @vg - volume group to check status flags * @status - specific status flags to check (e.g. EXPORTED_VG) */ int vg_check_status(const struct volume_group *vg, uint64_t status) { return !_vg_bad_status_bits(vg, status); } static struct volume_group *_recover_vg(struct cmd_context *cmd, const char *vg_name, const char *vgid) { int consistent = 1; struct volume_group *vg; unlock_vg(cmd, vg_name); dev_close_all(); if (!lock_vol(cmd, vg_name, LCK_VG_WRITE)) return_NULL; if (!(vg = vg_read_internal(cmd, vg_name, vgid, 1, &consistent))) return_NULL; if (!consistent) { free_vg(vg); return_NULL; } return (struct volume_group *)vg; } /* * Consolidated locking, reading, and status flag checking. * * If the metadata is inconsistent, setting READ_ALLOW_INCONSISTENT in * misc_flags will return it with FAILED_INCONSISTENT set instead of * giving you nothing. * * Use vg_read_error(vg) to determine the result. Nonzero means there were * problems reading the volume group. * Zero value means that the VG is open and appropriate locks are held. */ static struct volume_group *_vg_lock_and_read(struct cmd_context *cmd, const char *vg_name, const char *vgid, uint32_t lock_flags, uint64_t status_flags, uint32_t misc_flags) { struct volume_group *vg = NULL; int consistent = 1; int consistent_in; uint32_t failure = 0; int already_locked; if (misc_flags & READ_ALLOW_INCONSISTENT || lock_flags != LCK_VG_WRITE) consistent = 0; if (!validate_name(vg_name) && !is_orphan_vg(vg_name)) { log_error("Volume group name %s has invalid characters", vg_name); return NULL; } already_locked = vgname_is_locked(vg_name); if (!already_locked && !(misc_flags & READ_WITHOUT_LOCK) && !lock_vol(cmd, vg_name, lock_flags)) { log_error("Can't get lock for %s", vg_name); return _vg_make_handle(cmd, vg, FAILED_LOCKING); } if (is_orphan_vg(vg_name)) status_flags &= ~LVM_WRITE; consistent_in = consistent; /* If consistent == 1, we get NULL here if correction fails. */ if (!(vg = vg_read_internal(cmd, vg_name, vgid, 1, &consistent))) { if (consistent_in && !consistent) { log_error("Volume group \"%s\" inconsistent.", vg_name); failure |= FAILED_INCONSISTENT; goto_bad; } log_error("Volume group \"%s\" not found", vg_name); failure |= FAILED_NOTFOUND; goto_bad; } if (vg_is_clustered(vg) && !locking_is_clustered()) { log_error("Skipping clustered volume group %s", vg->name); failure |= FAILED_CLUSTERED; goto_bad; } /* consistent == 0 when VG is not found, but failed == FAILED_NOTFOUND */ if (!consistent && !failure) { free_vg(vg); if (!(vg = _recover_vg(cmd, vg_name, vgid))) { log_error("Recovery of volume group \"%s\" failed.", vg_name); failure |= FAILED_INCONSISTENT; goto_bad; } } /* * Check that the tool can handle tricky cases -- missing PVs and * unknown segment types. */ if (!cmd->handles_missing_pvs && vg_missing_pv_count(vg) && lock_flags == LCK_VG_WRITE) { log_error("Cannot change VG %s while PVs are missing.", vg->name); log_error("Consider vgreduce --removemissing."); failure |= FAILED_INCONSISTENT; /* FIXME new failure code here? */ goto_bad; } if (!cmd->handles_unknown_segments && vg_has_unknown_segments(vg) && lock_flags == LCK_VG_WRITE) { log_error("Cannot change VG %s with unknown segments in it!", vg->name); failure |= FAILED_INCONSISTENT; /* FIXME new failure code here? */ goto_bad; } failure |= _vg_bad_status_bits(vg, status_flags); if (failure) goto_bad; return _vg_make_handle(cmd, vg, failure); bad: if (!already_locked && !(misc_flags & READ_WITHOUT_LOCK)) unlock_vg(cmd, vg_name); return _vg_make_handle(cmd, vg, failure); } /* * vg_read: High-level volume group metadata read function. * * vg_read_error() must be used on any handle returned to check for errors. * * - metadata inconsistent and automatic correction failed: FAILED_INCONSISTENT * - VG is read-only: FAILED_READ_ONLY * - VG is EXPORTED, unless flags has READ_ALLOW_EXPORTED: FAILED_EXPORTED * - VG is not RESIZEABLE: FAILED_RESIZEABLE * - locking failed: FAILED_LOCKING * * On failures, all locks are released, unless one of the following applies: * - vgname_is_locked(lock_name) is true * FIXME: remove the above 2 conditions if possible and make an error always * release the lock. * * Volume groups are opened read-only unless flags contains READ_FOR_UPDATE. * * Checking for VG existence: * * FIXME: We want vg_read to attempt automatic recovery after acquiring a * temporary write lock: if that fails, we bail out as usual, with failed & * FAILED_INCONSISTENT. If it works, we are good to go. Code that's been in * toollib just set lock_flags to LCK_VG_WRITE and called vg_read_internal with * *consistent = 1. */ struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, const char *vgid, uint32_t flags) { uint64_t status = UINT64_C(0); uint32_t lock_flags = LCK_VG_READ; if (flags & READ_FOR_UPDATE) { status |= EXPORTED_VG | LVM_WRITE; lock_flags = LCK_VG_WRITE; } if (flags & READ_ALLOW_EXPORTED) status &= ~EXPORTED_VG; return _vg_lock_and_read(cmd, vg_name, vgid, lock_flags, status, flags); } /* * A high-level volume group metadata reading function. Open a volume group for * later update (this means the user code can change the metadata and later * request the new metadata to be written and committed). */ struct volume_group *vg_read_for_update(struct cmd_context *cmd, const char *vg_name, const char *vgid, uint32_t flags) { return vg_read(cmd, vg_name, vgid, flags | READ_FOR_UPDATE); } /* * Test the validity of a VG handle returned by vg_read() or vg_read_for_update(). */ uint32_t vg_read_error(struct volume_group *vg_handle) { if (!vg_handle) return FAILED_ALLOCATION; return vg_handle->read_status; } /* * Lock a vgname and/or check for existence. * Takes a WRITE lock on the vgname before scanning. * If scanning fails or vgname found, release the lock. * NOTE: If you find the return codes confusing, you might think of this * function as similar to an open() call with O_CREAT and O_EXCL flags * (open returns fail with -EEXIST if file already exists). * * Returns: * FAILED_LOCKING - Cannot lock name * FAILED_EXIST - VG name already exists - cannot reserve * SUCCESS - VG name does not exist in system and WRITE lock held */ uint32_t vg_lock_newname(struct cmd_context *cmd, const char *vgname) { if (!lock_vol(cmd, vgname, LCK_VG_WRITE)) { return FAILED_LOCKING; } /* Find the vgname in the cache */ /* If it's not there we must do full scan to be completely sure */ if (!fmt_from_vgname(vgname, NULL, 1)) { lvmcache_label_scan(cmd, 0); if (!fmt_from_vgname(vgname, NULL, 1)) { /* Independent MDAs aren't supported under low memory */ if (!cmd->independent_metadata_areas && critical_section()) { /* * FIXME: Disallow calling this function if * critical_section() is true. */ unlock_vg(cmd, vgname); return FAILED_LOCKING; } lvmcache_label_scan(cmd, 2); if (!fmt_from_vgname(vgname, NULL, 0)) { /* vgname not found after scanning */ return SUCCESS; } } } /* Found vgname so cannot reserve. */ unlock_vg(cmd, vgname); return FAILED_EXIST; } struct format_instance *alloc_fid(const struct format_type *fmt, const struct format_instance_ctx *fic) { struct dm_pool *mem; struct format_instance *fid; if (!(mem = dm_pool_create("format_instance", 1024))) return_NULL; if (!(fid = dm_pool_zalloc(mem, sizeof(*fid)))) { log_error("Couldn't allocate format_instance object."); goto bad; } fid->ref_count = 1; fid->mem = mem; fid->type = fic->type; fid->fmt = fmt; dm_list_init(&fid->metadata_areas_in_use); dm_list_init(&fid->metadata_areas_ignored); return fid; bad: dm_pool_destroy(mem); return NULL; } void pv_set_fid(struct physical_volume *pv, struct format_instance *fid) { if (fid) fid->ref_count++; if (pv->fid) pv->fid->fmt->ops->destroy_instance(pv->fid); pv->fid = fid; } void vg_set_fid(struct volume_group *vg, struct format_instance *fid) { struct pv_list *pvl; if (fid) fid->ref_count++; dm_list_iterate_items(pvl, &vg->pvs) pv_set_fid(pvl->pv, fid); dm_list_iterate_items(pvl, &vg->removed_pvs) pv_set_fid(pvl->pv, fid); if (vg->fid) vg->fid->fmt->ops->destroy_instance(vg->fid); vg->fid = fid; } static int _convert_key_to_string(const char *key, size_t key_len, unsigned sub_key, char *buf, size_t buf_len) { memcpy(buf, key, key_len); buf += key_len; buf_len -= key_len; if ((dm_snprintf(buf, buf_len, "_%u", sub_key) == -1)) return_0; return 1; } int fid_add_mda(struct format_instance *fid, struct metadata_area *mda, const char *key, size_t key_len, const unsigned sub_key) { char full_key[PATH_MAX]; dm_list_add(mda_is_ignored(mda) ? &fid->metadata_areas_ignored : &fid->metadata_areas_in_use, &mda->list); /* Return if the mda is not supposed to be indexed. */ if (!key) return 1; /* Add metadata area to index. */ if (fid->type & FMT_INSTANCE_VG) { if (!_convert_key_to_string(key, key_len, sub_key, full_key, PATH_MAX)) return_0; dm_hash_insert(fid->metadata_areas_index.hash, full_key, mda); } else fid->metadata_areas_index.array[sub_key] = mda; return 1; } int fid_add_mdas(struct format_instance *fid, struct dm_list *mdas, const char *key, size_t key_len) { struct metadata_area *mda, *mda_new; unsigned mda_index = 0; dm_list_iterate_items(mda, mdas) { mda_new = mda_copy(fid->mem, mda); if (!mda_new) return_0; fid_remove_mda(fid, NULL, key, key_len, mda_index); fid_add_mda(fid, mda_new, key, key_len, mda_index); mda_index++; } return 1; } struct metadata_area *fid_get_mda_indexed(struct format_instance *fid, const char *key, size_t key_len, const unsigned sub_key) { char full_key[PATH_MAX]; struct metadata_area *mda = NULL; if (fid->type & FMT_INSTANCE_VG) { if (!_convert_key_to_string(key, key_len, sub_key, full_key, PATH_MAX)) return_NULL; mda = (struct metadata_area *) dm_hash_lookup(fid->metadata_areas_index.hash, full_key); } else mda = fid->metadata_areas_index.array[sub_key]; return mda; } int fid_remove_mda(struct format_instance *fid, struct metadata_area *mda, const char *key, size_t key_len, const unsigned sub_key) { struct metadata_area *mda_indexed = NULL; char full_key[PATH_MAX]; /* At least one of mda or key must be specified. */ if (!mda && !key) return 1; if (key) { /* * If both mda and key specified, check given mda * with what we find using the index and return * immediately if these two do not match. */ if (!(mda_indexed = fid_get_mda_indexed(fid, key, key_len, sub_key)) || (mda && mda != mda_indexed)) return 1; mda = mda_indexed; if (fid->type & FMT_INSTANCE_VG) { if (!_convert_key_to_string(key, key_len, sub_key, full_key, PATH_MAX)) return_0; dm_hash_remove(fid->metadata_areas_index.hash, full_key); } else fid->metadata_areas_index.array[sub_key] = NULL; } dm_list_del(&mda->list); return 1; } /* * Copy constructor for a metadata_area. */ struct metadata_area *mda_copy(struct dm_pool *mem, struct metadata_area *mda) { struct metadata_area *mda_new; if (!(mda_new = dm_pool_alloc(mem, sizeof(*mda_new)))) { log_error("metadata_area allocation failed"); return NULL; } memcpy(mda_new, mda, sizeof(*mda)); if (mda->ops->mda_metadata_locn_copy && mda->metadata_locn) { mda_new->metadata_locn = mda->ops->mda_metadata_locn_copy(mem, mda->metadata_locn); if (!mda_new->metadata_locn) { dm_pool_free(mem, mda_new); return NULL; } } dm_list_init(&mda_new->list); return mda_new; } /* * This function provides a way to answer the question on a format specific * basis - does the format specfic context of these two metadata areas * match? * * A metatdata_area is defined to be independent of the underlying context. * This has the benefit that we can use the same abstraction to read disks * (see _metadata_text_raw_ops) or files (see _metadata_text_file_ops). * However, one downside is there is no format-independent way to determine * whether a given metadata_area is attached to a specific device - in fact, * it may not be attached to a device at all. * * Thus, LVM is structured such that an mda is not a member of struct * physical_volume. The location of the mda depends on whether * the PV is in a volume group. A PV not in a VG has an mda on the * 'info->mda' list in lvmcache, while a PV in a VG has an mda on * the vg->fid->metadata_areas_in_use list. For further details, see _vg_read(), * and the sequence of creating the format_instance with fid->metadata_areas_in_use * list, as well as the construction of the VG, with list of PVs (comes * after the construction of the fid and list of mdas). */ unsigned mda_locns_match(struct metadata_area *mda1, struct metadata_area *mda2) { if (!mda1->ops->mda_locns_match || !mda2->ops->mda_locns_match || mda1->ops->mda_locns_match != mda2->ops->mda_locns_match) return 0; return mda1->ops->mda_locns_match(mda1, mda2); } struct device *mda_get_device(struct metadata_area *mda) { if (!mda->ops->mda_get_device) return NULL; return mda->ops->mda_get_device(mda); } unsigned mda_is_ignored(struct metadata_area *mda) { return (mda->status & MDA_IGNORED); } void mda_set_ignored(struct metadata_area *mda, unsigned mda_ignored) { void *locn = mda->metadata_locn; unsigned old_mda_ignored = mda_is_ignored(mda); if (mda_ignored && !old_mda_ignored) mda->status |= MDA_IGNORED; else if (!mda_ignored && old_mda_ignored) mda->status &= ~MDA_IGNORED; else return; /* No change */ log_debug("%s ignored flag for mda %s at offset %" PRIu64 ".", mda_ignored ? "Setting" : "Clearing", mda->ops->mda_metadata_locn_name ? mda->ops->mda_metadata_locn_name(locn) : "", mda->ops->mda_metadata_locn_offset ? mda->ops->mda_metadata_locn_offset(locn) : UINT64_C(0)); } int mdas_empty_or_ignored(struct dm_list *mdas) { struct metadata_area *mda; if (!dm_list_size(mdas)) return 1; dm_list_iterate_items(mda, mdas) { if (mda_is_ignored(mda)) return 1; } return 0; } int pv_change_metadataignore(struct physical_volume *pv, uint32_t mda_ignored) { const char *pv_name = pv_dev_name(pv); if (mda_ignored && !pv_mda_used_count(pv)) { log_error("Metadata areas on physical volume \"%s\" already " "ignored.", pv_name); return 0; } if (!mda_ignored && (pv_mda_used_count(pv) == pv_mda_count(pv))) { log_error("Metadata areas on physical volume \"%s\" already " "marked as in-use.", pv_name); return 0; } if (!pv_mda_count(pv)) { log_error("Physical volume \"%s\" has no metadata " "areas.", pv_name); return 0; } log_verbose("Marking metadata areas on physical volume \"%s\" " "as %s.", pv_name, mda_ignored ? "ignored" : "in-use"); if (!pv_mda_set_ignored(pv, mda_ignored)) return_0; /* * Update vg_mda_copies based on the mdas in this PV. * This is most likely what the user would expect - if they * specify a specific PV to be ignored/un-ignored, they will * most likely not want LVM to turn around and change the * ignore / un-ignore value when it writes the VG to disk. * This does not guarantee this PV's ignore bits will be * preserved in future operations. */ if (!is_orphan(pv) && vg_mda_copies(pv->vg) != VGMETADATACOPIES_UNMANAGED) { log_warn("WARNING: Changing preferred number of copies of VG %s " "metadata from %"PRIu32" to %"PRIu32, pv_vg_name(pv), vg_mda_copies(pv->vg), vg_mda_used_count(pv->vg)); vg_set_mda_copies(pv->vg, vg_mda_used_count(pv->vg)); } return 1; } char *tags_format_and_copy(struct dm_pool *mem, const struct dm_list *tags) { struct str_list *sl; if (!dm_pool_begin_object(mem, 256)) { log_error("dm_pool_begin_object failed"); return NULL; } dm_list_iterate_items(sl, tags) { if (!dm_pool_grow_object(mem, sl->str, strlen(sl->str)) || (sl->list.n != tags && !dm_pool_grow_object(mem, ",", 1))) { log_error("dm_pool_grow_object failed"); return NULL; } } if (!dm_pool_grow_object(mem, "\0", 1)) { log_error("dm_pool_grow_object failed"); return NULL; } return dm_pool_end_object(mem); } /** * pv_by_path - Given a device path return a PV handle if it is a PV * @cmd - handle to the LVM command instance * @pv_name - device path to read for the PV * * Returns: * NULL - device path does not contain a valid PV * non-NULL - PV handle corresponding to device path * * FIXME: merge with find_pv_by_name ? */ struct physical_volume *pv_by_path(struct cmd_context *cmd, const char *pv_name) { return _pv_read(cmd, cmd->mem, pv_name, NULL, 1, 0); }