1
0
mirror of git://sourceware.org/git/lvm2.git synced 2025-01-10 05:18:36 +03:00
lvm2/lib/metadata/metadata.c

6407 lines
162 KiB
C
Raw Normal View History

2001-09-25 16:49:28 +04:00
/*
2008-01-30 17:00:02 +03:00
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2012 Red Hat, Inc. All rights reserved.
2001-09-25 16:49:28 +04:00
*
2004-03-30 23:35:44 +04:00
* This file is part of LVM2.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
2004-03-30 23:35:44 +04:00
*
* You should have received a copy of the GNU Lesser General Public License
2004-03-30 23:35:44 +04:00
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2001-09-25 16:49:28 +04:00
*/
2002-11-18 17:04:08 +03:00
#include "lib.h"
#include "device.h"
2001-10-01 19:14:39 +04:00
#include "metadata.h"
#include "toolcontext.h"
#include "lvm-string.h"
#include "lvm-file.h"
#include "lvm-signal.h"
#include "lvmcache.h"
#include "lvmetad.h"
#include "memlock.h"
2005-04-20 00:52:35 +04:00
#include "str_list.h"
#include "pv_alloc.h"
#include "segtype.h"
#include "activate.h"
#include "display.h"
#include "locking.h"
#include "archiver.h"
#include "defaults.h"
2015-03-05 23:00:44 +03:00
#include "lvmlockd.h"
#include "time.h"
#include "lvmnotify.h"
2001-09-25 16:49:28 +04:00
#include <math.h>
2006-08-17 23:53:36 +04:00
#include <sys/param.h>
2008-01-30 17:00:02 +03:00
static struct physical_volume *_pv_read(struct cmd_context *cmd,
struct dm_pool *pvmem,
const char *pv_name,
struct format_instance *fid,
uint32_t warn_flags, int scan_label_only);
static uint32_t _vg_bad_status_bits(const struct volume_group *vg,
uint64_t status);
Change vg_create() to take only minimal parameters and obtain a lock. vg_t *vg_create(struct cmd_context *cmd, const char *vg_name); This is the first step towards the API called to create a VG. Call vg_lock_newname() inside this function. Use _vg_make_handle() where possible. Now we have 2 ways to construct a volume group: 1) vg_read: Used when constructing an existing VG from disks 2) vg_create: Used when constructing a new VG Both of these interfaces obtain a lock, and return a vg_t *. The usage of _vg_make_handle() inside vg_create() doesn't fit perfectly but it's ok for now. Needs some cleanup though and I've noted "FIXME" in the code. Add the new vg_create() plus vg 'set' functions for non-default VG parameters in the following tools: - vgcreate: Fairly straightforward refactoring. We just moved vg_lock_newname inside vg_create so we check the return via vg_read_error. - vgsplit: The refactoring here is a bit more tricky. Originally we called vg_lock_newname and depending on the error code, we either read the existing vg or created the new one. Now vg_create() calls vg_lock_newname, so we first try to create the VG. If this fails with FAILED_EXIST, we can then do the vg_read. If the create succeeds, we check the input parameters and set any new values on the VG. TODO in future patches: 1. The VG_ORPHAN lock needs some thought. We may want to treat this as any other VG, and require the application to obtain a handle and pass it to other API calls (for example, vg_extend). Or, we may find that hiding the VG_ORPHAN lock inside other APIs is the way to go. I thought of placing the VG_ORPHAN lock inside vg_create() and tying it to the vg handle, but was not certain this was the right approach. 2. Cleanup error paths. Integrate vg_read_error() with vg_create and vg_read* error codes and/or the new error APIs. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2009-07-09 14:09:33 +04:00
static int _alignment_overrides_default(unsigned long data_alignment,
unsigned long default_pe_align)
{
return data_alignment && (default_pe_align % data_alignment);
}
unsigned long set_pe_align(struct physical_volume *pv, unsigned long data_alignment)
2006-08-17 23:30:59 +04:00
{
unsigned long default_pe_align, temp_pe_align;
2008-09-19 09:19:09 +04:00
if (pv->pe_align)
goto out;
if (data_alignment) {
/* Always use specified data_alignment */
pv->pe_align = data_alignment;
goto out;
}
default_pe_align = find_config_tree_int(pv->fmt->cmd, devices_default_data_alignment_CFG, NULL);
if (default_pe_align)
/* align on 1 MiB multiple */
default_pe_align *= DEFAULT_PE_ALIGN;
2009-10-06 20:00:38 +04:00
else
/* align on 64 KiB multiple (old default) */
default_pe_align = DEFAULT_PE_ALIGN_OLD;
pv->pe_align = MAX((default_pe_align << SECTOR_SHIFT),
lvm_getpagesize()) >> SECTOR_SHIFT;
2008-09-19 09:19:09 +04:00
if (!pv->dev)
goto out;
/*
* Align to stripe-width of underlying md device if present
*/
if (find_config_tree_bool(pv->fmt->cmd, devices_md_chunk_alignment_CFG, NULL)) {
temp_pe_align = dev_md_stripe_width(pv->fmt->cmd->dev_types, pv->dev);
if (_alignment_overrides_default(temp_pe_align, default_pe_align))
pv->pe_align = temp_pe_align;
}
2008-09-19 09:19:09 +04:00
/*
* Align to topology's minimum_io_size or optimal_io_size if present
* - minimum_io_size - the smallest request the device can perform
* w/o incurring a read-modify-write penalty (e.g. MD's chunk size)
* - optimal_io_size - the device's preferred unit of receiving I/O
* (e.g. MD's stripe width)
*/
if (find_config_tree_bool(pv->fmt->cmd, devices_data_alignment_detection_CFG, NULL)) {
temp_pe_align = dev_minimum_io_size(pv->fmt->cmd->dev_types, pv->dev);
if (_alignment_overrides_default(temp_pe_align, default_pe_align))
pv->pe_align = temp_pe_align;
temp_pe_align = dev_optimal_io_size(pv->fmt->cmd->dev_types, pv->dev);
if (_alignment_overrides_default(temp_pe_align, default_pe_align))
pv->pe_align = temp_pe_align;
}
out:
log_very_verbose("%s: Setting PE alignment to %lu sectors.",
dev_name(pv->dev), pv->pe_align);
return pv->pe_align;
2006-08-17 23:30:59 +04:00
}
unsigned long set_pe_align_offset(struct physical_volume *pv,
unsigned long data_alignment_offset)
{
if (pv->pe_align_offset)
goto out;
if (data_alignment_offset) {
/* Always use specified data_alignment_offset */
pv->pe_align_offset = data_alignment_offset;
goto out;
}
if (!pv->dev)
goto out;
if (find_config_tree_bool(pv->fmt->cmd, devices_data_alignment_offset_detection_CFG, NULL)) {
int align_offset = dev_alignment_offset(pv->fmt->cmd->dev_types, pv->dev);
/* must handle a -1 alignment_offset; means dev is misaligned */
if (align_offset < 0)
align_offset = 0;
pv->pe_align_offset = MAX(pv->pe_align_offset, align_offset);
}
out:
log_very_verbose("%s: Setting PE alignment offset to %lu sectors.",
dev_name(pv->dev), pv->pe_align_offset);
return pv->pe_align_offset;
}
void add_pvl_to_vgs(struct volume_group *vg, struct pv_list *pvl)
{
dm_list_add(&vg->pvs, &pvl->list);
vg->pv_count++;
pvl->pv->vg = vg;
pv_set_fid(pvl->pv, vg->fid);
}
void del_pvl_from_vgs(struct volume_group *vg, struct pv_list *pvl)
{
struct lvmcache_info *info;
vg->pv_count--;
dm_list_del(&pvl->list);
pvl->pv->vg = vg->fid->fmt->orphan_vg; /* orphan */
if ((info = lvmcache_info_from_pvid((const char *) &pvl->pv->id, pvl->pv->dev, 0)))
lvmcache_fid_add_mdas(info, vg->fid->fmt->orphan_vg->fid,
(const char *) &pvl->pv->id, ID_LEN);
pv_set_fid(pvl->pv, vg->fid->fmt->orphan_vg->fid);
}
/**
* add_pv_to_vg - Add a physical volume to a volume group
* @vg - volume group to add to
* @pv_name - name of the pv (to be removed)
* @pv - physical volume to add to volume group
*
* Returns:
* 0 - failure
* 1 - success
* FIXME: remove pv_name - obtain safely from pv
*/
static int add_pv_to_vg(struct volume_group *vg, const char *pv_name,
struct physical_volume *pv, int new_pv)
2001-10-12 18:25:53 +04:00
{
2001-10-15 22:39:40 +04:00
struct pv_list *pvl;
struct format_instance *fid = vg->fid;
struct dm_pool *mem = vg->vgmem;
char uuid[64] __attribute__((aligned(8)));
int used;
2001-10-12 18:25:53 +04:00
2001-10-15 22:39:40 +04:00
log_verbose("Adding physical volume '%s' to volume group '%s'",
pv_name, vg->name);
2001-10-15 22:39:40 +04:00
if (!(pvl = dm_pool_zalloc(mem, sizeof(*pvl)))) {
2001-10-15 22:39:40 +04:00
log_error("pv_list allocation for '%s' failed", pv_name);
2001-10-12 18:25:53 +04:00
return 0;
}
if (!is_orphan_vg(pv->vg_name)) {
2001-10-15 22:39:40 +04:00
log_error("Physical volume '%s' is already in volume group "
"'%s'", pv_name, pv->vg_name);
return 0;
} else if (!new_pv) {
if ((used = is_used_pv(pv)) < 0)
return_0;
if (used) {
log_error("PV %s is used by a VG but its metadata is missing.", pv_name);
return 0;
}
}
2001-10-12 18:25:53 +04:00
2002-11-18 17:04:08 +03:00
if (pv->fmt != fid->fmt) {
log_error("Physical volume %s is of different format type (%s)",
pv_name, pv->fmt->name);
return 0;
}
/* Ensure PV doesn't depend on another PV already in the VG */
if (pv_uses_vg(pv, vg)) {
log_error("Physical volume %s might be constructed from same "
"volume group %s", pv_name, vg->name);
return 0;
}
if (!(pv->vg_name = dm_pool_strdup(mem, vg->name))) {
2001-10-15 22:39:40 +04:00
log_error("vg->name allocation failed for '%s'", pv_name);
2001-10-12 18:25:53 +04:00
return 0;
}
memcpy(&pv->vgid, &vg->id, sizeof(vg->id));
2001-10-16 00:29:15 +04:00
/* Units of 512-byte sectors */
2001-10-12 18:25:53 +04:00
pv->pe_size = vg->extent_size;
/*
* pe_count must always be calculated by pv_setup
2001-10-12 18:25:53 +04:00
*/
pv->pe_alloc_count = 0;
2001-10-12 18:25:53 +04:00
/* LVM1 stores this outside a VG; LVM2 only stores it inside */
/* FIXME Default from config file? vgextend cmdline flag? */
pv->status |= ALLOCATABLE_PV;
if (!fid->fmt->ops->pv_setup(fid->fmt, pv, vg)) {
2002-01-28 00:30:47 +03:00
log_error("Format-specific setup of physical volume '%s' "
2001-10-15 22:39:40 +04:00
"failed.", pv_name);
return 0;
}
if (find_pv_in_vg(vg, pv_name) ||
find_pv_in_vg_by_uuid(vg, &pv->id)) {
if (!id_write_format(&pv->id, uuid, sizeof(uuid))) {
stack;
uuid[0] = '\0';
}
log_error("Physical volume '%s (%s)' already in the VG.",
pv_name, uuid);
2001-10-12 18:25:53 +04:00
return 0;
}
if (vg->pv_count && (vg->pv_count == vg->max_pv)) {
2001-10-15 22:39:40 +04:00
log_error("No space for '%s' - volume group '%s' "
"holds max %d physical volume(s).", pv_name,
vg->name, vg->max_pv);
return 0;
}
2008-01-30 16:19:47 +03:00
if (!alloc_pv_segment_whole_pv(mem, pv))
return_0;
2001-10-15 22:39:40 +04:00
if ((uint64_t) vg->extent_count + pv->pe_count > MAX_EXTENT_COUNT) {
log_error("Unable to add %s to %s: new extent count (%"
PRIu64 ") exceeds limit (%" PRIu32 ").",
pv_name, vg->name,
(uint64_t) vg->extent_count + pv->pe_count,
MAX_EXTENT_COUNT);
return 0;
}
pvl->pv = pv;
add_pvl_to_vgs(vg, pvl);
2001-11-06 22:02:26 +03:00
vg->extent_count += pv->pe_count;
vg->free_count += pv->pe_count;
2001-10-12 18:25:53 +04:00
dm_list_iterate_items(pvl, &fid->fmt->orphan_vg->pvs)
if (pv == pvl->pv) { /* unlink from orphan */
dm_list_del(&pvl->list);
break;
}
2001-10-12 18:25:53 +04:00
return 1;
}
static int _copy_pv(struct dm_pool *pvmem,
struct physical_volume *pv_to,
2005-04-20 00:44:21 +04:00
struct physical_volume *pv_from)
2005-04-18 03:59:04 +04:00
{
memcpy(pv_to, pv_from, sizeof(*pv_to));
2005-04-20 00:52:35 +04:00
/* We must use pv_set_fid here to update the reference counter! */
pv_to->fid = NULL;
pv_set_fid(pv_to, pv_from->fid);
if (!(pv_to->vg_name = dm_pool_strdup(pvmem, pv_from->vg_name)))
return_0;
if (!str_list_dup(pvmem, &pv_to->tags, &pv_from->tags))
return_0;
2005-04-20 00:52:35 +04:00
if (!peg_dup(pvmem, &pv_to->segments, &pv_from->segments))
2008-01-30 16:19:47 +03:00
return_0;
2005-04-20 00:52:35 +04:00
return 1;
2005-04-18 03:59:04 +04:00
}
static struct pv_list *_copy_pvl(struct dm_pool *pvmem, struct pv_list *pvl_from)
{
struct pv_list *pvl_to = NULL;
if (!(pvl_to = dm_pool_zalloc(pvmem, sizeof(*pvl_to))))
return_NULL;
if (!(pvl_to->pv = dm_pool_alloc(pvmem, sizeof(*pvl_to->pv))))
goto_bad;
2015-09-06 01:56:30 +03:00
if (!_copy_pv(pvmem, pvl_to->pv, pvl_from->pv))
goto_bad;
return pvl_to;
2015-09-06 01:56:30 +03:00
bad:
dm_pool_free(pvmem, pvl_to);
return NULL;
}
2005-04-18 03:57:44 +04:00
int get_pv_from_vg_by_id(const struct format_type *fmt, const char *vg_name,
const char *vgid, const char *pvid,
struct physical_volume *pv)
2005-04-18 03:57:44 +04:00
{
struct volume_group *vg;
struct pv_list *pvl;
uint32_t warn_flags = WARN_PV_READ | WARN_INCONSISTENT;
int r = 0, consistent = 0;
2005-04-18 03:57:44 +04:00
if (!(vg = vg_read_internal(fmt->cmd, vg_name, vgid, warn_flags, &consistent))) {
log_error("get_pv_from_vg_by_id: vg_read_internal failed to read VG %s",
2005-04-18 03:57:44 +04:00
vg_name);
return 0;
}
dm_list_iterate_items(pvl, &vg->pvs) {
if (id_equal(&pvl->pv->id, (const struct id *) pvid)) {
if (!_copy_pv(fmt->cmd->mem, pv, pvl->pv)) {
log_error("internal PV duplication failed");
r = 0;
goto out;
}
r = 1;
goto out;
2005-04-18 03:57:44 +04:00
}
}
out:
release_vg(vg);
return r;
2005-04-18 03:57:44 +04:00
}
static int _move_pv(struct volume_group *vg_from, struct volume_group *vg_to,
const char *pv_name, int enforce_pv_from_source)
{
struct physical_volume *pv;
struct pv_list *pvl;
/* FIXME: handle tags */
if (!(pvl = find_pv_in_vg(vg_from, pv_name))) {
if (!enforce_pv_from_source &&
find_pv_in_vg(vg_to, pv_name))
/*
* PV has already been moved. This can happen if an
* LV is being moved that has multiple sub-LVs on the
* same PV.
*/
return 1;
log_error("Physical volume %s not in volume group %s",
pv_name, vg_from->name);
return 0;
}
if (_vg_bad_status_bits(vg_from, RESIZEABLE_VG) ||
_vg_bad_status_bits(vg_to, RESIZEABLE_VG))
return 0;
del_pvl_from_vgs(vg_from, pvl);
add_pvl_to_vgs(vg_to, pvl);
pv = pvl->pv;
vg_from->extent_count -= pv_pe_count(pv);
vg_to->extent_count += pv_pe_count(pv);
vg_from->free_count -= pv_pe_count(pv) - pv_pe_alloc_count(pv);
vg_to->free_count += pv_pe_count(pv) - pv_pe_alloc_count(pv);
return 1;
}
int move_pv(struct volume_group *vg_from, struct volume_group *vg_to,
const char *pv_name)
{
return _move_pv(vg_from, vg_to, pv_name, 1);
}
int move_pvs_used_by_lv(struct volume_group *vg_from,
struct volume_group *vg_to,
const char *lv_name)
{
struct lv_segment *lvseg;
unsigned s;
struct lv_list *lvl;
struct logical_volume *lv;
/* FIXME: handle tags */
if (!(lvl = find_lv_in_vg(vg_from, lv_name))) {
log_error("Logical volume %s not in volume group %s",
lv_name, vg_from->name);
return 0;
}
if (_vg_bad_status_bits(vg_from, RESIZEABLE_VG) ||
_vg_bad_status_bits(vg_to, RESIZEABLE_VG))
return 0;
dm_list_iterate_items(lvseg, &lvl->lv->segments) {
if (lvseg->log_lv)
if (!move_pvs_used_by_lv(vg_from, vg_to,
lvseg->log_lv->name))
return_0;
for (s = 0; s < lvseg->area_count; s++) {
if (seg_type(lvseg, s) == AREA_PV) {
if (!_move_pv(vg_from, vg_to,
pv_dev_name(seg_pv(lvseg, s)), 0))
return_0;
} else if (seg_type(lvseg, s) == AREA_LV) {
lv = seg_lv(lvseg, s);
if (!move_pvs_used_by_lv(vg_from, vg_to,
lv->name))
return_0;
}
}
}
return 1;
}
int validate_new_vg_name(struct cmd_context *cmd, const char *vg_name)
{
static char vg_path[PATH_MAX];
name_error_t name_error;
name_error = validate_name_detailed(vg_name);
if (NAME_VALID != name_error) {
display_name_error(name_error);
log_error("New volume group name \"%s\" is invalid.", vg_name);
return 0;
}
snprintf(vg_path, sizeof(vg_path), "%s%s", cmd->dev_dir, vg_name);
if (path_exists(vg_path)) {
log_error("%s: already exists in filesystem", vg_path);
return 0;
}
return 1;
}
int validate_vg_rename_params(struct cmd_context *cmd,
const char *vg_name_old,
const char *vg_name_new)
{
unsigned length;
char *dev_dir;
dev_dir = cmd->dev_dir;
length = strlen(dev_dir);
/* Check sanity of new name */
if (strlen(vg_name_new) > NAME_LEN - length - 2) {
log_error("New volume group path exceeds maximum length "
"of %d!", NAME_LEN - length - 2);
return 0;
}
if (!validate_new_vg_name(cmd, vg_name_new))
return_0;
if (!strcmp(vg_name_old, vg_name_new)) {
log_error("Old and new volume group names must differ");
return 0;
}
return 1;
}
int vg_rename(struct cmd_context *cmd, struct volume_group *vg,
const char *new_name)
{
struct dm_pool *mem = vg->vgmem;
2005-06-01 20:51:55 +04:00
struct pv_list *pvl;
vg->old_name = vg->name;
if (!(vg->name = dm_pool_strdup(mem, new_name))) {
log_error("vg->name allocation failed for '%s'", new_name);
return 0;
}
dm_list_iterate_items(pvl, &vg->pvs) {
if (!(pvl->pv->vg_name = dm_pool_strdup(mem, new_name))) {
log_error("pv->vg_name allocation failed for '%s'",
pv_dev_name(pvl->pv));
return 0;
}
}
return 1;
}
int vg_remove_check(struct volume_group *vg)
{
unsigned lv_count;
if (vg_read_error(vg) || vg_missing_pv_count(vg)) {
log_error("Volume group \"%s\" not found, is inconsistent "
"or has PVs missing.", vg ? vg->name : "");
log_error("Consider vgreduce --removemissing if metadata "
"is inconsistent.");
return 0;
}
if (!vg_check_status(vg, EXPORTED_VG))
return 0;
lv_count = vg_visible_lvs(vg);
if (lv_count) {
log_error("Volume group \"%s\" still contains %u "
"logical volume(s)", vg->name, lv_count);
return 0;
}
if (!archive(vg))
return 0;
return 1;
}
void vg_remove_pvs(struct volume_group *vg)
{
struct pv_list *pvl, *tpvl;
dm_list_iterate_items_safe(pvl, tpvl, &vg->pvs) {
del_pvl_from_vgs(vg, pvl);
dm_list_add(&vg->removed_pvs, &pvl->list);
}
}
2015-03-05 23:00:44 +03:00
int vg_remove_direct(struct volume_group *vg)
{
struct physical_volume *pv;
struct pv_list *pvl;
int ret = 1;
if (!lvmetad_vg_remove_pending(vg)) {
log_error("Failed to update lvmetad for pending remove.");
return 0;
}
if (!vg_remove_mdas(vg)) {
log_error("vg_remove_mdas %s failed", vg->name);
return 0;
}
/* init physical volumes */
dm_list_iterate_items(pvl, &vg->removed_pvs) {
pv = pvl->pv;
if (is_missing_pv(pv))
continue;
log_verbose("Removing physical volume \"%s\" from "
"volume group \"%s\"", pv_dev_name(pv), vg->name);
2008-02-06 18:47:28 +03:00
pv->vg_name = vg->fid->fmt->orphan_vg_name;
pv->status &= ~ALLOCATABLE_PV;
if (!dev_get_size(pv_dev(pv), &pv->size)) {
log_error("%s: Couldn't get size.", pv_dev_name(pv));
ret = 0;
continue;
}
/* FIXME Write to same sector label was read from */
if (!pv_write(vg->cmd, pv, 0)) {
log_error("Failed to remove physical volume \"%s\""
" from volume group \"%s\"",
pv_dev_name(pv), vg->name);
ret = 0;
}
}
if (!lvmetad_vg_remove_finish(vg))
stack;
2015-03-05 23:00:44 +03:00
lockd_vg_update(vg);
set_vg_notify(vg->cmd);
if (!backup_remove(vg->cmd, vg->name))
stack;
if (ret)
log_print_unless_silent("Volume group \"%s\" successfully removed", vg->name);
else
log_error("Volume group \"%s\" not properly removed", vg->name);
2015-03-05 23:00:44 +03:00
return ret;
}
int vg_remove(struct volume_group *vg)
{
int ret;
if (!lock_vol(vg->cmd, VG_ORPHANS, LCK_VG_WRITE, NULL)) {
log_error("Can't get lock for orphan PVs");
return 0;
}
ret = vg_remove_direct(vg);
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
unlock_vg(vg->cmd, vg, VG_ORPHANS);
return ret;
}
int check_dev_block_size_for_vg(struct device *dev, const struct volume_group *vg,
unsigned int *max_phys_block_size_found)
{
unsigned int phys_block_size, block_size;
if (!(dev_get_block_size(dev, &phys_block_size, &block_size)))
return_0;
if (phys_block_size > *max_phys_block_size_found)
*max_phys_block_size_found = phys_block_size;
if (phys_block_size >> SECTOR_SHIFT > vg->extent_size) {
log_error("Physical extent size used for volume group %s "
"is less than physical block size that %s uses.",
vg->name, dev_name(dev));
return 0;
}
return 1;
}
int vg_check_pv_dev_block_sizes(const struct volume_group *vg)
{
struct pv_list *pvl;
unsigned int max_phys_block_size_found = 0;
dm_list_iterate_items(pvl, &vg->pvs) {
if (!check_dev_block_size_for_vg(pvl->pv->dev, vg, &max_phys_block_size_found))
return 0;
}
return 1;
}
static int _check_pv_dev_sizes(struct volume_group *vg)
{
struct pv_list *pvl;
uint64_t dev_size, size;
int r = 1;
if (!vg->cmd->check_pv_dev_sizes ||
is_orphan_vg(vg->name))
return 1;
dm_list_iterate_items(pvl, &vg->pvs) {
if (is_missing_pv(pvl->pv))
continue;
/*
* Don't compare the sizes if we're not able
* to determine the real dev_size. This may
* happen if the device has gone since we did
* VG read.
*/
if (!dev_get_size(pvl->pv->dev, &dev_size))
continue;
size = pv_size(pvl->pv);
if (dev_size < size) {
log_warn("WARNING: Device %s has size of %" PRIu64 " sectors which "
"is smaller than corresponding PV size of %" PRIu64
" sectors. Was device resized?",
pv_dev_name(pvl->pv), dev_size, size);
r = 0;
}
}
return r;
}
/*
* Extend a VG by a single PV / device path
*
* Parameters:
* - vg: handle of volume group to extend by 'pv_name'
* - pv_name: device path of PV to add to VG
* - pp: parameters to pass to implicit pvcreate; if NULL, do not pvcreate
* - max_phys_block_size: largest physical block size found amongst PVs in a VG
*
*/
static int vg_extend_single_pv(struct volume_group *vg, char *pv_name,
struct pvcreate_params *pp,
unsigned int *max_phys_block_size)
{
struct physical_volume *pv;
struct pv_to_write *pvw;
int new_pv = 0;
metadata: remove spurious "Physical volume <dev_name> not found" This is addendum to commit 2e82a070f3c9224da5c9f383d47e75a1715586cf which fixed these spurious messages that appeared after commit 651d5093edde3e0ebee9d75be1c9834efc152d91 ("avoid pv_read in find_pv_by_name"). There was one more "not found" message issued in case the device could not be found in device cache (commit 2e82a07 fixed this only for PV lookup itself). But if we "allow_unformatted" for find_pv_by_name, we should not issue this message even in case the device can't be found in dev cache as we just need to know whether there's a PV or not for the code to decide on next steps and we don't want to issue any messages if either device itself is not found or PV is not found. For example, when we were creating a new PV (and so allow_unformatted = 1) and the device had a signature on it which caused it to be filtered by device filter (e.g. MD signature if md filtering is enabled), or it was part of some other subsystem (e.g. multipath), this message was issued on find_pv_by_name call which was misleading. Also, remove misleading "stack" call in case find_pv_by_name returns NULL in pvcreate_check - any error state is reported later by pvcreate_check code so no need to "stack" here. There's one more and proper check to issue "not found" message if the device can't be found in device cache within pvcreate_check fn so this situation is still covered properly later in the code. Before this patch (/dev/sda contains MD signature and is therefore filtered): $ pvcreate /dev/sda Physical volume /dev/sda not found WARNING: linux_raid_member signature detected on /dev/sda at offset 4096. Wipe it? [y/n]: With this patch applied: $ pvcreate /dev/sda WARNING: linux_raid_member signature detected on /dev/sda at offset 4096. Wipe it? [y/n]: Non-existent devices are still caught properly: $ pvcreate /dev/sdx Device /dev/sdx not found (or ignored by filtering).
2014-07-31 11:30:25 +04:00
pv = find_pv_by_name(vg->cmd, pv_name, 1, 1);
if (!pv && !pp) {
log_error("%s not identified as an existing "
"physical volume", pv_name);
return 0;
} else if (!pv && pp) {
if (!(pv = pvcreate_vol(vg->cmd, pv_name, pp, 0)))
return_0;
new_pv = 1;
}
if (!(check_dev_block_size_for_vg(pv->dev, (const struct volume_group *) vg,
max_phys_block_size)))
goto_bad;
if (!add_pv_to_vg(vg, pv_name, pv, new_pv))
goto_bad;
if ((pv->fmt->features & FMT_PV_FLAGS) ||
(pv->status & UNLABELLED_PV)) {
if (!(pvw = dm_pool_zalloc(vg->vgmem, sizeof(*pvw)))) {
log_error("pv_to_write allocation for '%s' failed", pv_name);
return 0;
}
pvw->pv = pv;
pvw->pp = new_pv ? pp : NULL;
pvw->new_pv = new_pv;
dm_list_add(&vg->pvs_to_write, &pvw->list);
}
return 1;
bad:
free_pv_fid(pv);
return 0;
}
/*
* FIXME: commands shifting to common code in toollib have left a large
* amount of code only used by liblvm. Either remove this by shifting
* liblvm to use toollib, or isolate all this code into a liblvm-specific
* source file. All the following and more are only used by liblvm:
*
* . vg_extend()
* . vg_extend_single_pv()
* . pvcreate_vol()
* . _pvcreate_check()
* . _pvcreate_write()
* . pvremove_many()
* . pvremove_single()
* . find_pv_by_name()
* . get_pvs()
* . the vg->pvs_to_write list and pv_to_write struct
2016-05-25 23:04:30 +03:00
* . vg_reduce()
*/
/*
* Extend a VG by a single PV / device path
*
* Parameters:
* - vg: handle of volume group to extend by 'pv_name'
* - pv_count: count of device paths of PVs
* - pv_names: device paths of PVs to add to VG
* - pp: parameters to pass to implicit pvcreate; if NULL, do not pvcreate
*
*/
int vg_extend(struct volume_group *vg, int pv_count, const char *const *pv_names,
struct pvcreate_params *pp)
2001-10-16 02:04:27 +04:00
{
int i;
char *pv_name;
unsigned int max_phys_block_size = 0;
if (_vg_bad_status_bits(vg, RESIZEABLE_VG))
return_0;
2001-10-16 02:04:27 +04:00
/* attach each pv */
for (i = 0; i < pv_count; i++) {
if (!(pv_name = dm_strdup(pv_names[i]))) {
log_error("Failed to duplicate pv name %s.", pv_names[i]);
return 0;
}
dm_unescape_colons_and_at_signs(pv_name, NULL, NULL);
if (!vg_extend_single_pv(vg, pv_name, pp, &max_phys_block_size)) {
log_error("Unable to add physical volume '%s' to "
"volume group '%s'.", pv_name, vg->name);
dm_free(pv_name);
return 0;
}
dm_free(pv_name);
}
2001-10-16 02:04:27 +04:00
(void) _check_pv_dev_sizes(vg);
2002-11-18 17:04:08 +03:00
/* FIXME Decide whether to initialise and add new mdahs to format instance */
2001-10-16 02:04:27 +04:00
return 1;
}
int vg_extend_each_pv(struct volume_group *vg, struct pvcreate_params *pp)
{
struct pv_list *pvl;
unsigned int max_phys_block_size = 0;
log_debug_metadata("Adding PVs to VG %s.", vg->name);
if (_vg_bad_status_bits(vg, RESIZEABLE_VG))
return_0;
dm_list_iterate_items(pvl, &pp->pvs) {
log_debug_metadata("Adding PV %s to VG %s.", pv_dev_name(pvl->pv), vg->name);
if (!(check_dev_block_size_for_vg(pvl->pv->dev,
(const struct volume_group *) vg,
&max_phys_block_size))) {
log_error("PV %s has wrong block size.", pv_dev_name(pvl->pv));
return 0;
}
if (!add_pv_to_vg(vg, pv_dev_name(pvl->pv), pvl->pv, 0)) {
log_error("PV %s cannot be added to VG %s.",
pv_dev_name(pvl->pv), vg->name);
return 0;
}
}
(void) _check_pv_dev_sizes(vg);
dm_list_splice(&vg->pv_write_list, &pp->pvs);
return 1;
}
int vg_reduce(struct volume_group *vg, const char *pv_name)
{
struct physical_volume *pv;
struct pv_list *pvl;
if (!(pvl = find_pv_in_vg(vg, pv_name))) {
log_error("Physical volume %s not in volume group %s.",
pv_name, vg->name);
return 0;
}
pv = pvl->pv;
if (vgreduce_single(vg->cmd, vg, pv, 0)) {
dm_list_add(&vg->removed_pvs, &pvl->list);
return 1;
}
log_error("Unable to remove physical volume '%s' from "
2014-05-22 01:10:02 +04:00
"volume group '%s'.", pv_name, vg->name);
return 0;
}
int lv_change_tag(struct logical_volume *lv, const char *tag, int add_tag)
{
char *tag_new;
if (!(lv->vg->fid->fmt->features & FMT_TAGS)) {
log_error("Logical volume %s/%s does not support tags",
lv->vg->name, lv->name);
return 0;
}
if (add_tag) {
if (!(tag_new = dm_pool_strdup(lv->vg->vgmem, tag))) {
log_error("Failed to duplicate tag %s from %s/%s",
tag, lv->vg->name, lv->name);
return 0;
}
if (!str_list_add(lv->vg->vgmem, &lv->tags, tag_new)) {
log_error("Failed to add tag %s to %s/%s",
tag, lv->vg->name, lv->name);
return 0;
}
} else
str_list_del(&lv->tags, tag);
return 1;
}
int vg_change_tag(struct volume_group *vg, const char *tag, int add_tag)
{
char *tag_new;
if (!(vg->fid->fmt->features & FMT_TAGS)) {
log_error("Volume group %s does not support tags", vg->name);
return 0;
}
if (add_tag) {
if (!(tag_new = dm_pool_strdup(vg->vgmem, tag))) {
log_error("Failed to duplicate tag %s from %s",
tag, vg->name);
return 0;
}
if (!str_list_add(vg->vgmem, &vg->tags, tag_new)) {
log_error("Failed to add tag %s to volume group %s",
tag, vg->name);
return 0;
}
} else
str_list_del(&vg->tags, tag);
return 1;
}
2001-11-12 18:10:01 +03:00
const char *strip_dir(const char *vg_name, const char *dev_dir)
{
size_t len = strlen(dev_dir);
if (!strncmp(vg_name, dev_dir, len))
vg_name += len;
return vg_name;
}
/*
* Validates major and minor numbers.
* On >2.4 kernel we only support dynamic major number.
*/
int validate_major_minor(const struct cmd_context *cmd,
const struct format_type *fmt,
int32_t major, int32_t minor)
{
int r = 1;
if (!strncmp(cmd->kernel_vsn, "2.4.", 4) ||
(fmt->features & FMT_RESTRICTED_LVIDS)) {
if (major < 0 || major > 255) {
log_error("Major number %d outside range 0-255.", major);
r = 0;
}
if (minor < 0 || minor > 255) {
log_error("Minor number %d outside range 0-255.", minor);
r = 0;
}
} else {
/* 12 bits for major number */
if ((major != -1) &&
(major != cmd->dev_types->device_mapper_major)) {
/* User supplied some major number */
if (major < 0 || major > 4095) {
log_error("Major number %d outside range 0-4095.", major);
r = 0;
} else
log_print_unless_silent("Ignoring supplied major %d number - "
"kernel assigns major numbers dynamically.",
major);
}
/* 20 bits for minor number */
if (minor < 0 || minor > 1048575) {
log_error("Minor number %d outside range 0-1048575.", minor);
r = 0;
}
}
return r;
}
/*
* Validate parameters to vg_create() before calling.
* FIXME: Move inside vg_create library function.
* FIXME: Change vgcreate_params struct to individual gets/sets
*/
int vgcreate_params_validate(struct cmd_context *cmd,
struct vgcreate_params *vp)
{
if (!validate_new_vg_name(cmd, vp->vg_name))
return_0;
if (vp->alloc == ALLOC_INHERIT) {
log_error("Volume Group allocation policy cannot inherit "
"from anything");
return 0;
}
if (!vp->extent_size) {
log_error("Physical extent size may not be zero");
return 0;
}
if (!(cmd->fmt->features & FMT_UNLIMITED_VOLS)) {
if (!vp->max_lv)
vp->max_lv = 255;
if (!vp->max_pv)
vp->max_pv = 255;
if (vp->max_lv > 255 || vp->max_pv > 255) {
log_error("Number of volumes may not exceed 255");
return 0;
}
}
return 1;
}
/*
* Update content of precommitted VG
*
* TODO: Optimize in the future, since lvmetad needs similar
* config tree processing in lvmetad_vg_update().
*/
static int _vg_update_vg_precommitted(struct volume_group *vg)
{
release_vg(vg->vg_precommitted);
vg->vg_precommitted = NULL;
if (vg->cft_precommitted) {
dm_config_destroy(vg->cft_precommitted);
vg->cft_precommitted = NULL;
}
if (!(vg->cft_precommitted = export_vg_to_config_tree(vg)))
return_0;
if (!(vg->vg_precommitted = import_vg_from_config_tree(vg->cft_precommitted, vg->fid))) {
dm_config_destroy(vg->cft_precommitted);
vg->cft_precommitted = NULL;
return_0;
}
return 1;
}
static int _vg_update_vg_committed(struct volume_group *vg)
{
if (dm_pool_locked(vg->vgmem))
return 1;
if (vg->vg_committed || is_orphan_vg(vg->name)) /* we already have it */
return 1;
if (!_vg_update_vg_precommitted(vg))
return_0;
vg->vg_committed = vg->vg_precommitted;
vg->vg_precommitted = NULL;
if (vg->cft_precommitted) {
dm_config_destroy(vg->cft_precommitted);
vg->cft_precommitted = NULL;
}
return 1;
}
/*
* Create a (struct volume_group) volume group handle from a struct volume_group pointer and a
* possible failure code or zero for success.
*/
static struct volume_group *_vg_make_handle(struct cmd_context *cmd,
struct volume_group *vg,
uint32_t failure)
{
/* Never return a cached VG structure for a failure */
if (vg && vg->vginfo && failure != SUCCESS) {
release_vg(vg);
vg = NULL;
}
if (!vg && !(vg = alloc_vg("vg_make_handle", cmd, NULL)))
return_NULL;
if (vg->read_status != failure)
vg->read_status = failure;
if (vg->fid && !_vg_update_vg_committed(vg))
vg->read_status |= FAILED_ALLOCATION;
return vg;
}
int lv_has_unknown_segments(const struct logical_volume *lv)
{
struct lv_segment *seg;
/* foreach segment */
dm_list_iterate_items(seg, &lv->segments)
if (seg_unknown(seg))
return 1;
return 0;
}
int vg_has_unknown_segments(const struct volume_group *vg)
{
struct lv_list *lvl;
/* foreach LV */
dm_list_iterate_items(lvl, &vg->lvs)
if (lv_has_unknown_segments(lvl->lv))
return 1;
return 0;
}
struct volume_group *vg_lock_and_create(struct cmd_context *cmd, const char *vg_name)
{
uint32_t rc;
if (!validate_name(vg_name)) {
log_error("Invalid vg name %s", vg_name);
/* FIXME: use _vg_make_handle() w/proper error code */
return NULL;
}
rc = vg_lock_newname(cmd, vg_name);
if (rc != SUCCESS)
/* NOTE: let caller decide - this may be check for existence */
return _vg_make_handle(cmd, NULL, rc);
return vg_create(cmd, vg_name);
}
Change vg_create() to take only minimal parameters and obtain a lock. vg_t *vg_create(struct cmd_context *cmd, const char *vg_name); This is the first step towards the API called to create a VG. Call vg_lock_newname() inside this function. Use _vg_make_handle() where possible. Now we have 2 ways to construct a volume group: 1) vg_read: Used when constructing an existing VG from disks 2) vg_create: Used when constructing a new VG Both of these interfaces obtain a lock, and return a vg_t *. The usage of _vg_make_handle() inside vg_create() doesn't fit perfectly but it's ok for now. Needs some cleanup though and I've noted "FIXME" in the code. Add the new vg_create() plus vg 'set' functions for non-default VG parameters in the following tools: - vgcreate: Fairly straightforward refactoring. We just moved vg_lock_newname inside vg_create so we check the return via vg_read_error. - vgsplit: The refactoring here is a bit more tricky. Originally we called vg_lock_newname and depending on the error code, we either read the existing vg or created the new one. Now vg_create() calls vg_lock_newname, so we first try to create the VG. If this fails with FAILED_EXIST, we can then do the vg_read. If the create succeeds, we check the input parameters and set any new values on the VG. TODO in future patches: 1. The VG_ORPHAN lock needs some thought. We may want to treat this as any other VG, and require the application to obtain a handle and pass it to other API calls (for example, vg_extend). Or, we may find that hiding the VG_ORPHAN lock inside other APIs is the way to go. I thought of placing the VG_ORPHAN lock inside vg_create() and tying it to the vg handle, but was not certain this was the right approach. 2. Cleanup error paths. Integrate vg_read_error() with vg_create and vg_read* error codes and/or the new error APIs. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2009-07-09 14:09:33 +04:00
/*
* Create a VG with default parameters.
* Returns:
* - struct volume_group* with SUCCESS code: VG structure created
* - NULL or struct volume_group* with FAILED_* code: error creating VG structure
Change vg_create() to take only minimal parameters and obtain a lock. vg_t *vg_create(struct cmd_context *cmd, const char *vg_name); This is the first step towards the API called to create a VG. Call vg_lock_newname() inside this function. Use _vg_make_handle() where possible. Now we have 2 ways to construct a volume group: 1) vg_read: Used when constructing an existing VG from disks 2) vg_create: Used when constructing a new VG Both of these interfaces obtain a lock, and return a vg_t *. The usage of _vg_make_handle() inside vg_create() doesn't fit perfectly but it's ok for now. Needs some cleanup though and I've noted "FIXME" in the code. Add the new vg_create() plus vg 'set' functions for non-default VG parameters in the following tools: - vgcreate: Fairly straightforward refactoring. We just moved vg_lock_newname inside vg_create so we check the return via vg_read_error. - vgsplit: The refactoring here is a bit more tricky. Originally we called vg_lock_newname and depending on the error code, we either read the existing vg or created the new one. Now vg_create() calls vg_lock_newname, so we first try to create the VG. If this fails with FAILED_EXIST, we can then do the vg_read. If the create succeeds, we check the input parameters and set any new values on the VG. TODO in future patches: 1. The VG_ORPHAN lock needs some thought. We may want to treat this as any other VG, and require the application to obtain a handle and pass it to other API calls (for example, vg_extend). Or, we may find that hiding the VG_ORPHAN lock inside other APIs is the way to go. I thought of placing the VG_ORPHAN lock inside vg_create() and tying it to the vg handle, but was not certain this was the right approach. 2. Cleanup error paths. Integrate vg_read_error() with vg_create and vg_read* error codes and/or the new error APIs. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2009-07-09 14:09:33 +04:00
* Use vg_read_error() to determine success or failure.
* FIXME: cleanup usage of _vg_make_handle()
*/
struct volume_group *vg_create(struct cmd_context *cmd, const char *vg_name)
2001-10-12 18:25:53 +04:00
{
struct volume_group *vg;
struct format_instance_ctx fic = {
.type = FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS,
.context.vg_ref.vg_name = vg_name
};
struct format_instance *fid;
if (!(vg = alloc_vg("vg_create", cmd, vg_name)))
goto_bad;
2001-10-12 18:25:53 +04:00
if (!id_create(&vg->id)) {
log_error("Couldn't create uuid for volume group '%s'.",
vg_name);
2001-10-12 18:25:53 +04:00
goto bad;
}
vg->status = (RESIZEABLE_VG | LVM_READ | LVM_WRITE);
vg->system_id = NULL;
if (!(vg->lvm1_system_id = dm_pool_zalloc(vg->vgmem, NAME_LEN + 1)))
goto_bad;
Change vg_create() to take only minimal parameters and obtain a lock. vg_t *vg_create(struct cmd_context *cmd, const char *vg_name); This is the first step towards the API called to create a VG. Call vg_lock_newname() inside this function. Use _vg_make_handle() where possible. Now we have 2 ways to construct a volume group: 1) vg_read: Used when constructing an existing VG from disks 2) vg_create: Used when constructing a new VG Both of these interfaces obtain a lock, and return a vg_t *. The usage of _vg_make_handle() inside vg_create() doesn't fit perfectly but it's ok for now. Needs some cleanup though and I've noted "FIXME" in the code. Add the new vg_create() plus vg 'set' functions for non-default VG parameters in the following tools: - vgcreate: Fairly straightforward refactoring. We just moved vg_lock_newname inside vg_create so we check the return via vg_read_error. - vgsplit: The refactoring here is a bit more tricky. Originally we called vg_lock_newname and depending on the error code, we either read the existing vg or created the new one. Now vg_create() calls vg_lock_newname, so we first try to create the VG. If this fails with FAILED_EXIST, we can then do the vg_read. If the create succeeds, we check the input parameters and set any new values on the VG. TODO in future patches: 1. The VG_ORPHAN lock needs some thought. We may want to treat this as any other VG, and require the application to obtain a handle and pass it to other API calls (for example, vg_extend). Or, we may find that hiding the VG_ORPHAN lock inside other APIs is the way to go. I thought of placing the VG_ORPHAN lock inside vg_create() and tying it to the vg handle, but was not certain this was the right approach. 2. Cleanup error paths. Integrate vg_read_error() with vg_create and vg_read* error codes and/or the new error APIs. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2009-07-09 14:09:33 +04:00
vg->extent_size = DEFAULT_EXTENT_SIZE * 2;
vg->max_lv = DEFAULT_MAX_LV;
vg->max_pv = DEFAULT_MAX_PV;
vg->alloc = DEFAULT_ALLOC_POLICY;
vg->mda_copies = DEFAULT_VGMETADATACOPIES;
if (!(fid = cmd->fmt->ops->create_instance(cmd->fmt, &fic))) {
log_error("Failed to create format instance");
goto bad;
}
vg_set_fid(vg, fid);
if (vg->fid->fmt->ops->vg_setup &&
!vg->fid->fmt->ops->vg_setup(vg->fid, vg)) {
2001-10-15 22:39:40 +04:00
log_error("Format specific setup of volume group '%s' failed.",
vg_name);
2001-10-12 18:25:53 +04:00
goto bad;
}
Change vg_create() to take only minimal parameters and obtain a lock. vg_t *vg_create(struct cmd_context *cmd, const char *vg_name); This is the first step towards the API called to create a VG. Call vg_lock_newname() inside this function. Use _vg_make_handle() where possible. Now we have 2 ways to construct a volume group: 1) vg_read: Used when constructing an existing VG from disks 2) vg_create: Used when constructing a new VG Both of these interfaces obtain a lock, and return a vg_t *. The usage of _vg_make_handle() inside vg_create() doesn't fit perfectly but it's ok for now. Needs some cleanup though and I've noted "FIXME" in the code. Add the new vg_create() plus vg 'set' functions for non-default VG parameters in the following tools: - vgcreate: Fairly straightforward refactoring. We just moved vg_lock_newname inside vg_create so we check the return via vg_read_error. - vgsplit: The refactoring here is a bit more tricky. Originally we called vg_lock_newname and depending on the error code, we either read the existing vg or created the new one. Now vg_create() calls vg_lock_newname, so we first try to create the VG. If this fails with FAILED_EXIST, we can then do the vg_read. If the create succeeds, we check the input parameters and set any new values on the VG. TODO in future patches: 1. The VG_ORPHAN lock needs some thought. We may want to treat this as any other VG, and require the application to obtain a handle and pass it to other API calls (for example, vg_extend). Or, we may find that hiding the VG_ORPHAN lock inside other APIs is the way to go. I thought of placing the VG_ORPHAN lock inside vg_create() and tying it to the vg handle, but was not certain this was the right approach. 2. Cleanup error paths. Integrate vg_read_error() with vg_create and vg_read* error codes and/or the new error APIs. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2009-07-09 14:09:33 +04:00
return _vg_make_handle(cmd, vg, SUCCESS);
2001-10-12 18:25:53 +04:00
Change vg_create() to take only minimal parameters and obtain a lock. vg_t *vg_create(struct cmd_context *cmd, const char *vg_name); This is the first step towards the API called to create a VG. Call vg_lock_newname() inside this function. Use _vg_make_handle() where possible. Now we have 2 ways to construct a volume group: 1) vg_read: Used when constructing an existing VG from disks 2) vg_create: Used when constructing a new VG Both of these interfaces obtain a lock, and return a vg_t *. The usage of _vg_make_handle() inside vg_create() doesn't fit perfectly but it's ok for now. Needs some cleanup though and I've noted "FIXME" in the code. Add the new vg_create() plus vg 'set' functions for non-default VG parameters in the following tools: - vgcreate: Fairly straightforward refactoring. We just moved vg_lock_newname inside vg_create so we check the return via vg_read_error. - vgsplit: The refactoring here is a bit more tricky. Originally we called vg_lock_newname and depending on the error code, we either read the existing vg or created the new one. Now vg_create() calls vg_lock_newname, so we first try to create the VG. If this fails with FAILED_EXIST, we can then do the vg_read. If the create succeeds, we check the input parameters and set any new values on the VG. TODO in future patches: 1. The VG_ORPHAN lock needs some thought. We may want to treat this as any other VG, and require the application to obtain a handle and pass it to other API calls (for example, vg_extend). Or, we may find that hiding the VG_ORPHAN lock inside other APIs is the way to go. I thought of placing the VG_ORPHAN lock inside vg_create() and tying it to the vg handle, but was not certain this was the right approach. 2. Cleanup error paths. Integrate vg_read_error() with vg_create and vg_read* error codes and/or the new error APIs. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2009-07-09 14:09:33 +04:00
bad:
unlock_and_release_vg(cmd, vg, vg_name);
Change vg_create() to take only minimal parameters and obtain a lock. vg_t *vg_create(struct cmd_context *cmd, const char *vg_name); This is the first step towards the API called to create a VG. Call vg_lock_newname() inside this function. Use _vg_make_handle() where possible. Now we have 2 ways to construct a volume group: 1) vg_read: Used when constructing an existing VG from disks 2) vg_create: Used when constructing a new VG Both of these interfaces obtain a lock, and return a vg_t *. The usage of _vg_make_handle() inside vg_create() doesn't fit perfectly but it's ok for now. Needs some cleanup though and I've noted "FIXME" in the code. Add the new vg_create() plus vg 'set' functions for non-default VG parameters in the following tools: - vgcreate: Fairly straightforward refactoring. We just moved vg_lock_newname inside vg_create so we check the return via vg_read_error. - vgsplit: The refactoring here is a bit more tricky. Originally we called vg_lock_newname and depending on the error code, we either read the existing vg or created the new one. Now vg_create() calls vg_lock_newname, so we first try to create the VG. If this fails with FAILED_EXIST, we can then do the vg_read. If the create succeeds, we check the input parameters and set any new values on the VG. TODO in future patches: 1. The VG_ORPHAN lock needs some thought. We may want to treat this as any other VG, and require the application to obtain a handle and pass it to other API calls (for example, vg_extend). Or, we may find that hiding the VG_ORPHAN lock inside other APIs is the way to go. I thought of placing the VG_ORPHAN lock inside vg_create() and tying it to the vg handle, but was not certain this was the right approach. 2. Cleanup error paths. Integrate vg_read_error() with vg_create and vg_read* error codes and/or the new error APIs. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2009-07-09 14:09:33 +04:00
/* FIXME: use _vg_make_handle() w/proper error code */
2001-10-12 18:25:53 +04:00
return NULL;
}
/* Rounds up by default */
uint32_t extents_from_size(struct cmd_context *cmd, uint64_t size,
uint32_t extent_size)
{
if (size % extent_size) {
size += extent_size - size % extent_size;
log_print_unless_silent("Rounding up size to full physical extent %s",
display_size(cmd, size));
}
if (size > (uint64_t) MAX_EXTENT_COUNT * extent_size) {
log_error("Volume too large (%s) for extent size %s. "
"Upper limit is less then %s.",
display_size(cmd, size),
display_size(cmd, (uint64_t) extent_size),
display_size(cmd, (uint64_t) MAX_EXTENT_COUNT *
extent_size));
return 0;
}
return (uint32_t) (size / extent_size);
}
/*
* Converts size according to percentage with specified rounding to extents
*
* For PERCENT_NONE size is in standard sector units.
* For all other percent type is in DM_PERCENT_1 base unit (supports decimal point)
*
* Return value of 0 extents is an error.
*/
uint32_t extents_from_percent_size(struct volume_group *vg, const struct dm_list *pvh,
uint32_t extents, int roundup,
percent_type_t percent, uint64_t size)
{
uint32_t count;
switch (percent) {
case PERCENT_NONE:
if (!roundup && (size % vg->extent_size)) {
if (!(size -= size % vg->extent_size)) {
log_error("Specified size is smaller then physical extent boundary.");
return 0;
}
log_print_unless_silent("Rounding size to boundary between physical extents: %s.",
display_size(vg->cmd, size));
}
return extents_from_size(vg->cmd, size, vg->extent_size);
case PERCENT_LV:
break; /* Base extents already passed in. */
case PERCENT_VG:
extents = vg->extent_count;
break;
case PERCENT_PVS:
if (pvh != &vg->pvs) {
/* Physical volumes are specified on cmdline */
if (!(extents = pv_list_extents_free(pvh))) {
log_error("No free extents in the list of physical volumes.");
return 0;
}
break;
}
/* fall through to use all PVs in VG like %FREE */
case PERCENT_FREE:
if (!(extents = vg->free_count)) {
log_error("No free extents in Volume group %s.", vg->name);
return 0;
}
break;
default:
log_error(INTERNAL_ERROR "Unsupported percent type %u.", percent);
return 0;
}
if (!(count = percent_of_extents(size, extents, roundup)))
log_error("Converted %.2f%%%s into 0 extents.",
(double) size / DM_PERCENT_1, get_percent_string(percent));
else
log_verbose("Converted %.2f%%%s into %" PRIu32 " extents.",
(double) size / DM_PERCENT_1, get_percent_string(percent), count);
return count;
}
static dm_bitset_t _bitset_with_random_bits(struct dm_pool *mem, uint32_t num_bits,
uint32_t num_set_bits, unsigned *seed)
{
dm_bitset_t bs;
unsigned bit_selected;
char buf[32];
uint32_t i = num_bits - num_set_bits;
if (!(bs = dm_bitset_create(mem, (unsigned) num_bits))) {
log_error("Failed to allocate bitset for setting random bits.");
return NULL;
}
if (!dm_pool_begin_object(mem, 512)) {
log_error("dm_pool_begin_object failed for random list of bits.");
dm_pool_free(mem, bs);
return NULL;
}
/* Perform loop num_set_bits times, selecting one bit each time */
while (i++ < num_bits) {
/* Select a random bit between 0 and (i-1) inclusive. */
bit_selected = lvm_even_rand(seed, i);
/*
* If the bit was already set, set the new bit that became
* choosable for the first time during this pass.
* This maintains a uniform probability distribution by compensating
* for being unable to select it until this pass.
*/
if (dm_bit(bs, bit_selected))
bit_selected = i - 1;
dm_bit_set(bs, bit_selected);
if (dm_snprintf(buf, sizeof(buf), "%u ", bit_selected) < 0) {
log_error("snprintf random bit failed.");
dm_pool_free(mem, bs);
return NULL;
}
if (!dm_pool_grow_object(mem, buf, strlen(buf))) {
log_error("Failed to generate list of random bits.");
dm_pool_free(mem, bs);
return NULL;
}
}
if (!dm_pool_grow_object(mem, "\0", 1)) {
log_error("Failed to finish list of random bits.");
dm_pool_free(mem, bs);
return NULL;
}
log_debug_metadata("Selected %" PRIu32 " random bits from %" PRIu32 ": %s", num_set_bits, num_bits, (char *) dm_pool_end_object(mem));
return bs;
}
static int _vg_ignore_mdas(struct volume_group *vg, uint32_t num_to_ignore)
{
struct metadata_area *mda;
uint32_t mda_used_count = vg_mda_used_count(vg);
dm_bitset_t mda_to_ignore_bs;
int r = 1;
log_debug_metadata("Adjusting ignored mdas for %s: %" PRIu32 " of %" PRIu32 " mdas in use "
"but %" PRIu32 " required. Changing %" PRIu32 " mda.",
vg->name, mda_used_count, vg_mda_count(vg), vg_mda_copies(vg), num_to_ignore);
2010-06-30 17:51:11 +04:00
if (!num_to_ignore)
return 1;
2010-06-30 17:51:11 +04:00
if (!(mda_to_ignore_bs = _bitset_with_random_bits(vg->vgmem, mda_used_count,
num_to_ignore, &vg->cmd->rand_seed)))
return_0;
2010-06-30 23:28:35 +04:00
dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use)
if (!mda_is_ignored(mda) && (--mda_used_count,
dm_bit(mda_to_ignore_bs, mda_used_count))) {
mda_set_ignored(mda, 1);
2010-06-30 23:28:35 +04:00
if (!--num_to_ignore)
goto out;
}
2010-06-30 17:51:11 +04:00
log_error(INTERNAL_ERROR "Unable to find %"PRIu32" metadata areas to ignore "
"on volume group %s", num_to_ignore, vg->name);
2010-06-30 17:51:11 +04:00
r = 0;
out:
dm_pool_free(vg->vgmem, mda_to_ignore_bs);
return r;
}
static int _vg_unignore_mdas(struct volume_group *vg, uint32_t num_to_unignore)
{
struct metadata_area *mda, *tmda;
uint32_t mda_used_count = vg_mda_used_count(vg);
uint32_t mda_count = vg_mda_count(vg);
uint32_t mda_free_count = mda_count - mda_used_count;
dm_bitset_t mda_to_unignore_bs;
int r = 1;
if (!num_to_unignore)
return 1;
2010-06-30 17:51:11 +04:00
log_debug_metadata("Adjusting ignored mdas for %s: %" PRIu32 " of %" PRIu32 " mdas in use "
"but %" PRIu32 " required. Changing %" PRIu32 " mda.",
vg->name, mda_used_count, mda_count, vg_mda_copies(vg), num_to_unignore);
if (!(mda_to_unignore_bs = _bitset_with_random_bits(vg->vgmem, mda_free_count,
num_to_unignore, &vg->cmd->rand_seed)))
return_0;
2010-06-30 17:51:11 +04:00
2010-06-30 23:28:35 +04:00
dm_list_iterate_items_safe(mda, tmda, &vg->fid->metadata_areas_ignored)
if (mda_is_ignored(mda) && (--mda_free_count,
dm_bit(mda_to_unignore_bs, mda_free_count))) {
mda_set_ignored(mda, 0);
dm_list_move(&vg->fid->metadata_areas_in_use,
&mda->list);
2010-06-30 23:28:35 +04:00
if (!--num_to_unignore)
goto out;
}
2010-06-30 17:51:11 +04:00
2010-06-30 23:28:35 +04:00
dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use)
if (mda_is_ignored(mda) && (--mda_free_count,
dm_bit(mda_to_unignore_bs, mda_free_count))) {
mda_set_ignored(mda, 0);
2010-06-30 23:28:35 +04:00
if (!--num_to_unignore)
goto out;
}
2010-06-30 17:51:11 +04:00
log_error(INTERNAL_ERROR "Unable to find %"PRIu32" metadata areas to unignore "
"on volume group %s", num_to_unignore, vg->name);
r = 0;
out:
dm_pool_free(vg->vgmem, mda_to_unignore_bs);
return r;
}
static int _vg_adjust_ignored_mdas(struct volume_group *vg)
{
2010-06-30 23:28:35 +04:00
uint32_t mda_copies_used = vg_mda_used_count(vg);
2010-06-30 23:28:35 +04:00
if (vg->mda_copies == VGMETADATACOPIES_UNMANAGED) {
/* Ensure at least one mda is in use. */
if (!mda_copies_used && vg_mda_count(vg) && !_vg_unignore_mdas(vg, 1))
return_0;
else
2010-06-30 23:28:35 +04:00
return 1;
}
2010-06-30 17:51:11 +04:00
2010-06-30 23:28:35 +04:00
/* Not an error to have vg_mda_count larger than total mdas. */
if (vg->mda_copies == VGMETADATACOPIES_ALL ||
vg->mda_copies >= vg_mda_count(vg)) {
/* Use all */
if (!_vg_unignore_mdas(vg, vg_mda_count(vg) - mda_copies_used))
return_0;
} else if (mda_copies_used < vg->mda_copies) {
if (!_vg_unignore_mdas(vg, vg->mda_copies - mda_copies_used))
return_0;
} else if (mda_copies_used > vg->mda_copies)
if (!_vg_ignore_mdas(vg, mda_copies_used - vg->mda_copies))
return_0;
/*
* The VGMETADATACOPIES_ALL value will never be written disk.
* It is a special cmdline value that means 2 things:
* 1. clear all ignore bits in all mdas in this vg
* 2. set the "unmanaged" policy going forward for metadata balancing
*/
if (vg->mda_copies == VGMETADATACOPIES_ALL)
vg->mda_copies = VGMETADATACOPIES_UNMANAGED;
2010-06-30 17:51:11 +04:00
return 1;
}
uint64_t find_min_mda_size(struct dm_list *mdas)
{
uint64_t min_mda_size = UINT64_MAX, mda_size;
struct metadata_area *mda;
dm_list_iterate_items(mda, mdas) {
if (!mda->ops->mda_total_sectors)
continue;
mda_size = mda->ops->mda_total_sectors(mda);
if (mda_size < min_mda_size)
min_mda_size = mda_size;
}
if (min_mda_size == UINT64_MAX)
min_mda_size = UINT64_C(0);
return min_mda_size;
}
static int _move_mdas(struct volume_group *vg_from, struct volume_group *vg_to,
struct dm_list *mdas_from, struct dm_list *mdas_to)
{
struct metadata_area *mda, *mda2;
int common_mda = 0;
dm_list_iterate_items_safe(mda, mda2, mdas_from) {
if (!mda->ops->mda_in_vg) {
common_mda = 1;
continue;
}
if (!mda->ops->mda_in_vg(vg_from->fid, vg_from, mda)) {
if (is_orphan_vg(vg_to->name))
dm_list_del(&mda->list);
else
dm_list_move(mdas_to, &mda->list);
}
}
return common_mda;
}
/*
* Separate metadata areas after splitting a VG.
* Also accepts orphan VG as destination (for vgreduce).
*/
int vg_split_mdas(struct cmd_context *cmd __attribute__((unused)),
struct volume_group *vg_from, struct volume_group *vg_to)
{
struct dm_list *mdas_from_in_use, *mdas_to_in_use;
struct dm_list *mdas_from_ignored, *mdas_to_ignored;
int common_mda = 0;
mdas_from_in_use = &vg_from->fid->metadata_areas_in_use;
mdas_from_ignored = &vg_from->fid->metadata_areas_ignored;
mdas_to_in_use = &vg_to->fid->metadata_areas_in_use;
mdas_to_ignored = &vg_to->fid->metadata_areas_ignored;
common_mda = _move_mdas(vg_from, vg_to,
mdas_from_in_use, mdas_to_in_use);
common_mda = _move_mdas(vg_from, vg_to,
mdas_from_ignored, mdas_to_ignored);
if ((dm_list_empty(mdas_from_in_use) &&
dm_list_empty(mdas_from_ignored)) ||
((!is_orphan_vg(vg_to->name) &&
dm_list_empty(mdas_to_in_use) &&
dm_list_empty(mdas_to_ignored))))
return common_mda;
return 1;
}
void pvcreate_params_set_defaults(struct pvcreate_params *pp)
{
memset(pp, 0, sizeof(*pp));
pp->zero = 1;
pp->force = PROMPT;
pp->yes = 0;
pp->restorefile = NULL;
pp->uuid_str = NULL;
pp->pva.size = 0;
pp->pva.data_alignment = UINT64_C(0);
pp->pva.data_alignment_offset = UINT64_C(0);
pp->pva.pvmetadatacopies = DEFAULT_PVMETADATACOPIES;
pp->pva.pvmetadatasize = DEFAULT_PVMETADATASIZE;
pp->pva.label_sector = DEFAULT_LABELSECTOR;
pp->pva.metadataignore = DEFAULT_PVMETADATAIGNORE;
pp->pva.ba_start = 0;
pp->pva.ba_size = 0;
pp->pva.pe_start = PV_PE_START_CALC;
pp->pva.extent_count = 0;
pp->pva.extent_size = 0;
dm_list_init(&pp->prompts);
dm_list_init(&pp->arg_devices);
dm_list_init(&pp->arg_process);
dm_list_init(&pp->arg_confirm);
dm_list_init(&pp->arg_create);
dm_list_init(&pp->arg_remove);
dm_list_init(&pp->arg_fail);
dm_list_init(&pp->pvs);
}
/*
* See if we may pvcreate on this device.
* 0 indicates we may not.
*/
static int _pvcreate_check(struct cmd_context *cmd, const char *name,
struct pvcreate_params *pp, int *wiped)
{
static const char really_init_msg[] = "Really INITIALIZE physical volume";
static const char not_init_msg[] = "physical volume not initialized";
struct physical_volume *pv;
struct device *dev;
int r = 0;
int scan_needed = 0;
int filter_refresh_needed = 0;
int used;
/* FIXME Check partition type is LVM unless --force is given */
*wiped = 0;
/* Is there a pv here already? */
pv = find_pv_by_name(cmd, name, 1, 1);
/* Allow partial & exported VGs to be destroyed. */
/* We must have -ff to overwrite a non orphan */
if (pv) {
if (!is_orphan(pv) && pp->force != DONT_PROMPT_OVERRIDE) {
log_error("Can't initialize physical volume \"%s\" of "
"volume group \"%s\" without -ff.", name, pv_vg_name(pv));
goto out;
}
if ((used = is_used_pv(pv)) < 0)
goto_out;
if (used && pp->force != DONT_PROMPT_OVERRIDE) {
log_error("PV %s is used by a VG but its metadata is missing.", name);
log_error("Can't initialize PV '%s' without -ff.", name);
goto out;
}
}
/* prompt */
if (pv && !pp->yes) {
if (is_orphan(pv)) {
if (used) {
if (yes_no_prompt("%s \"%s\" that is marked as belonging to a VG [y/n]? ",
really_init_msg, name) == 'n') {
log_error("%s: %s", name, not_init_msg);
goto out;
}
}
} else {
if (yes_no_prompt("%s \"%s\" of volume group \"%s\" [y/n]? ",
really_init_msg, name, pv_vg_name(pv)) == 'n') {
log_error("%s: %s", name, not_init_msg);
goto out;
}
}
}
if (sigint_caught())
goto_out;
dev = dev_cache_get(name, cmd->full_filter);
/*
* Refresh+rescan at the end is needed if:
* - we don't obtain device list from udev,
* hence persistent cache file is used
* and we need to trash it and reevaluate
* for any changes done outside - adding
* any new foreign signature which may affect
* filtering - before we do pvcreate, we
* need to be sure that we have up-to-date
* view for filters
*
* - we have wiped existing foreign signatures
* from dev as this may affect what's filtered
* as well
*
*
* Only rescan at the end is needed if:
* - we've just checked whether dev is fileterd
* by MD filter. We do the refresh in-situ,
* so no need to require the refresh at the
* end of this fn. This is to allow for
* wiping MD signature during pvcreate for
* the dev - the dev would normally be
* filtered because of MD filter.
* This is an exception.
*/
/* Is there an md superblock here? */
if (!dev && md_filtering()) {
if (!refresh_filters(cmd))
goto_out;
init_md_filtering(0);
dev = dev_cache_get(name, cmd->full_filter);
init_md_filtering(1);
scan_needed = 1;
} else if (!obtain_device_list_from_udev())
filter_refresh_needed = scan_needed = 1;
if (!dev) {
log_error("Device %s not found (or ignored by filtering).", name);
goto out;
}
/*
* This test will fail if the device belongs to an MD array.
*/
if (!dev_test_excl(dev)) {
/* FIXME Detect whether device-mapper itself is still using it */
log_error("Can't open %s exclusively. Mounted filesystem?",
name);
goto out;
}
if (!wipe_known_signatures(cmd, dev, name,
TYPE_LVM1_MEMBER | TYPE_LVM2_MEMBER,
0, pp->yes, pp->force, wiped)) {
log_error("Aborting pvcreate on %s.", name);
goto out;
}
if (*wiped)
filter_refresh_needed = scan_needed = 1;
if (sigint_caught())
goto_out;
if (pv && !is_orphan(pv) && pp->force)
log_warn("WARNING: Forcing physical volume creation on "
"%s%s%s%s", name,
!is_orphan(pv) ? " of volume group \"" : "",
pv_vg_name(pv),
!is_orphan(pv) ? "\"" : "");
r = 1;
out:
if (filter_refresh_needed)
if (!refresh_filters(cmd)) {
stack;
r = 0;
}
if (scan_needed) {
lvmcache_force_next_label_scan();
if (!lvmcache_label_scan(cmd)) {
stack;
r = 0;
}
}
free_pv_fid(pv);
return r;
}
static int _pvcreate_write(struct cmd_context *cmd, struct pv_to_write *pvw)
{
struct physical_volume *pv = pvw->pv;
struct device *dev = pv->dev;
const char *pv_name = dev_name(dev);
if (pvw->new_pv) {
/* Wipe existing label first */
if (!label_remove(pv_dev(pv))) {
log_error("Failed to wipe existing label on %s", pv_name);
return 0;
}
if (pvw->pp->zero) {
log_verbose("Zeroing start of device %s", pv_name);
if (!dev_open_quiet(dev)) {
log_error("%s not opened: device not zeroed", pv_name);
return 0;
}
if (!dev_set(dev, UINT64_C(0), (size_t) 2048, 0)) {
log_error("%s not wiped: aborting", pv_name);
if (!dev_close(dev))
stack;
return 0;
}
if (!dev_close(dev))
stack;
}
}
log_verbose("Writing physical volume data to disk \"%s\"",
pv_name);
if (!(pv_write(cmd, pv, 1))) {
log_error("Failed to write physical volume \"%s\"", pv_name);
return 0;
}
if (pvw->new_pv)
log_print_unless_silent("Physical volume \"%s\" successfully created", pv_name);
else
log_verbose("Physical volume \"%s\" successfully written", pv_name);
return 1;
}
static int _verify_pv_create_params(struct pvcreate_params *pp)
{
/*
* FIXME: Some of these checks are duplicates in pvcreate_params_validate.
*/
if (pp->pva.pvmetadatacopies > 2) {
log_error("Metadatacopies may only be 0, 1 or 2");
return 0;
}
if (pp->pva.data_alignment > UINT32_MAX) {
log_error("Physical volume data alignment is too big.");
return 0;
}
if (pp->pva.data_alignment_offset > UINT32_MAX) {
log_error("Physical volume data alignment offset is too big.");
return 0;
}
return 1;
}
/*
* pvcreate_vol() - initialize a device with PV label and metadata area
*
* Parameters:
* - pv_name: device path to initialize
2009-10-06 20:00:38 +04:00
* - pp: parameters to pass to pv_create; if NULL, use default values
*
* Returns:
* NULL: error
* struct physical_volume * (non-NULL): handle to physical volume created
*/
struct physical_volume *pvcreate_vol(struct cmd_context *cmd, const char *pv_name,
struct pvcreate_params *pp, int write_now)
{
struct physical_volume *pv = NULL;
struct device *dev;
int wiped = 0;
struct dm_list mdas;
struct pvcreate_params default_pp;
char buffer[64] __attribute__((aligned(8)));
dev_ext_t dev_ext_src;
pvcreate_params_set_defaults(&default_pp);
if (!pp)
pp = &default_pp;
if (!_verify_pv_create_params(pp)) {
goto bad;
}
if (pp->pva.idp) {
if ((dev = lvmcache_device_from_pvid(cmd, pp->pva.idp, NULL, NULL)) &&
(dev != dev_cache_get(pv_name, cmd->full_filter))) {
if (!id_write_format((const struct id*)&pp->pva.idp->uuid,
buffer, sizeof(buffer)))
goto_bad;
log_error("uuid %s already in use on \"%s\"", buffer,
dev_name(dev));
goto bad;
}
}
if (!_pvcreate_check(cmd, pv_name, pp, &wiped))
goto_bad;
if (sigint_caught())
goto_bad;
/*
* wipe_known_signatures called in _pvcreate_check fires
* WATCH event to update udev database. But at the moment,
* we have no way to synchronize with such event - we may
* end up still seeing the old info in udev db and pvcreate
* can fail to proceed because of the device still being
* filtered (because of the stale info in udev db).
* Disable udev dev-ext source temporarily here for
* this reason and rescan with DEV_EXT_NONE dev-ext
* source (so filters use DEV_EXT_NONE source).
*/
dev_ext_src = external_device_info_source();
if (wiped && (dev_ext_src == DEV_EXT_UDEV))
init_external_device_info_source(DEV_EXT_NONE);
dev = dev_cache_get(pv_name, cmd->full_filter);
init_external_device_info_source(dev_ext_src);
if (!dev) {
log_error("%s: Couldn't find device. Check your filters?",
pv_name);
goto bad;
}
dm_list_init(&mdas);
if (!(pv = pv_create(cmd, dev, &pp->pva))) {
log_error("Failed to setup physical volume \"%s\"", pv_name);
goto bad;
}
log_verbose("Set up physical volume for \"%s\" with %" PRIu64
" available sectors", pv_name, pv_size(pv));
pv->status |= UNLABELLED_PV;
if (write_now) {
struct pv_to_write pvw;
pvw.pp = pp;
pvw.pv = pv;
pvw.new_pv = 1;
if (!_pvcreate_write(cmd, &pvw))
goto bad;
}
return pv;
bad:
return NULL;
}
static struct physical_volume *_alloc_pv(struct dm_pool *mem, struct device *dev)
{
struct physical_volume *pv;
if (!(pv = dm_pool_zalloc(mem, sizeof(*pv)))) {
log_error("Failed to allocate pv structure.");
return NULL;
}
pv->dev = dev;
dm_list_init(&pv->tags);
dm_list_init(&pv->segments);
return pv;
}
/**
* pv_create - initialize a physical volume for use with a volume group
* created PV belongs to Orphan VG.
*
* Returns:
* PV handle - physical volume initialized successfully
* NULL - invalid parameter or problem initializing the physical volume
*/
struct physical_volume *pv_create(const struct cmd_context *cmd,
struct device *dev,
struct pv_create_args *pva)
2001-09-25 16:49:28 +04:00
{
const struct format_type *fmt = cmd->fmt;
struct dm_pool *mem = fmt->orphan_vg->vgmem;
struct physical_volume *pv = _alloc_pv(mem, dev);
unsigned mda_index;
struct pv_list *pvl;
uint64_t size = pva->size;
unsigned long data_alignment = pva->data_alignment;
unsigned long data_alignment_offset = pva->data_alignment_offset;
unsigned pvmetadatacopies = pva->pvmetadatacopies;
uint64_t pvmetadatasize = pva->pvmetadatasize;
unsigned metadataignore = pva->metadataignore;
if (!pv)
return_NULL;
if (pva->idp)
memcpy(&pv->id, pva->idp, sizeof(*pva->idp));
else if (!id_create(&pv->id)) {
log_error("Failed to create random uuid for %s.",
dev_name(dev));
goto bad;
}
if (!dev_get_size(pv->dev, &pv->size)) {
log_error("%s: Couldn't get size.", pv_dev_name(pv));
goto bad;
}
if (size) {
if (size > pv->size)
log_warn("WARNING: %s: Overriding real size. "
"You could lose data.", pv_dev_name(pv));
log_verbose("%s: Pretending size is %" PRIu64 " sectors.",
pv_dev_name(pv), size);
pv->size = size;
}
if (pv->size < pv_min_size()) {
log_error("%s: Size must exceed minimum of %" PRIu64 " sectors.",
pv_dev_name(pv), pv_min_size());
goto bad;
}
pvcreate: fix alignment to incorporate alignment offset if PV has 0 MDAs If zero metadata copies are used, there's no further recalculation of PV alignment that happens when adding metadata areas to the PV and which actually calculates the alignment correctly as a matter of fact. So fix this for "PV without MDA" case as well. Before this patch: [1] raw/~ # pvcreate --dataalignment 8m --dataalignmentoffset 4m --metadatacopies 1 /dev/sda Physical volume "/dev/sda" successfully created [1] raw/~ # pvs -o pv_name,pe_start PV 1st PE /dev/sda 12.00m [1] raw/~ # pvcreate --dataalignment 8m --dataalignmentoffset 4m --metadatacopies 0 /dev/sda Physical volume "/dev/sda" successfully created [1] raw/~ # pvs -o pv_name,pe_start PV 1st PE /dev/sda 8.00m After this patch: [1] raw/~ # pvcreate --dataalignment 8m --dataalignmentoffset 4m --metadatacopies 1 /dev/sda Physical volume "/dev/sda" successfully created [1] raw/~ # pvs -o pv_name,pe_start PV 1st PE /dev/sda 12.00m [1] raw/~ # pvcreate --dataalignment 8m --dataalignmentoffset 4m --metadatacopies 0 /dev/sda Physical volume "/dev/sda" successfully created [1] raw/~ # pvs -o pv_name,pe_start PV 1st PE /dev/sda 12.00m Also, remove a superfluous condition "pv->pe_start < pv->pe_align" in: if (pe_start == PV_PE_START_CALC && pv->pe_start < pv->pe_align) pv->pe_start = pv->pe_align ... This part of the condition is not reachable as with the PV_PE_START_CALC, we always have pv->pe_start set to 0 from the PV struct initialisation (...the pv->pe_start value is just being calculated).
2013-02-21 17:47:49 +04:00
if (pv->size < data_alignment + data_alignment_offset) {
log_error("%s: Data alignment must not exceed device size.",
pv_dev_name(pv));
goto bad;
}
if (!(pvl = dm_pool_zalloc(mem, sizeof(*pvl)))) {
log_error("pv_list allocation in pv_create failed");
goto bad;
}
pvl->pv = pv;
add_pvl_to_vgs(fmt->orphan_vg, pvl);
fmt->orphan_vg->extent_count += pv->pe_count;
fmt->orphan_vg->free_count += pv->pe_count;
2002-11-18 17:04:08 +03:00
pv->fmt = fmt;
2008-02-06 18:47:28 +03:00
pv->vg_name = fmt->orphan_vg_name;
2002-02-15 17:33:59 +03:00
if (!fmt->ops->pv_initialise(fmt, pva, pv)) {
log_error("Format-specific initialisation of physical "
"volume %s failed.", pv_dev_name(pv));
2002-02-15 17:33:59 +03:00
goto bad;
}
for (mda_index = 0; mda_index < pvmetadatacopies; mda_index++) {
if (pv->fmt->ops->pv_add_metadata_area &&
!pv->fmt->ops->pv_add_metadata_area(pv->fmt, pv,
pva->pe_start != PV_PE_START_CALC,
mda_index, pvmetadatasize,
metadataignore)) {
log_error("Failed to add metadata area for "
"new physical volume %s", pv_dev_name(pv));
goto bad;
}
}
return pv;
2001-10-15 22:39:40 +04:00
bad:
// FIXME: detach from orphan in error path
//free_pv_fid(pv);
//dm_pool_free(mem, pv);
return NULL;
2001-09-25 16:49:28 +04:00
}
/* FIXME: liblvm todo - make into function that returns handle */
2008-03-14 01:51:24 +03:00
struct pv_list *find_pv_in_vg(const struct volume_group *vg,
const char *pv_name)
2001-10-15 22:39:40 +04:00
{
struct pv_list *pvl;
struct device *dev = dev_cache_get(pv_name, vg->cmd->filter);
/*
* If the device does not exist or is filtered out, don't bother trying
* to find it in the list. This also prevents accidentally finding a
* non-NULL PV which happens to be missing (i.e. its pv->dev is NULL)
* for such devices.
*/
if (!dev)
return NULL;
dm_list_iterate_items(pvl, &vg->pvs)
if (pvl->pv->dev == dev)
return pvl;
2001-09-25 16:49:28 +04:00
2001-10-15 22:39:40 +04:00
return NULL;
2002-11-18 17:04:08 +03:00
}
struct pv_list *find_pv_in_pv_list(const struct dm_list *pl,
const struct physical_volume *pv)
{
struct pv_list *pvl;
dm_list_iterate_items(pvl, pl)
if (pvl->pv == pv)
return pvl;
2008-04-10 23:59:43 +04:00
return NULL;
}
int pv_is_in_vg(struct volume_group *vg, struct physical_volume *pv)
{
2005-06-01 20:51:55 +04:00
struct pv_list *pvl;
dm_list_iterate_items(pvl, &vg->pvs)
2005-06-01 20:51:55 +04:00
if (pv == pvl->pv)
return 1;
return 0;
}
/**
* find_pv_in_vg_by_uuid - Find PV in VG by PV UUID
* @vg: volume group to search
* @id: UUID of the PV to match
*
* Returns:
* struct pv_list within owning struct volume_group - if UUID of PV found in VG
* NULL - invalid parameter or UUID of PV not found in VG
*
* Note
* FIXME - liblvm todo - make into function that takes VG handle
*/
struct pv_list *find_pv_in_vg_by_uuid(const struct volume_group *vg,
const struct id *id)
{
struct pv_list *pvl;
dm_list_iterate_items(pvl, &vg->pvs)
if (id_equal(&pvl->pv->id, id))
return pvl;
return NULL;
}
2008-03-14 01:51:24 +03:00
struct lv_list *find_lv_in_vg(const struct volume_group *vg,
const char *lv_name)
2001-10-29 16:52:23 +03:00
{
struct lv_list *lvl;
2001-10-29 16:52:23 +03:00
const char *ptr;
/* Use last component */
if ((ptr = strrchr(lv_name, '/')))
ptr++;
else
ptr = lv_name;
2001-10-31 15:47:01 +03:00
dm_list_iterate_items(lvl, &vg->lvs)
2002-01-21 19:49:32 +03:00
if (!strcmp(lvl->lv->name, ptr))
return lvl;
2001-10-29 16:52:23 +03:00
return NULL;
2001-10-29 16:52:23 +03:00
}
struct lv_list *find_lv_in_lv_list(const struct dm_list *ll,
const struct logical_volume *lv)
{
struct lv_list *lvl;
dm_list_iterate_items(lvl, ll)
if (lvl->lv == lv)
return lvl;
2008-04-10 23:59:43 +04:00
return NULL;
}
struct logical_volume *find_lv_in_vg_by_lvid(struct volume_group *vg,
const union lvid *lvid)
{
struct lv_list *lvl;
dm_list_iterate_items(lvl, &vg->lvs)
if (!strncmp(lvl->lv->lvid.s, lvid->s, sizeof(*lvid)))
return lvl->lv;
return NULL;
}
2008-03-14 01:51:24 +03:00
struct logical_volume *find_lv(const struct volume_group *vg,
const char *lv_name)
2001-10-29 16:52:23 +03:00
{
struct lv_list *lvl = find_lv_in_vg(vg, lv_name);
2002-01-21 19:49:32 +03:00
return lvl ? lvl->lv : NULL;
2001-10-29 16:52:23 +03:00
}
struct generic_logical_volume *find_historical_glv(const struct volume_group *vg,
const char *historical_lv_name,
2016-03-01 17:26:57 +03:00
int check_removed_list,
struct glv_list **glvl_found)
{
struct glv_list *glvl;
const char *ptr;
2016-03-01 17:26:57 +03:00
const struct dm_list *list = check_removed_list ? &vg->removed_historical_lvs
: &vg->historical_lvs;
/* Use last component */
if ((ptr = strrchr(historical_lv_name, '/')))
ptr++;
else
ptr = historical_lv_name;
2016-03-01 17:26:57 +03:00
dm_list_iterate_items(glvl, list) {
if (!strcmp(glvl->glv->historical->name, ptr)) {
if (glvl_found)
*glvl_found = glvl;
return glvl->glv;
}
}
if (glvl_found)
*glvl_found = NULL;
return NULL;
}
int lv_name_is_used_in_vg(const struct volume_group *vg, const char *name, int *historical)
{
struct generic_logical_volume *historical_lv;
struct logical_volume *lv;
int found = 0;
if ((lv = find_lv(vg, name))) {
found = 1;
if (historical)
*historical = 0;
} else if ((historical_lv = find_historical_glv(vg, name, 0, NULL))) {
found = 1;
if (historical)
*historical = 1;
}
return found;
}
struct physical_volume *find_pv(struct volume_group *vg, struct device *dev)
2001-10-29 16:52:23 +03:00
{
2005-06-01 20:51:55 +04:00
struct pv_list *pvl;
dm_list_iterate_items(pvl, &vg->pvs)
2005-06-01 20:51:55 +04:00
if (dev == pvl->pv->dev)
return pvl->pv;
return NULL;
2001-10-29 16:52:23 +03:00
}
/* FIXME: liblvm todo - make into function that returns handle */
2004-05-05 15:04:28 +04:00
struct physical_volume *find_pv_by_name(struct cmd_context *cmd,
const char *pv_name,
int allow_orphan, int allow_unformatted)
2004-05-05 15:04:28 +04:00
{
struct device *dev;
struct pv_list *pvl;
struct dm_list *pvslist;
struct physical_volume *pv = NULL;
lvmcache_seed_infos_from_lvmetad(cmd);
2004-05-05 15:04:28 +04:00
if (!(dev = dev_cache_get(pv_name, cmd->filter))) {
metadata: remove spurious "Physical volume <dev_name> not found" This is addendum to commit 2e82a070f3c9224da5c9f383d47e75a1715586cf which fixed these spurious messages that appeared after commit 651d5093edde3e0ebee9d75be1c9834efc152d91 ("avoid pv_read in find_pv_by_name"). There was one more "not found" message issued in case the device could not be found in device cache (commit 2e82a07 fixed this only for PV lookup itself). But if we "allow_unformatted" for find_pv_by_name, we should not issue this message even in case the device can't be found in dev cache as we just need to know whether there's a PV or not for the code to decide on next steps and we don't want to issue any messages if either device itself is not found or PV is not found. For example, when we were creating a new PV (and so allow_unformatted = 1) and the device had a signature on it which caused it to be filtered by device filter (e.g. MD signature if md filtering is enabled), or it was part of some other subsystem (e.g. multipath), this message was issued on find_pv_by_name call which was misleading. Also, remove misleading "stack" call in case find_pv_by_name returns NULL in pvcreate_check - any error state is reported later by pvcreate_check code so no need to "stack" here. There's one more and proper check to issue "not found" message if the device can't be found in device cache within pvcreate_check fn so this situation is still covered properly later in the code. Before this patch (/dev/sda contains MD signature and is therefore filtered): $ pvcreate /dev/sda Physical volume /dev/sda not found WARNING: linux_raid_member signature detected on /dev/sda at offset 4096. Wipe it? [y/n]: With this patch applied: $ pvcreate /dev/sda WARNING: linux_raid_member signature detected on /dev/sda at offset 4096. Wipe it? [y/n]: Non-existent devices are still caught properly: $ pvcreate /dev/sdx Device /dev/sdx not found (or ignored by filtering).
2014-07-31 11:30:25 +04:00
if (!allow_unformatted)
log_error("Physical volume %s not found", pv_name);
return_NULL;
2004-05-05 15:04:28 +04:00
}
if (!(pvslist = get_pvs(cmd)))
return_NULL;
dm_list_iterate_items(pvl, pvslist)
if (pvl->pv->dev == dev)
pv = pvl->pv;
else
free_pv_fid(pvl->pv);
if (!pv && !allow_unformatted)
log_error("Physical volume %s not found", pv_name);
if (pv && !allow_orphan && is_orphan_vg(pv->vg_name)) {
2004-05-05 15:04:28 +04:00
log_error("Physical volume %s not in a volume group", pv_name);
goto bad;
2004-05-05 15:04:28 +04:00
}
return pv;
bad:
free_pv_fid(pv);
return NULL;
2004-05-05 15:04:28 +04:00
}
/* Find segment at a given logical extent in an LV */
2007-12-20 21:55:46 +03:00
struct lv_segment *find_seg_by_le(const struct logical_volume *lv, uint32_t le)
{
struct lv_segment *seg;
dm_list_iterate_items(seg, &lv->segments)
if (le >= seg->le && le < seg->le + seg->len)
return seg;
return NULL;
}
2007-12-20 21:55:46 +03:00
struct lv_segment *first_seg(const struct logical_volume *lv)
2005-10-28 16:48:50 +04:00
{
struct lv_segment *seg;
2005-10-28 16:48:50 +04:00
dm_list_iterate_items(seg, &lv->segments)
return seg;
2005-10-28 16:48:50 +04:00
return NULL;
2005-10-28 16:48:50 +04:00
}
struct lv_segment *last_seg(const struct logical_volume *lv)
{
2012-02-27 13:51:31 +04:00
struct lv_segment *seg;
2012-02-27 13:51:31 +04:00
dm_list_iterate_back_items(seg, &lv->segments)
return seg;
2012-02-27 13:51:31 +04:00
return NULL;
}
int vg_remove_mdas(struct volume_group *vg)
{
2002-11-18 17:04:08 +03:00
struct metadata_area *mda;
/* FIXME Improve recovery situation? */
/* Remove each copy of the metadata */
dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
2002-11-18 17:04:08 +03:00
if (mda->ops->vg_remove &&
2008-01-30 16:19:47 +03:00
!mda->ops->vg_remove(vg->fid, vg, mda))
return_0;
}
return 1;
}
/*
* Determine whether two vgs are compatible for merging.
*/
int vgs_are_compatible(struct cmd_context *cmd __attribute__((unused)),
struct volume_group *vg_from,
struct volume_group *vg_to)
{
struct lv_list *lvl1, *lvl2;
struct pv_list *pvl;
const char *name1, *name2;
if (lvs_in_vg_activated(vg_from)) {
log_error("Logical volumes in \"%s\" must be inactive",
vg_from->name);
2008-01-17 20:17:09 +03:00
return 0;
}
/* Check compatibility */
if (vg_to->extent_size != vg_from->extent_size) {
log_error("Extent sizes differ: %d (%s) and %d (%s)",
vg_to->extent_size, vg_to->name,
vg_from->extent_size, vg_from->name);
2008-01-17 20:17:09 +03:00
return 0;
}
if (vg_to->max_pv &&
(vg_to->max_pv < vg_to->pv_count + vg_from->pv_count)) {
log_error("Maximum number of physical volumes (%d) exceeded "
" for \"%s\" and \"%s\"", vg_to->max_pv, vg_to->name,
vg_from->name);
2008-01-17 20:17:09 +03:00
return 0;
}
if (vg_to->max_lv &&
(vg_to->max_lv < vg_visible_lvs(vg_to) + vg_visible_lvs(vg_from))) {
log_error("Maximum number of logical volumes (%d) exceeded "
" for \"%s\" and \"%s\"", vg_to->max_lv, vg_to->name,
vg_from->name);
2008-01-17 20:17:09 +03:00
return 0;
}
/* Metadata types must be the same */
if (vg_to->fid->fmt != vg_from->fid->fmt) {
log_error("Metadata types differ for \"%s\" and \"%s\"",
vg_to->name, vg_from->name);
return 0;
}
/* Clustering attribute must be the same */
if (vg_is_clustered(vg_to) != vg_is_clustered(vg_from)) {
log_error("Clustered attribute differs for \"%s\" and \"%s\"",
vg_to->name, vg_from->name);
return 0;
}
/* Check no conflicts with LV names */
dm_list_iterate_items(lvl1, &vg_to->lvs) {
2008-01-17 20:17:09 +03:00
name1 = lvl1->lv->name;
dm_list_iterate_items(lvl2, &vg_from->lvs) {
2008-01-17 20:17:09 +03:00
name2 = lvl2->lv->name;
if (!strcmp(name1, name2)) {
log_error("Duplicate logical volume "
"name \"%s\" "
"in \"%s\" and \"%s\"",
name1, vg_to->name, vg_from->name);
2008-01-17 20:17:09 +03:00
return 0;
}
}
}
/* Check no PVs are constructed from either VG */
dm_list_iterate_items(pvl, &vg_to->pvs) {
if (pv_uses_vg(pvl->pv, vg_from)) {
log_error("Physical volume %s might be constructed "
"from same volume group %s.",
pv_dev_name(pvl->pv), vg_from->name);
2008-01-17 20:17:09 +03:00
return 0;
}
}
dm_list_iterate_items(pvl, &vg_from->pvs) {
if (pv_uses_vg(pvl->pv, vg_to)) {
log_error("Physical volume %s might be constructed "
"from same volume group %s.",
pv_dev_name(pvl->pv), vg_to->name);
2008-01-17 20:17:09 +03:00
return 0;
}
}
return 1;
}
struct _lv_postorder_baton {
int (*fn)(struct logical_volume *lv, void *data);
void *data;
};
static int _lv_postorder_visit(struct logical_volume *,
int (*fn)(struct logical_volume *lv, void *data),
void *data);
static int _lv_each_dependency(struct logical_volume *lv,
int (*fn)(struct logical_volume *lv, void *data),
void *data)
{
unsigned i, s;
struct lv_segment *lvseg;
struct dm_list *snh;
struct logical_volume *deps[] = {
(lv->rdevice && lv != lv->rdevice->lv) ? lv->rdevice->lv : 0,
(lv->rdevice && lv != lv->rdevice->slog) ? lv->rdevice->slog : 0,
lv->snapshot ? lv->snapshot->origin : 0,
lv->snapshot ? lv->snapshot->cow : 0 };
for (i = 0; i < DM_ARRAY_SIZE(deps); ++i) {
if (deps[i] && !fn(deps[i], data))
return_0;
}
dm_list_iterate_items(lvseg, &lv->segments) {
if (lvseg->external_lv && !fn(lvseg->external_lv, data))
return_0;
if (lvseg->log_lv && !fn(lvseg->log_lv, data))
return_0;
if (lvseg->rlog_lv && !fn(lvseg->rlog_lv, data))
return_0;
if (lvseg->pool_lv && !fn(lvseg->pool_lv, data))
return_0;
if (lvseg->metadata_lv && !fn(lvseg->metadata_lv, data))
return_0;
for (s = 0; s < lvseg->area_count; ++s) {
if (seg_type(lvseg, s) == AREA_LV && !fn(seg_lv(lvseg,s), data))
return_0;
}
}
if (lv_is_origin(lv))
dm_list_iterate(snh, &lv->snapshot_segs)
if (!fn(dm_list_struct_base(snh, struct lv_segment, origin_list)->cow, data))
return_0;
return 1;
}
static int _lv_postorder_cleanup(struct logical_volume *lv, void *data)
{
if (!(lv->status & POSTORDER_FLAG))
return 1;
lv->status &= ~POSTORDER_FLAG;
if (!_lv_each_dependency(lv, _lv_postorder_cleanup, data))
return_0;
return 1;
}
static int _lv_postorder_level(struct logical_volume *lv, void *data)
{
struct _lv_postorder_baton *baton = data;
return (data) ? _lv_postorder_visit(lv, baton->fn, baton->data) : 0;
};
static int _lv_postorder_visit(struct logical_volume *lv,
int (*fn)(struct logical_volume *lv, void *data),
void *data)
{
struct _lv_postorder_baton baton;
int r;
if (lv->status & POSTORDER_FLAG)
return 1;
if (lv->status & POSTORDER_OPEN_FLAG)
return 1; // a data structure loop has closed...
lv->status |= POSTORDER_OPEN_FLAG;
baton.fn = fn;
baton.data = data;
r = _lv_each_dependency(lv, _lv_postorder_level, &baton);
2009-05-30 05:54:29 +04:00
if (r)
r = fn(lv, data);
2009-05-30 05:54:29 +04:00
lv->status &= ~POSTORDER_OPEN_FLAG;
lv->status |= POSTORDER_FLAG;
return r;
}
/*
* This will walk the LV dependency graph in depth-first order and in the
* postorder, call a callback function "fn". The void *data is passed along all
* the calls. The callback may return zero to indicate an error and terminate
* the depth-first walk. The error is propagated to return value of
* _lv_postorder.
*/
static int _lv_postorder(struct logical_volume *lv,
int (*fn)(struct logical_volume *lv, void *data),
void *data)
{
int r;
int pool_locked = dm_pool_locked(lv->vg->vgmem);
if (pool_locked && !dm_pool_unlock(lv->vg->vgmem, 0))
return_0;
r = _lv_postorder_visit(lv, fn, data);
_lv_postorder_cleanup(lv, 0);
if (pool_locked && !dm_pool_lock(lv->vg->vgmem, 0))
return_0;
return r;
}
/*
* Calls _lv_postorder() on each LV from VG. Avoids duplicate transitivity visits.
* Clears with _lv_postorder_cleanup() when all LVs were visited by postorder.
*/
static int _lv_postorder_vg(struct volume_group *vg,
int (*fn)(struct logical_volume *lv, void *data),
void *data)
{
struct lv_list *lvl;
int r = 1;
int pool_locked = dm_pool_locked(vg->vgmem);
if (pool_locked && !dm_pool_unlock(vg->vgmem, 0))
return_0;
dm_list_iterate_items(lvl, &vg->lvs)
if (!_lv_postorder_visit(lvl->lv, fn, data)) {
stack;
r = 0;
}
dm_list_iterate_items(lvl, &vg->lvs)
_lv_postorder_cleanup(lvl->lv, 0);
if (pool_locked && !dm_pool_lock(vg->vgmem, 0))
return_0;
return r;
}
struct _lv_mark_if_partial_baton {
int partial;
};
static int _lv_mark_if_partial_collect(struct logical_volume *lv, void *data)
{
struct _lv_mark_if_partial_baton *baton = data;
if (baton && lv_is_partial(lv))
baton->partial = 1;
return 1;
}
static int _lv_mark_if_partial_single(struct logical_volume *lv, void *data)
{
unsigned s;
struct _lv_mark_if_partial_baton baton = { .partial = 0 };
struct lv_segment *lvseg;
dm_list_iterate_items(lvseg, &lv->segments) {
for (s = 0; s < lvseg->area_count; ++s) {
if (seg_type(lvseg, s) == AREA_PV) {
2010-03-16 17:37:38 +03:00
if (is_missing_pv(seg_pv(lvseg, s)))
lv->status |= PARTIAL_LV;
}
}
}
if (!_lv_each_dependency(lv, _lv_mark_if_partial_collect, &baton))
return_0;
if (baton.partial)
lv->status |= PARTIAL_LV;
return 1;
}
/*
* Mark LVs with missing PVs using PARTIAL_LV status flag. The flag is
* propagated transitively, so LVs referencing other LVs are marked
* partial as well, if any of their referenced LVs are marked partial.
*/
int vg_mark_partial_lvs(struct volume_group *vg, int clear)
{
struct lv_list *lvl;
if (clear)
dm_list_iterate_items(lvl, &vg->lvs)
lvl->lv->status &= ~PARTIAL_LV;
if (!_lv_postorder_vg(vg, _lv_mark_if_partial_single, NULL))
return_0;
return 1;
}
/*
* Be sure that all PV devices have cached read ahead in dev-cache
* Currently it takes read_ahead from first PV segment only
*/
static int _lv_read_ahead_single(struct logical_volume *lv, void *data)
{
struct lv_segment *seg = first_seg(lv);
uint32_t seg_read_ahead = 0, *read_ahead = data;
if (!read_ahead) {
log_error(INTERNAL_ERROR "Read ahead data missing.");
return 0;
}
if (seg && seg->area_count && seg_type(seg, 0) == AREA_PV)
dev_get_read_ahead(seg_pv(seg, 0)->dev, &seg_read_ahead);
if (seg_read_ahead > *read_ahead)
*read_ahead = seg_read_ahead;
return 1;
}
/*
* Calculate readahead for logical volume from underlying PV devices.
* If read_ahead is NULL, only ensure that readahead of PVs are preloaded
* into PV struct device in dev cache.
*/
void lv_calculate_readahead(const struct logical_volume *lv, uint32_t *read_ahead)
{
uint32_t _read_ahead = 0;
if (lv->read_ahead == DM_READ_AHEAD_AUTO)
_lv_postorder((struct logical_volume *)lv, _lv_read_ahead_single, &_read_ahead);
if (read_ahead) {
log_debug_metadata("Calculated readahead of LV %s is %u", lv->name, _read_ahead);
*read_ahead = _read_ahead;
}
}
struct validate_hash {
struct dm_hash_table *lvname;
struct dm_hash_table *historical_lvname;
struct dm_hash_table *lvid;
struct dm_hash_table *historical_lvid;
struct dm_hash_table *pvid;
2015-03-05 23:00:44 +03:00
struct dm_hash_table *lv_lock_args;
};
/*
* Check that an LV and all its PV references are correctly listed in vg->lvs
* and vg->pvs, respectively. This only looks at a single LV, but *not* at the
* LVs it is using. To do the latter, you should use _lv_postorder with this
* function. C.f. vg_validate.
*/
static int _lv_validate_references_single(struct logical_volume *lv, void *data)
{
struct volume_group *vg = lv->vg;
struct validate_hash *vhash = data;
struct lv_segment *lvseg;
struct physical_volume *pv;
unsigned s;
int r = 1;
if (lv != dm_hash_lookup_binary(vhash->lvid, &lv->lvid.id[1],
sizeof(lv->lvid.id[1]))) {
log_error(INTERNAL_ERROR
"Referenced LV %s not listed in VG %s.",
lv->name, vg->name);
r = 0;
}
dm_list_iterate_items(lvseg, &lv->segments) {
for (s = 0; s < lvseg->area_count; ++s) {
if (seg_type(lvseg, s) != AREA_PV)
continue;
pv = seg_pv(lvseg, s);
/* look up the reference in vg->pvs */
if (pv != dm_hash_lookup_binary(vhash->pvid, &pv->id,
sizeof(pv->id))) {
log_error(INTERNAL_ERROR
"Referenced PV %s not listed in VG %s.",
pv_dev_name(pv), vg->name);
r = 0;
}
}
}
return r;
}
2015-07-09 21:24:28 +03:00
/*
* Format is <version>:<info>
*/
static int _validate_lock_args_chars(const char *lock_args)
{
unsigned i;
2015-07-09 21:24:28 +03:00
char c;
int found_colon = 0;
int r = 1;
for (i = 0; i < strlen(lock_args); i++) {
c = lock_args[i];
if (!isalnum(c) && c != '.' && c != '_' && c != '-' && c != '+' && c != ':') {
log_error(INTERNAL_ERROR "Invalid character at index %u of lock_args \"%s\"",
2015-07-09 21:24:28 +03:00
i, lock_args);
r = 0;
}
if (c == ':' && found_colon) {
log_error(INTERNAL_ERROR "Invalid colon at index %u of lock_args \"%s\"",
2015-07-09 21:24:28 +03:00
i, lock_args);
r = 0;
}
if (c == ':')
found_colon = 1;
}
return r;
}
static int _validate_vg_lock_args(struct volume_group *vg)
{
if (!_validate_lock_args_chars(vg->lock_args)) {
log_error(INTERNAL_ERROR "VG %s has invalid lock_args chars", vg->name);
return 0;
}
return 1;
}
/*
* For lock_type sanlock, LV lock_args are <version>:<info>
* For lock_type dlm, LV lock_args are not used, and lock_args is
* just set to "dlm".
*/
static int _validate_lv_lock_args(struct logical_volume *lv)
{
int r = 1;
if (!strcmp(lv->vg->lock_type, "sanlock")) {
if (!_validate_lock_args_chars(lv->lock_args)) {
log_error(INTERNAL_ERROR "LV %s/%s has invalid lock_args chars",
lv->vg->name, display_lvname(lv));
return 0;
}
} else if (!strcmp(lv->vg->lock_type, "dlm")) {
if (strcmp(lv->lock_args, "dlm")) {
log_error(INTERNAL_ERROR "LV %s/%s has invalid lock_args \"%s\"",
lv->vg->name, display_lvname(lv), lv->lock_args);
r = 0;
}
}
return r;
}
int vg_validate(struct volume_group *vg)
{
struct pv_list *pvl;
struct lv_list *lvl;
struct glv_list *glvl;
struct historical_logical_volume *hlv;
struct lv_segment *seg;
struct dm_str_list *sl;
char uuid[64] __attribute__((aligned(8)));
char uuid2[64] __attribute__((aligned(8)));
int r = 1;
2013-07-05 19:10:11 +04:00
unsigned hidden_lv_count = 0, lv_count = 0, lv_visible_count = 0;
unsigned pv_count = 0;
unsigned num_snapshots = 0;
unsigned spare_count = 0;
size_t vg_name_len = strlen(vg->name);
size_t dev_name_len;
struct validate_hash vhash = { NULL };
if (vg->alloc == ALLOC_CLING_BY_TAGS) {
log_error(INTERNAL_ERROR "VG %s allocation policy set to invalid cling_by_tags.",
vg->name);
r = 0;
}
if (vg->status & LVM_WRITE_LOCKED) {
log_error(INTERNAL_ERROR "VG %s has external flag LVM_WRITE_LOCKED set internally.",
vg->name);
r = 0;
}
/* FIXME Also check there's no data/metadata overlap */
if (!(vhash.pvid = dm_hash_create(vg->pv_count))) {
log_error("Failed to allocate pvid hash.");
return 0;
}
dm_list_iterate_items(sl, &vg->tags)
if (!validate_tag(sl->str)) {
log_error(INTERNAL_ERROR "VG %s tag %s has invalid form.",
vg->name, sl->str);
r = 0;
}
dm_list_iterate_items(pvl, &vg->pvs) {
if (++pv_count > vg->pv_count) {
log_error(INTERNAL_ERROR "PV list corruption detected in VG %s.", vg->name);
/* FIXME Dump list structure? */
r = 0;
}
if (pvl->pv->vg != vg) {
log_error(INTERNAL_ERROR "VG %s PV list entry points "
"to different VG %s.", vg->name,
pvl->pv->vg ? pvl->pv->vg->name : "NULL");
r = 0;
}
if (strcmp(pvl->pv->vg_name, vg->name)) {
log_error(INTERNAL_ERROR "VG name for PV %s is corrupted.",
pv_dev_name(pvl->pv));
r = 0;
}
if (dm_hash_lookup_binary(vhash.pvid, &pvl->pv->id,
sizeof(pvl->pv->id))) {
if (!id_write_format(&pvl->pv->id, uuid,
sizeof(uuid)))
stack;
log_error(INTERNAL_ERROR "Duplicate PV id "
"%s detected for %s in %s.",
uuid, pv_dev_name(pvl->pv),
vg->name);
r = 0;
}
dm_list_iterate_items(sl, &pvl->pv->tags)
if (!validate_tag(sl->str)) {
log_error(INTERNAL_ERROR "PV %s tag %s has invalid form.",
pv_dev_name(pvl->pv), sl->str);
r = 0;
}
if (!dm_hash_insert_binary(vhash.pvid, &pvl->pv->id,
sizeof(pvl->pv->id), pvl->pv)) {
log_error("Failed to hash pvid.");
r = 0;
break;
}
}
if (!check_pv_segments(vg)) {
log_error(INTERNAL_ERROR "PV segments corrupted in %s.",
vg->name);
r = 0;
}
dm_list_iterate_items(lvl, &vg->removed_lvs) {
if (!(lvl->lv->status & LV_REMOVED)) {
log_error(INTERNAL_ERROR "LV %s is not marked as removed while it's part "
"of removed LV list for VG %s", lvl->lv->name, vg->name);
r = 0;
}
}
/*
* Count all non-snapshot invisible LVs
*/
dm_list_iterate_items(lvl, &vg->lvs) {
lv_count++;
if (lvl->lv->status & LV_REMOVED) {
log_error(INTERNAL_ERROR "LV %s is marked as removed while it's "
"still part of the VG %s", lvl->lv->name, vg->name);
r = 0;
}
if (lvl->lv->status & LVM_WRITE_LOCKED) {
log_error(INTERNAL_ERROR "LV %s has external flag LVM_WRITE_LOCKED set internally.",
lvl->lv->name);
r = 0;
}
dev_name_len = strlen(lvl->lv->name) + vg_name_len + 3;
if (dev_name_len >= NAME_LEN) {
log_error(INTERNAL_ERROR "LV name \"%s/%s\" length %"
PRIsize_t " is not supported.",
vg->name, lvl->lv->name, dev_name_len);
r = 0;
}
if (!id_equal(&lvl->lv->lvid.id[0], &lvl->lv->vg->id)) {
if (!id_write_format(&lvl->lv->lvid.id[0], uuid,
sizeof(uuid)))
stack;
if (!id_write_format(&lvl->lv->vg->id, uuid2,
sizeof(uuid2)))
stack;
log_error(INTERNAL_ERROR "LV %s has VG UUID %s but its VG %s has UUID %s",
lvl->lv->name, uuid, lvl->lv->vg->name, uuid2);
r = 0;
}
if (lv_is_pool_metadata_spare(lvl->lv)) {
if (++spare_count > 1) {
2014-11-11 17:13:00 +03:00
log_error(INTERNAL_ERROR "LV %s is extra pool metadata spare volume. %u found but only 1 allowed.",
lvl->lv->name, spare_count);
r = 0;
}
if (vg->pool_metadata_spare_lv != lvl->lv) {
2014-11-11 17:13:00 +03:00
log_error(INTERNAL_ERROR "LV %s is not the VG's pool metadata spare volume.",
lvl->lv->name);
r = 0;
}
}
if (lv_is_cow(lvl->lv))
num_snapshots++;
if (lv_is_visible(lvl->lv))
lv_visible_count++;
if (!check_lv_segments(lvl->lv, 0)) {
log_error(INTERNAL_ERROR "LV segments corrupted in %s.",
lvl->lv->name);
r = 0;
}
if (lvl->lv->alloc == ALLOC_CLING_BY_TAGS) {
log_error(INTERNAL_ERROR "LV %s allocation policy set to invalid cling_by_tags.",
lvl->lv->name);
r = 0;
}
if (!validate_name(lvl->lv->name)) {
log_error(INTERNAL_ERROR "LV name %s has invalid form.", lvl->lv->name);
r = 0;
}
dm_list_iterate_items(sl, &lvl->lv->tags)
if (!validate_tag(sl->str)) {
log_error(INTERNAL_ERROR "LV %s tag %s has invalid form.",
lvl->lv->name, sl->str);
r = 0;
}
if (lvl->lv->status & VISIBLE_LV)
continue;
/* snapshots */
if (lv_is_cow(lvl->lv))
continue;
/* virtual origins are always hidden */
if (lv_is_origin(lvl->lv) && !lv_is_virtual_origin(lvl->lv))
continue;
/* count other non-snapshot invisible volumes */
hidden_lv_count++;
/*
* FIXME: add check for unreferenced invisible LVs
* - snapshot cow & origin
* - mirror log & images
* - mirror conversion volumes (_mimagetmp*)
*/
}
/*
* all volumes = visible LVs + snapshot_cows + invisible LVs
*/
if (lv_count != lv_visible_count + num_snapshots + hidden_lv_count) {
2013-07-05 19:10:11 +04:00
log_error(INTERNAL_ERROR "#LVs (%u) != #visible LVs (%u) "
"+ #snapshots (%u) + #internal LVs (%u) in VG %s",
lv_count, lv_visible_count, num_snapshots,
hidden_lv_count, vg->name);
2008-06-06 23:28:35 +04:00
r = 0;
}
/* Avoid endless loop if lv->segments list is corrupt */
if (!r)
goto out;
if (!(vhash.lvname = dm_hash_create(lv_count))) {
log_error("Failed to allocate lv_name hash");
r = 0;
goto out;
}
if (!(vhash.lvid = dm_hash_create(lv_count))) {
log_error("Failed to allocate uuid hash");
r = 0;
goto out;
}
dm_list_iterate_items(lvl, &vg->lvs) {
if (dm_hash_lookup(vhash.lvname, lvl->lv->name)) {
log_error(INTERNAL_ERROR
"Duplicate LV name %s detected in %s.",
lvl->lv->name, vg->name);
r = 0;
}
if (dm_hash_lookup_binary(vhash.lvid, &lvl->lv->lvid.id[1],
sizeof(lvl->lv->lvid.id[1]))) {
if (!id_write_format(&lvl->lv->lvid.id[1], uuid,
sizeof(uuid)))
stack;
log_error(INTERNAL_ERROR "Duplicate LV id "
"%s detected for %s in %s.",
uuid, lvl->lv->name, vg->name);
r = 0;
}
2005-10-28 01:51:28 +04:00
if (!check_lv_segments(lvl->lv, 1)) {
log_error(INTERNAL_ERROR "LV segments corrupted in %s.",
2005-06-01 20:51:55 +04:00
lvl->lv->name);
r = 0;
2005-06-01 20:51:55 +04:00
}
if (!dm_hash_insert(vhash.lvname, lvl->lv->name, lvl)) {
log_error("Failed to hash lvname.");
r = 0;
break;
}
if (!dm_hash_insert_binary(vhash.lvid, &lvl->lv->lvid.id[1],
2011-03-11 01:39:36 +03:00
sizeof(lvl->lv->lvid.id[1]), lvl->lv)) {
log_error("Failed to hash lvid.");
r = 0;
break;
}
2005-06-01 20:51:55 +04:00
}
if (!_lv_postorder_vg(vg, _lv_validate_references_single, &vhash)) {
stack;
r = 0;
}
dm_list_iterate_items(lvl, &vg->lvs) {
if (!lv_is_pvmove(lvl->lv))
continue;
dm_list_iterate_items(seg, &lvl->lv->segments) {
if (seg_is_mirrored(seg)) {
if (seg->area_count != 2) {
log_error(INTERNAL_ERROR
"Segment in %s is not 2-way.",
lvl->lv->name);
r = 0;
}
} else if (seg->area_count != 1) {
log_error(INTERNAL_ERROR
"Segment in %s has wrong number of areas: %d.",
lvl->lv->name, seg->area_count);
r = 0;
}
}
}
if (!(vg->fid->fmt->features & FMT_UNLIMITED_VOLS) &&
(!vg->max_lv || !vg->max_pv)) {
log_error(INTERNAL_ERROR "Volume group %s has limited PV/LV count"
" but limit is not set.", vg->name);
r = 0;
}
if (vg->pool_metadata_spare_lv &&
!lv_is_pool_metadata_spare(vg->pool_metadata_spare_lv)) {
log_error(INTERNAL_ERROR "VG references non pool metadata spare LV %s.",
vg->pool_metadata_spare_lv->name);
r = 0;
}
if (vg_max_lv_reached(vg))
stack;
2015-03-05 23:00:44 +03:00
if (!(vhash.lv_lock_args = dm_hash_create(lv_count))) {
log_error("Failed to allocate lv_lock_args hash");
r = 0;
goto out;
}
if (is_lockd_type(vg->lock_type)) {
if (!vg->lock_args) {
log_error(INTERNAL_ERROR "VG %s with lock_type %s without lock_args",
vg->name, vg->lock_type);
r = 0;
}
if (vg_is_clustered(vg)) {
log_error(INTERNAL_ERROR "VG %s with lock_type %s is clustered",
vg->name, vg->lock_type);
r = 0;
}
if (vg->system_id && vg->system_id[0]) {
log_error(INTERNAL_ERROR "VG %s with lock_type %s has system_id %s",
vg->name, vg->lock_type, vg->system_id);
r = 0;
}
if (strcmp(vg->lock_type, "sanlock") && strcmp(vg->lock_type, "dlm")) {
log_error(INTERNAL_ERROR "VG %s has unknown lock_type %s",
vg->name, vg->lock_type);
r = 0;
}
2015-07-09 21:24:28 +03:00
if (!_validate_vg_lock_args(vg))
2015-07-09 21:24:28 +03:00
r = 0;
2015-03-05 23:00:44 +03:00
} else {
if (vg->lock_args) {
log_error(INTERNAL_ERROR "VG %s has lock_args %s without lock_type",
vg->name, vg->lock_args);
r = 0;
}
}
dm_list_iterate_items(lvl, &vg->lvs) {
if (is_lockd_type(vg->lock_type)) {
if (lockd_lv_uses_lock(lvl->lv)) {
2015-07-09 21:24:28 +03:00
if (vg->skip_validate_lock_args)
2015-03-05 23:00:44 +03:00
continue;
2015-07-09 21:24:28 +03:00
/*
* FIXME: make missing lock_args an error.
* There are at least two cases where this
* check doesn't work correctly:
*
* 1. When creating a cow snapshot,
* (lvcreate -s -L1M -n snap1 vg/lv1),
* lockd_lv_uses_lock() uses lv_is_cow()
* which depends on lv->snapshot being
* set, but it's not set at this point,
* so lockd_lv_uses_lock() cannot identify
* the LV as a cow_lv, and thinks it needs
* a lock when it doesn't. To fix this we
* probably need to validate by finding the
* origin LV, then finding all its snapshots
* which will have no lock_args.
*
* 2. When converting an LV to a thin pool
* without using an existing metadata LV,
* (lvconvert --type thin-pool vg/poolX),
* there is an intermediate LV created,
* probably for the metadata LV, and
* validate is called on the VG in this
* intermediate state, which finds the
* newly created LV which is not yet
* identified as a metadata LV, and
* does not have any lock_args. To fix
* this we might be able to find the place
* where the intermediate LV is created,
* and set new variable on it like for vgs,
* lv->skip_validate_lock_args.
*/
2015-07-09 21:24:28 +03:00
if (!lvl->lv->lock_args) {
/*
log_verbose("LV %s/%s missing lock_args",
vg->name, lvl->lv->name);
2015-03-05 23:00:44 +03:00
r = 0;
*/
2015-07-09 21:24:28 +03:00
continue;
}
if (!_validate_lv_lock_args(lvl->lv)) {
r = 0;
continue;
}
if (!strcmp(vg->lock_type, "sanlock")) {
2015-03-05 23:00:44 +03:00
if (dm_hash_lookup(vhash.lv_lock_args, lvl->lv->lock_args)) {
log_error(INTERNAL_ERROR "LV %s/%s has duplicate lock_args %s.",
vg->name, lvl->lv->name, lvl->lv->lock_args);
r = 0;
}
if (!dm_hash_insert(vhash.lv_lock_args, lvl->lv->lock_args, lvl)) {
log_error("Failed to hash lvname.");
r = 0;
}
}
} else {
if (lvl->lv->lock_args) {
log_error(INTERNAL_ERROR "LV %s/%s shouldn't have lock_args",
vg->name, lvl->lv->name);
r = 0;
}
}
} else {
if (lvl->lv->lock_args) {
log_error(INTERNAL_ERROR "LV %s/%s with no lock_type has lock_args %s",
vg->name, lvl->lv->name, lvl->lv->lock_args);
r = 0;
}
}
}
if (!(vhash.historical_lvname = dm_hash_create(dm_list_size(&vg->historical_lvs)))) {
log_error("Failed to allocate historical LV name hash");
r = 0;
goto out;
}
if (!(vhash.historical_lvid = dm_hash_create(dm_list_size(&vg->historical_lvs)))) {
log_error("Failed to allocate historical LV uuid hash");
r = 0;
goto out;
}
dm_list_iterate_items(glvl, &vg->historical_lvs) {
if (!glvl->glv->is_historical) {
log_error(INTERNAL_ERROR "LV %s/%s appearing in VG's historical list is not a historical LV",
vg->name, glvl->glv->live->name);
r = 0;
continue;
}
hlv = glvl->glv->historical;
if (hlv->vg != vg) {
log_error(INTERNAL_ERROR "Historical LV %s points to different VG %s while it is listed in VG %s",
hlv->name, hlv->vg->name, vg->name);
r = 0;
continue;
}
if (!id_equal(&hlv->lvid.id[0], &hlv->vg->id)) {
if (!id_write_format(&hlv->lvid.id[0], uuid, sizeof(uuid)))
stack;
if (!id_write_format(&hlv->vg->id, uuid2, sizeof(uuid2)))
stack;
log_error(INTERNAL_ERROR "Historical LV %s has VG UUID %s but its VG %s has UUID %s",
hlv->name, uuid, hlv->vg->name, uuid2);
r = 0;
continue;
}
if (dm_hash_lookup_binary(vhash.historical_lvid, &hlv->lvid.id[1], sizeof(hlv->lvid.id[1]))) {
if (!id_write_format(&hlv->lvid.id[1], uuid,sizeof(uuid)))
stack;
log_error(INTERNAL_ERROR "Duplicate historical LV id %s detected for %s in %s",
uuid, hlv->name, vg->name);
r = 0;
}
if (dm_hash_lookup(vhash.historical_lvname, hlv->name)) {
log_error(INTERNAL_ERROR "Duplicate historical LV name %s detected in %s", hlv->name, vg->name);
r = 0;
continue;
}
if (!dm_hash_insert(vhash.historical_lvname, hlv->name, hlv)) {
log_error("Failed to hash historical LV name");
r = 0;
break;
}
if (!dm_hash_insert_binary(vhash.historical_lvid, &hlv->lvid.id[1], sizeof(hlv->lvid.id[1]), hlv)) {
log_error("Failed to hash historical LV id");
r = 0;
break;
}
if (dm_hash_lookup(vhash.lvname, hlv->name)) {
log_error(INTERNAL_ERROR "Name %s appears as live and historical LV at the same time in VG %s",
hlv->name, vg->name);
r = 0;
continue;
}
if (!hlv->indirect_origin && !dm_list_size(&hlv->indirect_glvs)) {
log_error(INTERNAL_ERROR "Historical LV %s is not part of any LV chain in VG %s", hlv->name, vg->name);
r = 0;
continue;
}
}
out:
if (vhash.lvid)
dm_hash_destroy(vhash.lvid);
if (vhash.lvname)
dm_hash_destroy(vhash.lvname);
if (vhash.historical_lvid)
dm_hash_destroy(vhash.historical_lvid);
if (vhash.historical_lvname)
dm_hash_destroy(vhash.historical_lvname);
if (vhash.pvid)
dm_hash_destroy(vhash.pvid);
2015-03-05 23:00:44 +03:00
if (vhash.lv_lock_args)
dm_hash_destroy(vhash.lv_lock_args);
return r;
}
static int _pv_in_pv_list(struct physical_volume *pv, struct dm_list *head)
{
struct pv_list *pvl;
dm_list_iterate_items(pvl, head) {
if (pvl->pv == pv)
return 1;
}
return 0;
}
/*
* Check if any of the PVs in VG still contain old PV headers
* and if yes, schedule them for PV header update.
*/
static int _vg_update_old_pv_ext_if_needed(struct volume_group *vg)
{
struct pv_list *pvl, *new_pvl;
int pv_needs_rewrite;
if (!(vg->fid->fmt->features & FMT_PV_FLAGS))
return 1;
dm_list_iterate_items(pvl, &vg->pvs) {
if (is_missing_pv(pvl->pv) ||
!pvl->pv->fmt->ops->pv_needs_rewrite)
continue;
if (_pv_in_pv_list(pvl->pv, &vg->pv_write_list))
continue;
if (!pvl->pv->fmt->ops->pv_needs_rewrite(pvl->pv->fmt, pvl->pv,
&pv_needs_rewrite))
return_0;
if (pv_needs_rewrite) {
/*
* Schedule PV for writing only once!
*/
if (_pv_in_pv_list(pvl->pv, &vg->pv_write_list))
continue;
if (!(new_pvl = dm_pool_zalloc(vg->vgmem, sizeof(*new_pvl)))) {
log_error("pv_to_write allocation for '%s' failed", pv_dev_name(pvl->pv));
return 0;
}
new_pvl->pv = pvl->pv;
dm_list_add(&vg->pv_write_list, &new_pvl->list);
log_debug("PV %s has old extension header, updating to newest version.",
pv_dev_name(pvl->pv));
}
}
if (!dm_list_empty(&vg->pv_write_list) &&
(!vg_write(vg) || !vg_commit(vg))) {
log_error("Failed to update old PV extension headers in VG %s.", vg->name);
return 0;
}
return 1;
}
static int _check_historical_lv_is_valid(struct historical_logical_volume *hlv)
{
struct glv_list *glvl;
if (hlv->checked)
return hlv->valid;
/*
* Historical LV is valid if there is
* at least one live LV among ancestors.
*/
hlv->valid = 0;
dm_list_iterate_items(glvl, &hlv->indirect_glvs) {
if (!glvl->glv->is_historical ||
_check_historical_lv_is_valid(glvl->glv->historical)) {
hlv->valid = 1;
break;
}
}
hlv->checked = 1;
return hlv->valid;
}
static int _handle_historical_lvs(struct volume_group *vg)
{
struct glv_list *glvl, *tglvl;
time_t current_timestamp = 0;
struct historical_logical_volume *hlv;
int valid = 1;
dm_list_iterate_items(glvl, &vg->historical_lvs)
glvl->glv->historical->checked = 0;
dm_list_iterate_items(glvl, &vg->historical_lvs) {
hlv = glvl->glv->historical;
valid &= _check_historical_lv_is_valid(hlv);
if (!hlv->timestamp_removed) {
if (!current_timestamp)
current_timestamp = time(NULL);
hlv->timestamp_removed = (uint64_t) current_timestamp;
}
}
if (valid)
return 1;
dm_list_iterate_items_safe(glvl, tglvl, &vg->historical_lvs) {
hlv = glvl->glv->historical;
if (hlv->checked && hlv->valid)
continue;
log_print_unless_silent("Automatically removing historical "
"logical volume %s/%s%s.",
vg->name, HISTORICAL_LV_PREFIX, hlv->name);
if (!historical_glv_remove(glvl->glv))
return_0;
}
return 1;
}
/*
* After vg_write() returns success,
* caller MUST call either vg_commit() or vg_revert()
*/
int vg_write(struct volume_group *vg)
{
struct dm_list *mdah;
struct pv_to_write *pv_to_write, *pv_to_write_safe;
struct pv_list *pvl, *pvl_safe;
struct metadata_area *mda;
2015-03-05 23:00:44 +03:00
struct lv_list *lvl;
int revert = 0, wrote = 0;
2015-03-05 23:00:44 +03:00
dm_list_iterate_items(lvl, &vg->lvs) {
if (lvl->lv->lock_args && !strcmp(lvl->lv->lock_args, "pending")) {
if (!lockd_init_lv_args(vg->cmd, vg, lvl->lv, vg->lock_type, &lvl->lv->lock_args)) {
log_error("Cannot allocate lock for new LV.");
return 0;
}
lvl->lv->new_lock_args = 1;
}
}
if (!_handle_historical_lvs(vg)) {
log_error("Failed to handle historical LVs in VG %s.", vg->name);
return 0;
}
2008-01-30 16:19:47 +03:00
if (!vg_validate(vg))
return_0;
if (vg->status & PARTIAL_VG) {
log_error("Cannot update partial volume group %s.", vg->name);
return 0;
}
if (vg_missing_pv_count(vg) && !vg->cmd->handles_missing_pvs) {
log_error("Cannot update volume group %s while physical "
"volumes are missing.", vg->name);
return 0;
}
if (lvmcache_found_duplicate_pvs() && vg_has_duplicate_pvs(vg) &&
!find_config_tree_bool(vg->cmd, devices_allow_changes_with_duplicate_pvs_CFG, NULL)) {
log_error("Cannot update volume group %s with duplicate PV devices.",
vg->name);
return 0;
}
if (vg_has_unknown_segments(vg) && !vg->cmd->handles_unknown_segments) {
log_error("Cannot update volume group %s with unknown segments in it!",
vg->name);
return 0;
}
2010-06-30 23:28:35 +04:00
if ((vg->fid->fmt->features & FMT_MDAS) && !_vg_adjust_ignored_mdas(vg))
2010-06-30 17:51:11 +04:00
return_0;
2010-06-30 23:28:35 +04:00
if (!vg_mda_used_count(vg)) {
2002-11-18 17:04:08 +03:00
log_error("Aborting vg_write: No metadata areas to write to!");
return 0;
}
if (!drop_cached_metadata(vg)) {
log_error("Unable to drop cached metadata for VG %s.", vg->name);
return 0;
}
if (critical_section())
log_error(INTERNAL_ERROR
"Writing metadata in critical section.");
/* Unlock memory if possible */
memlock_unlock(vg->cmd);
vg->seqno++;
dm_list_iterate_items_safe(pvl, pvl_safe, &vg->pv_write_list) {
if (!pv_write(vg->cmd, pvl->pv, 1))
return_0;
dm_list_del(&pvl->list);
}
dm_list_iterate_items_safe(pv_to_write, pv_to_write_safe, &vg->pvs_to_write) {
if (!_pvcreate_write(vg->cmd, pv_to_write))
return_0;
dm_list_del(&pv_to_write->list);
}
/* Write to each copy of the metadata area */
dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
2004-03-27 00:07:30 +03:00
if (!mda->ops->vg_write) {
log_error("Format does not support writing volume"
"group metadata areas");
revert = 1;
break;
}
2002-11-18 17:04:08 +03:00
if (!mda->ops->vg_write(vg->fid, vg, mda)) {
if (vg->cmd->handles_missing_pvs) {
log_warn("WARNING: Failed to write an MDA of VG %s.", vg->name);
mda->status |= MDA_FAILED;
} else {
stack;
revert = 1;
break;
}
} else
++ wrote;
}
2005-06-01 20:51:55 +04:00
if (revert || !wrote) {
log_error("Failed to write VG %s.", vg->name);
dm_list_uniterate(mdah, &vg->fid->metadata_areas_in_use, &mda->list) {
mda = dm_list_item(mdah, struct metadata_area);
if (mda->ops->vg_revert &&
!mda->ops->vg_revert(vg->fid, vg, mda)) {
stack;
}
}
return 0;
}
/* Now pre-commit each copy of the new metadata */
dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
if (mda->status & MDA_FAILED)
continue;
if (mda->ops->vg_precommit &&
!mda->ops->vg_precommit(vg->fid, vg, mda)) {
stack;
/* Revert */
dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
if (mda->status & MDA_FAILED)
continue;
if (mda->ops->vg_revert &&
!mda->ops->vg_revert(vg->fid, vg, mda)) {
stack;
}
}
return 0;
}
}
if (!_vg_update_vg_precommitted(vg)) /* prepare precommited */
return_0;
lockd_vg_update(vg);
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
/*
* This tells lvmetad the new seqno it should expect to receive
* the metadata for after the commit. The cached VG will be
* invalid in lvmetad until this command sends the new metadata
* after it's committed.
*/
if (!lvmetad_vg_update_pending(vg)) {
log_error("Failed to prepare new VG metadata in lvmetad cache.");
return 0;
}
return 1;
}
static int _vg_commit_mdas(struct volume_group *vg)
{
Before committing each mda, arrange mdas so ignored mdas get committed first. Arrange mdas so mdas that are to be ignored come first. This is an optimization that ensures consistency on disk for the longest period of time. This was noted by agk in review of the v4 patchset of pvchange-based mda balance. Note the following example for an explanation of the background: Assume the initial state on disk is as follows: PV0 (v1, non-ignored) PV1 (v1, non-ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) If we did not sort the list, we would have a commit sequence something like this: PV0 (v2, non-ignored) PV1 (v2, ignored) PV2 (v2, ignored) PV3 (v2, non-ignored) After the commit of PV0's mdas, we'd have an on-disk state like this: PV0 (v2, non-ignored) PV1 (v1, non-ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) This is an inconsistent state of the disk. If the machine fails, the next time it was brought back up, the auto-correct mechanism in vg_read would update the metadata on PV1-PV3. However, if possible we try to avoid inconsistent on-disk states. Clearly, because we did not sort, we have a greater chance of on-disk inconsistency - from the time the commit of PV0 is complete until the time PV3 is complete. We could improve the amount of time the on-disk state is consistent by simply sorting the commit order as follows: PV1 (v2, ignored) PV2 (v2, ignored) PV0 (v2, non-ignored) PV3 (v2, non-ignored) Thus, after the first PV is committed (in this case PV1), on-disk we would have: PV0 (v1, non-ignored) PV1 (v2, ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) This is clearly a consistent state. PV1 will be read but the mda will be ignored. All other PVs contain v1 metadata, and no auto-correct will be required. In fact, if we commit all PVs with ignored mdas first, we'll only have an inconsistent state when we start writing non-ignored PVs, and thus the chances we'll get an inconsistent state on disk is much less with the sorted method. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2010-06-29 00:35:49 +04:00
struct metadata_area *mda, *tmda;
struct dm_list ignored;
int failed = 0;
int cache_updated = 0;
Before committing each mda, arrange mdas so ignored mdas get committed first. Arrange mdas so mdas that are to be ignored come first. This is an optimization that ensures consistency on disk for the longest period of time. This was noted by agk in review of the v4 patchset of pvchange-based mda balance. Note the following example for an explanation of the background: Assume the initial state on disk is as follows: PV0 (v1, non-ignored) PV1 (v1, non-ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) If we did not sort the list, we would have a commit sequence something like this: PV0 (v2, non-ignored) PV1 (v2, ignored) PV2 (v2, ignored) PV3 (v2, non-ignored) After the commit of PV0's mdas, we'd have an on-disk state like this: PV0 (v2, non-ignored) PV1 (v1, non-ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) This is an inconsistent state of the disk. If the machine fails, the next time it was brought back up, the auto-correct mechanism in vg_read would update the metadata on PV1-PV3. However, if possible we try to avoid inconsistent on-disk states. Clearly, because we did not sort, we have a greater chance of on-disk inconsistency - from the time the commit of PV0 is complete until the time PV3 is complete. We could improve the amount of time the on-disk state is consistent by simply sorting the commit order as follows: PV1 (v2, ignored) PV2 (v2, ignored) PV0 (v2, non-ignored) PV3 (v2, non-ignored) Thus, after the first PV is committed (in this case PV1), on-disk we would have: PV0 (v1, non-ignored) PV1 (v2, ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) This is clearly a consistent state. PV1 will be read but the mda will be ignored. All other PVs contain v1 metadata, and no auto-correct will be required. In fact, if we commit all PVs with ignored mdas first, we'll only have an inconsistent state when we start writing non-ignored PVs, and thus the chances we'll get an inconsistent state on disk is much less with the sorted method. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2010-06-29 00:35:49 +04:00
/* Rearrange the metadata_areas_in_use so ignored mdas come first. */
dm_list_init(&ignored);
dm_list_iterate_items_safe(mda, tmda, &vg->fid->metadata_areas_in_use)
Before committing each mda, arrange mdas so ignored mdas get committed first. Arrange mdas so mdas that are to be ignored come first. This is an optimization that ensures consistency on disk for the longest period of time. This was noted by agk in review of the v4 patchset of pvchange-based mda balance. Note the following example for an explanation of the background: Assume the initial state on disk is as follows: PV0 (v1, non-ignored) PV1 (v1, non-ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) If we did not sort the list, we would have a commit sequence something like this: PV0 (v2, non-ignored) PV1 (v2, ignored) PV2 (v2, ignored) PV3 (v2, non-ignored) After the commit of PV0's mdas, we'd have an on-disk state like this: PV0 (v2, non-ignored) PV1 (v1, non-ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) This is an inconsistent state of the disk. If the machine fails, the next time it was brought back up, the auto-correct mechanism in vg_read would update the metadata on PV1-PV3. However, if possible we try to avoid inconsistent on-disk states. Clearly, because we did not sort, we have a greater chance of on-disk inconsistency - from the time the commit of PV0 is complete until the time PV3 is complete. We could improve the amount of time the on-disk state is consistent by simply sorting the commit order as follows: PV1 (v2, ignored) PV2 (v2, ignored) PV0 (v2, non-ignored) PV3 (v2, non-ignored) Thus, after the first PV is committed (in this case PV1), on-disk we would have: PV0 (v1, non-ignored) PV1 (v2, ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) This is clearly a consistent state. PV1 will be read but the mda will be ignored. All other PVs contain v1 metadata, and no auto-correct will be required. In fact, if we commit all PVs with ignored mdas first, we'll only have an inconsistent state when we start writing non-ignored PVs, and thus the chances we'll get an inconsistent state on disk is much less with the sorted method. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2010-06-29 00:35:49 +04:00
if (mda_is_ignored(mda))
dm_list_move(&ignored, &mda->list);
dm_list_iterate_items_safe(mda, tmda, &ignored)
Before committing each mda, arrange mdas so ignored mdas get committed first. Arrange mdas so mdas that are to be ignored come first. This is an optimization that ensures consistency on disk for the longest period of time. This was noted by agk in review of the v4 patchset of pvchange-based mda balance. Note the following example for an explanation of the background: Assume the initial state on disk is as follows: PV0 (v1, non-ignored) PV1 (v1, non-ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) If we did not sort the list, we would have a commit sequence something like this: PV0 (v2, non-ignored) PV1 (v2, ignored) PV2 (v2, ignored) PV3 (v2, non-ignored) After the commit of PV0's mdas, we'd have an on-disk state like this: PV0 (v2, non-ignored) PV1 (v1, non-ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) This is an inconsistent state of the disk. If the machine fails, the next time it was brought back up, the auto-correct mechanism in vg_read would update the metadata on PV1-PV3. However, if possible we try to avoid inconsistent on-disk states. Clearly, because we did not sort, we have a greater chance of on-disk inconsistency - from the time the commit of PV0 is complete until the time PV3 is complete. We could improve the amount of time the on-disk state is consistent by simply sorting the commit order as follows: PV1 (v2, ignored) PV2 (v2, ignored) PV0 (v2, non-ignored) PV3 (v2, non-ignored) Thus, after the first PV is committed (in this case PV1), on-disk we would have: PV0 (v1, non-ignored) PV1 (v2, ignored) PV2 (v1, non-ignored) PV3 (v1, non-ignored) This is clearly a consistent state. PV1 will be read but the mda will be ignored. All other PVs contain v1 metadata, and no auto-correct will be required. In fact, if we commit all PVs with ignored mdas first, we'll only have an inconsistent state when we start writing non-ignored PVs, and thus the chances we'll get an inconsistent state on disk is much less with the sorted method. Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
2010-06-29 00:35:49 +04:00
dm_list_move(&vg->fid->metadata_areas_in_use, &mda->list);
/* Commit to each copy of the metadata area */
dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
if (mda->status & MDA_FAILED)
continue;
failed = 0;
2002-11-18 17:04:08 +03:00
if (mda->ops->vg_commit &&
!mda->ops->vg_commit(vg->fid, vg, mda)) {
stack;
failed = 1;
}
/* Update cache first time we succeed */
if (!failed && !cache_updated) {
2008-03-17 19:51:31 +03:00
lvmcache_update_vg(vg, 0);
// lvmetad_vg_commit(vg);
cache_updated = 1;
}
}
return cache_updated;
}
/* Commit pending changes */
int vg_commit(struct volume_group *vg)
{
int cache_updated = 0;
if (!lvmcache_vgname_is_locked(vg->name)) {
log_error(INTERNAL_ERROR "Attempt to write new VG metadata "
"without locking %s", vg->name);
return cache_updated;
}
cache_updated = _vg_commit_mdas(vg);
set_vg_notify(vg->cmd);
if (cache_updated) {
/* Instruct remote nodes to upgrade cached metadata. */
if (!remote_commit_cached_metadata(vg))
stack; // FIXME: What should we do?
/*
* We need to clear old_name after a successful commit.
* The volume_group structure could be reused later.
*/
vg->old_name = NULL;
/* This *is* the original now that it's commited. */
release_vg(vg->vg_committed);
vg->vg_committed = vg->vg_precommitted;
vg->vg_precommitted = NULL;
if (vg->cft_precommitted) {
dm_config_destroy(vg->cft_precommitted);
vg->cft_precommitted = NULL;
}
}
/* If update failed, remove any cached precommitted metadata. */
if (!cache_updated && !drop_cached_metadata(vg))
log_error("Attempt to drop cached metadata failed "
"after commit for VG %s.", vg->name);
/* If at least one mda commit succeeded, it was committed */
return cache_updated;
}
/* Don't commit any pending changes */
void vg_revert(struct volume_group *vg)
{
struct metadata_area *mda;
2015-03-05 23:00:44 +03:00
struct lv_list *lvl;
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
/*
* This will leave the cached copy in lvmetad INVALID (from
* lvmetad_vg_update_pending) and means the VG will be reread from disk
* to update the lvmetad copy, which is what we want to ensure that the
* cached copy is correct.
*/
vg->lvmetad_update_pending = 0;
2015-03-05 23:00:44 +03:00
dm_list_iterate_items(lvl, &vg->lvs) {
if (lvl->lv->new_lock_args) {
lockd_free_lv(vg->cmd, vg, lvl->lv->name, &lvl->lv->lvid.id[1], lvl->lv->lock_args);
lvl->lv->new_lock_args = 0;
}
}
release_vg(vg->vg_precommitted); /* VG is no longer needed */
vg->vg_precommitted = NULL;
if (vg->cft_precommitted) {
dm_config_destroy(vg->cft_precommitted);
vg->cft_precommitted = NULL;
}
dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
if (mda->ops->vg_revert &&
!mda->ops->vg_revert(vg->fid, vg, mda)) {
stack;
}
}
if (!drop_cached_metadata(vg))
log_error("Attempt to drop cached metadata failed "
"after reverted update for VG %s.", vg->name);
if (!remote_revert_cached_metadata(vg))
stack; // FIXME: What should we do?
}
static int _check_mda_in_use(struct metadata_area *mda, void *_in_use)
{
int *in_use = _in_use;
if (!mda_is_ignored(mda))
*in_use = 1;
return 1;
}
struct _vg_read_orphan_baton {
struct cmd_context *cmd;
struct volume_group *vg;
uint32_t warn_flags;
int consistent;
int repair;
};
static int _check_or_repair_orphan_pv_ext(struct physical_volume *pv,
struct lvmcache_info *info,
struct _vg_read_orphan_baton *b)
{
uint32_t ext_version = lvmcache_ext_version(info);
uint32_t ext_flags = lvmcache_ext_flags(info);
int at_least_one_mda_used;
/*
* Nothing to do if PV header extension < 2:
* - version 0 is PV header without any extensions,
* - version 1 has bootloader area support only and
* we're not checking anything for that one here.
*/
if (ext_version < 2) {
b->consistent = 1;
return 1;
}
if (ext_flags & PV_EXT_USED) {
if (lvmcache_mda_count(info)) {
at_least_one_mda_used = 0;
lvmcache_foreach_mda(info, _check_mda_in_use, &at_least_one_mda_used);
/*
* We've found a PV that is marked as used with PV_EXT_USED flag
* and it's orphan at the same time while it contains MDAs.
* This is incorrect state and it needs to be fixed.
* The PV_EXT_USED flag needs to be dropped!
*/
if (b->repair) {
if (at_least_one_mda_used) {
log_warn("WARNING: Repairing flag incorrectly marking "
"Physical Volume %s as used.", pv_dev_name(pv));
/* pv_write will set correct ext_flags */
if (!pv_write(b->cmd, pv, 0)) {
b->consistent = 0;
log_error("Failed to repair physical volume \"%s\".",
pv_dev_name(pv));
return 0;
}
}
b->consistent = 1;
} else if (at_least_one_mda_used) {
/* mark as inconsistent only if there's at least 1 MDA used */
b->consistent = 0;
}
}
}
return 1;
}
static int _vg_read_orphan_pv(struct lvmcache_info *info, void *baton)
{
struct _vg_read_orphan_baton *b = baton;
struct physical_volume *pv = NULL;
struct pv_list *pvl;
if (!(pv = _pv_read(b->vg->cmd, b->vg->vgmem, dev_name(lvmcache_device(info)),
b->vg->fid, b->warn_flags, 0))) {
stack;
return 1;
}
if (!(pvl = dm_pool_zalloc(b->vg->vgmem, sizeof(*pvl)))) {
log_error("pv_list allocation failed");
free_pv_fid(pv);
return 0;
}
pvl->pv = pv;
add_pvl_to_vgs(b->vg, pvl);
if (!_check_or_repair_orphan_pv_ext(pv, info, baton)) {
stack;
return 0;
}
return 1;
}
2012-03-01 13:46:38 +04:00
/* Make orphan PVs look like a VG. */
2008-02-06 18:47:28 +03:00
static struct volume_group *_vg_read_orphans(struct cmd_context *cmd,
uint32_t warn_flags,
const char *orphan_vgname,
int *consistent)
2002-11-18 17:04:08 +03:00
{
const struct format_type *fmt;
struct lvmcache_vginfo *vginfo;
struct volume_group *vg = NULL;
struct _vg_read_orphan_baton baton;
struct pv_list *pvl, *tpvl;
struct pv_list head;
2002-11-18 17:04:08 +03:00
dm_list_init(&head.list);
lvmcache_label_scan(cmd);
lvmcache_seed_infos_from_lvmetad(cmd);
if (!(vginfo = lvmcache_vginfo_from_vgname(orphan_vgname, NULL)))
return_NULL;
if (!(fmt = lvmcache_fmt_from_vgname(cmd, orphan_vgname, NULL, 0)))
2008-01-30 16:19:47 +03:00
return_NULL;
2002-11-18 17:04:08 +03:00
vg = fmt->orphan_vg;
dm_list_iterate_items_safe(pvl, tpvl, &vg->pvs)
if (pvl->pv->status & UNLABELLED_PV )
dm_list_move(&head.list, &pvl->list);
else
pv_set_fid(pvl->pv, NULL);
dm_list_init(&vg->pvs);
vg->pv_count = 0;
vg->extent_count = 0;
vg->free_count = 0;
2008-04-08 02:12:37 +04:00
baton.cmd = cmd;
baton.warn_flags = warn_flags;
baton.vg = vg;
baton.consistent = 1;
baton.repair = *consistent;
while ((pvl = (struct pv_list *) dm_list_first(&head.list))) {
dm_list_del(&pvl->list);
add_pvl_to_vgs(vg, pvl);
vg->extent_count += pvl->pv->pe_count;
vg->free_count += pvl->pv->pe_count;
}
if (!lvmcache_foreach_pv(vginfo, _vg_read_orphan_pv, &baton))
2012-03-01 13:46:38 +04:00
return_NULL;
2002-11-18 17:04:08 +03:00
*consistent = baton.consistent;
2002-11-18 17:04:08 +03:00
return vg;
}
static int _update_pv_list(struct dm_pool *pvmem, struct dm_list *all_pvs, struct volume_group *vg)
{
struct pv_list *pvl, *pvl2;
dm_list_iterate_items(pvl, &vg->pvs) {
dm_list_iterate_items(pvl2, all_pvs) {
if (pvl->pv->dev == pvl2->pv->dev)
goto next_pv;
}
/*
* PV is not on list so add it.
*/
if (!(pvl2 = _copy_pvl(pvmem, pvl))) {
log_error("pv_list allocation for '%s' failed",
pv_dev_name(pvl->pv));
return 0;
}
dm_list_add(all_pvs, &pvl2->list);
next_pv:
;
}
return 1;
}
static void _free_pv_list(struct dm_list *all_pvs)
{
struct pv_list *pvl;
dm_list_iterate_items(pvl, all_pvs)
pvl->pv->fid->fmt->ops->destroy_instance(pvl->pv->fid);
}
static void _destroy_fid(struct format_instance **fid)
{
if (*fid) {
(*fid)->fmt->ops->destroy_instance(*fid);
*fid = NULL;
}
}
int vg_missing_pv_count(const struct volume_group *vg)
{
int ret = 0;
struct pv_list *pvl;
dm_list_iterate_items(pvl, &vg->pvs) {
2010-03-16 17:37:38 +03:00
if (is_missing_pv(pvl->pv))
++ ret;
}
return ret;
}
static int _check_reappeared_pv(struct volume_group *correct_vg,
struct physical_volume *pv, int act)
{
struct pv_list *pvl;
int rv = 0;
/*
* Skip these checks in case the tool is going to deal with missing
* PVs, especially since the resulting messages can be pretty
* confusing.
*/
if (correct_vg->cmd->handles_missing_pvs)
return rv;
dm_list_iterate_items(pvl, &correct_vg->pvs)
2010-03-16 17:37:38 +03:00
if (pv->dev == pvl->pv->dev && is_missing_pv(pvl->pv)) {
if (act)
log_warn("WARNING: Missing device %s reappeared, updating "
"metadata for VG %s to version %u.",
pv_dev_name(pvl->pv), pv_vg_name(pvl->pv),
correct_vg->seqno);
if (pvl->pv->pe_alloc_count == 0) {
if (act) {
pv->status &= ~MISSING_PV;
pvl->pv->status &= ~MISSING_PV;
}
++ rv;
} else if (act)
log_warn("WARNING: Device %s still marked missing because of allocated data "
"on it, remove volumes and consider vgreduce --removemissing.",
pv_dev_name(pvl->pv));
}
return rv;
}
2016-02-16 15:44:48 +03:00
static int _is_foreign_vg(struct volume_group *vg)
{
return vg->cmd->system_id && strcmp(vg->system_id, vg->cmd->system_id);
}
static int _repair_inconsistent_vg(struct volume_group *vg)
{
unsigned saved_handles_missing_pvs = vg->cmd->handles_missing_pvs;
if (lvmcache_found_duplicate_pvs()) {
log_debug_metadata("Skip metadata repair with duplicates.");
return 0;
}
/* Cannot write foreign VGs, the owner will repair it. */
2016-02-16 15:44:48 +03:00
if (_is_foreign_vg(vg)) {
log_verbose("Skip metadata repair for foreign VG.");
return 0;
}
/* FIXME: do this at higher level where lvmlockd lock can be changed. */
if (is_lockd_type(vg->lock_type)) {
log_verbose("Skip metadata repair for shared VG.");
return 0;
}
log_warn("WARNING: Inconsistent metadata found for VG %s - updating to use version %u", vg->name, vg->seqno);
vg->cmd->handles_missing_pvs = 1;
if (!vg_write(vg)) {
log_error("Automatic metadata correction failed");
vg->cmd->handles_missing_pvs = saved_handles_missing_pvs;
return 0;
}
vg->cmd->handles_missing_pvs = saved_handles_missing_pvs;
if (!vg_commit(vg)) {
log_error("Automatic metadata correction commit failed");
return 0;
}
return 1;
}
static int _wipe_outdated_pvs(struct cmd_context *cmd, struct volume_group *vg, struct dm_list *to_check)
{
struct pv_list *pvl, *pvl2;
char uuid[64] __attribute__((aligned(8)));
if (lvmcache_found_duplicate_pvs()) {
log_debug_metadata("Skip wiping outdated PVs with duplicates.");
return 0;
}
/*
* Cannot write foreign VGs, the owner will repair it.
* Also, if another host is updating its VG, we may read
* the PVs while some are written but not others, making
* some PVs look outdated to us just because we're reading
* the VG while it's only partially written out.
*/
2016-02-16 15:44:48 +03:00
if (_is_foreign_vg(vg)) {
log_debug_metadata("Skip wiping outdated PVs for foreign VG.");
return 0;
}
/*
* FIXME: do this at higher level where lvmlockd lock can be changed.
* Also if we're reading the VG with the --shared option (not using
* lvmlockd), we can see a VG while it's being written by another
* host, same as the foreign VG case.
*/
if (is_lockd_type(vg->lock_type)) {
log_debug_metadata("Skip wiping outdated PVs for shared VG.");
return 0;
}
dm_list_iterate_items(pvl, to_check) {
dm_list_iterate_items(pvl2, &vg->pvs) {
if (pvl->pv->dev == pvl2->pv->dev)
goto next_pv;
}
if (!id_write_format(&pvl->pv->id, uuid, sizeof(uuid)))
return_0;
log_warn("WARNING: Removing PV %s (%s) that no longer belongs to VG %s",
pv_dev_name(pvl->pv), uuid, vg->name);
if (!pv_write_orphan(cmd, pvl->pv))
return_0;
/* Refresh metadata after orphan write */
if (!drop_cached_metadata(vg)) {
log_error("Unable to drop cached metadata for VG %s while wiping outdated PVs.", vg->name);
return 0;
}
next_pv:
;
}
return 1;
}
static int _check_or_repair_pv_ext(struct cmd_context *cmd,
struct volume_group *vg,
int repair, int *inconsistent_pvs)
{
struct lvmcache_info *info;
uint32_t ext_version, ext_flags;
struct pv_list *pvl;
unsigned pvs_fixed = 0;
int r = 0;
*inconsistent_pvs = 0;
dm_list_iterate_items(pvl, &vg->pvs) {
/* Missing PV - nothing to do. */
if (is_missing_pv(pvl->pv))
continue;
if (!(info = lvmcache_info_from_pvid(pvl->pv->dev->pvid, pvl->pv->dev, 0))) {
log_error("Failed to find cached info for PV %s.", pv_dev_name(pvl->pv));
goto out;
}
ext_version = lvmcache_ext_version(info);
if (ext_version < 2)
continue;
ext_flags = lvmcache_ext_flags(info);
if (!(ext_flags & PV_EXT_USED)) {
if (!repair) {
*inconsistent_pvs = 1;
/* we're not repairing now, so no need to
* check further PVs - inconsistent_pvs is already
* set and that will trigger the repair next time */
return 1;
}
if (_is_foreign_vg(vg)) {
log_verbose("Skip repair of PV %s that is in foreign "
"VG %s but not marked as used.",
pv_dev_name(pvl->pv), vg->name);
*inconsistent_pvs = 1;
} else if (is_lockd_type(vg->lock_type)) {
/*
* FIXME: decide how to handle repair for shared VGs.
*/
log_warn("Skip repair of PV %s that is in shared "
"VG %s but not marked as used.",
pv_dev_name(pvl->pv), vg->name);
*inconsistent_pvs = 1;
} else {
log_warn("WARNING: Repairing Physical Volume %s that is "
"in Volume Group %s but not marked as used.",
pv_dev_name(pvl->pv), vg->name);
/* pv write will set correct ext_flags */
if (!pv_write(cmd, pvl->pv, 1)) {
*inconsistent_pvs = 1;
log_error("Failed to repair physical volume \"%s\".",
pv_dev_name(pvl->pv));
goto out;
}
pvs_fixed++;
}
}
}
r = 1;
out:
if ((pvs_fixed > 0) && !_repair_inconsistent_vg(vg))
return_0;
return r;
}
/* Caller sets consistent to 1 if it's safe for vg_read_internal to correct
2002-11-18 17:04:08 +03:00
* inconsistent metadata on disk (i.e. the VG write lock is held).
* This guarantees only consistent metadata is returned.
2002-11-18 17:04:08 +03:00
* If consistent is 0, caller must check whether consistent == 1 on return
2008-01-30 17:00:02 +03:00
* and take appropriate action if it isn't (e.g. abort; get write lock
* and call vg_read_internal again).
*
* If precommitted is set, use precommitted metadata if present.
2008-06-06 15:12:50 +04:00
*
* Either of vgname or vgid may be NULL.
Fix use of released vgname and vgid Avoid using of already released memory when duplicated MDA is found. As get_pv_from_vg_by_id() may call lvmcache_label_scan() use the local copy of the vgname and vgid on the stack as vginfo may dissapear and code was then accessing garbage in memory. i.e. pvs /dev/loop0 (when /dev/loop0 and /dev/loop1 has same MDA content) Invalid read of size 1 at 0x523C986: dm_hash_lookup (hash.c:325) by 0x440C8C: vginfo_from_vgname (lvmcache.c:399) by 0x4605C0: _create_vg_text_instance (format-text.c:1882) by 0x46140D: _text_create_text_instance (format-text.c:2243) by 0x47EB49: _vg_read (metadata.c:2887) by 0x47FBD8: vg_read_internal (metadata.c:3231) by 0x477594: get_pv_from_vg_by_id (metadata.c:344) by 0x45F07A: _get_pv_if_in_vg (format-text.c:1400) by 0x45F0B9: _populate_pv_fields (format-text.c:1414) by 0x45F40F: _text_pv_read (format-text.c:1493) by 0x480431: _pv_read (metadata.c:3500) by 0x4802B2: pv_read (metadata.c:3462) Address 0x652ab80 is 0 bytes inside a block of size 4 free'd at 0x4C2756E: free (vg_replace_malloc.c:366) by 0x442277: _free_vginfo (lvmcache.c:963) by 0x44235E: _drop_vginfo (lvmcache.c:992) by 0x442B23: _lvmcache_update_vgname (lvmcache.c:1165) by 0x443449: lvmcache_update_vgname_and_id (lvmcache.c:1358) by 0x443C07: lvmcache_add (lvmcache.c:1492) by 0x46588C: _text_read (text_label.c:271) by 0x466A65: label_read (label.c:289) by 0x4413FC: lvmcache_label_scan (lvmcache.c:635) by 0x4605AD: _create_vg_text_instance (format-text.c:1881) by 0x46140D: _text_create_text_instance (format-text.c:2243) by 0x47EB49: _vg_read (metadata.c:2887) Add testing script
2011-04-21 17:13:40 +04:00
*
* Note: vginfo structs must not be held or used as parameters
* across the call to this function.
2002-11-18 17:04:08 +03:00
*/
static struct volume_group *_vg_read(struct cmd_context *cmd,
const char *vgname,
const char *vgid,
uint32_t warn_flags,
2008-03-17 19:51:31 +03:00
int *consistent, unsigned precommitted)
{
struct format_instance *fid = NULL;
struct format_instance_ctx fic;
const struct format_type *fmt;
struct volume_group *vg, *correct_vg = NULL;
2002-11-18 17:04:08 +03:00
struct metadata_area *mda;
struct lvmcache_info *info;
int inconsistent = 0;
int inconsistent_vgid = 0;
int inconsistent_pvs = 0;
int inconsistent_mdas = 0;
int inconsistent_mda_count = 0;
int strip_historical_lvs = *consistent;
int update_old_pv_ext = *consistent;
2008-03-17 19:51:31 +03:00
unsigned use_precommitted = precommitted;
struct dm_list *pvids;
struct pv_list *pvl;
struct dm_list all_pvs;
unsigned seqno = 0;
int reappeared = 0;
struct cached_vg_fmtdata *vg_fmtdata = NULL; /* Additional format-specific data about the vg */
unsigned use_previous_vg;
log_very_verbose("Reading VG %s %.32s", vgname ?: "<no name>", vgid ?: "<no vgid>");
if (is_orphan_vg(vgname)) {
if (use_precommitted) {
log_error(INTERNAL_ERROR "vg_read_internal requires vgname "
"with pre-commit.");
return NULL;
}
return _vg_read_orphans(cmd, warn_flags, vgname, consistent);
2002-11-18 17:04:08 +03:00
}
if (lvmetad_used() && !use_precommitted) {
if ((correct_vg = lvmcache_get_vg(cmd, vgname, vgid, precommitted))) {
dm_list_iterate_items(pvl, &correct_vg->pvs)
if (pvl->pv->dev)
reappeared += _check_reappeared_pv(correct_vg, pvl->pv, *consistent);
if (reappeared && *consistent)
*consistent = _repair_inconsistent_vg(correct_vg);
else
*consistent = !reappeared;
if (_wipe_outdated_pvs(cmd, correct_vg, &correct_vg->pvs_outdated)) {
/* clear the list */
dm_list_init(&correct_vg->pvs_outdated);
lvmetad_vg_clear_outdated_pvs(correct_vg);
}
}
if (correct_vg) {
if (update_old_pv_ext && !_vg_update_old_pv_ext_if_needed(correct_vg)) {
release_vg(correct_vg);
return_NULL;
}
if (strip_historical_lvs && !vg_strip_outdated_historical_lvs(correct_vg)) {
release_vg(correct_vg);
return_NULL;
}
}
return correct_vg;
}
/*
* If cached metadata was inconsistent and *consistent is set
* then repair it now. Otherwise just return it.
* Also return if use_precommitted is set due to the FIXME in
* the missing PV logic below.
*/
if ((correct_vg = lvmcache_get_vg(cmd, vgname, vgid, precommitted)) &&
(use_precommitted || !*consistent)) {
*consistent = 1;
return correct_vg;
} else {
if (correct_vg && correct_vg->seqno > seqno)
seqno = correct_vg->seqno;
release_vg(correct_vg);
correct_vg = NULL;
}
2002-11-18 17:04:08 +03:00
/* Find the vgname in the cache */
/* If it's not there we must do full scan to be completely sure */
if (!(fmt = lvmcache_fmt_from_vgname(cmd, vgname, vgid, 1))) {
lvmcache_label_scan(cmd);
if (!(fmt = lvmcache_fmt_from_vgname(cmd, vgname, vgid, 1))) {
/* Independent MDAs aren't supported under low memory */
if (!cmd->independent_metadata_areas && critical_section())
2008-01-30 16:19:47 +03:00
return_NULL;
lvmcache_force_next_label_scan();
lvmcache_label_scan(cmd);
if (!(fmt = lvmcache_fmt_from_vgname(cmd, vgname, vgid, 0)))
2008-01-30 16:19:47 +03:00
return_NULL;
}
}
2008-06-06 15:12:50 +04:00
/* Now determine the correct vgname if none was supplied */
if (!vgname && !(vgname = lvmcache_vgname_from_vgid(cmd->mem, vgid))) {
log_debug_metadata("Cache did not find VG name from vgid %.32s", vgid);
2008-06-06 15:12:50 +04:00
return_NULL;
}
/* Determine the correct vgid if none was supplied */
if (!vgid && !(vgid = lvmcache_vgid_from_vgname(cmd, vgname))) {
log_debug_metadata("Cache did not find VG vgid from name %s", vgname);
return_NULL;
}
2008-06-06 15:12:50 +04:00
if (use_precommitted && !(fmt->features & FMT_PRECOMMIT))
use_precommitted = 0;
2002-11-18 17:04:08 +03:00
/* create format instance with appropriate metadata area */
2012-02-13 03:01:19 +04:00
fic.type = FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS;
fic.context.vg_ref.vg_name = vgname;
fic.context.vg_ref.vg_id = vgid;
if (!(fid = fmt->ops->create_instance(fmt, &fic))) {
log_error("Failed to create format instance");
return NULL;
}
/* Store pvids for later so we can check if any are missing */
if (!(pvids = lvmcache_get_pvids(cmd, vgname, vgid))) {
_destroy_fid(&fid);
return_NULL;
}
/*
* We use the fid globally here so prevent the release_vg
* call to destroy the fid - we may want to reuse it!
*/
fid->ref_count++;
/* Ensure contents of all metadata areas match - else do recovery */
inconsistent_mda_count=0;
dm_list_iterate_items(mda, &fid->metadata_areas_in_use) {
use_previous_vg = 0;
if ((use_precommitted &&
!(vg = mda->ops->vg_read_precommit(fid, vgname, mda, &vg_fmtdata, &use_previous_vg)) && !use_previous_vg) ||
(!use_precommitted &&
!(vg = mda->ops->vg_read(fid, vgname, mda, &vg_fmtdata, &use_previous_vg, 0)) && !use_previous_vg)) {
2002-11-18 17:04:08 +03:00
inconsistent = 1;
vg_fmtdata = NULL;
continue;
}
/* Use previous VG because checksum matches */
if (!vg) {
vg = correct_vg;
2002-11-18 17:04:08 +03:00
continue;
}
if (!correct_vg) {
correct_vg = vg;
continue;
}
2002-11-18 17:04:08 +03:00
/* FIXME Also ensure contents same - checksum compare? */
if (correct_vg->seqno != vg->seqno) {
if (cmd->metadata_read_only)
log_very_verbose("Not repairing VG %s metadata seqno (%d != %d) "
"as global/metadata_read_only is set.",
vgname, vg->seqno, correct_vg->seqno);
else
inconsistent = 1;
if (vg->seqno > correct_vg->seqno) {
release_vg(correct_vg);
correct_vg = vg;
} else {
mda->status |= MDA_INCONSISTENT;
++inconsistent_mda_count;
}
}
if (vg != correct_vg) {
release_vg(vg);
vg_fmtdata = NULL;
}
}
fid->ref_count--;
/* Ensure every PV in the VG was in the cache */
if (correct_vg) {
/*
* Update the seqno from the cache, for the benefit of
* retro-style metadata formats like LVM1.
*/
// correct_vg->seqno = seqno > correct_vg->seqno ? seqno : correct_vg->seqno;
/*
* If the VG has PVs without mdas, or ignored mdas, they may
* still be orphans in the cache: update the cache state here,
* and update the metadata lists in the vg.
*/
if (!inconsistent &&
dm_list_size(&correct_vg->pvs) > dm_list_size(pvids)) {
dm_list_iterate_items(pvl, &correct_vg->pvs) {
if (!pvl->pv->dev) {
inconsistent_pvs = 1;
break;
}
if (str_list_match_item(pvids, pvl->pv->dev->pvid))
continue;
/*
* PV not marked as belonging to this VG in cache.
* Check it's an orphan without metadata area
* not ignored.
*/
if (!(info = lvmcache_info_from_pvid(pvl->pv->dev->pvid, pvl->pv->dev, 1)) ||
!lvmcache_is_orphan(info)) {
inconsistent_pvs = 1;
break;
}
if (lvmcache_mda_count(info)) {
if (!lvmcache_fid_add_mdas_pv(info, fid)) {
release_vg(correct_vg);
return_NULL;
}
log_debug_metadata("Empty mda found for VG %s.", vgname);
if (inconsistent_mdas)
continue;
/*
* If any newly-added mdas are in-use then their
* metadata needs updating.
*/
lvmcache_foreach_mda(info, _check_mda_in_use,
&inconsistent_mdas);
}
}
/* If the check passed, let's update VG and recalculate pvids */
if (!inconsistent_pvs) {
log_debug_metadata("Updating cache for PVs without mdas "
"in VG %s.", vgname);
/*
* If there is no precommitted metadata, committed metadata
* is read and stored in the cache even if use_precommitted is set
*/
lvmcache_update_vg(correct_vg, correct_vg->status & PRECOMMITTED);
if (!(pvids = lvmcache_get_pvids(cmd, vgname, vgid))) {
release_vg(correct_vg);
return_NULL;
}
}
}
fid->ref_count++;
if (dm_list_size(&correct_vg->pvs) !=
dm_list_size(pvids) + vg_missing_pv_count(correct_vg)) {
log_debug_metadata("Cached VG %s had incorrect PV list",
vgname);
if (critical_section())
inconsistent = 1;
else {
release_vg(correct_vg);
correct_vg = NULL;
}
} else dm_list_iterate_items(pvl, &correct_vg->pvs) {
2010-03-16 17:37:38 +03:00
if (is_missing_pv(pvl->pv))
continue;
if (!str_list_match_item(pvids, pvl->pv->dev->pvid)) {
log_debug_metadata("Cached VG %s had incorrect PV list",
vgname);
release_vg(correct_vg);
correct_vg = NULL;
break;
}
}
if (correct_vg && inconsistent_mdas) {
release_vg(correct_vg);
correct_vg = NULL;
}
fid->ref_count--;
}
dm_list_init(&all_pvs);
/* Failed to find VG where we expected it - full scan and retry */
if (!correct_vg) {
/*
* Free outstanding format instance that remained unassigned
* from previous step where we tried to get the "correct_vg",
* but we failed to do so (so there's a dangling fid now).
*/
_destroy_fid(&fid);
vg_fmtdata = NULL;
inconsistent = 0;
/* Independent MDAs aren't supported under low memory */
if (!cmd->independent_metadata_areas && critical_section())
2008-01-30 16:19:47 +03:00
return_NULL;
lvmcache_force_next_label_scan();
lvmcache_label_scan(cmd);
if (!(fmt = lvmcache_fmt_from_vgname(cmd, vgname, vgid, 0)))
2008-01-30 16:19:47 +03:00
return_NULL;
if (precommitted && !(fmt->features & FMT_PRECOMMIT))
use_precommitted = 0;
/* create format instance with appropriate metadata area */
2012-02-13 03:01:19 +04:00
fic.type = FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS;
fic.context.vg_ref.vg_name = vgname;
fic.context.vg_ref.vg_id = vgid;
if (!(fid = fmt->ops->create_instance(fmt, &fic))) {
log_error("Failed to create format instance");
return NULL;
}
/*
* We use the fid globally here so prevent the release_vg
* call to destroy the fid - we may want to reuse it!
*/
fid->ref_count++;
/* Ensure contents of all metadata areas match - else recover */
inconsistent_mda_count=0;
dm_list_iterate_items(mda, &fid->metadata_areas_in_use) {
use_previous_vg = 0;
if ((use_precommitted &&
!(vg = mda->ops->vg_read_precommit(fid, vgname, mda, &vg_fmtdata, &use_previous_vg)) && !use_previous_vg) ||
(!use_precommitted &&
!(vg = mda->ops->vg_read(fid, vgname, mda, &vg_fmtdata, &use_previous_vg, 0)) && !use_previous_vg)) {
inconsistent = 1;
vg_fmtdata = NULL;
continue;
}
/* Use previous VG because checksum matches */
if (!vg) {
vg = correct_vg;
continue;
}
if (!correct_vg) {
correct_vg = vg;
if (!_update_pv_list(cmd->mem, &all_pvs, correct_vg)) {
_free_pv_list(&all_pvs);
fid->ref_count--;
release_vg(vg);
return_NULL;
}
continue;
}
if (!id_equal(&vg->id, &correct_vg->id)) {
inconsistent = 1;
inconsistent_vgid = 1;
}
/* FIXME Also ensure contents same - checksums same? */
if (correct_vg->seqno != vg->seqno) {
/* Ignore inconsistent seqno if told to skip repair logic */
if (cmd->metadata_read_only)
log_very_verbose("Not repairing VG %s metadata seqno (%d != %d) "
"as global/metadata_read_only is set.",
vgname, vg->seqno, correct_vg->seqno);
else
inconsistent = 1;
if (!_update_pv_list(cmd->mem, &all_pvs, vg)) {
_free_pv_list(&all_pvs);
fid->ref_count--;
release_vg(vg);
release_vg(correct_vg);
return_NULL;
}
if (vg->seqno > correct_vg->seqno) {
release_vg(correct_vg);
correct_vg = vg;
} else {
mda->status |= MDA_INCONSISTENT;
++inconsistent_mda_count;
}
}
if (vg != correct_vg) {
release_vg(vg);
vg_fmtdata = NULL;
}
}
fid->ref_count--;
/* Give up looking */
if (!correct_vg) {
_free_pv_list(&all_pvs);
_destroy_fid(&fid);
2008-01-30 16:19:47 +03:00
return_NULL;
}
}
/*
* If there is no precommitted metadata, committed metadata
* is read and stored in the cache even if use_precommitted is set
*/
lvmcache_update_vg(correct_vg, (correct_vg->status & PRECOMMITTED));
2002-11-18 17:04:08 +03:00
if (inconsistent) {
/* FIXME Test should be if we're *using* precommitted metadata not if we were searching for it */
if (use_precommitted) {
log_error("Inconsistent pre-commit metadata copies "
"for volume group %s", vgname);
/*
* Check whether all of the inconsistent MDAs were on
* MISSING PVs -- in that case, we should be safe.
*/
dm_list_iterate_items(mda, &fid->metadata_areas_in_use) {
if (mda->status & MDA_INCONSISTENT) {
log_debug_metadata("Checking inconsistent MDA: %s", dev_name(mda_get_device(mda)));
dm_list_iterate_items(pvl, &correct_vg->pvs) {
if (mda_get_device(mda) == pvl->pv->dev &&
(pvl->pv->status & MISSING_PV))
--inconsistent_mda_count;
}
}
}
if (inconsistent_mda_count < 0)
log_error(INTERNAL_ERROR "Too many inconsistent MDAs.");
if (!inconsistent_mda_count) {
*consistent = 0;
_free_pv_list(&all_pvs);
return correct_vg;
}
_free_pv_list(&all_pvs);
release_vg(correct_vg);
return NULL;
}
if (!*consistent) {
_free_pv_list(&all_pvs);
2002-11-18 17:04:08 +03:00
return correct_vg;
}
2002-11-18 17:04:08 +03:00
/* Don't touch if vgids didn't match */
if (inconsistent_vgid) {
log_warn("WARNING: Inconsistent metadata UUIDs found for "
"volume group %s.", vgname);
*consistent = 0;
_free_pv_list(&all_pvs);
return correct_vg;
}
/*
* If PV is marked missing but we found it,
* update metadata and remove MISSING flag
*/
dm_list_iterate_items(pvl, &all_pvs)
_check_reappeared_pv(correct_vg, pvl->pv, 1);
if (!_repair_inconsistent_vg(correct_vg)) {
_free_pv_list(&all_pvs);
release_vg(correct_vg);
2005-01-17 21:24:28 +03:00
return NULL;
}
if (!_wipe_outdated_pvs(cmd, correct_vg, &all_pvs)) {
_free_pv_list(&all_pvs);
release_vg(correct_vg);
return_NULL;
}
}
_free_pv_list(&all_pvs);
if (vg_missing_pv_count(correct_vg)) {
log_verbose("There are %d physical volumes missing.",
vg_missing_pv_count(correct_vg));
vg_mark_partial_lvs(correct_vg, 1);
}
2003-05-06 16:06:02 +04:00
if ((correct_vg->status & PVMOVE) && !pvmove_mode()) {
log_error("Interrupted pvmove detected in volume group %s.",
correct_vg->name);
log_print("Please restore the metadata by running vgcfgrestore.");
release_vg(correct_vg);
2003-04-30 19:23:43 +04:00
return NULL;
}
/* We have the VG now finally, check if PV ext info is in sync with VG metadata. */
if (!_check_or_repair_pv_ext(cmd, correct_vg, *consistent, &inconsistent_pvs)) {
release_vg(correct_vg);
return_NULL;
}
*consistent = !inconsistent_pvs;
if (correct_vg && *consistent) {
if (update_old_pv_ext && !_vg_update_old_pv_ext_if_needed(correct_vg)) {
release_vg(correct_vg);
return_NULL;
}
if (strip_historical_lvs && !vg_strip_outdated_historical_lvs(correct_vg)) {
release_vg(correct_vg);
return_NULL;
}
}
return correct_vg;
}
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
#define DEV_LIST_DELIM ", "
static int _check_devs_used_correspond_with_lv(struct dm_pool *mem, struct dm_list *list, struct logical_volume *lv)
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
{
struct device_list *dl;
int found_inconsistent = 0;
struct device *dev;
struct lv_segment *seg;
uint32_t s;
metadata: log warning instead of error if device not found while checking used and assumed devs When checking assumed PVs against real devices used for LVs and if there's no device assigned for an assumed PV (e.g. due to filters), do log_warn instead of log_error and continue checking LV segments and associated assumed PVs further, just like we do log_warn elsewhere in this situation. This way user will see the warning for each LV which couldn't be checked completely against real PVs used. Before, we logged only the very first occurence of missing device for an LV in a VG and we returned from the function doing this check for all the LVs in VG immediately which may be a bit misleading because it didn't tell user about all the other LVs and whether they could be checked or not. For example, we have this setup: [0] fedora/~ # pvs PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m /dev/vda2 fedora lvm2 a-- 19.49g 0 [0] fedora/~ # lvs -o+devices LV VG Attr LSize Devices root fedora -wi-ao---- 19.00g /dev/vda2(0) swap fedora -wi-ao---- 500.00m /dev/vda2(4864) Before this patch (only the very first LV in a VG is logged to have a problem while checking used and assumed devices): [0] fedora/~ # pvs --config 'devices/filter=["a|/dev/sda|", "r|.*|"]' WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. Couldn't find device for segment belonging to fedora/root while checking used and assumed devices. PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m [unknown] fedora lvm2 a-m 19.49g 0 With this patch applied (all LVs where we hit problem while checking used and assumed devices are logged and it's warning, not error): [0] fedora/~ # pvs --config 'devices/filter=["a|/dev/sda|", "r|.*|"]' WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Couldn't find device for segment belonging to fedora/root while checking used and assumed devices. WARNING: Couldn't find device for segment belonging to fedora/swap while checking used and assumed devices. PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m [unknown] fedora lvm2 a-m 19.49g 0
2016-04-25 12:15:44 +03:00
int warned_about_no_dev = 0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
char *used_devnames = NULL, *assumed_devnames = NULL;
if (!(list = dev_cache_get_dev_list_for_lvid(lv->lvid.s + ID_LEN)))
return 1;
dm_list_iterate_items(dl, list) {
dev = dl->dev;
if (!(dev->flags & DEV_ASSUMED_FOR_LV)) {
if (!found_inconsistent) {
if (!dm_pool_begin_object(mem, 32))
return_0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
found_inconsistent = 1;
} else {
if (!dm_pool_grow_object(mem, DEV_LIST_DELIM, sizeof(DEV_LIST_DELIM) - 1))
return_0;
}
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
if (!dm_pool_grow_object(mem, dev_name(dev), 0))
return_0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
}
}
if (!found_inconsistent)
return 1;
if (!dm_pool_grow_object(mem, "\0", 1))
return_0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
used_devnames = dm_pool_end_object(mem);
found_inconsistent = 0;
dm_list_iterate_items(seg, &lv->segments) {
for (s = 0; s < seg->area_count; s++) {
if (seg_type(seg, s) == AREA_PV) {
if (!(dev = seg_dev(seg, s))) {
metadata: log warning instead of error if device not found while checking used and assumed devs When checking assumed PVs against real devices used for LVs and if there's no device assigned for an assumed PV (e.g. due to filters), do log_warn instead of log_error and continue checking LV segments and associated assumed PVs further, just like we do log_warn elsewhere in this situation. This way user will see the warning for each LV which couldn't be checked completely against real PVs used. Before, we logged only the very first occurence of missing device for an LV in a VG and we returned from the function doing this check for all the LVs in VG immediately which may be a bit misleading because it didn't tell user about all the other LVs and whether they could be checked or not. For example, we have this setup: [0] fedora/~ # pvs PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m /dev/vda2 fedora lvm2 a-- 19.49g 0 [0] fedora/~ # lvs -o+devices LV VG Attr LSize Devices root fedora -wi-ao---- 19.00g /dev/vda2(0) swap fedora -wi-ao---- 500.00m /dev/vda2(4864) Before this patch (only the very first LV in a VG is logged to have a problem while checking used and assumed devices): [0] fedora/~ # pvs --config 'devices/filter=["a|/dev/sda|", "r|.*|"]' WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. Couldn't find device for segment belonging to fedora/root while checking used and assumed devices. PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m [unknown] fedora lvm2 a-m 19.49g 0 With this patch applied (all LVs where we hit problem while checking used and assumed devices are logged and it's warning, not error): [0] fedora/~ # pvs --config 'devices/filter=["a|/dev/sda|", "r|.*|"]' WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Couldn't find device for segment belonging to fedora/root while checking used and assumed devices. WARNING: Couldn't find device for segment belonging to fedora/swap while checking used and assumed devices. PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m [unknown] fedora lvm2 a-m 19.49g 0
2016-04-25 12:15:44 +03:00
if (!warned_about_no_dev) {
log_warn("WARNING: Couldn't find all devices for LV %s "
"while checking used and assumed devices.",
metadata: log warning instead of error if device not found while checking used and assumed devs When checking assumed PVs against real devices used for LVs and if there's no device assigned for an assumed PV (e.g. due to filters), do log_warn instead of log_error and continue checking LV segments and associated assumed PVs further, just like we do log_warn elsewhere in this situation. This way user will see the warning for each LV which couldn't be checked completely against real PVs used. Before, we logged only the very first occurence of missing device for an LV in a VG and we returned from the function doing this check for all the LVs in VG immediately which may be a bit misleading because it didn't tell user about all the other LVs and whether they could be checked or not. For example, we have this setup: [0] fedora/~ # pvs PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m /dev/vda2 fedora lvm2 a-- 19.49g 0 [0] fedora/~ # lvs -o+devices LV VG Attr LSize Devices root fedora -wi-ao---- 19.00g /dev/vda2(0) swap fedora -wi-ao---- 500.00m /dev/vda2(4864) Before this patch (only the very first LV in a VG is logged to have a problem while checking used and assumed devices): [0] fedora/~ # pvs --config 'devices/filter=["a|/dev/sda|", "r|.*|"]' WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. Couldn't find device for segment belonging to fedora/root while checking used and assumed devices. PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m [unknown] fedora lvm2 a-m 19.49g 0 With this patch applied (all LVs where we hit problem while checking used and assumed devices are logged and it's warning, not error): [0] fedora/~ # pvs --config 'devices/filter=["a|/dev/sda|", "r|.*|"]' WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Couldn't find device for segment belonging to fedora/root while checking used and assumed devices. WARNING: Couldn't find device for segment belonging to fedora/swap while checking used and assumed devices. PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m [unknown] fedora lvm2 a-m 19.49g 0
2016-04-25 12:15:44 +03:00
display_lvname(lv));
warned_about_no_dev = 1;
}
continue;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
}
if (!(dev->flags & DEV_USED_FOR_LV)) {
if (!found_inconsistent) {
if (!dm_pool_begin_object(mem, 32))
return_0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
found_inconsistent = 1;
} else {
if (!dm_pool_grow_object(mem, DEV_LIST_DELIM, sizeof(DEV_LIST_DELIM) - 1))
return_0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
}
if (!dm_pool_grow_object(mem, dev_name(dev), 0))
return_0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
}
}
}
}
if (found_inconsistent) {
if (!dm_pool_grow_object(mem, "\0", 1))
return_0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
assumed_devnames = dm_pool_end_object(mem);
metadata: log warning instead of error if device not found while checking used and assumed devs When checking assumed PVs against real devices used for LVs and if there's no device assigned for an assumed PV (e.g. due to filters), do log_warn instead of log_error and continue checking LV segments and associated assumed PVs further, just like we do log_warn elsewhere in this situation. This way user will see the warning for each LV which couldn't be checked completely against real PVs used. Before, we logged only the very first occurence of missing device for an LV in a VG and we returned from the function doing this check for all the LVs in VG immediately which may be a bit misleading because it didn't tell user about all the other LVs and whether they could be checked or not. For example, we have this setup: [0] fedora/~ # pvs PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m /dev/vda2 fedora lvm2 a-- 19.49g 0 [0] fedora/~ # lvs -o+devices LV VG Attr LSize Devices root fedora -wi-ao---- 19.00g /dev/vda2(0) swap fedora -wi-ao---- 500.00m /dev/vda2(4864) Before this patch (only the very first LV in a VG is logged to have a problem while checking used and assumed devices): [0] fedora/~ # pvs --config 'devices/filter=["a|/dev/sda|", "r|.*|"]' WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. Couldn't find device for segment belonging to fedora/root while checking used and assumed devices. PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m [unknown] fedora lvm2 a-m 19.49g 0 With this patch applied (all LVs where we hit problem while checking used and assumed devices are logged and it's warning, not error): [0] fedora/~ # pvs --config 'devices/filter=["a|/dev/sda|", "r|.*|"]' WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Device for PV Qcxpcy-XgtP-UD3s-PmG0-qLyE-Z0ho-DYsxoz not found or rejected by a filter. WARNING: Couldn't find device for segment belonging to fedora/root while checking used and assumed devices. WARNING: Couldn't find device for segment belonging to fedora/swap while checking used and assumed devices. PV VG Fmt Attr PSize PFree /dev/sda lvm2 --- 128.00m 128.00m [unknown] fedora lvm2 a-m 19.49g 0
2016-04-25 12:15:44 +03:00
log_warn("WARNING: Device mismatch detected for %s which is accessing %s instead of %s.",
display_lvname(lv), used_devnames, assumed_devnames);
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
}
return 1;
}
static int _check_devs_used_correspond_with_vg(struct volume_group *vg)
{
struct dm_pool *mem;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
char vgid[ID_LEN + 1];
struct pv_list *pvl;
struct lv_list *lvl;
struct dm_list *list;
struct device_list *dl;
int found_inconsistent = 0;
if (is_orphan_vg(vg->name))
return 1;
strncpy(vgid, (const char *) vg->id.uuid, sizeof(vgid));
vgid[ID_LEN] = '\0';
/* Mark all PVs in VG as used. */
dm_list_iterate_items(pvl, &vg->pvs) {
/*
* FIXME: It's not clear if the meaning
* of "missing" should always include the
* !pv->dev case, or if "missing" is the
* more narrow case where VG metadata has
* been written with the MISSING flag.
*/
if (!pvl->pv->dev)
continue;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
if (is_missing_pv(pvl->pv))
continue;
pvl->pv->dev->flags |= DEV_ASSUMED_FOR_LV;
}
if (!(list = dev_cache_get_dev_list_for_vgid(vgid)))
return 1;
dm_list_iterate_items(dl, list) {
if (!(dl->dev->flags & DEV_OPEN_FAILURE) &&
!(dl->dev->flags & DEV_ASSUMED_FOR_LV)) {
found_inconsistent = 1;
break;
}
}
if (found_inconsistent) {
if (!(mem = dm_pool_create("vg_devs_check", 1024)))
return_0;
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
dm_list_iterate_items(lvl, &vg->lvs) {
if (!_check_devs_used_correspond_with_lv(mem, list, lvl->lv)) {
dm_pool_destroy(mem);
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
return_0;
}
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
}
dm_pool_destroy(mem);
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
}
return 1;
}
struct volume_group *vg_read_internal(struct cmd_context *cmd, const char *vgname,
const char *vgid, uint32_t warn_flags, int *consistent)
{
struct volume_group *vg;
struct lv_list *lvl;
if (!(vg = _vg_read(cmd, vgname, vgid, warn_flags, consistent, 0)))
goto_out;
if (!_check_pv_dev_sizes(vg))
log_warn("One or more devices used as PVs in VG %s "
"have changed sizes.", vg->name);
if (!check_pv_segments(vg)) {
log_error(INTERNAL_ERROR "PV segments corrupted in %s.",
vg->name);
release_vg(vg);
vg = NULL;
goto out;
}
dm_list_iterate_items(lvl, &vg->lvs) {
if (!check_lv_segments(lvl->lv, 0)) {
log_error(INTERNAL_ERROR "LV segments corrupted in %s.",
lvl->lv->name);
release_vg(vg);
vg = NULL;
goto out;
}
}
dm_list_iterate_items(lvl, &vg->lvs) {
/*
* Checks that cross-reference other LVs.
*/
2005-10-28 01:51:28 +04:00
if (!check_lv_segments(lvl->lv, 1)) {
log_error(INTERNAL_ERROR "LV segments corrupted in %s.",
lvl->lv->name);
release_vg(vg);
vg = NULL;
goto out;
}
}
dev: detect mismatch between devices used and devices assumed for an LV It's possible for an LVM LV to use a device during activation which then differs from device which LVM assumes based on metadata later on. For example, such device mismatch can occur if LVM doesn't have complete view of devices during activation or if filters are misbehaving or they're incorrectly set during activation. This patch adds code that can detect this mismatch by creating VG UUID and LV UUID index while scanning devices for device cache. The VG UUID index maps VG UUID to a device list. Each device in the list has a device layered above as a holder which is an LVM LV device and for which we know the VG UUID (and similarly for LV UUID index). We can acquire VG and LV UUID by reading /sys/block/<dm_dev_name>/dm/uuid. So these indices represent the actual state of PV device use in the system by LVs and then we compare that to what LVM assumes based on metadata. For example: [0] fedora/~ # lsblk /dev/sdq /dev/sdr /dev/sds /dev/sdt NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdq 65:0 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev1 253:3 0 104M 0 mpath sdr 65:16 0 104M 0 disk `-mpath_dev1 253:3 0 104M 0 mpath sds 65:32 0 104M 0 disk |-vg-lvol0 253:2 0 200M 0 lvm `-mpath_dev2 253:4 0 104M 0 mpath sdt 65:48 0 104M 0 disk `-mpath_dev2 253:4 0 104M 0 mpath In this case the vg-lvol0 is mapped onto sdq and sds becauset this is what was available and seen during activation. Then later on, sdr and sdt appeared and mpath devices were created out of sdq+sdr (mpath_dev1) and sds+sdt (mpath_dev2). Now, LVM assumes (correctly) that mpath_dev1 and mpath_dev2 are the PVs that should be used, not the mpath components (sdq/sdr, sds/sdt). [0] fedora/~ # pvs Found duplicate PV xSUix1GJ2SK82ACFuKzFLAQi8xMfFxnO: using /dev/mapper/mpath_dev1 not /dev/sdq Using duplicate PV /dev/mapper/mpath_dev1 from subsystem DM, replacing /dev/sdq Found duplicate PV MvHyMVabtSqr33AbkUrobq1LjP8oiTRm: using /dev/mapper/mpath_dev2 not /dev/sds Using duplicate PV /dev/mapper/mpath_dev2 from subsystem DM, ignoring /dev/sds WARNING: Device mismatch detected for vg/lvol0 which is accessing /dev/sdq, /dev/sds instead of /dev/mapper/mpath_dev1, /dev/mapper/mpath_dev2. PV VG Fmt Attr PSize PFree /dev/mapper/mpath_dev1 vg lvm2 a-- 100.00m 0 /dev/mapper/mpath_dev2 vg lvm2 a-- 100.00m 0
2016-03-16 16:01:26 +03:00
(void) _check_devs_used_correspond_with_vg(vg);
out:
if (!*consistent && (warn_flags & WARN_INCONSISTENT)) {
if (is_orphan_vg(vgname))
log_warn("WARNING: Found inconsistent standalone Physical Volumes.");
else
log_warn("WARNING: Volume Group %s is not consistent.", vgname);
}
return vg;
}
void free_pv_fid(struct physical_volume *pv)
{
if (!pv)
return;
pv_set_fid(pv, NULL);
}
2008-01-30 17:00:02 +03:00
/* This is only called by lv_from_lvid, which is only called from
* activate.c so we know the appropriate VG lock is already held and
* the vg_read_internal is therefore safe.
2002-11-18 17:04:08 +03:00
*/
static struct volume_group *_vg_read_by_vgid(struct cmd_context *cmd,
const char *vgid,
2008-03-17 19:51:31 +03:00
unsigned precommitted)
{
2003-10-16 00:10:11 +04:00
const char *vgname;
struct dm_list *vgnames;
struct volume_group *vg;
struct dm_str_list *strl;
uint32_t warn_flags = WARN_PV_READ | WARN_INCONSISTENT;
2002-11-18 17:04:08 +03:00
int consistent = 0;
/* Is corresponding vgname already cached? */
if (lvmcache_vgid_is_cached(vgid)) {
if ((vg = _vg_read(cmd, NULL, vgid, warn_flags, &consistent, precommitted)) &&
id_equal(&vg->id, (const struct id *)vgid)) {
2002-11-18 17:04:08 +03:00
return vg;
}
release_vg(vg);
2002-11-18 17:04:08 +03:00
}
2015-03-05 23:00:44 +03:00
/*
* When using lvmlockd we should never reach this point.
* The VG is locked, then vg_read() is done, which gets
* the latest VG from lvmetad, or disk if lvmetad has
* been invalidated. When we get here the VG should
* always be cached and returned above.
*/
if (lvmlockd_use())
log_error(INTERNAL_ERROR "vg_read_by_vgid failed with lvmlockd");
2003-10-16 00:10:11 +04:00
/* Mustn't scan if memory locked: ensure cache gets pre-populated! */
if (critical_section())
return_NULL;
/* FIXME Need a genuine read by ID here - don't vg_read_internal by name! */
/* FIXME Disabled vgrenames while active for now because we aren't
* allowed to do a full scan here any more. */
2008-01-30 17:00:02 +03:00
// The slow way - full scan required to cope with vgrename
lvmcache_force_next_label_scan();
lvmcache_label_scan(cmd);
if (!(vgnames = get_vgnames(cmd, 0))) {
log_error("vg_read_by_vgid: get_vgnames failed");
return NULL;
}
dm_list_iterate_items(strl, vgnames) {
2005-06-01 20:51:55 +04:00
vgname = strl->str;
if (!vgname)
2008-01-30 17:00:02 +03:00
continue; // FIXME Unnecessary?
2002-11-18 17:04:08 +03:00
consistent = 0;
if ((vg = _vg_read(cmd, vgname, vgid, warn_flags, &consistent, precommitted)) &&
id_equal(&vg->id, (const struct id *)vgid)) {
2002-11-18 17:04:08 +03:00
if (!consistent) {
release_vg(vg);
return NULL;
2002-11-18 17:04:08 +03:00
}
return vg;
}
release_vg(vg);
}
return NULL;
}
2002-11-18 17:04:08 +03:00
/* Only called by activate.c */
struct logical_volume *lv_from_lvid(struct cmd_context *cmd, const char *lvid_s,
2008-03-17 19:51:31 +03:00
unsigned precommitted)
2002-11-18 17:04:08 +03:00
{
struct logical_volume *lv;
2002-11-18 17:04:08 +03:00
struct volume_group *vg;
const union lvid *lvid;
2002-11-18 17:04:08 +03:00
lvid = (const union lvid *) lvid_s;
2002-11-18 17:04:08 +03:00
log_very_verbose("Finding %svolume group for uuid %s", precommitted ? "precommitted " : "", lvid_s);
if (!(vg = _vg_read_by_vgid(cmd, (const char *)lvid->id[0].uuid, precommitted))) {
2002-11-18 17:04:08 +03:00
log_error("Volume group for uuid not found: %s", lvid_s);
return NULL;
}
log_verbose("Found volume group \"%s\"", vg->name);
if (vg->status & EXPORTED_VG) {
log_error("Volume group \"%s\" is exported", vg->name);
goto out;
2002-11-18 17:04:08 +03:00
}
if (!(lv = find_lv_in_vg_by_lvid(vg, lvid))) {
2002-11-18 17:04:08 +03:00
log_very_verbose("Can't find logical volume id %s", lvid_s);
goto out;
2002-11-18 17:04:08 +03:00
}
return lv;
out:
release_vg(vg);
return NULL;
2002-11-18 17:04:08 +03:00
}
const char *find_vgname_from_pvid(struct cmd_context *cmd,
const char *pvid)
{
char *vgname;
struct lvmcache_info *info;
vgname = lvmcache_vgname_from_pvid(cmd, pvid);
if (is_orphan_vg(vgname)) {
if (!(info = lvmcache_info_from_pvid(pvid, NULL, 0))) {
return_NULL;
}
/*
* If an orphan PV has no MDAs, or it has MDAs but the
* MDA is ignored, it may appear to be an orphan until
* the metadata is read off another PV in the same VG.
* Detecting this means checking every VG by scanning
* every PV on the system.
*/
if (lvmcache_uncertain_ownership(info)) {
if (!scan_vgs_for_pvs(cmd, WARN_PV_READ)) {
log_error("Rescan for PVs without "
"metadata areas failed.");
return NULL;
}
/*
* Ask lvmcache again - we may have a non-orphan
* name now
*/
vgname = lvmcache_vgname_from_pvid(cmd, pvid);
}
}
return vgname;
}
const char *find_vgname_from_pvname(struct cmd_context *cmd,
const char *pvname)
{
const char *pvid;
pvid = lvmcache_pvid_from_devname(cmd, pvname);
if (!pvid)
/* Not a PV */
return NULL;
return find_vgname_from_pvid(cmd, pvid);
}
/* FIXME Use label functions instead of PV functions */
2008-01-30 17:00:02 +03:00
static struct physical_volume *_pv_read(struct cmd_context *cmd,
struct dm_pool *pvmem,
const char *pv_name,
struct format_instance *fid,
uint32_t warn_flags, int scan_label_only)
{
struct physical_volume *pv;
2002-11-18 17:04:08 +03:00
struct label *label;
struct lvmcache_info *info;
2002-11-18 17:04:08 +03:00
struct device *dev;
const struct format_type *fmt;
int found;
2008-01-30 16:19:47 +03:00
if (!(dev = dev_cache_get(pv_name, cmd->filter)))
return_NULL;
if (lvmetad_used()) {
info = lvmcache_info_from_pvid(dev->pvid, dev, 0);
if (!info) {
if (!lvmetad_pv_lookup_by_dev(cmd, dev, &found))
return_NULL;
if (!found) {
if (warn_flags & WARN_PV_READ)
log_error("No physical volume found in lvmetad cache for %s",
pv_name);
return NULL;
}
if (!(info = lvmcache_info_from_pvid(dev->pvid, dev, 0))) {
if (warn_flags & WARN_PV_READ)
log_error("No cache info in lvmetad cache for %s.",
pv_name);
return NULL;
}
}
label = lvmcache_get_label(info);
} else {
if (!(label_read(dev, &label, UINT64_C(0)))) {
if (warn_flags & WARN_PV_READ)
log_error("No physical volume label read from %s",
pv_name);
return NULL;
}
info = (struct lvmcache_info *) label->info;
}
fmt = lvmcache_fmt(info);
2002-11-18 17:04:08 +03:00
pv = _alloc_pv(pvmem, dev);
if (!pv) {
log_error("pv allocation for '%s' failed", pv_name);
2005-04-20 00:52:35 +04:00
return NULL;
2002-11-18 17:04:08 +03:00
}
pv->label_sector = label->sector;
2002-11-18 17:04:08 +03:00
/* FIXME Move more common code up here */
if (!(lvmcache_fmt(info)->ops->pv_read(lvmcache_fmt(info), pv_name, pv, scan_label_only))) {
log_error("Failed to read existing physical volume '%s'",
pv_name);
goto bad;
}
if (!pv->size)
goto bad;
if (!alloc_pv_segment_whole_pv(pvmem, pv))
goto_bad;
2005-04-20 00:52:35 +04:00
if (fid)
lvmcache_fid_add_mdas(info, fid, (const char *) &pv->id, ID_LEN);
else {
lvmcache_fid_add_mdas(info, fmt->orphan_vg->fid, (const char *) &pv->id, ID_LEN);
pv_set_fid(pv, fmt->orphan_vg->fid);
}
2005-04-20 00:52:35 +04:00
return pv;
bad:
free_pv_fid(pv);
dm_pool_free(pvmem, pv);
return NULL;
}
2002-11-18 17:04:08 +03:00
/* May return empty list */
struct dm_list *get_vgnames(struct cmd_context *cmd, int include_internal)
{
return lvmcache_get_vgnames(cmd, include_internal);
}
struct dm_list *get_vgids(struct cmd_context *cmd, int include_internal)
{
return lvmcache_get_vgids(cmd, include_internal);
}
int get_vgnameids(struct cmd_context *cmd, struct dm_list *vgnameids,
const char *only_this_vgname, int include_internal)
{
struct vgnameid_list *vgnl;
struct format_type *fmt;
if (only_this_vgname) {
if (!(vgnl = dm_pool_alloc(cmd->mem, sizeof(*vgnl)))) {
log_error("vgnameid_list allocation failed.");
return 0;
}
vgnl->vg_name = dm_pool_strdup(cmd->mem, only_this_vgname);
vgnl->vgid = NULL;
dm_list_add(vgnameids, &vgnl->list);
return 1;
}
if (lvmetad_used()) {
/*
* This just gets the list of names/ids from lvmetad
* and does not populate lvmcache.
*/
lvmetad_get_vgnameids(cmd, vgnameids);
if (include_internal) {
dm_list_iterate_items(fmt, &cmd->formats) {
if (!(vgnl = dm_pool_alloc(cmd->mem, sizeof(*vgnl)))) {
log_error("vgnameid_list allocation failed.");
return 0;
}
vgnl->vg_name = dm_pool_strdup(cmd->mem, fmt->orphan_vg_name);
vgnl->vgid = NULL;
dm_list_add(vgnameids, &vgnl->list);
}
}
} else {
/*
* The non-lvmetad case. This function begins by calling
* lvmcache_label_scan() to populate lvmcache.
*/
lvmcache_get_vgnameids(cmd, include_internal, vgnameids);
}
return 1;
}
static int _get_pvs(struct cmd_context *cmd, uint32_t warn_flags,
struct dm_list *pvslist, struct dm_list *vgslist)
{
struct dm_str_list *strl;
const char *vgname, *vgid;
struct pv_list *pvl, *pvl_copy;
struct dm_list *vgids;
2002-11-18 17:04:08 +03:00
struct volume_group *vg;
int consistent = 0;
2003-04-30 19:23:43 +04:00
int old_pvmove;
struct vg_list *vgl_item = NULL;
int have_pv = 0;
2002-11-18 17:04:08 +03:00
lvmcache_label_scan(cmd);
2002-11-18 17:04:08 +03:00
/* Get list of VGs */
if (!(vgids = get_vgids(cmd, 1))) {
log_error("get_pvs: get_vgids failed");
return 0;
}
2002-11-18 17:04:08 +03:00
/* Read every VG to ensure cache consistency */
/* Orphan VG is last on list */
2003-04-30 19:23:43 +04:00
old_pvmove = pvmove_mode();
init_pvmove(1);
dm_list_iterate_items(strl, vgids) {
vgid = strl->str;
if (!vgid)
2002-11-18 17:04:08 +03:00
continue; /* FIXME Unnecessary? */
consistent = 0;
if (!(vgname = lvmcache_vgname_from_vgid(NULL, vgid))) {
stack;
continue;
}
/*
* When we are retrieving a list to return toliblvm we need
* that list to contain VGs that are modifiable as we are using
* the vgmem pool in the vg to provide allocation for liblvm.
* This is a hack to prevent the vg from getting cached as the
* vgid will be NULL.
* FIXME Remove this hack.
*/
warn_flags |= WARN_INCONSISTENT;
if (!(vg = vg_read_internal(cmd, vgname, (!vgslist) ? vgid : NULL, warn_flags, &consistent))) {
2002-11-18 17:04:08 +03:00
stack;
continue;
}
/* Move PVs onto results list */
if (pvslist)
dm_list_iterate_items(pvl, &vg->pvs) {
if (!(pvl_copy = _copy_pvl(cmd->mem, pvl))) {
log_error("PV list allocation failed");
release_vg(vg);
return 0;
}
/* If we are going to release the VG, don't
* store a pointer to it in the PV structure.
*/
if (!vgslist)
pvl_copy->pv->vg = NULL;
else
/*
* Make sure the vg mode indicates
* writeable.
* FIXME Rework function to take a
* parameter to control this
*/
pvl_copy->pv->vg->open_mode = 'w';
have_pv = 1;
dm_list_add(pvslist, &pvl_copy->list);
}
/*
* In the case of the library we want to preserve the embedded
* volume group as subsequent calls to retrieve data about the
* PV require it.
*/
if (!vgslist || !have_pv)
release_vg(vg);
else {
/*
* Add VG to list of VG objects that will be returned
*/
vgl_item = dm_pool_alloc(cmd->mem, sizeof(*vgl_item));
if (!vgl_item) {
log_error("VG list element allocation failed");
return 0;
}
vgl_item->vg = vg;
vg = NULL;
dm_list_add(vgslist, &vgl_item->list);
}
have_pv = 0;
2002-11-18 17:04:08 +03:00
}
2003-04-30 19:23:43 +04:00
init_pvmove(old_pvmove);
2002-11-18 17:04:08 +03:00
if (!pvslist)
dm_pool_free(cmd->mem, vgids);
return 1;
}
/*
* Retrieve a list of all physical volumes.
* @param cmd Command context
* @param pvslist Set to NULL if you want memory for list created,
* else valid memory
* @param vgslist Set to NULL if you need the pv structures to contain
* valid vg pointer. This is the list of VGs
* @returns NULL on errors, else pvslist which will equal passed-in value if
* supplied.
*/
struct dm_list *get_pvs_internal(struct cmd_context *cmd,
struct dm_list *pvslist,
struct dm_list *vgslist)
{
struct dm_list *results = pvslist;
if (NULL == results) {
if (!(results = dm_pool_alloc(cmd->mem, sizeof(*results)))) {
log_error("PV list allocation failed");
return 0;
}
dm_list_init(results);
}
if (!_get_pvs(cmd, WARN_PV_READ, results, vgslist)) {
if (!pvslist)
dm_pool_free(cmd->mem, results);
return NULL;
}
return results;
}
int scan_vgs_for_pvs(struct cmd_context *cmd, uint32_t warn_flags)
{
return _get_pvs(cmd, warn_flags, NULL, NULL);
}
int pv_write(struct cmd_context *cmd,
struct physical_volume *pv, int allow_non_orphan)
{
if (!pv->fmt->ops->pv_write) {
log_error("Format does not support writing physical volumes");
return 0;
}
/*
* FIXME: Try to remove this restriction. This requires checking
* that the PV and the VG are in a consistent state. We need
* to provide some revert mechanism since PV label together
* with VG metadata write is not atomic.
*/
if (!allow_non_orphan &&
(!is_orphan_vg(pv->vg_name) || pv->pe_alloc_count)) {
2002-11-18 17:04:08 +03:00
log_error("Assertion failed: can't _pv_write non-orphan PV "
"(in VG %s)", pv_vg_name(pv));
2002-11-18 17:04:08 +03:00
return 0;
}
if (!pv->fmt->ops->pv_write(pv->fmt, pv))
2008-01-30 16:19:47 +03:00
return_0;
pv->status &= ~UNLABELLED_PV;
if (!lvmetad_pv_found(cmd, &pv->id, pv->dev, pv->fmt, pv->label_sector, NULL, NULL, NULL))
return_0;
return 1;
}
int pv_write_orphan(struct cmd_context *cmd, struct physical_volume *pv)
{
const char *old_vg_name = pv->vg_name;
2008-02-06 18:47:28 +03:00
pv->vg_name = cmd->fmt->orphan_vg_name;
pv->status = ALLOCATABLE_PV;
pv->pe_alloc_count = 0;
if (!dev_get_size(pv->dev, &pv->size)) {
log_error("%s: Couldn't get size.", pv_dev_name(pv));
return 0;
}
if (!pv_write(cmd, pv, 0)) {
log_error("Failed to clear metadata from physical "
"volume \"%s\" after removal from \"%s\"",
pv_dev_name(pv), old_vg_name);
return 0;
}
return 1;
}
int is_global_vg(const char *vg_name)
{
return (vg_name && !strcmp(vg_name, VG_GLOBAL)) ? 1 : 0;
}
/**
* is_orphan_vg - Determine whether a vg_name is an orphan
* @vg_name: pointer to the vg_name
*/
int is_orphan_vg(const char *vg_name)
{
return (vg_name && !strncmp(vg_name, ORPHAN_PREFIX, sizeof(ORPHAN_PREFIX) - 1)) ? 1 : 0;
}
/*
* Exclude pseudo VG names used for locking.
*/
int is_real_vg(const char *vg_name)
{
return (vg_name && *vg_name != '#');
}
static int _analyze_mda(struct metadata_area *mda, void *baton)
{
const struct format_type *fmt = baton;
mda->ops->pv_analyze_mda(fmt, mda);
return 1;
}
/*
* Returns:
* 0 - fail
* 1 - success
*/
int pv_analyze(struct cmd_context *cmd, const char *pv_name,
uint64_t label_sector)
{
struct label *label;
struct device *dev;
struct lvmcache_info *info;
dev = dev_cache_get(pv_name, cmd->filter);
if (!dev) {
log_error("Device %s not found (or ignored by filtering).",
pv_name);
return 0;
}
/*
* First, scan for LVM labels.
*/
if (!label_read(dev, &label, label_sector)) {
log_error("Could not find LVM label on %s",
pv_name);
return 0;
}
log_print("Found label on %s, sector %"PRIu64", type=%.8s",
pv_name, label->sector, label->type);
/*
* Next, loop through metadata areas
*/
info = label->info;
lvmcache_foreach_mda(info, _analyze_mda, (void *)lvmcache_fmt(info));
return 1;
}
/* FIXME: remove / combine this with locking? */
int vg_check_write_mode(struct volume_group *vg)
{
if (vg->open_mode != 'w') {
2009-07-29 00:41:41 +04:00
log_errno(EPERM, "Attempt to modify a read-only VG");
return 0;
}
return 1;
}
/*
* Return 1 if the VG metadata should be written
2015-03-09 21:53:22 +03:00
* *without* the LVM_WRITE flag in the status line, and
* *with* the LVM_WRITE_LOCKED flag in the flags line.
*
* If this is done for a VG, it forces previous versions
2015-03-09 21:53:22 +03:00
* of lvm (before the LVM_WRITE_LOCKED flag was added), to view
* the VG and its LVs as read-only (because the LVM_WRITE flag
* is missing). Versions of lvm that understand the
2015-03-09 21:53:22 +03:00
* LVM_WRITE_LOCKED flag know to check the other methods of
* access control for the VG, specifically system_id and lock_type.
*
* So, if a VG has a system_id or lock_type, then the
* system_id and lock_type control access to the VG in
* addition to its basic writable status. Because previous
* lvm versions do not know about system_id or lock_type,
2015-03-09 21:53:22 +03:00
* VGs depending on either of these should have LVM_WRITE_LOCKED
* instead of LVM_WRITE to prevent the previous lvm versions from
* assuming they can write the VG and its LVs.
*/
int vg_flag_write_locked(struct volume_group *vg)
{
if (vg->system_id && vg->system_id[0])
return 1;
if (vg->lock_type && vg->lock_type[0] && strcmp(vg->lock_type, "none"))
return 1;
return 0;
}
2016-12-25 01:10:06 +03:00
static int _access_vg_clustered(struct cmd_context *cmd, const struct volume_group *vg)
{
if (vg_is_clustered(vg) && !locking_is_clustered()) {
if (!cmd->ignore_clustered_vgs)
log_error("Skipping clustered volume group %s", vg->name);
else
log_verbose("Skipping clustered volume group %s", vg->name);
return 0;
}
return 1;
}
/*
* Performs a set of checks against a VG according to bits set in status
* and returns FAILED_* bits for those that aren't acceptable.
*
* FIXME Remove the unnecessary duplicate definitions and return bits directly.
*/
static uint32_t _vg_bad_status_bits(const struct volume_group *vg,
uint64_t status)
{
uint32_t failure = 0;
2016-12-25 01:10:06 +03:00
if ((status & CLUSTERED) && !_access_vg_clustered(vg->cmd, vg))
/* Return because other flags are considered undefined. */
return FAILED_CLUSTERED;
if ((status & EXPORTED_VG) &&
vg_is_exported(vg)) {
log_error("Volume group %s is exported", vg->name);
failure |= FAILED_EXPORTED;
}
if ((status & LVM_WRITE) &&
!(vg->status & LVM_WRITE)) {
log_error("Volume group %s is read-only", vg->name);
failure |= FAILED_READ_ONLY;
}
if ((status & RESIZEABLE_VG) &&
!vg_is_resizeable(vg)) {
log_error("Volume group %s is not resizeable.", vg->name);
failure |= FAILED_RESIZEABLE;
}
return failure;
}
/**
* vg_check_status - check volume group status flags and log error
* @vg - volume group to check status flags
* @status - specific status flags to check (e.g. EXPORTED_VG)
*/
int vg_check_status(const struct volume_group *vg, uint64_t status)
{
return !_vg_bad_status_bits(vg, status);
}
/*
* VG is left unlocked on failure
*/
static struct volume_group *_recover_vg(struct cmd_context *cmd,
const char *vg_name, const char *vgid)
{
int consistent = 1;
struct volume_group *vg;
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
unlock_vg(cmd, NULL, vg_name);
dev_close_all();
if (!lock_vol(cmd, vg_name, LCK_VG_WRITE, NULL))
return_NULL;
if (!(vg = vg_read_internal(cmd, vg_name, vgid, WARN_PV_READ, &consistent))) {
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
unlock_vg(cmd, NULL, vg_name);
return_NULL;
}
if (!consistent) {
release_vg(vg);
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
unlock_vg(cmd, NULL, vg_name);
return_NULL;
}
return (struct volume_group *)vg;
}
static int _allow_extra_system_id(struct cmd_context *cmd, const char *system_id)
{
const struct dm_config_node *cn;
const struct dm_config_value *cv;
const char *str;
if (!(cn = find_config_tree_array(cmd, local_extra_system_ids_CFG, NULL)))
return 0;
for (cv = cn->v; cv; cv = cv->next) {
if (cv->type == DM_CFG_EMPTY_ARRAY)
break;
/* Ignore invalid data: Warning message already issued by config.c */
if (cv->type != DM_CFG_STRING)
continue;
str = cv->v.str;
if (!*str)
continue;
if (!strcmp(str, system_id))
return 1;
}
return 0;
}
2015-03-05 23:00:44 +03:00
static int _access_vg_lock_type(struct cmd_context *cmd, struct volume_group *vg,
uint32_t lockd_state, uint32_t *failure)
{
if (!is_real_vg(vg->name))
return 1;
2015-03-05 23:00:44 +03:00
if (cmd->lockd_vg_disable)
return 1;
/*
2015-03-05 23:00:44 +03:00
* Local VG requires no lock from lvmlockd.
*/
if (!is_lockd_type(vg->lock_type))
return 1;
/*
* When lvmlockd is not used, lockd VGs are ignored by lvm
* and cannot be used, with two exceptions:
*
* . The --shared option allows them to be revealed with
* reporting/display commands.
*
* . If a command asks to operate on one specifically
* by name, then an error is printed.
*/
2015-03-05 23:00:44 +03:00
if (!lvmlockd_use()) {
/*
* Some reporting/display commands have the --shared option
* (like --foreign) to allow them to reveal lockd VGs that
* are otherwise ignored. The --shared option must only be
* permitted in commands that read the VG for report or display,
* not any that write the VG or activate LVs.
*/
if (cmd->include_shared_vgs)
return 1;
/*
* Some commands want the error printed by vg_read, others by ignore_vg.
* Those using ignore_vg may choose to skip the error.
*/
if (cmd->vg_read_print_access_error) {
log_error("Cannot access VG %s with lock type %s that requires lvmlockd.",
vg->name, vg->lock_type);
}
*failure |= FAILED_LOCK_TYPE;
return 0;
}
2015-03-05 23:00:44 +03:00
/*
* The lock request from lvmlockd failed. If the lock was ex,
* we cannot continue. If the lock was sh, we could also fail
* to continue but since the lock was sh, it means the VG is
* only being read, and it doesn't hurt to allow reading with
* no lock.
*/
if (lockd_state & LDST_FAIL) {
if ((lockd_state & LDST_EX) || cmd->lockd_vg_enforce_sh) {
2015-03-05 23:00:44 +03:00
log_error("Cannot access VG %s due to failed lock.", vg->name);
*failure |= FAILED_LOCK_MODE;
2015-03-05 23:00:44 +03:00
return 0;
} else {
log_warn("Reading VG %s without a lock.", vg->name);
return 1;
}
}
if (test_mode()) {
log_error("Test mode is not yet supported with lock type %s.", vg->lock_type);
return 0;
}
return 1;
}
int is_system_id_allowed(struct cmd_context *cmd, const char *system_id)
{
/*
* A VG without a system_id can be accessed by anyone.
*/
if (!system_id || !system_id[0])
return 1;
/*
* Allowed if the host and VG system_id's match.
*/
if (cmd->system_id && !strcmp(cmd->system_id, system_id))
return 1;
/*
* Allowed if a host's extra system_id matches.
*/
if (cmd->system_id && _allow_extra_system_id(cmd, system_id))
return 1;
/*
* Not allowed if the host does not have a system_id
* and the VG does, or if the host and VG's system_id's
* do not match.
*/
return 0;
}
static int _access_vg_systemid(struct cmd_context *cmd, struct volume_group *vg)
{
/*
* LVM1 VGs must not be accessed if a new-style LVM2 system ID is set.
*/
if (cmd->system_id && systemid_on_pvs(vg)) {
log_error("Cannot access VG %s with LVM1 system ID %s when host system ID is set.",
vg->name, vg->lvm1_system_id);
return 0;
}
/*
* A few commands allow read-only access to foreign VGs.
*/
if (cmd->include_foreign_vgs)
return 1;
if (is_system_id_allowed(cmd, vg->system_id))
return 1;
/*
* Allow VG access if the local host has active LVs in it.
*/
if (lvs_in_vg_activated(vg)) {
log_warn("WARNING: Found LVs active in VG %s with foreign system ID %s. Possible data corruption.",
vg->name, vg->system_id);
if (cmd->include_active_foreign_vgs)
return 1;
return 0;
}
/*
* Print an error when reading a VG that has a system_id
* and the host system_id is unknown.
*/
if (!cmd->system_id || cmd->unknown_system_id) {
log_error("Cannot access VG %s with system ID %s with unknown local system ID.",
vg->name, vg->system_id);
return 0;
}
/*
2015-03-05 23:00:44 +03:00
* Some commands want the error printed by vg_read, others by ignore_vg.
* Those using ignore_vg may choose to skip the error.
*/
2015-03-05 23:00:44 +03:00
if (cmd->vg_read_print_access_error) {
log_error("Cannot access VG %s with system ID %s with local system ID %s.",
vg->name, vg->system_id, cmd->system_id);
return 0;
}
2015-03-05 23:00:44 +03:00
/* Silently ignore foreign vgs. */
return 0;
}
/*
* FIXME: move _vg_bad_status_bits() checks in here.
*/
2015-03-05 23:00:44 +03:00
static int _vg_access_permitted(struct cmd_context *cmd, struct volume_group *vg,
uint32_t lockd_state, uint32_t *failure)
{
if (!is_real_vg(vg->name)) {
/* Disallow use of LVM1 orphans when a host system ID is set. */
if (cmd->system_id && *cmd->system_id && systemid_on_pvs(vg)) {
*failure |= FAILED_SYSTEMID;
return_0;
}
return 1;
}
if (!_access_vg_clustered(cmd, vg)) {
*failure |= FAILED_CLUSTERED;
return 0;
}
if (!_access_vg_lock_type(cmd, vg, lockd_state, failure)) {
/* Either FAILED_LOCK_TYPE or FAILED_LOCK_MODE were set. */
return 0;
}
if (!_access_vg_systemid(cmd, vg)) {
*failure |= FAILED_SYSTEMID;
return 0;
}
return 1;
}
/*
* Consolidated locking, reading, and status flag checking.
*
* If the metadata is inconsistent, setting READ_ALLOW_INCONSISTENT in
* read_flags will return it with FAILED_INCONSISTENT set instead of
* giving you nothing.
*
* Use vg_read_error(vg) to determine the result. Nonzero means there were
* problems reading the volume group.
* Zero value means that the VG is open and appropriate locks are held.
*/
static struct volume_group *_vg_lock_and_read(struct cmd_context *cmd, const char *vg_name,
const char *vgid,
uint32_t lock_flags,
uint64_t status_flags,
uint32_t read_flags,
2015-03-05 23:00:44 +03:00
uint32_t lockd_state)
{
struct volume_group *vg = NULL;
2014-03-19 03:24:09 +04:00
int consistent = 1;
int consistent_in;
uint32_t failure = 0;
uint32_t warn_flags = 0;
int already_locked;
if ((read_flags & READ_ALLOW_INCONSISTENT) || (lock_flags != LCK_VG_WRITE))
consistent = 0;
if (!validate_name(vg_name) && !is_orphan_vg(vg_name)) {
log_error("Volume group name \"%s\" has invalid characters.",
vg_name);
return NULL;
}
already_locked = lvmcache_vgname_is_locked(vg_name);
if (!already_locked &&
!lock_vol(cmd, vg_name, lock_flags, NULL)) {
log_error("Can't get lock for %s", vg_name);
return _vg_make_handle(cmd, vg, FAILED_LOCKING);
}
if (already_locked)
log_very_verbose("Locking %s already done", vg_name);
if (is_orphan_vg(vg_name))
status_flags &= ~LVM_WRITE;
consistent_in = consistent;
warn_flags = WARN_PV_READ;
if (consistent || (read_flags & READ_WARN_INCONSISTENT))
warn_flags |= WARN_INCONSISTENT;
/* If consistent == 1, we get NULL here if correction fails. */
if (!(vg = vg_read_internal(cmd, vg_name, vgid, warn_flags, &consistent))) {
if (consistent_in && !consistent) {
failure |= FAILED_INCONSISTENT;
goto bad;
}
if (!(read_flags & READ_OK_NOTFOUND))
log_error("Volume group \"%s\" not found", vg_name);
failure |= FAILED_NOTFOUND;
goto bad;
}
2015-03-05 23:00:44 +03:00
if (!_vg_access_permitted(cmd, vg, lockd_state, &failure))
goto bad;
/* consistent == 0 when VG is not found, but failed == FAILED_NOTFOUND */
if (!consistent && !failure) {
release_vg(vg);
if (!(vg = _recover_vg(cmd, vg_name, vgid))) {
if (is_orphan_vg(vg_name))
log_error("Recovery of standalone physical volumes failed.");
else
log_error("Recovery of volume group \"%s\" failed.",
vg_name);
failure |= FAILED_RECOVERY;
goto bad_no_unlock;
}
}
/*
* Check that the tool can handle tricky cases -- missing PVs and
* unknown segment types.
*/
if (!cmd->handles_missing_pvs && vg_missing_pv_count(vg) &&
lock_flags == LCK_VG_WRITE) {
log_error("Cannot change VG %s while PVs are missing.", vg->name);
log_error("Consider vgreduce --removemissing.");
failure |= FAILED_INCONSISTENT; /* FIXME new failure code here? */
goto bad;
}
if (!cmd->handles_unknown_segments && vg_has_unknown_segments(vg) &&
lock_flags == LCK_VG_WRITE) {
log_error("Cannot change VG %s with unknown segments in it!",
vg->name);
failure |= FAILED_INCONSISTENT; /* FIXME new failure code here? */
goto bad;
}
failure |= _vg_bad_status_bits(vg, status_flags);
if (failure)
goto_bad;
return _vg_make_handle(cmd, vg, failure);
bad:
if (!already_locked)
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
unlock_vg(cmd, vg, vg_name);
bad_no_unlock:
return _vg_make_handle(cmd, vg, failure);
}
/*
* vg_read: High-level volume group metadata read function.
*
* vg_read_error() must be used on any handle returned to check for errors.
*
* - metadata inconsistent and automatic correction failed: FAILED_INCONSISTENT
* - VG is read-only: FAILED_READ_ONLY
* - VG is EXPORTED, unless flags has READ_ALLOW_EXPORTED: FAILED_EXPORTED
* - VG is not RESIZEABLE: FAILED_RESIZEABLE
* - locking failed: FAILED_LOCKING
*
* On failures, all locks are released, unless one of the following applies:
* - vgname_is_locked(lock_name) is true
* FIXME: remove the above 2 conditions if possible and make an error always
* release the lock.
*
* Volume groups are opened read-only unless flags contains READ_FOR_UPDATE.
*
* Checking for VG existence:
*
* FIXME: We want vg_read to attempt automatic recovery after acquiring a
* temporary write lock: if that fails, we bail out as usual, with failed &
* FAILED_INCONSISTENT. If it works, we are good to go. Code that's been in
* toollib just set lock_flags to LCK_VG_WRITE and called vg_read_internal with
* *consistent = 1.
*/
struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name,
const char *vgid, uint32_t read_flags, uint32_t lockd_state)
{
uint64_t status_flags = UINT64_C(0);
uint32_t lock_flags = LCK_VG_READ;
if (read_flags & READ_FOR_UPDATE) {
status_flags |= EXPORTED_VG | LVM_WRITE;
lock_flags = LCK_VG_WRITE;
}
if (read_flags & READ_ALLOW_EXPORTED)
status_flags &= ~EXPORTED_VG;
return _vg_lock_and_read(cmd, vg_name, vgid, lock_flags, status_flags, read_flags, lockd_state);
}
/*
* A high-level volume group metadata reading function. Open a volume group for
* later update (this means the user code can change the metadata and later
* request the new metadata to be written and committed).
*/
struct volume_group *vg_read_for_update(struct cmd_context *cmd, const char *vg_name,
const char *vgid, uint32_t read_flags, uint32_t lockd_state)
{
return vg_read(cmd, vg_name, vgid, read_flags | READ_FOR_UPDATE, lockd_state);
}
/*
* Test the validity of a VG handle returned by vg_read() or vg_read_for_update().
*/
uint32_t vg_read_error(struct volume_group *vg_handle)
{
if (!vg_handle)
return FAILED_ALLOCATION;
return vg_handle->read_status;
}
/*
* Lock a vgname and/or check for existence.
* Takes a WRITE lock on the vgname before scanning.
* If scanning fails or vgname found, release the lock.
* NOTE: If you find the return codes confusing, you might think of this
* function as similar to an open() call with O_CREAT and O_EXCL flags
* (open returns fail with -EEXIST if file already exists).
*
* Returns:
* FAILED_LOCKING - Cannot lock name
* FAILED_EXIST - VG name already exists - cannot reserve
* SUCCESS - VG name does not exist in system and WRITE lock held
*/
uint32_t vg_lock_newname(struct cmd_context *cmd, const char *vgname)
{
if (!lock_vol(cmd, vgname, LCK_VG_WRITE, NULL)) {
return FAILED_LOCKING;
}
/* Find the vgname in the cache */
/* If it's not there we must do full scan to be completely sure */
if (!lvmcache_fmt_from_vgname(cmd, vgname, NULL, 1)) {
lvmcache_label_scan(cmd);
if (!lvmcache_fmt_from_vgname(cmd, vgname, NULL, 1)) {
/* Independent MDAs aren't supported under low memory */
if (!cmd->independent_metadata_areas && critical_section()) {
/*
* FIXME: Disallow calling this function if
* critical_section() is true.
*/
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
unlock_vg(cmd, NULL, vgname);
return FAILED_LOCKING;
}
lvmcache_force_next_label_scan();
lvmcache_label_scan(cmd);
if (!lvmcache_fmt_from_vgname(cmd, vgname, NULL, 0)) {
/* vgname not found after scanning */
return SUCCESS;
}
}
}
2009-06-30 22:39:31 +04:00
/* Found vgname so cannot reserve. */
lvmetad: two phase vg_update Previously, a command sent lvmetad new VG metadata in vg_commit(). In vg_commit(), devices are suspended, so any memory allocation done by the command while sending to lvmetad, or by lvmetad while updating its cache could deadlock if memory reclaim was triggered. Now lvmetad is updated in unlock_vg(), after devices are resumed. The new method for updating VG metadata in lvmetad is in two phases: 1. In vg_write(), before devices are suspended, the command sends lvmetad a short message ("set_vg_info") telling it what the new VG seqno will be. lvmetad sees that the seqno is newer than the seqno of its cached VG, so it sets the INVALID flag for the cached VG. If sending the message to lvmetad fails, the command fails before the metadata is committed and the change is not made. If sending the message succeeds, vg_commit() is called. 2. In unlock_vg(), after devices are resumed, the command sends lvmetad the standard vg_update message with the new metadata. lvmetad sees that the seqno in the new metadata matches the seqno it saved from set_vg_info, and knows it has the latest copy, so it clears the INVALID flag for the cached VG. If a command fails between 1 and 2 (after committing the VG on disk, but before sending lvmetad the new metadata), the cached VG retains the INVALID flag in lvmetad. A subsequent command will read the cached VG from lvmetad, see the INVALID flag, ignore the cached copy, read the VG from disk instead, update the lvmetad copy with the latest copy from disk, (this clears the INVALID flag in lvmetad), and use the correct VG metadata for the command. (This INVALID mechanism already existed for use by lvmlockd.)
2016-06-08 22:42:03 +03:00
unlock_vg(cmd, NULL, vgname);
return FAILED_EXIST;
}
struct format_instance *alloc_fid(const struct format_type *fmt,
const struct format_instance_ctx *fic)
{
struct dm_pool *mem;
struct format_instance *fid;
if (!(mem = dm_pool_create("format_instance", 1024)))
return_NULL;
if (!(fid = dm_pool_zalloc(mem, sizeof(*fid)))) {
log_error("Couldn't allocate format_instance object.");
goto bad;
}
fid->ref_count = 1;
fid->mem = mem;
fid->type = fic->type;
fid->fmt = fmt;
dm_list_init(&fid->metadata_areas_in_use);
dm_list_init(&fid->metadata_areas_ignored);
return fid;
bad:
dm_pool_destroy(mem);
return NULL;
}
void pv_set_fid(struct physical_volume *pv,
struct format_instance *fid)
{
if (fid == pv->fid)
return;
if (fid)
fid->ref_count++;
if (pv->fid)
pv->fid->fmt->ops->destroy_instance(pv->fid);
pv->fid = fid;
}
void vg_set_fid(struct volume_group *vg,
struct format_instance *fid)
{
struct pv_list *pvl;
if (fid == vg->fid)
return;
if (fid)
fid->ref_count++;
dm_list_iterate_items(pvl, &vg->pvs)
pv_set_fid(pvl->pv, fid);
dm_list_iterate_items(pvl, &vg->removed_pvs)
pv_set_fid(pvl->pv, fid);
if (vg->fid)
vg->fid->fmt->ops->destroy_instance(vg->fid);
vg->fid = fid;
}
static int _convert_key_to_string(const char *key, size_t key_len,
unsigned sub_key, char *buf, size_t buf_len)
{
memcpy(buf, key, key_len);
buf += key_len;
buf_len -= key_len;
if ((dm_snprintf(buf, buf_len, "_%u", sub_key) == -1))
return_0;
return 1;
}
int fid_add_mda(struct format_instance *fid, struct metadata_area *mda,
const char *key, size_t key_len, const unsigned sub_key)
{
static char full_key[PATH_MAX];
dm_list_add(mda_is_ignored(mda) ? &fid->metadata_areas_ignored :
&fid->metadata_areas_in_use, &mda->list);
/* Return if the mda is not supposed to be indexed. */
if (!key)
return 1;
if (!fid->metadata_areas_index)
return_0;
/* Add metadata area to index. */
2012-02-13 03:01:19 +04:00
if (!_convert_key_to_string(key, key_len, sub_key,
full_key, sizeof(full_key)))
return_0;
if (!dm_hash_insert(fid->metadata_areas_index,
full_key, mda)) {
log_error("Failed to hash mda.");
return 0;
}
return 1;
}
int fid_add_mdas(struct format_instance *fid, struct dm_list *mdas,
const char *key, size_t key_len)
{
struct metadata_area *mda, *mda_new;
unsigned mda_index = 0;
dm_list_iterate_items(mda, mdas) {
mda_new = mda_copy(fid->mem, mda);
if (!mda_new)
return_0;
fid_remove_mda(fid, NULL, key, key_len, mda_index);
fid_add_mda(fid, mda_new, key, key_len, mda_index);
mda_index++;
}
return 1;
}
struct metadata_area *fid_get_mda_indexed(struct format_instance *fid,
const char *key, size_t key_len,
const unsigned sub_key)
{
static char full_key[PATH_MAX];
struct metadata_area *mda = NULL;
if (!fid->metadata_areas_index)
return_NULL;
2012-02-13 03:01:19 +04:00
if (!_convert_key_to_string(key, key_len, sub_key,
full_key, sizeof(full_key)))
return_NULL;
2012-02-13 03:01:19 +04:00
mda = (struct metadata_area *) dm_hash_lookup(fid->metadata_areas_index,
full_key);
return mda;
}
int fid_remove_mda(struct format_instance *fid, struct metadata_area *mda,
const char *key, size_t key_len, const unsigned sub_key)
{
static char full_key[PATH_MAX];
struct metadata_area *mda_indexed = NULL;
/* At least one of mda or key must be specified. */
if (!mda && !key)
return 1;
if (key) {
/*
* If both mda and key specified, check given mda
* with what we find using the index and return
* immediately if these two do not match.
*/
if (!(mda_indexed = fid_get_mda_indexed(fid, key, key_len, sub_key)) ||
(mda && mda != mda_indexed))
return 1;
mda = mda_indexed;
2012-02-13 03:01:19 +04:00
if (!_convert_key_to_string(key, key_len, sub_key,
full_key, sizeof(full_key)))
return_0;
2012-02-13 03:01:19 +04:00
dm_hash_remove(fid->metadata_areas_index, full_key);
}
dm_list_del(&mda->list);
return 1;
}
/*
* Copy constructor for a metadata_area.
*/
struct metadata_area *mda_copy(struct dm_pool *mem,
struct metadata_area *mda)
{
struct metadata_area *mda_new;
if (!(mda_new = dm_pool_alloc(mem, sizeof(*mda_new)))) {
log_error("metadata_area allocation failed");
return NULL;
}
memcpy(mda_new, mda, sizeof(*mda));
if (mda->ops->mda_metadata_locn_copy && mda->metadata_locn) {
mda_new->metadata_locn =
mda->ops->mda_metadata_locn_copy(mem, mda->metadata_locn);
if (!mda_new->metadata_locn) {
dm_pool_free(mem, mda_new);
return NULL;
}
}
dm_list_init(&mda_new->list);
return mda_new;
}
/*
* This function provides a way to answer the question on a format specific
* basis - does the format specfic context of these two metadata areas
* match?
*
* A metatdata_area is defined to be independent of the underlying context.
* This has the benefit that we can use the same abstraction to read disks
* (see _metadata_text_raw_ops) or files (see _metadata_text_file_ops).
* However, one downside is there is no format-independent way to determine
* whether a given metadata_area is attached to a specific device - in fact,
* it may not be attached to a device at all.
*
* Thus, LVM is structured such that an mda is not a member of struct
* physical_volume. The location of the mda depends on whether
* the PV is in a volume group. A PV not in a VG has an mda on the
* 'info->mda' list in lvmcache, while a PV in a VG has an mda on
* the vg->fid->metadata_areas_in_use list. For further details, see _vg_read(),
* and the sequence of creating the format_instance with fid->metadata_areas_in_use
* list, as well as the construction of the VG, with list of PVs (comes
* after the construction of the fid and list of mdas).
*/
unsigned mda_locns_match(struct metadata_area *mda1, struct metadata_area *mda2)
{
if (!mda1->ops->mda_locns_match || !mda2->ops->mda_locns_match ||
mda1->ops->mda_locns_match != mda2->ops->mda_locns_match)
return 0;
return mda1->ops->mda_locns_match(mda1, mda2);
}
struct device *mda_get_device(struct metadata_area *mda)
{
if (!mda->ops->mda_get_device)
return NULL;
return mda->ops->mda_get_device(mda);
}
unsigned mda_is_ignored(struct metadata_area *mda)
{
return (mda->status & MDA_IGNORED);
}
void mda_set_ignored(struct metadata_area *mda, unsigned mda_ignored)
{
void *locn = mda->metadata_locn;
unsigned old_mda_ignored = mda_is_ignored(mda);
if (mda_ignored && !old_mda_ignored)
mda->status |= MDA_IGNORED;
else if (!mda_ignored && old_mda_ignored)
mda->status &= ~MDA_IGNORED;
else
2010-06-30 17:51:11 +04:00
return; /* No change */
log_debug_metadata("%s ignored flag for mda %s at offset %" PRIu64 ".",
mda_ignored ? "Setting" : "Clearing",
mda->ops->mda_metadata_locn_name ? mda->ops->mda_metadata_locn_name(locn) : "",
mda->ops->mda_metadata_locn_offset ? mda->ops->mda_metadata_locn_offset(locn) : UINT64_C(0));
}
int mdas_empty_or_ignored(struct dm_list *mdas)
{
struct metadata_area *mda;
if (dm_list_empty(mdas))
return 1;
dm_list_iterate_items(mda, mdas) {
if (mda_is_ignored(mda))
return 1;
}
return 0;
}
int pv_change_metadataignore(struct physical_volume *pv, uint32_t mda_ignored)
{
const char *pv_name = pv_dev_name(pv);
if (mda_ignored && !pv_mda_used_count(pv)) {
log_error("Metadata areas on physical volume \"%s\" already "
"ignored.", pv_name);
return 0;
}
if (!mda_ignored && (pv_mda_used_count(pv) == pv_mda_count(pv))) {
log_error("Metadata areas on physical volume \"%s\" already "
"marked as in-use.", pv_name);
return 0;
}
if (!pv_mda_count(pv)) {
log_error("Physical volume \"%s\" has no metadata "
"areas.", pv_name);
return 0;
}
log_verbose("Marking metadata areas on physical volume \"%s\" "
"as %s.", pv_name, mda_ignored ? "ignored" : "in-use");
if (!pv_mda_set_ignored(pv, mda_ignored))
return_0;
/*
* Update vg_mda_copies based on the mdas in this PV.
* This is most likely what the user would expect - if they
* specify a specific PV to be ignored/un-ignored, they will
* most likely not want LVM to turn around and change the
* ignore / un-ignore value when it writes the VG to disk.
* This does not guarantee this PV's ignore bits will be
* preserved in future operations.
*/
if (!is_orphan(pv) &&
vg_mda_copies(pv->vg) != VGMETADATACOPIES_UNMANAGED) {
log_warn("WARNING: Changing preferred number of copies of VG %s "
"metadata from %"PRIu32" to %"PRIu32, pv_vg_name(pv),
vg_mda_copies(pv->vg), vg_mda_used_count(pv->vg));
vg_set_mda_copies(pv->vg, vg_mda_used_count(pv->vg));
}
return 1;
}
char *tags_format_and_copy(struct dm_pool *mem, const struct dm_list *tagsl)
{
struct dm_str_list *sl;
if (!dm_pool_begin_object(mem, 256)) {
log_error("dm_pool_begin_object failed");
return NULL;
}
dm_list_iterate_items(sl, tagsl) {
if (!dm_pool_grow_object(mem, sl->str, strlen(sl->str)) ||
(sl->list.n != tagsl && !dm_pool_grow_object(mem, ",", 1))) {
log_error("dm_pool_grow_object failed");
return NULL;
}
}
if (!dm_pool_grow_object(mem, "\0", 1)) {
log_error("dm_pool_grow_object failed");
return NULL;
}
return dm_pool_end_object(mem);
}
const struct logical_volume *lv_committed(const struct logical_volume *lv)
{
struct volume_group *vg;
struct logical_volume *found_lv;
if (!lv)
return NULL;
if (!lv->vg->vg_committed)
return lv;
vg = lv->vg->vg_committed;
if (!(found_lv = find_lv_in_vg_by_lvid(vg, &lv->lvid))) {
log_error(INTERNAL_ERROR "LV %s (UUID %s) not found in committed metadata.",
display_lvname(lv), lv->lvid.s);
return NULL;
}
return found_lv;
}
2015-03-05 23:00:44 +03:00
/*
* Check if a lock_type uses lvmlockd.
* If not (none, clvm), return 0.
* If so (dlm, sanlock), return 1.
*/
int is_lockd_type(const char *lock_type)
{
if (!lock_type)
return 0;
if (!strcmp(lock_type, "dlm"))
return 1;
if (!strcmp(lock_type, "sanlock"))
return 1;
return 0;
}
int vg_strip_outdated_historical_lvs(struct volume_group *vg) {
struct glv_list *glvl, *tglvl;
time_t current_time = time(NULL);
uint64_t threshold = find_config_tree_int(vg->cmd, metadata_lvs_history_retention_time_CFG, NULL);
if (!threshold)
return 1;
dm_list_iterate_items_safe(glvl, tglvl, &vg->historical_lvs) {
/*
* Removal time in the future? Not likely,
* but skip this item in any case.
*/
if (current_time < (time_t) glvl->glv->historical->timestamp_removed)
continue;
if ((current_time - glvl->glv->historical->timestamp_removed) > threshold) {
if (!historical_glv_remove(glvl->glv)) {
log_error("Failed to destroy record about historical LV %s/%s.",
vg->name, glvl->glv->historical->name);
return 0;
}
log_verbose("Outdated record for historical logical volume \"%s\" "
"automatically destroyed.", glvl->glv->historical->name);
}
}
return 1;
}