1
0
mirror of git://sourceware.org/git/lvm2.git synced 2025-01-18 10:04:20 +03:00
lvm2/lib/metadata/metadata-exported.h

1469 lines
57 KiB
C
Raw Normal View History

/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2018 Red Hat, Inc. All rights reserved.
*
* This file is part of LVM2.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* This is the representation of LVM metadata that is being adapted
* for library export.
*/
#ifndef _LVM_METADATA_EXPORTED_H
#define _LVM_METADATA_EXPORTED_H
#include "lib/uuid/uuid.h"
#include "lib/metadata/pv.h"
#include "lib/metadata/vg.h"
#include "lib/metadata/lv.h"
#include "lib/misc/lvm-percent.h"
#include <stdbool.h>
#define MAX_STRIPES 128U
#define SECTOR_SHIFT 9L
#define SECTOR_SIZE ( 1L << SECTOR_SHIFT )
#define STRIPE_SIZE_MIN ( (unsigned) lvm_getpagesize() >> SECTOR_SHIFT) /* PAGESIZE in sectors */
#define STRIPE_SIZE_MAX ( 512L * 1024L >> SECTOR_SHIFT) /* 512 KB in sectors */
#define STRIPE_SIZE_LIMIT ((UINT_MAX >> 2) + 1)
#define MAX_RESTRICTED_LVS 255 /* Used by FMT_RESTRICTED_LVIDS */
#define MAX_EXTENT_SIZE ((uint32_t) -1)
#define MIN_NON_POWER2_EXTENT_SIZE (128U * 2U) /* 128KB in sectors */
#define HISTORICAL_LV_PREFIX "-"
/* Layer suffix */
#define MIRROR_SYNC_LAYER "_mimagetmp"
/* PV extension flags */
#define PV_EXT_USED UINT32_C(0x00000001)
/* Various flags */
/* Note that the bits no longer necessarily correspond to LVM1 disk format */
#define PARTIAL_VG UINT64_C(0x0000000000000001) /* VG */
#define EXPORTED_VG UINT64_C(0x0000000000000002) /* VG PV */
#define RESIZEABLE_VG UINT64_C(0x0000000000000004) /* VG */
/* May any free extents on this PV be used or must they be left free? */
#define ALLOCATABLE_PV UINT64_C(0x0000000000000008) /* PV */
#define ARCHIVED_VG ALLOCATABLE_PV /* VG, reuse same bit */
Add metadata-based autoactivation property for VG and LV The autoactivation property can be specified in lvcreate or vgcreate for new LVs/VGs, and the property can be changed by lvchange or vgchange for existing LVs/VGs. --setautoactivation y|n enables|disables autoactivation of a VG or LV. Autoactivation is enabled by default, which is consistent with past behavior. The disabled state is stored as a new flag in the VG metadata, and the absence of the flag allows autoactivation. If autoactivation is disabled for the VG, then no LVs in the VG will be autoactivated (the LV autoactivation property will have no effect.) When autoactivation is enabled for the VG, then autoactivation can be controlled on individual LVs. The state of this property can be reported for LVs/VGs using the "-o autoactivation" option in lvs/vgs commands, which will report "enabled", or "" for the disabled state. Previous versions of lvm do not recognize this property. Since autoactivation is enabled by default, the disabled setting will have no effect in older lvm versions. If the VG is modified by older lvm versions, the disabled state will also be dropped from the metadata. The autoactivation property is an alternative to using the lvm.conf auto_activation_volume_list, which is still applied to to VGs/LVs in addition to the new property. If VG or LV autoactivation is disabled either in metadata or in auto_activation_volume_list, it will not be autoactivated. An autoactivation command will silently skip activating an LV when the autoactivation property is disabled. To determine the effective autoactivation behavior for a specific LV, multiple settings would need to be checked: the VG autoactivation property, the LV autoactivation property, the auto_activation_volume_list. The "activation skip" property would also be relevant, since it applies to both normal and auto activation.
2021-04-01 17:20:00 -05:00
#define LV_NOAUTOACTIVATE UINT64_C(0x0000000000000010) /* LV - also a PV flag */
#define NOAUTOACTIVATE UINT64_C(0x0000000000000010) /* VG - also a PV flag */
//#define BADBLOCK_ON UINT64_C(0x0000000000000020) /* LV */
#define VISIBLE_LV UINT64_C(0x0000000000000040) /* LV */
#define FIXED_MINOR UINT64_C(0x0000000000000080) /* LV */
#define LVM_READ UINT64_C(0x0000000000000100) /* LV, VG */
#define LVM_WRITE UINT64_C(0x0000000000000200) /* LV, VG */
2015-03-09 18:53:22 +00:00
#define LVM_WRITE_LOCKED UINT64_C(0x0020000000000000) /* LV, VG */
#define CLUSTERED UINT64_C(0x0000000000000400) /* VG */
//#define SHARED UINT64_C(0x0000000000000800) /* VG */
/* FIXME Remove when metadata restructuring is completed */
#define SNAPSHOT UINT64_C(0x0000000000001000) /* LV - internal use only */
#define PVMOVE UINT64_C(0x0000000000002000) /* VG LV SEG */
#define LOCKED UINT64_C(0x0000000000004000) /* LV */
#define MIRRORED UINT64_C(0x0000000000008000) /* LV - internal use only */
#define VIRTUAL UINT64_C(0x0000000000010000) /* LV - internal use only */
#define MIRROR UINT64_C(0x0002000000000000) /* LV - Internal use only */
#define MIRROR_LOG UINT64_C(0x0000000000020000) /* LV - Internal use only */
#define MIRROR_IMAGE UINT64_C(0x0000000000040000) /* LV - Internal use only */
#define LV_NOTSYNCED UINT64_C(0x0000000000080000) /* LV */
#define LV_REBUILD UINT64_C(0x0000000000100000) /* LV */
//#define PRECOMMITTED UINT64_C(0x0000000000200000) /* VG - internal use only */
#define CONVERTING UINT64_C(0x0000000000400000) /* LV */
#define MISSING_PV UINT64_C(0x0000000000800000) /* PV */
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
#define INTEGRITY UINT64_C(0x0000000000800000) /* LV - Internal use only */
#define PV_MOVED_VG UINT64_C(0x4000000000000000) /* PV - Moved to a new VG */
#define PARTIAL_LV UINT64_C(0x0000000001000000) /* LV - derived flag, not
written out in metadata*/
2020-02-24 13:52:12 -06:00
#define WRITECACHE_ORIGIN UINT64_C(0x0000000002000000)
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
#define INTEGRITY_METADATA UINT64_C(0x0000000004000000) /* LV - Internal use only */
#define VIRTUAL_ORIGIN UINT64_C(0x0000000008000000) /* LV - internal use only */
#define MERGING UINT64_C(0x0000000010000000) /* LV SEG */
#define UNLABELLED_PV UINT64_C(0x0000000080000000) /* PV -this PV had no label written yet */
#define WRITECACHE UINT64_C(0x0000000080000000) /* LV - shared with UNLABELLED_PV */
#define RAID UINT64_C(0x0000000100000000) /* LV - Internal use only */
#define RAID_META UINT64_C(0x0000000200000000) /* LV - Internal use only */
#define RAID_IMAGE UINT64_C(0x0000000400000000) /* LV - Internal use only */
#define THIN_VOLUME UINT64_C(0x0000001000000000) /* LV - Internal use only */
#define THIN_POOL UINT64_C(0x0000002000000000) /* LV - Internal use only */
#define THIN_POOL_DATA UINT64_C(0x0000004000000000) /* LV - Internal use only */
#define THIN_POOL_METADATA UINT64_C(0x0000008000000000) /* LV - Internal use only */
#define POOL_METADATA_SPARE UINT64_C(0x0000010000000000) /* LV - Internal use only */
#define LV_WRITEMOSTLY UINT64_C(0x0000020000000000) /* LV (RAID1) */
RAID: Add writemostly/writebehind support for RAID1 'lvchange' is used to alter a RAID 1 logical volume's write-mostly and write-behind characteristics. The '--writemostly' parameter takes a PV as an argument with an optional trailing character to specify whether to set ('y'), unset ('n'), or toggle ('t') the value. If no trailing character is given, it will set the flag. Synopsis: lvchange [--writemostly <PV>:{t|y|n}] [--writebehind <count>] vg/lv Example: lvchange --writemostly /dev/sdb1:y --writebehind 512 vg/raid1_lv The last character in the 'lv_attr' field is used to show whether a device has the WriteMostly flag set. It is signified with a 'w'. If the device has failed, the 'p'artial flag has priority. Example ("nosync" raid1 with mismatch_cnt and writemostly): [~]# lvs -a --segment vg LV VG Attr #Str Type SSize raid1 vg Rwi---r-m 2 raid1 500.00m [raid1_rimage_0] vg Iwi---r-- 1 linear 500.00m [raid1_rimage_1] vg Iwi---r-w 1 linear 500.00m [raid1_rmeta_0] vg ewi---r-- 1 linear 4.00m [raid1_rmeta_1] vg ewi---r-- 1 linear 4.00m Example (raid1 with mismatch_cnt, writemostly - but failed drive): [~]# lvs -a --segment vg LV VG Attr #Str Type SSize raid1 vg rwi---r-p 2 raid1 500.00m [raid1_rimage_0] vg Iwi---r-- 1 linear 500.00m [raid1_rimage_1] vg Iwi---r-p 1 linear 500.00m [raid1_rmeta_0] vg ewi---r-- 1 linear 4.00m [raid1_rmeta_1] vg ewi---r-p 1 linear 4.00m A new reportable field has been added for writebehind as well. If write-behind has not been set or the LV is not RAID1, the field will be blank. Example (writebehind is set): [~]# lvs -a -o name,attr,writebehind vg LV Attr WBehind lv rwi-a-r-- 512 [lv_rimage_0] iwi-aor-w [lv_rimage_1] iwi-aor-- [lv_rmeta_0] ewi-aor-- [lv_rmeta_1] ewi-aor-- Example (writebehind is not set): [~]# lvs -a -o name,attr,writebehind vg LV Attr WBehind lv rwi-a-r-- [lv_rimage_0] iwi-aor-w [lv_rimage_1] iwi-aor-- [lv_rmeta_0] ewi-aor-- [lv_rmeta_1] ewi-aor--
2013-04-15 13:59:46 -05:00
#define LV_ACTIVATION_SKIP UINT64_C(0x0000040000000000) /* LV */
activation: flag temporary LVs internally Add LV_TEMPORARY flag for LVs with limited existence during command execution. Such LVs are temporary in way that they need to be activated, some action done and then removed immediately. Such LVs are just like any normal LV - the only difference is that they are removed during LVM command execution. This is also the case for LVs representing future pool metadata spare LVs which we need to initialize by using the usual LV before they are declared as pool metadata spare. We can optimize some other parts like udev to do a better job if it knows that the LV is temporary and any processing on it is just useless. This flag is orthogonal to LV_NOSCAN flag introduced recently as LV_NOSCAN flag is primarily used to mark an LV for the scanning to be avoided before the zeroing of the device happens. The LV_TEMPORARY flag makes a difference between a full-fledged LV visible in the system and the LV just used as a temporary overlay for some action that needs to be done on underlying PVs. For example: lvcreate --thinpool POOL --zero n -L 1G vg - first, the usual LV is created to do a clean up for pool metadata spare. The LV is activated, zeroed, deactivated. - between "activated" and "zeroed" stage, the LV_NOSCAN flag is used to avoid any scanning in udev - betwen "zeroed" and "deactivated" stage, we need to avoid the WATCH udev rule, but since the LV is just a usual LV, we can't make a difference. The LV_TEMPORARY internal LV flag helps here. If we create the LV with this flag, the DM_UDEV_DISABLE_DISK_RULES and DM_UDEV_DISABLE_OTHER_RULES flag are set (just like as it is with "invisible" and non-top-level LVs) - udev is directed to skip WATCH rule use. - if the LV_TEMPORARY flag was not used, there would normally be a WATCH event generated once the LV is closed after "zeroed" stage. This will make problems with immediated deactivation that follows.
2013-10-23 14:06:39 +02:00
#define LV_NOSCAN UINT64_C(0x0000080000000000) /* LV - internal use only - the LV
should not be scanned */
#define LV_TEMPORARY UINT64_C(0x0000100000000000) /* LV - internal use only - the LV
is supposed to be created and
removed or reactivated with
this flag dropped during single
LVM command execution. */
activation: add support for skipping activation of selected LVs Also add -k/--setactivationskip y/n and -K/--ignoreactivationskip options to lvcreate. The --setactivationskip y sets the flag in metadata for an LV to skip the LV during activation. Also, the newly created LV is not activated. Thin snapsots have this flag set automatically if not specified directly by the --setactivationskip y/n option. The --ignoreactivationskip overrides the activation skip flag set in metadata for an LV (just for the run of the command - the flag is not changed in metadata!) A few examples for the lvcreate with the new options: (non-thin snap LV => skip flag not set in MDA + LV activated) raw/~ $ lvcreate -l1 vg Logical volume "lvol0" created raw/~ $ lvs -o lv_name,attr vg/lvol0 LV Attr lvol0 -wi-a---- (non-thin snap LV + -ky => skip flag set in MDA + LV not activated) raw/~ $ lvcreate -l1 -ky vg Logical volume "lvol1" created raw/~ $ lvs -o lv_name,attr vg/lvol1 LV Attr lvol1 -wi------ (non-thin snap LV + -ky + -K => skip flag set in MDA + LV activated) raw/~ $ lvcreate -l1 -ky -K vg Logical volume "lvol2" created raw/~ $ lvs -o lv_name,attr vg/lvol2 LV Attr lvol2 -wi-a---- (thin snap LV => skip flag set in MDA (default behaviour) + LV not activated) raw/~ $ lvcreate -L100M -T vg/pool -V 1T -n thin_lv Logical volume "thin_lv" created raw/~ $ lvcreate -s vg/thin_lv -n thin_snap Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg LV Attr pool twi-a-tz- thin_lv Vwi-a-tz- thin_snap Vwi---tz- (thin snap LV + -K => skip flag set in MDA (default behaviour) + LV activated) raw/~ $ lvcreate -s vg/thin_lv -n thin_snap -K Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg/thin_lv LV Attr thin_lv Vwi-a-tz- (thins snap LV + -kn => no skip flag in MDA (default behaviour overridden) + LV activated) [0] raw/~ # lvcreate -s vg/thin_lv -n thin_snap -kn Logical volume "thin_snap" created [0] raw/~ # lvs -o name,attr vg/thin_snap LV Attr thin_snap Vwi-a-tz-
2013-07-10 14:06:50 +02:00
#define CACHE_POOL UINT64_C(0x0000200000000000) /* LV - Internal use only */
#define CACHE_POOL_DATA UINT64_C(0x0000400000000000) /* LV - Internal use only */
#define CACHE_POOL_METADATA UINT64_C(0x0000800000000000) /* LV - Internal use only */
#define CACHE UINT64_C(0x0001000000000000) /* LV - Internal use only */
2014-11-10 10:56:43 +01:00
#define LV_PENDING_DELETE UINT64_C(0x0004000000000000) /* LV - Internal use only */
metadata: process_each_lv_in_vg: get the list of LVs to process first, then do the processing This avoids a problem in which we're using selection on LV list - we need to do the selection on initial state and not on any intermediary state as we process LVs one by one - some of the relations among LVs can be gone during this processing. For example, processing one LV can cause the other LVs to lose the relation to this LV and hence they're not selectable anymore with the original selection criteria as it would be if we did selection on inital state. A perfect example is with thin snapshots: $ lvs -o lv_name,origin,layout,role vg LV Origin Layout Role lvol1 thin,sparse public,origin,thinorigin,multithinorigin lvol2 lvol1 thin,sparse public,snapshot,thinsnapshot lvol3 lvol1 thin,sparse public,snapshot,thinsnapshot pool thin,pool private $ lvremove -ff -S 'lv_name=lvol1 || origin=lvol1' Logical volume "lvol1" successfully removed The lvremove command above was supposed to remove lvol1 as well as all its snapshots which have origin=lvol1. It failed to do so, because once we removed the origin lvol1, the lvol2 and lvol3 which were snapshots before are not snapshots anymore - the relations change as we're processing these LVs one by one. If we do the selection first and then execute any concrete actions on these LVs (which is what this patch does), the behaviour is correct then - the selection is done on the *initial state*: $ lvremove -ff -S 'lv_name=lvol1 || origin=lvol1' Logical volume "lvol1" successfully removed Logical volume "lvol2" successfully removed Logical volume "lvol3" successfully removed Similarly for all the other situations in which relations among LVs are being changed by processing the LVs one by one. This patch also introduces LV_REMOVED internal LV status flag to mark removed LVs so they're not processed further when we iterate over collected list of LVs to be processed. Previously, when we iterated directly over vg->lvs list to process the LVs, we relied on the fact that once the LV is removed, it is also removed from the vg->lvs list we're iterating over. But that was incorrect as we shouldn't remove LVs from the list during one iteration while we're iterating over that exact list (dm_list_iterate_items safe can handle only one removal at one iteration anyway, so it can't be used here).
2015-03-16 17:10:21 +01:00
#define LV_REMOVED UINT64_C(0x0040000000000000) /* LV - Internal use only
This flag is used to mark an LV once it has
been removed from the VG. It might still
be referenced on internal lists of LVs.
Any remaining references should check for
this flag and ignore the LV is set.
FIXME: Remove this flag once we have indexed
vg->removed_lvs for quick lookup.
metadata: process_each_lv_in_vg: get the list of LVs to process first, then do the processing This avoids a problem in which we're using selection on LV list - we need to do the selection on initial state and not on any intermediary state as we process LVs one by one - some of the relations among LVs can be gone during this processing. For example, processing one LV can cause the other LVs to lose the relation to this LV and hence they're not selectable anymore with the original selection criteria as it would be if we did selection on inital state. A perfect example is with thin snapshots: $ lvs -o lv_name,origin,layout,role vg LV Origin Layout Role lvol1 thin,sparse public,origin,thinorigin,multithinorigin lvol2 lvol1 thin,sparse public,snapshot,thinsnapshot lvol3 lvol1 thin,sparse public,snapshot,thinsnapshot pool thin,pool private $ lvremove -ff -S 'lv_name=lvol1 || origin=lvol1' Logical volume "lvol1" successfully removed The lvremove command above was supposed to remove lvol1 as well as all its snapshots which have origin=lvol1. It failed to do so, because once we removed the origin lvol1, the lvol2 and lvol3 which were snapshots before are not snapshots anymore - the relations change as we're processing these LVs one by one. If we do the selection first and then execute any concrete actions on these LVs (which is what this patch does), the behaviour is correct then - the selection is done on the *initial state*: $ lvremove -ff -S 'lv_name=lvol1 || origin=lvol1' Logical volume "lvol1" successfully removed Logical volume "lvol2" successfully removed Logical volume "lvol3" successfully removed Similarly for all the other situations in which relations among LVs are being changed by processing the LVs one by one. This patch also introduces LV_REMOVED internal LV status flag to mark removed LVs so they're not processed further when we iterate over collected list of LVs to be processed. Previously, when we iterated directly over vg->lvs list to process the LVs, we relied on the fact that once the LV is removed, it is also removed from the vg->lvs list we're iterating over. But that was incorrect as we shouldn't remove LVs from the list during one iteration while we're iterating over that exact list (dm_list_iterate_items safe can handle only one removal at one iteration anyway, so it can't be used here).
2015-03-16 17:10:21 +01:00
*/
#define LV_ERROR_WHEN_FULL UINT64_C(0x0008000000000000) /* LV - error when full */
#define PV_ALLOCATION_PROHIBITED UINT64_C(0x0010000000000000) /* PV - internal use only - allocation prohibited
e.g. to prohibit allocation of a RAID image
on a PV already holing an image of the RAID set */
#define LOCKD_SANLOCK_LV UINT64_C(0x0080000000000000) /* LV - Internal use only */
lvconvert: add infrastructure for RaidLV reshaping support In order to support striped raid5/6/10 LV reshaping (change of LV type, stripesize or number of legs), this patch introduces infrastructure prerequisites to be used by raid_manip.c extensions in followup patches. This base is needed for allocation of out-of-place reshape space required by the MD raid personalities to avoid writing over data in-place when reading off the current RAID layout or number of legs and writing out the new layout or to a different number of legs (i.e. restripe) Changes: - add members reshape_len to 'struct lv_segment' to store out-of-place reshape length per component rimage - add member data_copies to struct lv_segment to support more than 2 raid10 data copies - make alloc_lv_segment() aware of both reshape_len and data_copies - adjust all alloc_lv_segment() callers to the new API - add functions to retrieve the current data offset (needed for out-of-place reshaping space allocation) and the devices count from the kernel - make libdm deptree code aware of reshape_len - add LV flags for disk add/remove reshaping - support import/export of the new 'struct lv_segment' members - enhance lv_extend/_lv_reduce to cope with reshape_len - add seg_is_*/segtype_is_* macros related to reshaping - add target version check for reshaping - grow rebuilds/writemostly bitmaps to 246 bit to support kernel maximal - enhance libdm deptree code to support data_offset (out-of-place reshaping) and delta_disk (legs add/remove reshaping) target arguments Related: rhbz834579 Related: rhbz1191935 Related: rhbz1191978
2017-02-24 00:50:00 +01:00
#define LV_RESHAPE_DELTA_DISKS_PLUS UINT64_C(0x0100000000000000) /* LV reshape flag delta disks plus image(s) */
#define LV_RESHAPE_DELTA_DISKS_MINUS UINT64_C(0x0200000000000000) /* LV reshape flag delta disks minus image(s) */
#define LV_REMOVE_AFTER_RESHAPE UINT64_C(0x0400000000000000) /* LV needs to be removed after a shrinking reshape */
#define LV_METADATA_FORMAT UINT64_C(0x0800000000000000) /* LV has segments with metadata format */
thin: improve 16g support for thin pool metadata Initial support for thin-pool used slightly smaller max size 15.81GiB for thin-pool metadata. However the real limit later settled at 15.88GiB (difference is ~64MiB - 16448 4K blocks). lvm2 could not simply increase the size as it has been using hard cropping of the loaded metadata device to avoid warnings printing warning of kernel when the size was bigger (i.e. due to bigger extent_size). This patch adds the new lvm.conf configurable setting: allocation/thin_pool_crop_metadata which defaults to 0 -> no crop of metadata beyond 15.81GiB. Only user with these sizes of metadata will be affected. Without cropping lvm2 now limits metadata allocation size to 15.88GiB. Any space beyond is currently not used by thin-pool target. Even if i.e. bigger LV is used for metadata via lvconvert, or allocated bigger because of to large extent size. With cropping enabled (=1) lvm2 preserves the old limitation 15.81GiB and should allow to work in the evironement with older lvm2 tools (i.e. older distribution). Thin-pool metadata with size bigger then 15.81G is now using CROP_METADATA flag within lvm2 metadata, so older lvm2 recognizes an incompatible thin-pool and cannot activate such pool! Users should use uncropped version as it is not suffering from various issues between thin_repair results and allocated metadata LV as thin_repair limit is 15.88GiB Users should use cropping only when really needed! Patch also better handles resize of thin-pool metadata and prevents resize beoyond usable size 15.88GiB. Resize beyond 15.81GiB automatically switches pool to no-crop version. Even with existing bigger thin-pool metadata command 'lvextend -l+1 vg/pool_tmeta' does the change. Patch gives better controls 'coverted' metadata LV and reports less confusing message during conversion. Patch set also moves the code for updating min/max into pool_manip.c for better sharing with cache_pool code.
2021-01-12 17:59:29 +01:00
#define LV_CROP_METADATA UINT64_C(0x0000000000000400) /* LV - also VG CLUSTERED */
#define LV_RESHAPE UINT64_C(0x1000000000000000) /* Ongoing reshape (number of stripes, stripesize or raid algorithm change):
used as SEGTYPE_FLAG to prevent activation on old runtime */
#define LV_RESHAPE_DATA_OFFSET UINT64_C(0x2000000000000000) /* LV reshape flag data offset (out of place reshaping) */
#define LV_VDO UINT64_C(0x0000000020000000) /* LV - Internal user only */
#define LV_VDO_POOL UINT64_C(0x0000000040000000) /* LV - Internal user only */
#define LV_VDO_POOL_DATA UINT64_C(0x8000000000000000) /* LV - Internal user only */
#define LV_CACHE_VOL UINT64_C(0x0010000000000000) /* LV - also a PV flag */
#define LV_CACHE_USES_CACHEVOL UINT64_C(0x4000000000000000) /* LV - also a PV flag */
Allow dm-cache cache device to be standard LV If a single, standard LV is specified as the cache, use it directly instead of converting it into a cache-pool object with two separate LVs (for data and metadata). With a single LV as the cache, lvm will use blocks at the beginning for metadata, and the rest for data. Separate dm linear devices are set up to point at the metadata and data areas of the LV. These dm devs are given to the dm-cache target to use. The single LV cache cannot be resized without recreating it. If the --poolmetadata option is used to specify an LV for metadata, then a cache pool will be created (with separate LVs for data and metadata.) Usage: $ lvcreate -n main -L 128M vg /dev/loop0 $ lvcreate -n fast -L 64M vg /dev/loop1 $ lvs -a vg LV VG Attr LSize Type Devices main vg -wi-a----- 128.00m linear /dev/loop0(0) fast vg -wi-a----- 64.00m linear /dev/loop1(0) $ lvconvert --type cache --cachepool fast vg/main $ lvs -a vg LV VG Attr LSize Origin Pool Type Devices [fast] vg Cwi---C--- 64.00m linear /dev/loop1(0) main vg Cwi---C--- 128.00m [main_corig] [fast] cache main_corig(0) [main_corig] vg owi---C--- 128.00m linear /dev/loop0(0) $ lvchange -ay vg/main $ dmsetup ls vg-fast_cdata (253:4) vg-fast_cmeta (253:5) vg-main_corig (253:6) vg-main (253:24) vg-fast (253:3) $ dmsetup table vg-fast_cdata: 0 98304 linear 253:3 32768 vg-fast_cmeta: 0 32768 linear 253:3 0 vg-main_corig: 0 262144 linear 7:0 2048 vg-main: 0 262144 cache 253:5 253:4 253:6 128 2 metadata2 writethrough mq 0 vg-fast: 0 131072 linear 7:1 2048 $ lvchange -an vg/min $ lvconvert --splitcache vg/main $ lvs -a vg LV VG Attr LSize Type Devices fast vg -wi------- 64.00m linear /dev/loop1(0) main vg -wi------- 128.00m linear /dev/loop0(0)
2018-08-17 15:45:52 -05:00
Add metadata-based autoactivation property for VG and LV The autoactivation property can be specified in lvcreate or vgcreate for new LVs/VGs, and the property can be changed by lvchange or vgchange for existing LVs/VGs. --setautoactivation y|n enables|disables autoactivation of a VG or LV. Autoactivation is enabled by default, which is consistent with past behavior. The disabled state is stored as a new flag in the VG metadata, and the absence of the flag allows autoactivation. If autoactivation is disabled for the VG, then no LVs in the VG will be autoactivated (the LV autoactivation property will have no effect.) When autoactivation is enabled for the VG, then autoactivation can be controlled on individual LVs. The state of this property can be reported for LVs/VGs using the "-o autoactivation" option in lvs/vgs commands, which will report "enabled", or "" for the disabled state. Previous versions of lvm do not recognize this property. Since autoactivation is enabled by default, the disabled setting will have no effect in older lvm versions. If the VG is modified by older lvm versions, the disabled state will also be dropped from the metadata. The autoactivation property is an alternative to using the lvm.conf auto_activation_volume_list, which is still applied to to VGs/LVs in addition to the new property. If VG or LV autoactivation is disabled either in metadata or in auto_activation_volume_list, it will not be autoactivated. An autoactivation command will silently skip activating an LV when the autoactivation property is disabled. To determine the effective autoactivation behavior for a specific LV, multiple settings would need to be checked: the VG autoactivation property, the LV autoactivation property, the auto_activation_volume_list. The "activation skip" property would also be relevant, since it applies to both normal and auto activation.
2021-04-01 17:20:00 -05:00
/* Format features flags */
#define FMT_SEGMENTS 0x00000001U /* Arbitrary segment params? */
// #define FMT_MDAS 0x00000002U /* Proper metadata areas? */
#define FMT_TAGS 0x00000004U /* Tagging? */
#define FMT_UNLIMITED_VOLS 0x00000008U /* Unlimited PVs/LVs? */
#define FMT_RESTRICTED_LVIDS 0x00000010U /* LVID <= 255 */
#define FMT_ORPHAN_ALLOCATABLE 0x00000020U /* Orphan PV allocatable? */
//#define FMT_PRECOMMIT 0x00000040U /* Supports pre-commit? */
#define FMT_RESIZE_PV 0x00000080U /* Supports pvresize? */
#define FMT_UNLIMITED_STRIPESIZE 0x00000100U /* Unlimited stripe size? */
#define FMT_RESTRICTED_READAHEAD 0x00000200U /* Readahead restricted to 2-120? */
// #define FMT_BAS 0x000000400U /* Supports bootloader areas? */
#define FMT_CONFIG_PROFILE 0x000000800U /* Supports configuration profiles? */
// #define FMT_OBSOLETE 0x000001000U /* Obsolete format? */
#define FMT_NON_POWER2_EXTENTS 0x000002000U /* Non-power-of-2 extent sizes? */
// #define FMT_SYSTEMID_ON_PVS 0x000004000U /* System ID is stored on PVs not VG */
#define FMT_PV_FLAGS 0x000008000U /* Supports PV flags */
2007-12-20 22:37:42 +00:00
/* Mirror conversion type flags */
#define MIRROR_BY_SEG 0x00000001U /* segment-by-segment mirror */
2008-01-17 17:17:09 +00:00
#define MIRROR_BY_LV 0x00000002U /* mirror using whole mimage LVs */
2014-06-19 13:40:47 +01:00
#define MIRROR_BY_SEGMENTED_LV 0x00000004U /* mirror using whole mimage LVs that
* preserve the segment structure */
#define MIRROR_SKIP_INIT_SYNC 0x00000010U /* skip initial sync */
2007-12-20 22:37:42 +00:00
/* vg_read and vg_read_for_update flags */
#define READ_OK_NOTFOUND 0x00040000U
#define READ_WARN_INCONSISTENT 0x00080000U
#define READ_FOR_UPDATE 0x00100000U /* command tells vg_read it plans to write the vg */
#define PROCESS_SKIP_SCAN 0x00200000U /* skip lvmcache_label_scan in process_each_pv */
#define READ_FOR_ACTIVATE 0x00400000U /* command tells vg_read it plans to activate the vg */
#define READ_WITHOUT_LOCK 0x00800000U /* caller responsible for vg lock */
improve reading and repairing vg metadata The fact that vg repair is implemented as a part of vg read has led to a messy and complicated implementation of vg_read, and limited and uncontrolled repair capability. This splits read and repair apart. Summary ------- - take all kinds of various repairs out of vg_read - vg_read no longer writes anything - vg_read now simply reads and returns vg metadata - vg_read ignores bad or old copies of metadata - vg_read proceeds with a single good copy of metadata - improve error checks and handling when reading - keep track of bad (corrupt) copies of metadata in lvmcache - keep track of old (seqno) copies of metadata in lvmcache - keep track of outdated PVs in lvmcache - vg_write will do basic repairs - new command vgck --updatemetdata will do all repairs Details ------- - In scan, do not delete dev from lvmcache if reading/processing fails; the dev is still present, and removing it makes it look like the dev is not there. Records are now kept about the problems with each PV so they be fixed/repaired in the appropriate places. - In scan, record a bad mda on failure, and delete the mda from mda in use list so it will not be used by vg_read or vg_write, only by repair. - In scan, succeed if any good mda on a device is found, instead of failing if any is bad. The bad/old copies of metadata should not interfere with normal usage while good copies can be used. - In scan, add a record of old mdas in lvmcache for later, do not repair them while reading, and do not let them prevent us from finding and using a good copy of metadata from elsewhere. One result is that "inconsistent metadata" is no longer a read error, but instead a record in lvmcache that can be addressed separate from the read. - Treat a dev with no good mdas like a dev with no mdas, which is an existing case we already handle. - Don't use a fake vg "handle" for returning an error from vg_read, or the vg_read_error function for getting that error number; just return null if the vg cannot be read or used, and an error_flags arg with flags set for the specific kind of error (which can be used later for determining the kind of repair.) - Saving an original copy of the vg metadata, for purposes of reverting a write, is now done explicitly in vg_read instead of being hidden in the vg_make_handle function. - When a vg is not accessible due to "access restrictions" but is otherwise fine, return the vg through the new error_vg arg so that process_each_pv can skip the PVs in the VG while processing. (This is a temporary accomodation for the way process_each_pv tracks which devs have been looked at, and can be dropped later when process_each_pv implementation dev tracking is changed.) - vg_read does not try to fix or recover a vg, but now just reads the metadata, checks access restrictions and returns it. (Checking access restrictions might be better done outside of vg_read, but this is a later improvement.) - _vg_read now simply makes one attempt to read metadata from each mda, and uses the most recent copy to return to the caller in the form of a 'vg' struct. (bad mdas were excluded during the scan and are not retried) (old mdas were not excluded during scan and are retried here) - vg_read uses _vg_read to get the latest copy of metadata from mdas, and then makes various checks against it to produce warnings, and to check if VG access is allowed (access restrictions include: writable, foreign, shared, clustered, missing pvs). - Things that were previously silently/automatically written by vg_read that are now done by vg_write, based on the records made in lvmcache during the scan and read: . clearing the missing flag . updating old copies of metadata . clearing outdated pvs . updating pv header flags - Bad/corrupt metadata are now repaired; they were not before. Test changes ------------ - A read command no longer writes the VG to repair it, so add a write command to do a repair. (inconsistent-metadata, unlost-pv) - When a missing PV is removed from a VG, and then the device is enabled again, vgck --updatemetadata is needed to clear the outdated PV before it can be used again, where it wasn't before. (lvconvert-repair-policy, lvconvert-repair-raid, lvconvert-repair, mirror-vgreduce-removemissing, pv-ext-flags, unlost-pv) Reading bad/old metadata ------------------------ - "bad metadata": the mda_header or metadata text has invalid fields or can't be parsed by lvm. This is a form of corruption that would not be caused by known failure scenarios. A checksum error is typically included among the errors reported. - "old metadata": a valid copy of the metadata that has a smaller seqno than other copies of the metadata. This can happen if the device failed, or io failed, or lvm failed while commiting new metadata to all the metadata areas. Old metadata on a PV that has been removed from the VG is the "outdated" case below. When a VG has some PVs with bad/old metadata, lvm can simply ignore the bad/old copies, and use a good copy. This is why there are multiple copies of the metadata -- so it's available even when some of the copies cannot be used. The bad/old copies do not have to be repaired before the VG can be used (the repair can happen later.) A PV with no good copies of the metadata simply falls back to being treated like a PV with no mdas; a common and harmless configuration. When bad/old metadata exists, lvm warns the user about it, and suggests repairing it using a new metadata repair command. Bad metadata in particular is something that users will want to investigate and repair themselves, since it should not happen and may indicate some other problem that needs to be fixed. PVs with bad/old metadata are not the same as missing devices. Missing devices will block various kinds of VG modification or activation, but bad/old metadata will not. Previously, lvm would attempt to repair bad/old metadata whenever it was read. This was unnecessary since lvm does not require every copy of the metadata to be used. It would also hide potential problems that should be investigated by the user. It was also dangerous in cases where the VG was on shared storage. The user is now allowed to investigate potential problems and decide how and when to repair them. Repairing bad/old metadata -------------------------- When label scan sees bad metadata in an mda, that mda is removed from the lvmcache info->mdas list. This means that vg_read will skip it, and not attempt to read/process it again. If it was the only in-use mda on a PV, that PV is treated like a PV with no mdas. It also means that vg_write will also skip the bad mda, and not attempt to write new metadata to it. The only way to repair bad metadata is with the metadata repair command. When label scan sees old metadata in an mda, that mda is kept in the lvmcache info->mdas list. This means that vg_read will read/process it again, and likely see the same mismatch with the other copies of the metadata. Like the label_scan, the vg_read will simply ignore the old copy of the metadata and use the latest copy. If the command is modifying the vg (e.g. lvcreate), then vg_write, which writes new metadata to every mda on info->mdas, will write the new metadata to the mda that had the old version. If successful, this will resolve the old metadata problem (without needing to run a metadata repair command.) Outdated PVs ------------ An outdated PV is a PV that has an old copy of VG metadata that shows it is a member of the VG, but the latest copy of the VG metadata does not include this PV. This happens if the PV is disconnected, vgreduce --removemissing is run to remove the PV from the VG, then the PV is reconnected. In this case, the outdated PV needs have its outdated metadata removed and the PV used flag needs to be cleared. This repair will be done by the subsequent repair command. It is also done if vgremove is run on the VG. MISSING PVs ----------- When a device is missing, most commands will refuse to modify the VG. This is the simple case. More complicated is when a command is allowed to modify the VG while it is missing a device. When a VG is written while a device is missing for one of it's PVs, the VG metadata is written to disk with the MISSING flag on the PV with the missing device. When the VG is next used, it is treated as if the PV with the MISSING flag still has a missing device, even if that device has reappeared. If all LVs that were using a PV with the MISSING flag are removed or repaired so that the MISSING PV is no longer used, then the next time the VG metadata is written, the MISSING flag will be dropped. Alternative methods of clearing the MISSING flag are: vgreduce --removemissing will remove PVs with missing devices, or PVs with the MISSING flag where the device has reappeared. vgextend --restoremissing will clear the MISSING flag on PVs where the device has reappeared, allowing the VG to be used normally. This must be done with caution since the reappeared device may have old data that is inconsistent with data on other PVs. Bad mda repair -------------- The new command: vgck --updatemetadata VG first uses vg_write to repair old metadata, and other basic issues mentioned above (old metadata, outdated PVs, pv_header flags, MISSING_PV flags). It will also go further and repair bad metadata: . text metadata that has a bad checksum . text metadata that is not parsable . corrupt mda_header checksum and version fields (To keep a clean diff, #if 0 is added around functions that are replaced by new code. These commented functions are removed by the following commit.)
2019-05-24 12:04:37 -05:00
/* vg_read returns these in error_flags */
#define FAILED_NOT_ENABLED 0x00000001U
#define FAILED_LOCKING 0x00000002U
#define FAILED_NOTFOUND 0x00000004U
#define FAILED_READ_ONLY 0x00000008U
#define FAILED_EXPORTED 0x00000010U
#define FAILED_RESIZEABLE 0x00000020U
#define FAILED_CLUSTERED 0x00000040U
#define FAILED_ALLOCATION 0x00000080U
#define FAILED_EXIST 0x00000100U
#define FAILED_RECOVERY 0x00000200U
#define FAILED_SYSTEMID 0x00000400U
#define FAILED_LOCK_TYPE 0x00000800U
#define FAILED_LOCK_MODE 0x00001000U
improve reading and repairing vg metadata The fact that vg repair is implemented as a part of vg read has led to a messy and complicated implementation of vg_read, and limited and uncontrolled repair capability. This splits read and repair apart. Summary ------- - take all kinds of various repairs out of vg_read - vg_read no longer writes anything - vg_read now simply reads and returns vg metadata - vg_read ignores bad or old copies of metadata - vg_read proceeds with a single good copy of metadata - improve error checks and handling when reading - keep track of bad (corrupt) copies of metadata in lvmcache - keep track of old (seqno) copies of metadata in lvmcache - keep track of outdated PVs in lvmcache - vg_write will do basic repairs - new command vgck --updatemetdata will do all repairs Details ------- - In scan, do not delete dev from lvmcache if reading/processing fails; the dev is still present, and removing it makes it look like the dev is not there. Records are now kept about the problems with each PV so they be fixed/repaired in the appropriate places. - In scan, record a bad mda on failure, and delete the mda from mda in use list so it will not be used by vg_read or vg_write, only by repair. - In scan, succeed if any good mda on a device is found, instead of failing if any is bad. The bad/old copies of metadata should not interfere with normal usage while good copies can be used. - In scan, add a record of old mdas in lvmcache for later, do not repair them while reading, and do not let them prevent us from finding and using a good copy of metadata from elsewhere. One result is that "inconsistent metadata" is no longer a read error, but instead a record in lvmcache that can be addressed separate from the read. - Treat a dev with no good mdas like a dev with no mdas, which is an existing case we already handle. - Don't use a fake vg "handle" for returning an error from vg_read, or the vg_read_error function for getting that error number; just return null if the vg cannot be read or used, and an error_flags arg with flags set for the specific kind of error (which can be used later for determining the kind of repair.) - Saving an original copy of the vg metadata, for purposes of reverting a write, is now done explicitly in vg_read instead of being hidden in the vg_make_handle function. - When a vg is not accessible due to "access restrictions" but is otherwise fine, return the vg through the new error_vg arg so that process_each_pv can skip the PVs in the VG while processing. (This is a temporary accomodation for the way process_each_pv tracks which devs have been looked at, and can be dropped later when process_each_pv implementation dev tracking is changed.) - vg_read does not try to fix or recover a vg, but now just reads the metadata, checks access restrictions and returns it. (Checking access restrictions might be better done outside of vg_read, but this is a later improvement.) - _vg_read now simply makes one attempt to read metadata from each mda, and uses the most recent copy to return to the caller in the form of a 'vg' struct. (bad mdas were excluded during the scan and are not retried) (old mdas were not excluded during scan and are retried here) - vg_read uses _vg_read to get the latest copy of metadata from mdas, and then makes various checks against it to produce warnings, and to check if VG access is allowed (access restrictions include: writable, foreign, shared, clustered, missing pvs). - Things that were previously silently/automatically written by vg_read that are now done by vg_write, based on the records made in lvmcache during the scan and read: . clearing the missing flag . updating old copies of metadata . clearing outdated pvs . updating pv header flags - Bad/corrupt metadata are now repaired; they were not before. Test changes ------------ - A read command no longer writes the VG to repair it, so add a write command to do a repair. (inconsistent-metadata, unlost-pv) - When a missing PV is removed from a VG, and then the device is enabled again, vgck --updatemetadata is needed to clear the outdated PV before it can be used again, where it wasn't before. (lvconvert-repair-policy, lvconvert-repair-raid, lvconvert-repair, mirror-vgreduce-removemissing, pv-ext-flags, unlost-pv) Reading bad/old metadata ------------------------ - "bad metadata": the mda_header or metadata text has invalid fields or can't be parsed by lvm. This is a form of corruption that would not be caused by known failure scenarios. A checksum error is typically included among the errors reported. - "old metadata": a valid copy of the metadata that has a smaller seqno than other copies of the metadata. This can happen if the device failed, or io failed, or lvm failed while commiting new metadata to all the metadata areas. Old metadata on a PV that has been removed from the VG is the "outdated" case below. When a VG has some PVs with bad/old metadata, lvm can simply ignore the bad/old copies, and use a good copy. This is why there are multiple copies of the metadata -- so it's available even when some of the copies cannot be used. The bad/old copies do not have to be repaired before the VG can be used (the repair can happen later.) A PV with no good copies of the metadata simply falls back to being treated like a PV with no mdas; a common and harmless configuration. When bad/old metadata exists, lvm warns the user about it, and suggests repairing it using a new metadata repair command. Bad metadata in particular is something that users will want to investigate and repair themselves, since it should not happen and may indicate some other problem that needs to be fixed. PVs with bad/old metadata are not the same as missing devices. Missing devices will block various kinds of VG modification or activation, but bad/old metadata will not. Previously, lvm would attempt to repair bad/old metadata whenever it was read. This was unnecessary since lvm does not require every copy of the metadata to be used. It would also hide potential problems that should be investigated by the user. It was also dangerous in cases where the VG was on shared storage. The user is now allowed to investigate potential problems and decide how and when to repair them. Repairing bad/old metadata -------------------------- When label scan sees bad metadata in an mda, that mda is removed from the lvmcache info->mdas list. This means that vg_read will skip it, and not attempt to read/process it again. If it was the only in-use mda on a PV, that PV is treated like a PV with no mdas. It also means that vg_write will also skip the bad mda, and not attempt to write new metadata to it. The only way to repair bad metadata is with the metadata repair command. When label scan sees old metadata in an mda, that mda is kept in the lvmcache info->mdas list. This means that vg_read will read/process it again, and likely see the same mismatch with the other copies of the metadata. Like the label_scan, the vg_read will simply ignore the old copy of the metadata and use the latest copy. If the command is modifying the vg (e.g. lvcreate), then vg_write, which writes new metadata to every mda on info->mdas, will write the new metadata to the mda that had the old version. If successful, this will resolve the old metadata problem (without needing to run a metadata repair command.) Outdated PVs ------------ An outdated PV is a PV that has an old copy of VG metadata that shows it is a member of the VG, but the latest copy of the VG metadata does not include this PV. This happens if the PV is disconnected, vgreduce --removemissing is run to remove the PV from the VG, then the PV is reconnected. In this case, the outdated PV needs have its outdated metadata removed and the PV used flag needs to be cleared. This repair will be done by the subsequent repair command. It is also done if vgremove is run on the VG. MISSING PVs ----------- When a device is missing, most commands will refuse to modify the VG. This is the simple case. More complicated is when a command is allowed to modify the VG while it is missing a device. When a VG is written while a device is missing for one of it's PVs, the VG metadata is written to disk with the MISSING flag on the PV with the missing device. When the VG is next used, it is treated as if the PV with the MISSING flag still has a missing device, even if that device has reappeared. If all LVs that were using a PV with the MISSING flag are removed or repaired so that the MISSING PV is no longer used, then the next time the VG metadata is written, the MISSING flag will be dropped. Alternative methods of clearing the MISSING flag are: vgreduce --removemissing will remove PVs with missing devices, or PVs with the MISSING flag where the device has reappeared. vgextend --restoremissing will clear the MISSING flag on PVs where the device has reappeared, allowing the VG to be used normally. This must be done with caution since the reappeared device may have old data that is inconsistent with data on other PVs. Bad mda repair -------------- The new command: vgck --updatemetadata VG first uses vg_write to repair old metadata, and other basic issues mentioned above (old metadata, outdated PVs, pv_header flags, MISSING_PV flags). It will also go further and repair bad metadata: . text metadata that has a bad checksum . text metadata that is not parsable . corrupt mda_header checksum and version fields (To keep a clean diff, #if 0 is added around functions that are replaced by new code. These commented functions are removed by the following commit.)
2019-05-24 12:04:37 -05:00
#define FAILED_INTERNAL_ERROR 0x00002000U
#define SUCCESS 0x00000000U
2010-06-30 19:28:35 +00:00
#define VGMETADATACOPIES_ALL UINT32_MAX
#define VGMETADATACOPIES_UNMANAGED 0
#define vg_is_archived(vg) (((vg)->status & ARCHIVED_VG) ? 1 : 0)
#define lv_is_locked(lv) (((lv)->status & LOCKED) ? 1 : 0)
#define lv_is_partial(lv) (((lv)->status & PARTIAL_LV) ? 1 : 0)
#define lv_is_virtual(lv) (((lv)->status & VIRTUAL) ? 1 : 0)
#define lv_is_merging(lv) (((lv)->status & MERGING) ? 1 : 0)
#define lv_is_merging_origin(lv) (lv_is_merging(lv) && (lv)->snapshot)
#define lv_is_snapshot(lv) (((lv)->status & SNAPSHOT) ? 1 : 0)
#define lv_is_converting(lv) (((lv)->status & CONVERTING) ? 1 : 0)
#define lv_is_external_origin(lv) (((lv)->external_count > 0) ? 1 : 0)
#define lv_is_virtual_origin(lv) (((lv)->status & VIRTUAL_ORIGIN) ? 1 : 0)
2014-06-27 00:02:58 +02:00
#define lv_is_thin_volume(lv) (((lv)->status & THIN_VOLUME) ? 1 : 0)
#define lv_is_thin_pool(lv) (((lv)->status & THIN_POOL) ? 1 : 0)
#define lv_is_new_thin_pool(lv) (lv_is_thin_pool(lv) && !first_seg(lv)->transaction_id)
#define lv_is_used_thin_pool(lv) (lv_is_thin_pool(lv) && !dm_list_empty(&(lv)->segs_using_this_lv))
#define lv_is_thin_pool_data(lv) (((lv)->status & THIN_POOL_DATA) ? 1 : 0)
#define lv_is_thin_pool_metadata(lv) (((lv)->status & THIN_POOL_METADATA) ? 1 : 0)
2014-06-27 00:02:58 +02:00
#define lv_is_thin_type(lv) (((lv)->status & (THIN_POOL | THIN_VOLUME | THIN_POOL_DATA | THIN_POOL_METADATA)) ? 1 : 0)
#define lv_is_mirrored(lv) (((lv)->status & MIRRORED) ? 1 : 0)
#define lv_is_mirror_image(lv) (((lv)->status & MIRROR_IMAGE) ? 1 : 0)
#define lv_is_mirror_log(lv) (((lv)->status & MIRROR_LOG) ? 1 : 0)
#define lv_is_mirror(lv) (((lv)->status & MIRROR) ? 1 : 0)
#define lv_is_mirror_type(lv) (((lv)->status & (MIRROR | MIRROR_LOG | MIRROR_IMAGE)) ? 1 : 0)
2016-07-14 14:21:01 +01:00
#define lv_is_not_synced(lv) (((lv)->status & LV_NOTSYNCED) ? 1 : 0)
2014-11-10 10:56:43 +01:00
#define lv_is_pending_delete(lv) (((lv)->status & LV_PENDING_DELETE) ? 1 : 0)
#define lv_is_error_when_full(lv) (((lv)->status & LV_ERROR_WHEN_FULL) ? 1 : 0)
#define lv_is_pvmove(lv) (((lv)->status & PVMOVE) ? 1 : 0)
#define lv_is_raid(lv) (((lv)->status & RAID) ? 1 : 0)
#define lv_is_raid_image(lv) (((lv)->status & RAID_IMAGE) ? 1 : 0)
#define lv_is_raid_image_with_tracking(lv) ((lv_is_raid_image(lv) && !((lv)->status & LVM_WRITE)) ? 1 : 0)
#define lv_is_raid_metadata(lv) (((lv)->status & RAID_META) ? 1 : 0)
#define lv_is_raid_type(lv) (((lv)->status & (RAID | RAID_IMAGE | RAID_META)) ? 1 : 0)
#define lv_is_cache(lv) (((lv)->status & CACHE) ? 1 : 0)
#define lv_is_cache_pool(lv) (((lv)->status & CACHE_POOL) ? 1 : 0)
#define lv_is_cache_vol(lv) (((lv)->status & LV_CACHE_VOL) ? 1 : 0)
#define lv_is_used_cache_pool(lv) (lv_is_cache_pool(lv) && !dm_list_empty(&(lv)->segs_using_this_lv))
#define lv_is_cache_pool_data(lv) (((lv)->status & CACHE_POOL_DATA) ? 1 : 0)
#define lv_is_cache_pool_metadata(lv) (((lv)->status & CACHE_POOL_METADATA) ? 1 : 0)
#define lv_is_cache_type(lv) (((lv)->status & (CACHE | CACHE_POOL | LV_CACHE_VOL | CACHE_POOL_DATA | CACHE_POOL_METADATA)) ? 1 : 0)
#define lv_is_pool(lv) (((lv)->status & (CACHE_POOL | THIN_POOL)) ? 1 : 0)
#define lv_is_pool_data(lv) (((lv)->status & (CACHE_POOL_DATA | THIN_POOL_DATA)) ? 1 : 0)
#define lv_is_pool_metadata(lv) (((lv)->status & (CACHE_POOL_METADATA | THIN_POOL_METADATA)) ? 1 : 0)
#define lv_is_pool_metadata_spare(lv) (((lv)->status & POOL_METADATA_SPARE) ? 1 : 0)
2015-03-05 14:00:44 -06:00
#define lv_is_lockd_sanlock_lv(lv) (((lv)->status & LOCKD_SANLOCK_LV) ? 1 : 0)
#define lv_is_writecache(lv) (((lv)->status & WRITECACHE) ? 1 : 0)
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
#define lv_is_integrity(lv) (((lv)->status & INTEGRITY) ? 1 : 0)
#define lv_is_integrity_metadata(lv) (((lv)->status & INTEGRITY_METADATA) ? 1 : 0)
#define lv_is_vdo(lv) (((lv)->status & LV_VDO) ? 1 : 0)
#define lv_is_vdo_pool(lv) (((lv)->status & LV_VDO_POOL) ? 1 : 0)
#define lv_is_vdo_pool_data(lv) (((lv)->status & LV_VDO_POOL_DATA) ? 1 : 0)
#define lv_is_vdo_type(lv) (((lv)->status & (LV_VDO | LV_VDO_POOL | LV_VDO_POOL_DATA)) ? 1 : 0)
metadata: process_each_lv_in_vg: get the list of LVs to process first, then do the processing This avoids a problem in which we're using selection on LV list - we need to do the selection on initial state and not on any intermediary state as we process LVs one by one - some of the relations among LVs can be gone during this processing. For example, processing one LV can cause the other LVs to lose the relation to this LV and hence they're not selectable anymore with the original selection criteria as it would be if we did selection on inital state. A perfect example is with thin snapshots: $ lvs -o lv_name,origin,layout,role vg LV Origin Layout Role lvol1 thin,sparse public,origin,thinorigin,multithinorigin lvol2 lvol1 thin,sparse public,snapshot,thinsnapshot lvol3 lvol1 thin,sparse public,snapshot,thinsnapshot pool thin,pool private $ lvremove -ff -S 'lv_name=lvol1 || origin=lvol1' Logical volume "lvol1" successfully removed The lvremove command above was supposed to remove lvol1 as well as all its snapshots which have origin=lvol1. It failed to do so, because once we removed the origin lvol1, the lvol2 and lvol3 which were snapshots before are not snapshots anymore - the relations change as we're processing these LVs one by one. If we do the selection first and then execute any concrete actions on these LVs (which is what this patch does), the behaviour is correct then - the selection is done on the *initial state*: $ lvremove -ff -S 'lv_name=lvol1 || origin=lvol1' Logical volume "lvol1" successfully removed Logical volume "lvol2" successfully removed Logical volume "lvol3" successfully removed Similarly for all the other situations in which relations among LVs are being changed by processing the LVs one by one. This patch also introduces LV_REMOVED internal LV status flag to mark removed LVs so they're not processed further when we iterate over collected list of LVs to be processed. Previously, when we iterated directly over vg->lvs list to process the LVs, we relied on the fact that once the LV is removed, it is also removed from the vg->lvs list we're iterating over. But that was incorrect as we shouldn't remove LVs from the list during one iteration while we're iterating over that exact list (dm_list_iterate_items safe can handle only one removal at one iteration anyway, so it can't be used here).
2015-03-16 17:10:21 +01:00
#define lv_is_removed(lv) (((lv)->status & LV_REMOVED) ? 1 : 0)
/* Recognize component LV (matching lib/misc/lvm-string.c _lvname_has_reserved_component_string()) */
#define lv_is_component(lv) (lv_is_cache_origin(lv) || \
lv_is_writecache_origin(lv) || \
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
lv_is_integrity_origin(lv) || \
((lv)->status & (\
CACHE_POOL_DATA |\
CACHE_POOL_METADATA |\
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
INTEGRITY_METADATA |\
LV_CACHE_VOL |\
LV_VDO_POOL_DATA |\
MIRROR_IMAGE |\
MIRROR_LOG |\
RAID_IMAGE |\
RAID_META |\
THIN_POOL_DATA |\
THIN_POOL_METADATA)) ? 1 : 0)
int lv_layout_and_role(struct dm_pool *mem, const struct logical_volume *lv,
struct dm_list **layout, struct dm_list **role);
Add lv_layout_and_type fn, lv_layout and lv_type reporting fields. The lv_layout and lv_type fields together help with LV identification. We can do basic identification using the lv_attr field which provides very condensed view. In contrast to that, the new lv_layout and lv_type fields provide more detialed information on exact layout and type used for LVs. For top-level LVs which are pure types not combined with any other LV types, the lv_layout value is equal to lv_type value. For non-top-level LVs which may be combined with other types, the lv_layout describes the underlying layout used, while the lv_type describes the use/type/usage of the LV. These two new fields are both string lists so selection (-S/--select) criteria can be defined using the list operators easily: [] for strict matching {} for subset matching. For example, let's consider this: $ lvs -a -o name,vg_name,lv_attr,layout,type LV VG Attr Layout Type [lvol1_pmspare] vg ewi------- linear metadata,pool,spare pool vg twi-a-tz-- pool,thin pool,thin [pool_tdata] vg rwi-aor--- level10,raid data,pool,thin [pool_tdata_rimage_0] vg iwi-aor--- linear image,raid [pool_tdata_rimage_1] vg iwi-aor--- linear image,raid [pool_tdata_rimage_2] vg iwi-aor--- linear image,raid [pool_tdata_rimage_3] vg iwi-aor--- linear image,raid [pool_tdata_rmeta_0] vg ewi-aor--- linear metadata,raid [pool_tdata_rmeta_1] vg ewi-aor--- linear metadata,raid [pool_tdata_rmeta_2] vg ewi-aor--- linear metadata,raid [pool_tdata_rmeta_3] vg ewi-aor--- linear metadata,raid [pool_tmeta] vg ewi-aor--- level1,raid metadata,pool,thin [pool_tmeta_rimage_0] vg iwi-aor--- linear image,raid [pool_tmeta_rimage_1] vg iwi-aor--- linear image,raid [pool_tmeta_rmeta_0] vg ewi-aor--- linear metadata,raid [pool_tmeta_rmeta_1] vg ewi-aor--- linear metadata,raid thin_snap1 vg Vwi---tz-k thin snapshot,thin thin_snap2 vg Vwi---tz-k thin snapshot,thin thin_vol1 vg Vwi-a-tz-- thin thin thin_vol2 vg Vwi-a-tz-- thin multiple,origin,thin Which is a situation with thin pool, thin volumes and thin snapshots. We can see internal 'pool_tdata' volume that makes up thin pool has actually a level10 raid layout and the internal 'pool_tmeta' has level1 raid layout. Also, we can see that 'thin_snap1' and 'thin_snap2' are both thin snapshots while 'thin_vol1' is thin origin (having multiple snapshots). Such reporting scheme provides much better base for selection criteria in addition to providing more detailed information, for example: $ lvs -a -o name,vg_name,lv_attr,layout,type -S 'type=metadata' LV VG Attr Layout Type [lvol1_pmspare] vg ewi------- linear metadata,pool,spare [pool_tdata_rmeta_0] vg ewi-aor--- linear metadata,raid [pool_tdata_rmeta_1] vg ewi-aor--- linear metadata,raid [pool_tdata_rmeta_2] vg ewi-aor--- linear metadata,raid [pool_tdata_rmeta_3] vg ewi-aor--- linear metadata,raid [pool_tmeta] vg ewi-aor--- level1,raid metadata,pool,thin [pool_tmeta_rmeta_0] vg ewi-aor--- linear metadata,raid [pool_tmeta_rmeta_1] vg ewi-aor--- linear metadata,raid (selected all LVs which are related to metadata of any type) lvs -a -o name,vg_name,lv_attr,layout,type -S 'type={metadata,thin}' LV VG Attr Layout Type [pool_tmeta] vg ewi-aor--- level1,raid metadata,pool,thin (selected all LVs which hold metadata related to thin) lvs -a -o name,vg_name,lv_attr,layout,type -S 'type={thin,snapshot}' LV VG Attr Layout Type thin_snap1 vg Vwi---tz-k thin snapshot,thin thin_snap2 vg Vwi---tz-k thin snapshot,thin (selected all LVs which are thin snapshots) lvs -a -o name,vg_name,lv_attr,layout,type -S 'layout=raid' LV VG Attr Layout Type [pool_tdata] vg rwi-aor--- level10,raid data,pool,thin [pool_tmeta] vg ewi-aor--- level1,raid metadata,pool,thin (selected all LVs with raid layout, any raid layout) lvs -a -o name,vg_name,lv_attr,layout,type -S 'layout={raid,level1}' LV VG Attr Layout Type [pool_tmeta] vg ewi-aor--- level1,raid metadata,pool,thin (selected all LVs with raid level1 layout exactly) And so on...
2014-08-13 10:03:45 +02:00
/* Ordered list - see lv_manip.c */
typedef enum {
AREA_UNASSIGNED,
AREA_PV,
AREA_LV
} area_type_t;
/* Whether or not to force an operation */
typedef enum {
PROMPT = 0, /* Issue yes/no prompt to confirm operation */
DONT_PROMPT = 1, /* Add more prompts */
DONT_PROMPT_OVERRIDE = 2 /* Add even more dangerous prompts */
} force_t;
enum {
MIRROR_LOG_CORE,
MIRROR_LOG_DISK,
MIRROR_LOG_MIRRORED,
};
typedef enum {
THIN_ZERO_UNSELECTED = 0,
THIN_ZERO_NO,
THIN_ZERO_YES,
} thin_zero_t;
typedef enum {
THIN_DISCARDS_UNSELECTED = 0,
THIN_DISCARDS_IGNORE,
THIN_DISCARDS_NO_PASSDOWN,
THIN_DISCARDS_PASSDOWN,
} thin_discards_t;
thin: improve 16g support for thin pool metadata Initial support for thin-pool used slightly smaller max size 15.81GiB for thin-pool metadata. However the real limit later settled at 15.88GiB (difference is ~64MiB - 16448 4K blocks). lvm2 could not simply increase the size as it has been using hard cropping of the loaded metadata device to avoid warnings printing warning of kernel when the size was bigger (i.e. due to bigger extent_size). This patch adds the new lvm.conf configurable setting: allocation/thin_pool_crop_metadata which defaults to 0 -> no crop of metadata beyond 15.81GiB. Only user with these sizes of metadata will be affected. Without cropping lvm2 now limits metadata allocation size to 15.88GiB. Any space beyond is currently not used by thin-pool target. Even if i.e. bigger LV is used for metadata via lvconvert, or allocated bigger because of to large extent size. With cropping enabled (=1) lvm2 preserves the old limitation 15.81GiB and should allow to work in the evironement with older lvm2 tools (i.e. older distribution). Thin-pool metadata with size bigger then 15.81G is now using CROP_METADATA flag within lvm2 metadata, so older lvm2 recognizes an incompatible thin-pool and cannot activate such pool! Users should use uncropped version as it is not suffering from various issues between thin_repair results and allocated metadata LV as thin_repair limit is 15.88GiB Users should use cropping only when really needed! Patch also better handles resize of thin-pool metadata and prevents resize beoyond usable size 15.88GiB. Resize beyond 15.81GiB automatically switches pool to no-crop version. Even with existing bigger thin-pool metadata command 'lvextend -l+1 vg/pool_tmeta' does the change. Patch gives better controls 'coverted' metadata LV and reports less confusing message during conversion. Patch set also moves the code for updating min/max into pool_manip.c for better sharing with cache_pool code.
2021-01-12 17:59:29 +01:00
typedef enum {
THIN_CROP_METADATA_UNSELECTED = 0, /* 'auto' selects */
THIN_CROP_METADATA_NO,
THIN_CROP_METADATA_YES,
} thin_crop_metadata_t;
typedef enum {
CACHE_MODE_UNSELECTED = 0,
CACHE_MODE_WRITETHROUGH,
CACHE_MODE_WRITEBACK,
CACHE_MODE_PASSTHROUGH,
} cache_mode_t;
/* ATM used for cache only */
typedef enum {
CACHE_METADATA_FORMAT_UNSELECTED = 0, /* On input means 'auto' */
CACHE_METADATA_FORMAT_1,
CACHE_METADATA_FORMAT_2,
} cache_metadata_format_t;
2015-03-05 14:00:44 -06:00
typedef enum {
LOCK_TYPE_INVALID = -1,
LOCK_TYPE_NONE = 0,
LOCK_TYPE_CLVM = 1,
LOCK_TYPE_DLM = 2,
LOCK_TYPE_SANLOCK = 3,
LOCK_TYPE_IDM = 4,
2015-03-05 14:00:44 -06:00
} lock_type_t;
struct cmd_context;
struct format_handler;
struct labeller;
struct format_type {
struct dm_list list;
struct cmd_context *cmd;
struct format_handler *ops;
struct dm_list mda_ops; /* List of permissible mda ops. */
struct labeller *labeller;
const char *name;
const char *alias;
char orphan_vg_name[ID_LEN];
struct volume_group *orphan_vg; /* Only one ever exists. */
uint32_t features;
void *library;
void *private;
};
struct pv_segment {
struct dm_list list; /* Member of pv->segments: ordered list
* covering entire data area on this PV */
struct physical_volume *pv;
uint32_t pe;
uint32_t len;
struct lv_segment *lvseg; /* NULL if free space */
uint32_t lv_area; /* Index to area in LV segment */
};
#define pvseg_is_allocated(pvseg) ((pvseg)->lvseg ? 1 : 0)
/*
2012-02-12 23:01:19 +00:00
* Properties of each format instance type.
* The primary role of the format_instance is to temporarily store metadata
* area information we are working with.
*/
2012-05-10 11:03:07 +00:00
/* Include any existing PV ("on-disk") mdas during format_instance initialisation. */
#define FMT_INSTANCE_MDAS 0x00000002U
2012-05-10 10:37:49 +00:00
/*
* Include any auxiliary mdas during format_instance intialisation.
* Currently, this includes metadata areas as defined by
* metadata/dirs and metadata/raws setting.
*/
#define FMT_INSTANCE_AUX_MDAS 0x00000004U
2012-05-10 11:03:07 +00:00
/*
* Include any other format-specific mdas during format_instance initialisation.
* For example metadata areas used during backup/restore/archive handling.
*/
#define FMT_INSTANCE_PRIVATE_MDAS 0x00000008U
/*
* Each VG has its own fid struct. The fid for a VG describes where
* the metadata for that VG can be found. The lists hold mda locations.
*
* label scan finds the metadata locations (devs and offsets) for a VG,
* and saves this info in lvmcache vginfo/info lists.
*
* vg_read() then creates an fid for a given VG, and the mda locations
* from lvmcache are copied onto the fid lists. Those mda locations
* are read again by vg_read() to get VG metadata that is used to
* create the 'vg' struct.
*/
struct format_instance {
2011-04-28 20:29:59 +00:00
unsigned ref_count; /* Refs to this fid from VG and PV structs */
struct dm_pool *mem;
uint32_t type;
const struct format_type *fmt;
/*
* Each mda in a vg is on exactly one of the below lists.
* MDAs on the 'in_use' list will be read from / written to
* disk, while MDAs on the 'ignored' list will not be read
* or written to.
*/
/* FIXME: Try to use the index only. Remove these lists. */
struct dm_list metadata_areas_in_use;
struct dm_list metadata_areas_ignored;
struct dm_hash_table *metadata_areas_index;
void *private;
};
/* There will be one area for each stripe */
struct lv_segment_area {
area_type_t type;
union {
struct {
struct pv_segment *pvseg;
} pv;
struct {
struct logical_volume *lv;
uint32_t le;
} lv;
} u;
};
struct lv_thin_message {
struct dm_list list; /* Chained list of messages */
dm_thin_message_t type; /* Use dm thin message datatype */
union {
struct logical_volume *lv; /* For: create_thin, create_snap, trim */
uint32_t delete_id; /* For delete, needs device_id */
} u;
};
struct segment_type;
struct lv_segment {
struct dm_list list;
struct logical_volume *lv;
const struct segment_type *segtype;
uint32_t le;
uint32_t len;
lvconvert: add infrastructure for RaidLV reshaping support In order to support striped raid5/6/10 LV reshaping (change of LV type, stripesize or number of legs), this patch introduces infrastructure prerequisites to be used by raid_manip.c extensions in followup patches. This base is needed for allocation of out-of-place reshape space required by the MD raid personalities to avoid writing over data in-place when reading off the current RAID layout or number of legs and writing out the new layout or to a different number of legs (i.e. restripe) Changes: - add members reshape_len to 'struct lv_segment' to store out-of-place reshape length per component rimage - add member data_copies to struct lv_segment to support more than 2 raid10 data copies - make alloc_lv_segment() aware of both reshape_len and data_copies - adjust all alloc_lv_segment() callers to the new API - add functions to retrieve the current data offset (needed for out-of-place reshaping space allocation) and the devices count from the kernel - make libdm deptree code aware of reshape_len - add LV flags for disk add/remove reshaping - support import/export of the new 'struct lv_segment' members - enhance lv_extend/_lv_reduce to cope with reshape_len - add seg_is_*/segtype_is_* macros related to reshaping - add target version check for reshaping - grow rebuilds/writemostly bitmaps to 246 bit to support kernel maximal - enhance libdm deptree code to support data_offset (out-of-place reshaping) and delta_disk (legs add/remove reshaping) target arguments Related: rhbz834579 Related: rhbz1191935 Related: rhbz1191978
2017-02-24 00:50:00 +01:00
uint32_t reshape_len; /* For RAID: user hidden additional out of place reshaping length off area_len and len */
uint64_t status;
/* FIXME Fields depend on segment type */
uint32_t stripe_size; /* For stripe and RAID - in sectors */
RAID: Add writemostly/writebehind support for RAID1 'lvchange' is used to alter a RAID 1 logical volume's write-mostly and write-behind characteristics. The '--writemostly' parameter takes a PV as an argument with an optional trailing character to specify whether to set ('y'), unset ('n'), or toggle ('t') the value. If no trailing character is given, it will set the flag. Synopsis: lvchange [--writemostly <PV>:{t|y|n}] [--writebehind <count>] vg/lv Example: lvchange --writemostly /dev/sdb1:y --writebehind 512 vg/raid1_lv The last character in the 'lv_attr' field is used to show whether a device has the WriteMostly flag set. It is signified with a 'w'. If the device has failed, the 'p'artial flag has priority. Example ("nosync" raid1 with mismatch_cnt and writemostly): [~]# lvs -a --segment vg LV VG Attr #Str Type SSize raid1 vg Rwi---r-m 2 raid1 500.00m [raid1_rimage_0] vg Iwi---r-- 1 linear 500.00m [raid1_rimage_1] vg Iwi---r-w 1 linear 500.00m [raid1_rmeta_0] vg ewi---r-- 1 linear 4.00m [raid1_rmeta_1] vg ewi---r-- 1 linear 4.00m Example (raid1 with mismatch_cnt, writemostly - but failed drive): [~]# lvs -a --segment vg LV VG Attr #Str Type SSize raid1 vg rwi---r-p 2 raid1 500.00m [raid1_rimage_0] vg Iwi---r-- 1 linear 500.00m [raid1_rimage_1] vg Iwi---r-p 1 linear 500.00m [raid1_rmeta_0] vg ewi---r-- 1 linear 4.00m [raid1_rmeta_1] vg ewi---r-p 1 linear 4.00m A new reportable field has been added for writebehind as well. If write-behind has not been set or the LV is not RAID1, the field will be blank. Example (writebehind is set): [~]# lvs -a -o name,attr,writebehind vg LV Attr WBehind lv rwi-a-r-- 512 [lv_rimage_0] iwi-aor-w [lv_rimage_1] iwi-aor-- [lv_rmeta_0] ewi-aor-- [lv_rmeta_1] ewi-aor-- Example (writebehind is not set): [~]# lvs -a -o name,attr,writebehind vg LV Attr WBehind lv rwi-a-r-- [lv_rimage_0] iwi-aor-w [lv_rimage_1] iwi-aor-- [lv_rmeta_0] ewi-aor-- [lv_rmeta_1] ewi-aor--
2013-04-15 13:59:46 -05:00
uint32_t writebehind; /* For RAID (RAID1 only) */
uint32_t min_recovery_rate; /* For RAID */
uint32_t max_recovery_rate; /* For RAID */
lvconvert: add infrastructure for RaidLV reshaping support In order to support striped raid5/6/10 LV reshaping (change of LV type, stripesize or number of legs), this patch introduces infrastructure prerequisites to be used by raid_manip.c extensions in followup patches. This base is needed for allocation of out-of-place reshape space required by the MD raid personalities to avoid writing over data in-place when reading off the current RAID layout or number of legs and writing out the new layout or to a different number of legs (i.e. restripe) Changes: - add members reshape_len to 'struct lv_segment' to store out-of-place reshape length per component rimage - add member data_copies to struct lv_segment to support more than 2 raid10 data copies - make alloc_lv_segment() aware of both reshape_len and data_copies - adjust all alloc_lv_segment() callers to the new API - add functions to retrieve the current data offset (needed for out-of-place reshaping space allocation) and the devices count from the kernel - make libdm deptree code aware of reshape_len - add LV flags for disk add/remove reshaping - support import/export of the new 'struct lv_segment' members - enhance lv_extend/_lv_reduce to cope with reshape_len - add seg_is_*/segtype_is_* macros related to reshaping - add target version check for reshaping - grow rebuilds/writemostly bitmaps to 246 bit to support kernel maximal - enhance libdm deptree code to support data_offset (out-of-place reshaping) and delta_disk (legs add/remove reshaping) target arguments Related: rhbz834579 Related: rhbz1191935 Related: rhbz1191978
2017-02-24 00:50:00 +01:00
uint32_t data_offset; /* For RAID: data offset in sectors on each data component image */
uint32_t area_count;
uint32_t area_len;
uint32_t chunk_size; /* For snapshots/thin_pool. In sectors. */
/* For thin_pool, 128..2097152. */
struct logical_volume *origin; /* snap and thin */
2016-03-01 15:18:42 +01:00
struct generic_logical_volume *indirect_origin;
2013-11-29 15:51:28 +01:00
struct logical_volume *merge_lv; /* thin, merge descendent lv into this ancestor */
struct logical_volume *cow;
struct dm_list origin_list;
uint32_t region_size; /* For raids/mirrors - in sectors */
lvconvert: add infrastructure for RaidLV reshaping support In order to support striped raid5/6/10 LV reshaping (change of LV type, stripesize or number of legs), this patch introduces infrastructure prerequisites to be used by raid_manip.c extensions in followup patches. This base is needed for allocation of out-of-place reshape space required by the MD raid personalities to avoid writing over data in-place when reading off the current RAID layout or number of legs and writing out the new layout or to a different number of legs (i.e. restripe) Changes: - add members reshape_len to 'struct lv_segment' to store out-of-place reshape length per component rimage - add member data_copies to struct lv_segment to support more than 2 raid10 data copies - make alloc_lv_segment() aware of both reshape_len and data_copies - adjust all alloc_lv_segment() callers to the new API - add functions to retrieve the current data offset (needed for out-of-place reshaping space allocation) and the devices count from the kernel - make libdm deptree code aware of reshape_len - add LV flags for disk add/remove reshaping - support import/export of the new 'struct lv_segment' members - enhance lv_extend/_lv_reduce to cope with reshape_len - add seg_is_*/segtype_is_* macros related to reshaping - add target version check for reshaping - grow rebuilds/writemostly bitmaps to 246 bit to support kernel maximal - enhance libdm deptree code to support data_offset (out-of-place reshaping) and delta_disk (legs add/remove reshaping) target arguments Related: rhbz834579 Related: rhbz1191935 Related: rhbz1191978
2017-02-24 00:50:00 +01:00
uint32_t data_copies; /* For RAID: number of data copies (e.g. 3 for RAID 6 */
uint32_t extents_copied;/* Number of extents synced for raids/mirrors */
struct logical_volume *log_lv;
struct lv_segment *pvmove_source_seg;
void *segtype_private;
struct dm_list tags;
struct lv_segment_area *areas;
2011-09-06 22:43:56 +00:00
struct lv_segment_area *meta_areas; /* For RAID */
struct logical_volume *metadata_lv; /* For thin_pool */
uint64_t transaction_id; /* For thin_pool, thin */
thin_zero_t zero_new_blocks; /* For thin_pool */
thin_discards_t discards; /* For thin_pool */
thin: improve 16g support for thin pool metadata Initial support for thin-pool used slightly smaller max size 15.81GiB for thin-pool metadata. However the real limit later settled at 15.88GiB (difference is ~64MiB - 16448 4K blocks). lvm2 could not simply increase the size as it has been using hard cropping of the loaded metadata device to avoid warnings printing warning of kernel when the size was bigger (i.e. due to bigger extent_size). This patch adds the new lvm.conf configurable setting: allocation/thin_pool_crop_metadata which defaults to 0 -> no crop of metadata beyond 15.81GiB. Only user with these sizes of metadata will be affected. Without cropping lvm2 now limits metadata allocation size to 15.88GiB. Any space beyond is currently not used by thin-pool target. Even if i.e. bigger LV is used for metadata via lvconvert, or allocated bigger because of to large extent size. With cropping enabled (=1) lvm2 preserves the old limitation 15.81GiB and should allow to work in the evironement with older lvm2 tools (i.e. older distribution). Thin-pool metadata with size bigger then 15.81G is now using CROP_METADATA flag within lvm2 metadata, so older lvm2 recognizes an incompatible thin-pool and cannot activate such pool! Users should use uncropped version as it is not suffering from various issues between thin_repair results and allocated metadata LV as thin_repair limit is 15.88GiB Users should use cropping only when really needed! Patch also better handles resize of thin-pool metadata and prevents resize beoyond usable size 15.88GiB. Resize beyond 15.81GiB automatically switches pool to no-crop version. Even with existing bigger thin-pool metadata command 'lvextend -l+1 vg/pool_tmeta' does the change. Patch gives better controls 'coverted' metadata LV and reports less confusing message during conversion. Patch set also moves the code for updating min/max into pool_manip.c for better sharing with cache_pool code.
2021-01-12 17:59:29 +01:00
thin_crop_metadata_t crop_metadata; /* For thin_pool */
struct dm_list thin_messages; /* For thin_pool */
struct logical_volume *external_lv; /* For thin */
2014-11-09 20:18:00 +01:00
struct logical_volume *pool_lv; /* For thin, cache */
uint32_t device_id; /* For thin, 24bit */
Allow dm-cache cache device to be standard LV If a single, standard LV is specified as the cache, use it directly instead of converting it into a cache-pool object with two separate LVs (for data and metadata). With a single LV as the cache, lvm will use blocks at the beginning for metadata, and the rest for data. Separate dm linear devices are set up to point at the metadata and data areas of the LV. These dm devs are given to the dm-cache target to use. The single LV cache cannot be resized without recreating it. If the --poolmetadata option is used to specify an LV for metadata, then a cache pool will be created (with separate LVs for data and metadata.) Usage: $ lvcreate -n main -L 128M vg /dev/loop0 $ lvcreate -n fast -L 64M vg /dev/loop1 $ lvs -a vg LV VG Attr LSize Type Devices main vg -wi-a----- 128.00m linear /dev/loop0(0) fast vg -wi-a----- 64.00m linear /dev/loop1(0) $ lvconvert --type cache --cachepool fast vg/main $ lvs -a vg LV VG Attr LSize Origin Pool Type Devices [fast] vg Cwi---C--- 64.00m linear /dev/loop1(0) main vg Cwi---C--- 128.00m [main_corig] [fast] cache main_corig(0) [main_corig] vg owi---C--- 128.00m linear /dev/loop0(0) $ lvchange -ay vg/main $ dmsetup ls vg-fast_cdata (253:4) vg-fast_cmeta (253:5) vg-main_corig (253:6) vg-main (253:24) vg-fast (253:3) $ dmsetup table vg-fast_cdata: 0 98304 linear 253:3 32768 vg-fast_cmeta: 0 32768 linear 253:3 0 vg-main_corig: 0 262144 linear 7:0 2048 vg-main: 0 262144 cache 253:5 253:4 253:6 128 2 metadata2 writethrough mq 0 vg-fast: 0 131072 linear 7:1 2048 $ lvchange -an vg/min $ lvconvert --splitcache vg/main $ lvs -a vg LV VG Attr LSize Type Devices fast vg -wi------- 64.00m linear /dev/loop1(0) main vg -wi------- 128.00m linear /dev/loop0(0)
2018-08-17 15:45:52 -05:00
uint64_t metadata_start; /* For cache */
uint64_t metadata_len; /* For cache */
uint64_t data_start; /* For cache */
uint64_t data_len; /* For cache */
struct id *metadata_id; /* For cache, when NULL uses CVOL id */
struct id *data_id; /* For cache, when NULL uses CVOL id */
Allow dm-cache cache device to be standard LV If a single, standard LV is specified as the cache, use it directly instead of converting it into a cache-pool object with two separate LVs (for data and metadata). With a single LV as the cache, lvm will use blocks at the beginning for metadata, and the rest for data. Separate dm linear devices are set up to point at the metadata and data areas of the LV. These dm devs are given to the dm-cache target to use. The single LV cache cannot be resized without recreating it. If the --poolmetadata option is used to specify an LV for metadata, then a cache pool will be created (with separate LVs for data and metadata.) Usage: $ lvcreate -n main -L 128M vg /dev/loop0 $ lvcreate -n fast -L 64M vg /dev/loop1 $ lvs -a vg LV VG Attr LSize Type Devices main vg -wi-a----- 128.00m linear /dev/loop0(0) fast vg -wi-a----- 64.00m linear /dev/loop1(0) $ lvconvert --type cache --cachepool fast vg/main $ lvs -a vg LV VG Attr LSize Origin Pool Type Devices [fast] vg Cwi---C--- 64.00m linear /dev/loop1(0) main vg Cwi---C--- 128.00m [main_corig] [fast] cache main_corig(0) [main_corig] vg owi---C--- 128.00m linear /dev/loop0(0) $ lvchange -ay vg/main $ dmsetup ls vg-fast_cdata (253:4) vg-fast_cmeta (253:5) vg-main_corig (253:6) vg-main (253:24) vg-fast (253:3) $ dmsetup table vg-fast_cdata: 0 98304 linear 253:3 32768 vg-fast_cmeta: 0 32768 linear 253:3 0 vg-main_corig: 0 262144 linear 7:0 2048 vg-main: 0 262144 cache 253:5 253:4 253:6 128 2 metadata2 writethrough mq 0 vg-fast: 0 131072 linear 7:1 2048 $ lvchange -an vg/min $ lvconvert --splitcache vg/main $ lvs -a vg LV VG Attr LSize Type Devices fast vg -wi------- 64.00m linear /dev/loop1(0) main vg -wi------- 128.00m linear /dev/loop0(0)
2018-08-17 15:45:52 -05:00
cache_metadata_format_t cache_metadata_format;/* For cache_pool */
cache_mode_t cache_mode; /* For cache_pool */
const char *policy_name; /* For cache_pool */
struct dm_config_node *policy_settings; /* For cache_pool */
2014-11-09 20:18:00 +01:00
unsigned cleaner_policy; /* For cache */
struct logical_volume *writecache; /* For writecache */
uint32_t writecache_block_size; /* For writecache */
struct writecache_settings writecache_settings; /* For writecache */
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
uint64_t integrity_data_sectors;
struct logical_volume *integrity_meta_dev;
struct integrity_settings integrity_settings;
uint32_t integrity_recalculate;
struct dm_vdo_target_params vdo_params; /* For VDO-pool */
uint32_t vdo_pool_header_size; /* For VDO-pool */
uint32_t vdo_pool_virtual_extents; /* For VDO-pool */
};
#define seg_type(seg, s) (seg)->areas[(s)].type
#define seg_pv(seg, s) (seg)->areas[(s)].u.pv.pvseg->pv
#define seg_lv(seg, s) (seg)->areas[(s)].u.lv.lv
#define seg_metalv(seg, s) (seg)->meta_areas[(s)].u.lv.lv
#define seg_metatype(seg, s) (seg)->meta_areas[(s)].type
struct pe_range {
struct dm_list list;
uint32_t start; /* PEs */
uint32_t count; /* PEs */
};
struct pv_list {
struct dm_list list;
struct physical_volume *pv;
struct dm_list *mdas; /* Metadata areas */
struct dm_list *pe_ranges; /* Ranges of PEs e.g. for allocation */
};
struct lv_list {
struct dm_list list;
struct logical_volume *lv;
};
2016-03-01 15:18:42 +01:00
struct glv_list {
struct dm_list list;
struct generic_logical_volume *glv;
};
struct vg_list {
struct dm_list list;
struct volume_group *vg;
};
struct vgnameid_list {
struct dm_list list;
const char *vg_name;
const char *vgid;
};
struct device_id_list {
struct dm_list list;
struct device *dev;
char pvid[ID_LEN + 1];
};
#define PV_PE_START_CALC ((uint64_t) -1) /* Calculate pe_start value */
/*
* Values used by pv_create().
*/
struct pv_create_args {
Place the first PE at 1 MiB for all defaults . When using default settings, this commit should change nothing. The first PE continues to be placed at 1 MiB resulting in a metadata area size of 1020 KiB (for 4K page sizes; slightly smaller for larger page sizes.) . When default_data_alignment is disabled in lvm.conf, align pe_start at 1 MiB, based on a default metadata area size that adapts to the page size. Previously, disabling this option would result in mda_size that was too small for common use, and produced a 64 KiB aligned pe_start. . Customized pe_start and mda_size values continue to be set as before in lvm.conf and command line. . Remove the configure option for setting default_data_alignment at build time. . Improve alignment related option descriptions. . Add section about alignment to pvcreate man page. Previously, DEFAULT_PVMETADATASIZE was 255 sectors. However, the fact that the config setting named "default_data_alignment" has a default value of 1 (MiB) meant that DEFAULT_PVMETADATASIZE was having no effect. The metadata area size is the space between the start of the metadata area (page size offset from the start of the device) and the first PE (1 MiB by default due to default_data_alignment 1.) The result is a 1020 KiB metadata area on machines with 4KiB page size (1024 KiB - 4 KiB), and smaller on machines with larger page size. If default_data_alignment was set to 0 (disabled), then DEFAULT_PVMETADATASIZE 255 would take effect, and produce a metadata area that was 188 KiB and pe_start of 192 KiB. This was too small for common use. This is fixed by making the default metadata area size a computed value that matches the value produced by default_data_alignment.
2018-11-13 15:00:11 -06:00
uint64_t size; /* in sectors */
uint64_t data_alignment; /* in sectors */
uint64_t data_alignment_offset; /* in sectors */
uint64_t label_sector;
int pvmetadatacopies;
Place the first PE at 1 MiB for all defaults . When using default settings, this commit should change nothing. The first PE continues to be placed at 1 MiB resulting in a metadata area size of 1020 KiB (for 4K page sizes; slightly smaller for larger page sizes.) . When default_data_alignment is disabled in lvm.conf, align pe_start at 1 MiB, based on a default metadata area size that adapts to the page size. Previously, disabling this option would result in mda_size that was too small for common use, and produced a 64 KiB aligned pe_start. . Customized pe_start and mda_size values continue to be set as before in lvm.conf and command line. . Remove the configure option for setting default_data_alignment at build time. . Improve alignment related option descriptions. . Add section about alignment to pvcreate man page. Previously, DEFAULT_PVMETADATASIZE was 255 sectors. However, the fact that the config setting named "default_data_alignment" has a default value of 1 (MiB) meant that DEFAULT_PVMETADATASIZE was having no effect. The metadata area size is the space between the start of the metadata area (page size offset from the start of the device) and the first PE (1 MiB by default due to default_data_alignment 1.) The result is a 1020 KiB metadata area on machines with 4KiB page size (1024 KiB - 4 KiB), and smaller on machines with larger page size. If default_data_alignment was set to 0 (disabled), then DEFAULT_PVMETADATASIZE 255 would take effect, and produce a metadata area that was 188 KiB and pe_start of 192 KiB. This was too small for common use. This is fixed by making the default metadata area size a computed value that matches the value produced by default_data_alignment.
2018-11-13 15:00:11 -06:00
uint64_t pvmetadatasize; /* in sectors */
unsigned metadataignore;
/* used when restoring */
struct id id;
struct id *idp;
Place the first PE at 1 MiB for all defaults . When using default settings, this commit should change nothing. The first PE continues to be placed at 1 MiB resulting in a metadata area size of 1020 KiB (for 4K page sizes; slightly smaller for larger page sizes.) . When default_data_alignment is disabled in lvm.conf, align pe_start at 1 MiB, based on a default metadata area size that adapts to the page size. Previously, disabling this option would result in mda_size that was too small for common use, and produced a 64 KiB aligned pe_start. . Customized pe_start and mda_size values continue to be set as before in lvm.conf and command line. . Remove the configure option for setting default_data_alignment at build time. . Improve alignment related option descriptions. . Add section about alignment to pvcreate man page. Previously, DEFAULT_PVMETADATASIZE was 255 sectors. However, the fact that the config setting named "default_data_alignment" has a default value of 1 (MiB) meant that DEFAULT_PVMETADATASIZE was having no effect. The metadata area size is the space between the start of the metadata area (page size offset from the start of the device) and the first PE (1 MiB by default due to default_data_alignment 1.) The result is a 1020 KiB metadata area on machines with 4KiB page size (1024 KiB - 4 KiB), and smaller on machines with larger page size. If default_data_alignment was set to 0 (disabled), then DEFAULT_PVMETADATASIZE 255 would take effect, and produce a metadata area that was 188 KiB and pe_start of 192 KiB. This was too small for common use. This is fixed by making the default metadata area size a computed value that matches the value produced by default_data_alignment.
2018-11-13 15:00:11 -06:00
uint64_t ba_start; /* in sectors */
uint64_t ba_size; /* in sectors */
uint64_t pe_start; /* in sectors */
uint32_t extent_count;
uint32_t extent_size;
};
struct pvcreate_params {
/*
* From argc and argv.
*/
char **pv_names;
uint32_t pv_count;
/*
* From command line args.
*/
int zero;
force_t force;
unsigned yes;
/*
* From recovery-specific command line args.
*/
const char *restorefile; /* NULL if no --restorefile option */
const char *uuid_str; /* id in printable format, NULL if no id */
/*
* Values used by pv_create().
*/
struct pv_create_args pva;
/*
* Used for command processing.
*/
struct dm_list prompts; /* pvcreate_prompt */
struct dm_list arg_devices; /* pvcreate_device, one for each pv_name */
struct dm_list arg_process; /* pvcreate_device, used for processing */
struct dm_list arg_confirm; /* pvcreate_device, used for processing */
struct dm_list arg_create; /* pvcreate_device, used for pvcreate */
struct dm_list arg_remove; /* pvcreate_device, used for pvremove */
struct dm_list arg_fail; /* pvcreate_device, failed to create */
struct dm_list pvs; /* pv_list, created and usable for vgcreate/vgextend */
const char *orphan_vg_name;
unsigned is_remove : 1; /* is removing PVs, not creating */
unsigned preserve_existing : 1;
unsigned check_failed : 1;
unsigned check_consistent_block_size : 1;
};
struct lvresize_params {
int argc;
char **argv;
const char *vg_name; /* only-used when VG is not yet opened (in /tools) */
const char *lv_name;
const struct segment_type *segtype;
uint64_t poolmetadata_size;
sign_t poolmetadata_sign;
/* Per LV applied parameters */
enum {
LV_ANY = 0,
LV_REDUCE = 1,
LV_EXTEND = 2
} resize;
int use_policies;
alloc_policy_t alloc;
int yes;
int force;
int nosync;
int nofsck;
int resizefs;
unsigned mirrors;
uint32_t stripes;
uint64_t stripe_size;
uint32_t extents;
uint64_t size;
sign_t sign;
percent_type_t percent;
int approx_alloc;
int extents_are_pes; /* Is 'extents' counting PEs or LEs? */
int size_changed; /* Was there actually a size change */
const char *lockopt;
char *lockd_lv_refresh_path; /* set during resize to use for refresh at the end */
char *lockd_lv_refresh_uuid; /* set during resize to use for refresh at the end */
};
void pvcreate_params_set_defaults(struct pvcreate_params *pp);
/*
* Flags that indicate which warnings a library function should issue.
*/
#define WARN_PV_READ 0x00000001
#define WARN_INCONSISTENT 0x00000002
#define SKIP_RESCAN 0x00000004
/*
* Utility functions
*/
int vg_write(struct volume_group *vg);
int vg_commit(struct volume_group *vg);
void vg_revert(struct volume_group *vg);
/*
* Add/remove LV to/from volume group
*/
int link_lv_to_vg(struct volume_group *vg, struct logical_volume *lv);
int unlink_lv_from_vg(struct logical_volume *lv);
void lv_set_visible(struct logical_volume *lv);
2009-05-21 03:04:52 +00:00
void lv_set_hidden(struct logical_volume *lv);
int pv_write(struct cmd_context *cmd, struct physical_volume *pv, int allow_non_orphan);
int move_pv(struct volume_group *vg_from, struct volume_group *vg_to,
const char *pv_name);
int move_pvs_used_by_lv(struct volume_group *vg_from,
struct volume_group *vg_to,
const char *lv_name);
int is_orphan_vg(const char *vg_name);
int is_real_vg(const char *vg_name);
int vg_missing_pv_count(const struct volume_group *vg);
int vgs_are_compatible(struct cmd_context *cmd,
struct volume_group *vg_from,
struct volume_group *vg_to);
uint32_t vg_lock_newname(struct cmd_context *cmd, const char *vgname);
int lv_resize(struct logical_volume *lv,
struct lvresize_params *lp,
struct dm_list *pvh);
improve reading and repairing vg metadata The fact that vg repair is implemented as a part of vg read has led to a messy and complicated implementation of vg_read, and limited and uncontrolled repair capability. This splits read and repair apart. Summary ------- - take all kinds of various repairs out of vg_read - vg_read no longer writes anything - vg_read now simply reads and returns vg metadata - vg_read ignores bad or old copies of metadata - vg_read proceeds with a single good copy of metadata - improve error checks and handling when reading - keep track of bad (corrupt) copies of metadata in lvmcache - keep track of old (seqno) copies of metadata in lvmcache - keep track of outdated PVs in lvmcache - vg_write will do basic repairs - new command vgck --updatemetdata will do all repairs Details ------- - In scan, do not delete dev from lvmcache if reading/processing fails; the dev is still present, and removing it makes it look like the dev is not there. Records are now kept about the problems with each PV so they be fixed/repaired in the appropriate places. - In scan, record a bad mda on failure, and delete the mda from mda in use list so it will not be used by vg_read or vg_write, only by repair. - In scan, succeed if any good mda on a device is found, instead of failing if any is bad. The bad/old copies of metadata should not interfere with normal usage while good copies can be used. - In scan, add a record of old mdas in lvmcache for later, do not repair them while reading, and do not let them prevent us from finding and using a good copy of metadata from elsewhere. One result is that "inconsistent metadata" is no longer a read error, but instead a record in lvmcache that can be addressed separate from the read. - Treat a dev with no good mdas like a dev with no mdas, which is an existing case we already handle. - Don't use a fake vg "handle" for returning an error from vg_read, or the vg_read_error function for getting that error number; just return null if the vg cannot be read or used, and an error_flags arg with flags set for the specific kind of error (which can be used later for determining the kind of repair.) - Saving an original copy of the vg metadata, for purposes of reverting a write, is now done explicitly in vg_read instead of being hidden in the vg_make_handle function. - When a vg is not accessible due to "access restrictions" but is otherwise fine, return the vg through the new error_vg arg so that process_each_pv can skip the PVs in the VG while processing. (This is a temporary accomodation for the way process_each_pv tracks which devs have been looked at, and can be dropped later when process_each_pv implementation dev tracking is changed.) - vg_read does not try to fix or recover a vg, but now just reads the metadata, checks access restrictions and returns it. (Checking access restrictions might be better done outside of vg_read, but this is a later improvement.) - _vg_read now simply makes one attempt to read metadata from each mda, and uses the most recent copy to return to the caller in the form of a 'vg' struct. (bad mdas were excluded during the scan and are not retried) (old mdas were not excluded during scan and are retried here) - vg_read uses _vg_read to get the latest copy of metadata from mdas, and then makes various checks against it to produce warnings, and to check if VG access is allowed (access restrictions include: writable, foreign, shared, clustered, missing pvs). - Things that were previously silently/automatically written by vg_read that are now done by vg_write, based on the records made in lvmcache during the scan and read: . clearing the missing flag . updating old copies of metadata . clearing outdated pvs . updating pv header flags - Bad/corrupt metadata are now repaired; they were not before. Test changes ------------ - A read command no longer writes the VG to repair it, so add a write command to do a repair. (inconsistent-metadata, unlost-pv) - When a missing PV is removed from a VG, and then the device is enabled again, vgck --updatemetadata is needed to clear the outdated PV before it can be used again, where it wasn't before. (lvconvert-repair-policy, lvconvert-repair-raid, lvconvert-repair, mirror-vgreduce-removemissing, pv-ext-flags, unlost-pv) Reading bad/old metadata ------------------------ - "bad metadata": the mda_header or metadata text has invalid fields or can't be parsed by lvm. This is a form of corruption that would not be caused by known failure scenarios. A checksum error is typically included among the errors reported. - "old metadata": a valid copy of the metadata that has a smaller seqno than other copies of the metadata. This can happen if the device failed, or io failed, or lvm failed while commiting new metadata to all the metadata areas. Old metadata on a PV that has been removed from the VG is the "outdated" case below. When a VG has some PVs with bad/old metadata, lvm can simply ignore the bad/old copies, and use a good copy. This is why there are multiple copies of the metadata -- so it's available even when some of the copies cannot be used. The bad/old copies do not have to be repaired before the VG can be used (the repair can happen later.) A PV with no good copies of the metadata simply falls back to being treated like a PV with no mdas; a common and harmless configuration. When bad/old metadata exists, lvm warns the user about it, and suggests repairing it using a new metadata repair command. Bad metadata in particular is something that users will want to investigate and repair themselves, since it should not happen and may indicate some other problem that needs to be fixed. PVs with bad/old metadata are not the same as missing devices. Missing devices will block various kinds of VG modification or activation, but bad/old metadata will not. Previously, lvm would attempt to repair bad/old metadata whenever it was read. This was unnecessary since lvm does not require every copy of the metadata to be used. It would also hide potential problems that should be investigated by the user. It was also dangerous in cases where the VG was on shared storage. The user is now allowed to investigate potential problems and decide how and when to repair them. Repairing bad/old metadata -------------------------- When label scan sees bad metadata in an mda, that mda is removed from the lvmcache info->mdas list. This means that vg_read will skip it, and not attempt to read/process it again. If it was the only in-use mda on a PV, that PV is treated like a PV with no mdas. It also means that vg_write will also skip the bad mda, and not attempt to write new metadata to it. The only way to repair bad metadata is with the metadata repair command. When label scan sees old metadata in an mda, that mda is kept in the lvmcache info->mdas list. This means that vg_read will read/process it again, and likely see the same mismatch with the other copies of the metadata. Like the label_scan, the vg_read will simply ignore the old copy of the metadata and use the latest copy. If the command is modifying the vg (e.g. lvcreate), then vg_write, which writes new metadata to every mda on info->mdas, will write the new metadata to the mda that had the old version. If successful, this will resolve the old metadata problem (without needing to run a metadata repair command.) Outdated PVs ------------ An outdated PV is a PV that has an old copy of VG metadata that shows it is a member of the VG, but the latest copy of the VG metadata does not include this PV. This happens if the PV is disconnected, vgreduce --removemissing is run to remove the PV from the VG, then the PV is reconnected. In this case, the outdated PV needs have its outdated metadata removed and the PV used flag needs to be cleared. This repair will be done by the subsequent repair command. It is also done if vgremove is run on the VG. MISSING PVs ----------- When a device is missing, most commands will refuse to modify the VG. This is the simple case. More complicated is when a command is allowed to modify the VG while it is missing a device. When a VG is written while a device is missing for one of it's PVs, the VG metadata is written to disk with the MISSING flag on the PV with the missing device. When the VG is next used, it is treated as if the PV with the MISSING flag still has a missing device, even if that device has reappeared. If all LVs that were using a PV with the MISSING flag are removed or repaired so that the MISSING PV is no longer used, then the next time the VG metadata is written, the MISSING flag will be dropped. Alternative methods of clearing the MISSING flag are: vgreduce --removemissing will remove PVs with missing devices, or PVs with the MISSING flag where the device has reappeared. vgextend --restoremissing will clear the MISSING flag on PVs where the device has reappeared, allowing the VG to be used normally. This must be done with caution since the reappeared device may have old data that is inconsistent with data on other PVs. Bad mda repair -------------- The new command: vgck --updatemetadata VG first uses vg_write to repair old metadata, and other basic issues mentioned above (old metadata, outdated PVs, pv_header flags, MISSING_PV flags). It will also go further and repair bad metadata: . text metadata that has a bad checksum . text metadata that is not parsable . corrupt mda_header checksum and version fields (To keep a clean diff, #if 0 is added around functions that are replaced by new code. These commented functions are removed by the following commit.)
2019-05-24 12:04:37 -05:00
struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, const char *vgid,
uint32_t read_flags, uint32_t lockd_state,
uint32_t *error_flags, struct volume_group **error_vg);
struct volume_group *vg_read_for_update(struct cmd_context *cmd, const char *vg_name,
const char *vgid, uint32_t read_flags, uint32_t lockd_state);
improve reading and repairing vg metadata The fact that vg repair is implemented as a part of vg read has led to a messy and complicated implementation of vg_read, and limited and uncontrolled repair capability. This splits read and repair apart. Summary ------- - take all kinds of various repairs out of vg_read - vg_read no longer writes anything - vg_read now simply reads and returns vg metadata - vg_read ignores bad or old copies of metadata - vg_read proceeds with a single good copy of metadata - improve error checks and handling when reading - keep track of bad (corrupt) copies of metadata in lvmcache - keep track of old (seqno) copies of metadata in lvmcache - keep track of outdated PVs in lvmcache - vg_write will do basic repairs - new command vgck --updatemetdata will do all repairs Details ------- - In scan, do not delete dev from lvmcache if reading/processing fails; the dev is still present, and removing it makes it look like the dev is not there. Records are now kept about the problems with each PV so they be fixed/repaired in the appropriate places. - In scan, record a bad mda on failure, and delete the mda from mda in use list so it will not be used by vg_read or vg_write, only by repair. - In scan, succeed if any good mda on a device is found, instead of failing if any is bad. The bad/old copies of metadata should not interfere with normal usage while good copies can be used. - In scan, add a record of old mdas in lvmcache for later, do not repair them while reading, and do not let them prevent us from finding and using a good copy of metadata from elsewhere. One result is that "inconsistent metadata" is no longer a read error, but instead a record in lvmcache that can be addressed separate from the read. - Treat a dev with no good mdas like a dev with no mdas, which is an existing case we already handle. - Don't use a fake vg "handle" for returning an error from vg_read, or the vg_read_error function for getting that error number; just return null if the vg cannot be read or used, and an error_flags arg with flags set for the specific kind of error (which can be used later for determining the kind of repair.) - Saving an original copy of the vg metadata, for purposes of reverting a write, is now done explicitly in vg_read instead of being hidden in the vg_make_handle function. - When a vg is not accessible due to "access restrictions" but is otherwise fine, return the vg through the new error_vg arg so that process_each_pv can skip the PVs in the VG while processing. (This is a temporary accomodation for the way process_each_pv tracks which devs have been looked at, and can be dropped later when process_each_pv implementation dev tracking is changed.) - vg_read does not try to fix or recover a vg, but now just reads the metadata, checks access restrictions and returns it. (Checking access restrictions might be better done outside of vg_read, but this is a later improvement.) - _vg_read now simply makes one attempt to read metadata from each mda, and uses the most recent copy to return to the caller in the form of a 'vg' struct. (bad mdas were excluded during the scan and are not retried) (old mdas were not excluded during scan and are retried here) - vg_read uses _vg_read to get the latest copy of metadata from mdas, and then makes various checks against it to produce warnings, and to check if VG access is allowed (access restrictions include: writable, foreign, shared, clustered, missing pvs). - Things that were previously silently/automatically written by vg_read that are now done by vg_write, based on the records made in lvmcache during the scan and read: . clearing the missing flag . updating old copies of metadata . clearing outdated pvs . updating pv header flags - Bad/corrupt metadata are now repaired; they were not before. Test changes ------------ - A read command no longer writes the VG to repair it, so add a write command to do a repair. (inconsistent-metadata, unlost-pv) - When a missing PV is removed from a VG, and then the device is enabled again, vgck --updatemetadata is needed to clear the outdated PV before it can be used again, where it wasn't before. (lvconvert-repair-policy, lvconvert-repair-raid, lvconvert-repair, mirror-vgreduce-removemissing, pv-ext-flags, unlost-pv) Reading bad/old metadata ------------------------ - "bad metadata": the mda_header or metadata text has invalid fields or can't be parsed by lvm. This is a form of corruption that would not be caused by known failure scenarios. A checksum error is typically included among the errors reported. - "old metadata": a valid copy of the metadata that has a smaller seqno than other copies of the metadata. This can happen if the device failed, or io failed, or lvm failed while commiting new metadata to all the metadata areas. Old metadata on a PV that has been removed from the VG is the "outdated" case below. When a VG has some PVs with bad/old metadata, lvm can simply ignore the bad/old copies, and use a good copy. This is why there are multiple copies of the metadata -- so it's available even when some of the copies cannot be used. The bad/old copies do not have to be repaired before the VG can be used (the repair can happen later.) A PV with no good copies of the metadata simply falls back to being treated like a PV with no mdas; a common and harmless configuration. When bad/old metadata exists, lvm warns the user about it, and suggests repairing it using a new metadata repair command. Bad metadata in particular is something that users will want to investigate and repair themselves, since it should not happen and may indicate some other problem that needs to be fixed. PVs with bad/old metadata are not the same as missing devices. Missing devices will block various kinds of VG modification or activation, but bad/old metadata will not. Previously, lvm would attempt to repair bad/old metadata whenever it was read. This was unnecessary since lvm does not require every copy of the metadata to be used. It would also hide potential problems that should be investigated by the user. It was also dangerous in cases where the VG was on shared storage. The user is now allowed to investigate potential problems and decide how and when to repair them. Repairing bad/old metadata -------------------------- When label scan sees bad metadata in an mda, that mda is removed from the lvmcache info->mdas list. This means that vg_read will skip it, and not attempt to read/process it again. If it was the only in-use mda on a PV, that PV is treated like a PV with no mdas. It also means that vg_write will also skip the bad mda, and not attempt to write new metadata to it. The only way to repair bad metadata is with the metadata repair command. When label scan sees old metadata in an mda, that mda is kept in the lvmcache info->mdas list. This means that vg_read will read/process it again, and likely see the same mismatch with the other copies of the metadata. Like the label_scan, the vg_read will simply ignore the old copy of the metadata and use the latest copy. If the command is modifying the vg (e.g. lvcreate), then vg_write, which writes new metadata to every mda on info->mdas, will write the new metadata to the mda that had the old version. If successful, this will resolve the old metadata problem (without needing to run a metadata repair command.) Outdated PVs ------------ An outdated PV is a PV that has an old copy of VG metadata that shows it is a member of the VG, but the latest copy of the VG metadata does not include this PV. This happens if the PV is disconnected, vgreduce --removemissing is run to remove the PV from the VG, then the PV is reconnected. In this case, the outdated PV needs have its outdated metadata removed and the PV used flag needs to be cleared. This repair will be done by the subsequent repair command. It is also done if vgremove is run on the VG. MISSING PVs ----------- When a device is missing, most commands will refuse to modify the VG. This is the simple case. More complicated is when a command is allowed to modify the VG while it is missing a device. When a VG is written while a device is missing for one of it's PVs, the VG metadata is written to disk with the MISSING flag on the PV with the missing device. When the VG is next used, it is treated as if the PV with the MISSING flag still has a missing device, even if that device has reappeared. If all LVs that were using a PV with the MISSING flag are removed or repaired so that the MISSING PV is no longer used, then the next time the VG metadata is written, the MISSING flag will be dropped. Alternative methods of clearing the MISSING flag are: vgreduce --removemissing will remove PVs with missing devices, or PVs with the MISSING flag where the device has reappeared. vgextend --restoremissing will clear the MISSING flag on PVs where the device has reappeared, allowing the VG to be used normally. This must be done with caution since the reappeared device may have old data that is inconsistent with data on other PVs. Bad mda repair -------------- The new command: vgck --updatemetadata VG first uses vg_write to repair old metadata, and other basic issues mentioned above (old metadata, outdated PVs, pv_header flags, MISSING_PV flags). It will also go further and repair bad metadata: . text metadata that has a bad checksum . text metadata that is not parsable . corrupt mda_header checksum and version fields (To keep a clean diff, #if 0 is added around functions that are replaced by new code. These commented functions are removed by the following commit.)
2019-05-24 12:04:37 -05:00
struct volume_group *vg_read_orphans(struct cmd_context *cmd, const char *orphan_vgname);
/* pe_start and pe_end relate to any existing data so that new metadata
* areas can avoid overlap */
struct physical_volume *pv_create(const struct cmd_context *cmd,
struct device *dev, struct pv_create_args *pva);
int pv_resize_single(struct cmd_context *cmd,
struct volume_group *vg,
struct physical_volume *pv,
const uint64_t new_size,
int yes);
int pv_analyze(struct cmd_context *cmd, struct device *dev,
uint64_t label_sector);
/* FIXME: move internal to library */
uint32_t pv_list_extents_free(const struct dm_list *pvh);
int validate_new_vg_name(struct cmd_context *cmd, const char *vg_name);
int vg_validate(struct volume_group *vg);
struct volume_group *vg_create(struct cmd_context *cmd, const char *vg_name);
improve reading and repairing vg metadata The fact that vg repair is implemented as a part of vg read has led to a messy and complicated implementation of vg_read, and limited and uncontrolled repair capability. This splits read and repair apart. Summary ------- - take all kinds of various repairs out of vg_read - vg_read no longer writes anything - vg_read now simply reads and returns vg metadata - vg_read ignores bad or old copies of metadata - vg_read proceeds with a single good copy of metadata - improve error checks and handling when reading - keep track of bad (corrupt) copies of metadata in lvmcache - keep track of old (seqno) copies of metadata in lvmcache - keep track of outdated PVs in lvmcache - vg_write will do basic repairs - new command vgck --updatemetdata will do all repairs Details ------- - In scan, do not delete dev from lvmcache if reading/processing fails; the dev is still present, and removing it makes it look like the dev is not there. Records are now kept about the problems with each PV so they be fixed/repaired in the appropriate places. - In scan, record a bad mda on failure, and delete the mda from mda in use list so it will not be used by vg_read or vg_write, only by repair. - In scan, succeed if any good mda on a device is found, instead of failing if any is bad. The bad/old copies of metadata should not interfere with normal usage while good copies can be used. - In scan, add a record of old mdas in lvmcache for later, do not repair them while reading, and do not let them prevent us from finding and using a good copy of metadata from elsewhere. One result is that "inconsistent metadata" is no longer a read error, but instead a record in lvmcache that can be addressed separate from the read. - Treat a dev with no good mdas like a dev with no mdas, which is an existing case we already handle. - Don't use a fake vg "handle" for returning an error from vg_read, or the vg_read_error function for getting that error number; just return null if the vg cannot be read or used, and an error_flags arg with flags set for the specific kind of error (which can be used later for determining the kind of repair.) - Saving an original copy of the vg metadata, for purposes of reverting a write, is now done explicitly in vg_read instead of being hidden in the vg_make_handle function. - When a vg is not accessible due to "access restrictions" but is otherwise fine, return the vg through the new error_vg arg so that process_each_pv can skip the PVs in the VG while processing. (This is a temporary accomodation for the way process_each_pv tracks which devs have been looked at, and can be dropped later when process_each_pv implementation dev tracking is changed.) - vg_read does not try to fix or recover a vg, but now just reads the metadata, checks access restrictions and returns it. (Checking access restrictions might be better done outside of vg_read, but this is a later improvement.) - _vg_read now simply makes one attempt to read metadata from each mda, and uses the most recent copy to return to the caller in the form of a 'vg' struct. (bad mdas were excluded during the scan and are not retried) (old mdas were not excluded during scan and are retried here) - vg_read uses _vg_read to get the latest copy of metadata from mdas, and then makes various checks against it to produce warnings, and to check if VG access is allowed (access restrictions include: writable, foreign, shared, clustered, missing pvs). - Things that were previously silently/automatically written by vg_read that are now done by vg_write, based on the records made in lvmcache during the scan and read: . clearing the missing flag . updating old copies of metadata . clearing outdated pvs . updating pv header flags - Bad/corrupt metadata are now repaired; they were not before. Test changes ------------ - A read command no longer writes the VG to repair it, so add a write command to do a repair. (inconsistent-metadata, unlost-pv) - When a missing PV is removed from a VG, and then the device is enabled again, vgck --updatemetadata is needed to clear the outdated PV before it can be used again, where it wasn't before. (lvconvert-repair-policy, lvconvert-repair-raid, lvconvert-repair, mirror-vgreduce-removemissing, pv-ext-flags, unlost-pv) Reading bad/old metadata ------------------------ - "bad metadata": the mda_header or metadata text has invalid fields or can't be parsed by lvm. This is a form of corruption that would not be caused by known failure scenarios. A checksum error is typically included among the errors reported. - "old metadata": a valid copy of the metadata that has a smaller seqno than other copies of the metadata. This can happen if the device failed, or io failed, or lvm failed while commiting new metadata to all the metadata areas. Old metadata on a PV that has been removed from the VG is the "outdated" case below. When a VG has some PVs with bad/old metadata, lvm can simply ignore the bad/old copies, and use a good copy. This is why there are multiple copies of the metadata -- so it's available even when some of the copies cannot be used. The bad/old copies do not have to be repaired before the VG can be used (the repair can happen later.) A PV with no good copies of the metadata simply falls back to being treated like a PV with no mdas; a common and harmless configuration. When bad/old metadata exists, lvm warns the user about it, and suggests repairing it using a new metadata repair command. Bad metadata in particular is something that users will want to investigate and repair themselves, since it should not happen and may indicate some other problem that needs to be fixed. PVs with bad/old metadata are not the same as missing devices. Missing devices will block various kinds of VG modification or activation, but bad/old metadata will not. Previously, lvm would attempt to repair bad/old metadata whenever it was read. This was unnecessary since lvm does not require every copy of the metadata to be used. It would also hide potential problems that should be investigated by the user. It was also dangerous in cases where the VG was on shared storage. The user is now allowed to investigate potential problems and decide how and when to repair them. Repairing bad/old metadata -------------------------- When label scan sees bad metadata in an mda, that mda is removed from the lvmcache info->mdas list. This means that vg_read will skip it, and not attempt to read/process it again. If it was the only in-use mda on a PV, that PV is treated like a PV with no mdas. It also means that vg_write will also skip the bad mda, and not attempt to write new metadata to it. The only way to repair bad metadata is with the metadata repair command. When label scan sees old metadata in an mda, that mda is kept in the lvmcache info->mdas list. This means that vg_read will read/process it again, and likely see the same mismatch with the other copies of the metadata. Like the label_scan, the vg_read will simply ignore the old copy of the metadata and use the latest copy. If the command is modifying the vg (e.g. lvcreate), then vg_write, which writes new metadata to every mda on info->mdas, will write the new metadata to the mda that had the old version. If successful, this will resolve the old metadata problem (without needing to run a metadata repair command.) Outdated PVs ------------ An outdated PV is a PV that has an old copy of VG metadata that shows it is a member of the VG, but the latest copy of the VG metadata does not include this PV. This happens if the PV is disconnected, vgreduce --removemissing is run to remove the PV from the VG, then the PV is reconnected. In this case, the outdated PV needs have its outdated metadata removed and the PV used flag needs to be cleared. This repair will be done by the subsequent repair command. It is also done if vgremove is run on the VG. MISSING PVs ----------- When a device is missing, most commands will refuse to modify the VG. This is the simple case. More complicated is when a command is allowed to modify the VG while it is missing a device. When a VG is written while a device is missing for one of it's PVs, the VG metadata is written to disk with the MISSING flag on the PV with the missing device. When the VG is next used, it is treated as if the PV with the MISSING flag still has a missing device, even if that device has reappeared. If all LVs that were using a PV with the MISSING flag are removed or repaired so that the MISSING PV is no longer used, then the next time the VG metadata is written, the MISSING flag will be dropped. Alternative methods of clearing the MISSING flag are: vgreduce --removemissing will remove PVs with missing devices, or PVs with the MISSING flag where the device has reappeared. vgextend --restoremissing will clear the MISSING flag on PVs where the device has reappeared, allowing the VG to be used normally. This must be done with caution since the reappeared device may have old data that is inconsistent with data on other PVs. Bad mda repair -------------- The new command: vgck --updatemetadata VG first uses vg_write to repair old metadata, and other basic issues mentioned above (old metadata, outdated PVs, pv_header flags, MISSING_PV flags). It will also go further and repair bad metadata: . text metadata that has a bad checksum . text metadata that is not parsable . corrupt mda_header checksum and version fields (To keep a clean diff, #if 0 is added around functions that are replaced by new code. These commented functions are removed by the following commit.)
2019-05-24 12:04:37 -05:00
struct volume_group *vg_lock_and_create(struct cmd_context *cmd, const char *vg_name, int *exists);
int vg_remove_mdas(struct volume_group *vg);
int vg_remove_check(struct volume_group *vg);
void vg_remove_pvs(struct volume_group *vg);
2015-03-05 14:00:44 -06:00
int vg_remove_direct(struct volume_group *vg);
int vg_remove(struct volume_group *vg);
int vg_rename(struct cmd_context *cmd, struct volume_group *vg,
const char *new_name);
int vg_extend_each_pv(struct volume_group *vg, struct pvcreate_params *pp);
int vgreduce_single(struct cmd_context *cmd, struct volume_group *vg,
struct physical_volume *pv, int commit);
int vg_change_tag(struct volume_group *vg, const char *tag, int add_tag);
int vg_split_mdas(struct cmd_context *cmd, struct volume_group *vg_from,
struct volume_group *vg_to);
/* FIXME: Investigate refactoring these functions to take a pv ISO pv_list */
void add_pvl_to_vgs(struct volume_group *vg, struct pv_list *pvl);
void del_pvl_from_vgs(struct volume_group *vg, struct pv_list *pvl);
/*
* free_pv_fid() must be called on every struct physical_volume allocated
* by pv_create, pv_read, find_pv_by_name or to free it when no longer required.
*/
void free_pv_fid(struct physical_volume *pv);
/* Manipulate LVs */
struct logical_volume *lv_create_empty(const char *name,
union lvid *lvid,
uint64_t status,
alloc_policy_t alloc,
struct volume_group *vg);
struct wipe_params {
uint64_t zero_sectors; /* sector count to zero */
uint8_t zero_value; /* zero-out with this value */
int do_zero; /* should we do zeroing of LV start? */
int do_wipe_signatures; /* should we wipe known signatures found on LV? */
int yes; /* answer yes automatically to all questions */
force_t force; /* force mode */
int is_metadata; /* wipe volume is metadata LV */
};
/* Zero out LV and/or wipe signatures */
int wipe_lv(struct logical_volume *lv, struct wipe_params params);
2007-12-20 22:37:42 +00:00
/* Wipe any signatures and zero first sector on @lv */
int activate_and_wipe_lv(struct logical_volume *lv, int commit);
/* Wipe any signatures and zero first sector of LVs listed on @lv_list */
int activate_and_wipe_lvlist(struct dm_list *lv_list, int commit);
int lv_change_tag(struct logical_volume *lv, const char *tag, int add_tag);
/* Reduce the size of an LV by extents */
int lv_reduce(struct logical_volume *lv, uint32_t extents);
/* Empty an LV prior to deleting it */
int lv_empty(struct logical_volume *lv);
2008-01-17 13:13:54 +00:00
/* Empty an LV and add error segment */
2008-01-17 13:54:05 +00:00
int replace_lv_with_error_segment(struct logical_volume *lv);
2008-01-17 13:13:54 +00:00
int lv_refresh_suspend_resume(const struct logical_volume *lv);
/* Entry point for all LV extent allocations */
int lv_extend(struct logical_volume *lv,
const struct segment_type *segtype,
uint32_t stripes, uint32_t stripe_size,
uint32_t mirrors, uint32_t region_size,
uint32_t extents,
struct dm_list *allocatable_pvs, alloc_policy_t alloc,
int approx_alloc);
/* lv must be part of lv->vg->lvs */
int lv_remove(struct logical_volume *lv);
2016-03-01 15:26:57 +01:00
/* historical_glv must be part of lv->vg->historical_lvs */
int historical_glv_remove(struct generic_logical_volume *historical_glv);
int lv_remove_single(struct cmd_context *cmd, struct logical_volume *lv,
force_t force, int suppress_remove_message);
int lv_remove_with_dependencies(struct cmd_context *cmd, struct logical_volume *lv,
force_t force, unsigned level);
int lv_rename(struct cmd_context *cmd, struct logical_volume *lv,
const char *new_name);
int lv_rename_update(struct cmd_context *cmd, struct logical_volume *lv,
const char *new_name, int update_mda);
int lv_uniq_rename_update(struct cmd_context *cmd, struct logical_volume *lv,
const char *new_name, int update_mda);
/* Updates and reloads metadata for given lv */
int lv_update_and_reload(struct logical_volume *lv);
int lv_update_and_reload_origin(struct logical_volume *lv);
uint32_t extents_from_size(struct cmd_context *cmd, uint64_t size,
uint32_t extent_size);
uint32_t extents_from_percent_size(struct volume_group *vg, const struct dm_list *pvh,
uint32_t extents, int roundup,
percent_type_t percent, uint64_t size);
2014-01-08 10:27:17 +01:00
struct logical_volume *find_pool_lv(const struct logical_volume *lv);
int pool_is_active(const struct logical_volume *lv);
int pool_supports_external_origin(const struct lv_segment *pool_seg, const struct logical_volume *external_lv);
int thin_pool_feature_supported(const struct logical_volume *lv, int feature);
int recalculate_pool_chunk_size_with_dev_hints(struct logical_volume *pool_lv,
int chunk_size_calc_policy);
int validate_cache_chunk_size(struct cmd_context *cmd, uint32_t chunk_size);
int validate_thin_pool_chunk_size(struct cmd_context *cmd, uint32_t chunk_size);
int validate_pool_chunk_size(struct cmd_context *cmd, const struct segment_type *segtype, uint32_t chunk_size);
int update_pool_lv(struct logical_volume *lv, int activate);
int get_default_allocation_thin_pool_chunk_size(struct cmd_context *cmd, struct profile *profile,
uint32_t *chunk_size, int *chunk_size_calc_method);
int update_thin_pool_params(struct cmd_context *cmd,
struct profile *profile,
uint32_t extent_size,
const struct segment_type *segtype,
unsigned attr,
uint32_t pool_data_extents,
uint32_t *pool_metadata_extents,
thin: improve 16g support for thin pool metadata Initial support for thin-pool used slightly smaller max size 15.81GiB for thin-pool metadata. However the real limit later settled at 15.88GiB (difference is ~64MiB - 16448 4K blocks). lvm2 could not simply increase the size as it has been using hard cropping of the loaded metadata device to avoid warnings printing warning of kernel when the size was bigger (i.e. due to bigger extent_size). This patch adds the new lvm.conf configurable setting: allocation/thin_pool_crop_metadata which defaults to 0 -> no crop of metadata beyond 15.81GiB. Only user with these sizes of metadata will be affected. Without cropping lvm2 now limits metadata allocation size to 15.88GiB. Any space beyond is currently not used by thin-pool target. Even if i.e. bigger LV is used for metadata via lvconvert, or allocated bigger because of to large extent size. With cropping enabled (=1) lvm2 preserves the old limitation 15.81GiB and should allow to work in the evironement with older lvm2 tools (i.e. older distribution). Thin-pool metadata with size bigger then 15.81G is now using CROP_METADATA flag within lvm2 metadata, so older lvm2 recognizes an incompatible thin-pool and cannot activate such pool! Users should use uncropped version as it is not suffering from various issues between thin_repair results and allocated metadata LV as thin_repair limit is 15.88GiB Users should use cropping only when really needed! Patch also better handles resize of thin-pool metadata and prevents resize beoyond usable size 15.88GiB. Resize beyond 15.81GiB automatically switches pool to no-crop version. Even with existing bigger thin-pool metadata command 'lvextend -l+1 vg/pool_tmeta' does the change. Patch gives better controls 'coverted' metadata LV and reports less confusing message during conversion. Patch set also moves the code for updating min/max into pool_manip.c for better sharing with cache_pool code.
2021-01-12 17:59:29 +01:00
struct logical_volume *metadata_lv,
unsigned *crop_metadata,
int *chunk_size_calc_method, uint32_t *chunk_size,
thin_discards_t *discards, thin_zero_t *zero_new_blocks);
struct lv_status_thin_pool {
struct dm_pool *mem;
struct dm_status_thin_pool *thin_pool;
dm_percent_t data_usage;
dm_percent_t metadata_usage;
};
struct lv_status_thin {
struct dm_pool *mem;
struct dm_status_thin *thin;
dm_percent_t usage;
};
const char *get_pool_discards_name(thin_discards_t discards);
2014-11-08 01:28:38 +01:00
int set_pool_discards(thin_discards_t *discards, const char *str);
struct logical_volume *alloc_pool_metadata(struct logical_volume *pool_lv,
const char *name, uint32_t read_ahead,
uint32_t stripes, uint32_t stripe_size,
uint32_t extents, alloc_policy_t alloc,
struct dm_list *pvh);
int handle_pool_metadata_spare(struct volume_group *vg, uint32_t extents,
struct dm_list *pvh, int poolmetadataspare);
int vg_set_pool_metadata_spare(struct logical_volume *lv);
int vg_remove_pool_metadata_spare(struct volume_group *vg);
struct logical_volume *data_lv_from_thin_pool(struct logical_volume *pool_lv);
2013-07-09 02:51:24 +01:00
int attach_thin_external_origin(struct lv_segment *seg,
struct logical_volume *external_lv);
int detach_thin_external_origin(struct lv_segment *seg);
int attach_pool_metadata_lv(struct lv_segment *pool_seg,
struct logical_volume *metadata_lv);
2013-07-09 02:51:24 +01:00
int detach_pool_metadata_lv(struct lv_segment *pool_seg,
struct logical_volume **metadata_lv);
2013-07-09 02:51:24 +01:00
int attach_pool_data_lv(struct lv_segment *pool_seg,
struct logical_volume *pool_data_lv);
int is_mirror_image_removable(struct logical_volume *mimage_lv, void *baton);
/*
* Activation options
*/
typedef enum activation_change {
CHANGE_AY = 0, /* activate */
CHANGE_AN = 1, /* deactivate */
CHANGE_AEY = 2, /* activate exclusively */
CHANGE_ALY = 3, /* activate locally */
CHANGE_ALN = 4, /* deactivate locally */
CHANGE_AAY = 5, /* automatic activation */
CHANGE_ASY = 6 /* activate shared */
} activation_change_t;
/* Returns true, when change activates device */
static inline int is_change_activating(activation_change_t change)
{
return ((change != CHANGE_AN) && (change != CHANGE_ALN));
}
/* FIXME: refactor and reduce the size of this struct! */
struct lvcreate_params {
/* flags */
int snapshot; /* snap */
int create_pool; /* pools */
int zero; /* all */
int wipe_signatures; /* all */
int32_t major; /* all */
int32_t minor; /* all */
int log_count; /* mirror/RAID */
int nosync; /* mirror/RAID */
int pool_metadata_spare; /* pools */
int type; /* type arg is given */
activation: flag temporary LVs internally Add LV_TEMPORARY flag for LVs with limited existence during command execution. Such LVs are temporary in way that they need to be activated, some action done and then removed immediately. Such LVs are just like any normal LV - the only difference is that they are removed during LVM command execution. This is also the case for LVs representing future pool metadata spare LVs which we need to initialize by using the usual LV before they are declared as pool metadata spare. We can optimize some other parts like udev to do a better job if it knows that the LV is temporary and any processing on it is just useless. This flag is orthogonal to LV_NOSCAN flag introduced recently as LV_NOSCAN flag is primarily used to mark an LV for the scanning to be avoided before the zeroing of the device happens. The LV_TEMPORARY flag makes a difference between a full-fledged LV visible in the system and the LV just used as a temporary overlay for some action that needs to be done on underlying PVs. For example: lvcreate --thinpool POOL --zero n -L 1G vg - first, the usual LV is created to do a clean up for pool metadata spare. The LV is activated, zeroed, deactivated. - between "activated" and "zeroed" stage, the LV_NOSCAN flag is used to avoid any scanning in udev - betwen "zeroed" and "deactivated" stage, we need to avoid the WATCH udev rule, but since the LV is just a usual LV, we can't make a difference. The LV_TEMPORARY internal LV flag helps here. If we create the LV with this flag, the DM_UDEV_DISABLE_DISK_RULES and DM_UDEV_DISABLE_OTHER_RULES flag are set (just like as it is with "invisible" and non-top-level LVs) - udev is directed to skip WATCH rule use. - if the LV_TEMPORARY flag was not used, there would normally be a WATCH event generated once the LV is closed after "zeroed" stage. This will make problems with immediated deactivation that follows.
2013-10-23 14:06:39 +02:00
int temporary; /* temporary LV */
activation: add support for skipping activation of selected LVs Also add -k/--setactivationskip y/n and -K/--ignoreactivationskip options to lvcreate. The --setactivationskip y sets the flag in metadata for an LV to skip the LV during activation. Also, the newly created LV is not activated. Thin snapsots have this flag set automatically if not specified directly by the --setactivationskip y/n option. The --ignoreactivationskip overrides the activation skip flag set in metadata for an LV (just for the run of the command - the flag is not changed in metadata!) A few examples for the lvcreate with the new options: (non-thin snap LV => skip flag not set in MDA + LV activated) raw/~ $ lvcreate -l1 vg Logical volume "lvol0" created raw/~ $ lvs -o lv_name,attr vg/lvol0 LV Attr lvol0 -wi-a---- (non-thin snap LV + -ky => skip flag set in MDA + LV not activated) raw/~ $ lvcreate -l1 -ky vg Logical volume "lvol1" created raw/~ $ lvs -o lv_name,attr vg/lvol1 LV Attr lvol1 -wi------ (non-thin snap LV + -ky + -K => skip flag set in MDA + LV activated) raw/~ $ lvcreate -l1 -ky -K vg Logical volume "lvol2" created raw/~ $ lvs -o lv_name,attr vg/lvol2 LV Attr lvol2 -wi-a---- (thin snap LV => skip flag set in MDA (default behaviour) + LV not activated) raw/~ $ lvcreate -L100M -T vg/pool -V 1T -n thin_lv Logical volume "thin_lv" created raw/~ $ lvcreate -s vg/thin_lv -n thin_snap Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg LV Attr pool twi-a-tz- thin_lv Vwi-a-tz- thin_snap Vwi---tz- (thin snap LV + -K => skip flag set in MDA (default behaviour) + LV activated) raw/~ $ lvcreate -s vg/thin_lv -n thin_snap -K Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg/thin_lv LV Attr thin_lv Vwi-a-tz- (thins snap LV + -kn => no skip flag in MDA (default behaviour overridden) + LV activated) [0] raw/~ # lvcreate -s vg/thin_lv -n thin_snap -kn Logical volume "thin_snap" created [0] raw/~ # lvs -o name,attr vg/thin_snap LV Attr thin_snap Vwi-a-tz-
2013-07-10 14:06:50 +02:00
#define ACTIVATION_SKIP_SET 0x01 /* request to set LV activation skip flag state */
#define ACTIVATION_SKIP_SET_ENABLED 0x02 /* set the LV activation skip flag state to 'enabled' */
#define ACTIVATION_SKIP_IGNORE 0x04 /* request to ignore LV activation skip flag (if any) */
int activation_skip; /* activation skip flags */
Add metadata-based autoactivation property for VG and LV The autoactivation property can be specified in lvcreate or vgcreate for new LVs/VGs, and the property can be changed by lvchange or vgchange for existing LVs/VGs. --setautoactivation y|n enables|disables autoactivation of a VG or LV. Autoactivation is enabled by default, which is consistent with past behavior. The disabled state is stored as a new flag in the VG metadata, and the absence of the flag allows autoactivation. If autoactivation is disabled for the VG, then no LVs in the VG will be autoactivated (the LV autoactivation property will have no effect.) When autoactivation is enabled for the VG, then autoactivation can be controlled on individual LVs. The state of this property can be reported for LVs/VGs using the "-o autoactivation" option in lvs/vgs commands, which will report "enabled", or "" for the disabled state. Previous versions of lvm do not recognize this property. Since autoactivation is enabled by default, the disabled setting will have no effect in older lvm versions. If the VG is modified by older lvm versions, the disabled state will also be dropped from the metadata. The autoactivation property is an alternative to using the lvm.conf auto_activation_volume_list, which is still applied to to VGs/LVs in addition to the new property. If VG or LV autoactivation is disabled either in metadata or in auto_activation_volume_list, it will not be autoactivated. An autoactivation command will silently skip activating an LV when the autoactivation property is disabled. To determine the effective autoactivation behavior for a specific LV, multiple settings would need to be checked: the VG autoactivation property, the LV autoactivation property, the auto_activation_volume_list. The "activation skip" property would also be relevant, since it applies to both normal and auto activation.
2021-04-01 17:20:00 -05:00
int noautoactivate; /* 1 if --setautoactivation n */
activation_change_t activate; /* non-snapshot, non-mirror */
2012-08-09 12:20:47 +02:00
thin_discards_t discards; /* thin */
thin_zero_t zero_new_blocks;
#define THIN_CHUNK_SIZE_CALC_METHOD_GENERIC 0x01
#define THIN_CHUNK_SIZE_CALC_METHOD_PERFORMANCE 0x02
int thin_chunk_size_calc_policy;
unsigned suppress_zero_warn : 1;
2015-03-05 14:00:44 -06:00
unsigned needs_lockd_init : 1;
lvcreate: new cache or writecache lv with single command To create a new cache or writecache LV with a single command: lvcreate --type cache|writecache -n Name -L Size --cachedevice PVfast VG [PVslow ...] - A new main linear|striped LV is created as usual, using the specified -n Name and -L Size, and using the optionally specified PVslow devices. - Then, a new cachevol LV is created internally, using PVfast specified by the cachedevice option. - Then, the cachevol is attached to the main LV, converting the main LV to type cache|writecache. Include --cachesize Size to specify the size of cache|writecache to create from the specified --cachedevice PVs, otherwise the entire cachedevice PV is used. The --cachedevice option can be repeated to create the cache from multiple devices, or the cachedevice option can contain a tag name specifying a set of PVs to allocate the cache from. To create a new cache or writecache LV with a single command using an existing cachevol LV: lvcreate --type cache|writecache -n Name -L Size --cachevol LVfast VG [PVslow ...] - A new main linear|striped LV is created as usual, using the specified -n Name and -L Size, and using the optionally specified PVslow devices. - Then, the cachevol LVfast is attached to the main LV, converting the main LV to type cache|writecache. In cases where more advanced types (for the main LV or cachevol LV) are needed, they should be created independently and then combined with lvconvert. Example ------- user creates a new VG with one slow device and one fast device: $ vgcreate vg /dev/slow1 /dev/fast1 user creates a new 8G main LV on /dev/slow1 that uses all of /dev/fast1 as a writecache: $ lvcreate --type writecache --cachedevice /dev/fast1 -n main -L 8G vg /dev/slow1 Example ------- user creates a new VG with two slow devs and two fast devs: $ vgcreate vg /dev/slow1 /dev/slow2 /dev/fast1 /dev/fast2 user creates a new 8G main LV on /dev/slow1 and /dev/slow2 that uses all of /dev/fast1 and /dev/fast2 as a writecache: $ lvcreate --type writecache --cachedevice /dev/fast1 --cachedevice /dev/fast2 -n main -L 8G vg /dev/slow1 /dev/slow2 Example ------- A user has several slow devices and several fast devices in their VG, the slow devs have tag @slow, the fast devs have tag @fast. user creates a new 8G main LV on the slow devs with a 2G writecache on the fast devs: $ lvcreate --type writecache -n main -L 8G --cachedevice @fast --cachesize 2G vg @slow
2020-04-10 13:17:37 -05:00
unsigned ignore_type : 1;
unsigned is_metadata : 1; /* created LV will be used as metadata LV (and can be zeroed) */
const char *vg_name; /* only-used when VG is not yet opened (in /tools) */
const char *lv_name; /* all */
const char *origin_name; /* snap */
const char *pool_name; /* thin */
2015-03-05 14:00:44 -06:00
const char *lock_args;
uint32_t stripes; /* striped/RAID */
uint32_t stripe_size; /* striped/RAID */
uint32_t chunk_size; /* snapshot */
uint32_t region_size; /* mirror/RAID */
unsigned stripes_supplied; /* striped/RAID */
unsigned stripe_size_supplied; /* striped/RAID */
uint32_t mirrors; /* mirror/RAID */
uint32_t min_recovery_rate; /* RAID */
uint32_t max_recovery_rate; /* RAID */
cache_metadata_format_t cache_metadata_format; /* cache */
cache_mode_t cache_mode; /* cache */
const char *policy_name; /* cache */
struct dm_config_tree *policy_settings; /* cache */
const struct segment_type *segtype; /* all */
unsigned target_attr; /* all */
/* size */
uint32_t extents; /* all */
uint32_t pool_metadata_extents; /* pools */
uint64_t pool_metadata_size; /* pools */
uint32_t pool_data_extents; /* pools */
uint64_t pool_data_size; /* pools */
uint32_t virtual_extents; /* snapshots, thins */
struct dm_list *pvh; /* all */
uint64_t permission; /* all */
unsigned error_when_full; /* when segment supports it */
thin: improve 16g support for thin pool metadata Initial support for thin-pool used slightly smaller max size 15.81GiB for thin-pool metadata. However the real limit later settled at 15.88GiB (difference is ~64MiB - 16448 4K blocks). lvm2 could not simply increase the size as it has been using hard cropping of the loaded metadata device to avoid warnings printing warning of kernel when the size was bigger (i.e. due to bigger extent_size). This patch adds the new lvm.conf configurable setting: allocation/thin_pool_crop_metadata which defaults to 0 -> no crop of metadata beyond 15.81GiB. Only user with these sizes of metadata will be affected. Without cropping lvm2 now limits metadata allocation size to 15.88GiB. Any space beyond is currently not used by thin-pool target. Even if i.e. bigger LV is used for metadata via lvconvert, or allocated bigger because of to large extent size. With cropping enabled (=1) lvm2 preserves the old limitation 15.81GiB and should allow to work in the evironement with older lvm2 tools (i.e. older distribution). Thin-pool metadata with size bigger then 15.81G is now using CROP_METADATA flag within lvm2 metadata, so older lvm2 recognizes an incompatible thin-pool and cannot activate such pool! Users should use uncropped version as it is not suffering from various issues between thin_repair results and allocated metadata LV as thin_repair limit is 15.88GiB Users should use cropping only when really needed! Patch also better handles resize of thin-pool metadata and prevents resize beoyond usable size 15.88GiB. Resize beyond 15.81GiB automatically switches pool to no-crop version. Even with existing bigger thin-pool metadata command 'lvextend -l+1 vg/pool_tmeta' does the change. Patch gives better controls 'coverted' metadata LV and reports less confusing message during conversion. Patch set also moves the code for updating min/max into pool_manip.c for better sharing with cache_pool code.
2021-01-12 17:59:29 +01:00
thin_crop_metadata_t crop_metadata;
uint32_t read_ahead; /* all */
int approx_alloc; /* all */
alloc_policy_t alloc; /* all */
struct dm_vdo_target_params vdo_params; /* vdo */
uint64_t vdo_pool_header_size; /* VDO */
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
int raidintegrity;
const char *raidintegritymode;
struct integrity_settings integrity_settings;
struct dm_list tags; /* all */
int yes;
force_t force;
};
struct logical_volume *lv_create_single(struct volume_group *vg,
struct lvcreate_params *lp);
activation: add support for skipping activation of selected LVs Also add -k/--setactivationskip y/n and -K/--ignoreactivationskip options to lvcreate. The --setactivationskip y sets the flag in metadata for an LV to skip the LV during activation. Also, the newly created LV is not activated. Thin snapsots have this flag set automatically if not specified directly by the --setactivationskip y/n option. The --ignoreactivationskip overrides the activation skip flag set in metadata for an LV (just for the run of the command - the flag is not changed in metadata!) A few examples for the lvcreate with the new options: (non-thin snap LV => skip flag not set in MDA + LV activated) raw/~ $ lvcreate -l1 vg Logical volume "lvol0" created raw/~ $ lvs -o lv_name,attr vg/lvol0 LV Attr lvol0 -wi-a---- (non-thin snap LV + -ky => skip flag set in MDA + LV not activated) raw/~ $ lvcreate -l1 -ky vg Logical volume "lvol1" created raw/~ $ lvs -o lv_name,attr vg/lvol1 LV Attr lvol1 -wi------ (non-thin snap LV + -ky + -K => skip flag set in MDA + LV activated) raw/~ $ lvcreate -l1 -ky -K vg Logical volume "lvol2" created raw/~ $ lvs -o lv_name,attr vg/lvol2 LV Attr lvol2 -wi-a---- (thin snap LV => skip flag set in MDA (default behaviour) + LV not activated) raw/~ $ lvcreate -L100M -T vg/pool -V 1T -n thin_lv Logical volume "thin_lv" created raw/~ $ lvcreate -s vg/thin_lv -n thin_snap Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg LV Attr pool twi-a-tz- thin_lv Vwi-a-tz- thin_snap Vwi---tz- (thin snap LV + -K => skip flag set in MDA (default behaviour) + LV activated) raw/~ $ lvcreate -s vg/thin_lv -n thin_snap -K Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg/thin_lv LV Attr thin_lv Vwi-a-tz- (thins snap LV + -kn => no skip flag in MDA (default behaviour overridden) + LV activated) [0] raw/~ # lvcreate -s vg/thin_lv -n thin_snap -kn Logical volume "thin_snap" created [0] raw/~ # lvs -o name,attr vg/thin_snap LV Attr thin_snap Vwi-a-tz-
2013-07-10 14:06:50 +02:00
/*
* The activation can be skipped for selected LVs. Some LVs are skipped
* by default (e.g. thin snapshots), others can be skipped on demand by
* overriding the default behaviour. The flag that causes the activation
* skip on next activations is stored directly in metadata for each LV
* as ACTIVATION_SKIP flag.
*/
void lv_set_activation_skip(struct logical_volume *lv, int override_default, int add_skip);
activation: add support for skipping activation of selected LVs Also add -k/--setactivationskip y/n and -K/--ignoreactivationskip options to lvcreate. The --setactivationskip y sets the flag in metadata for an LV to skip the LV during activation. Also, the newly created LV is not activated. Thin snapsots have this flag set automatically if not specified directly by the --setactivationskip y/n option. The --ignoreactivationskip overrides the activation skip flag set in metadata for an LV (just for the run of the command - the flag is not changed in metadata!) A few examples for the lvcreate with the new options: (non-thin snap LV => skip flag not set in MDA + LV activated) raw/~ $ lvcreate -l1 vg Logical volume "lvol0" created raw/~ $ lvs -o lv_name,attr vg/lvol0 LV Attr lvol0 -wi-a---- (non-thin snap LV + -ky => skip flag set in MDA + LV not activated) raw/~ $ lvcreate -l1 -ky vg Logical volume "lvol1" created raw/~ $ lvs -o lv_name,attr vg/lvol1 LV Attr lvol1 -wi------ (non-thin snap LV + -ky + -K => skip flag set in MDA + LV activated) raw/~ $ lvcreate -l1 -ky -K vg Logical volume "lvol2" created raw/~ $ lvs -o lv_name,attr vg/lvol2 LV Attr lvol2 -wi-a---- (thin snap LV => skip flag set in MDA (default behaviour) + LV not activated) raw/~ $ lvcreate -L100M -T vg/pool -V 1T -n thin_lv Logical volume "thin_lv" created raw/~ $ lvcreate -s vg/thin_lv -n thin_snap Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg LV Attr pool twi-a-tz- thin_lv Vwi-a-tz- thin_snap Vwi---tz- (thin snap LV + -K => skip flag set in MDA (default behaviour) + LV activated) raw/~ $ lvcreate -s vg/thin_lv -n thin_snap -K Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg/thin_lv LV Attr thin_lv Vwi-a-tz- (thins snap LV + -kn => no skip flag in MDA (default behaviour overridden) + LV activated) [0] raw/~ # lvcreate -s vg/thin_lv -n thin_snap -kn Logical volume "thin_snap" created [0] raw/~ # lvs -o name,attr vg/thin_snap LV Attr thin_snap Vwi-a-tz-
2013-07-10 14:06:50 +02:00
int lv_activation_skip(struct logical_volume *lv, activation_change_t activate,
int override_lv_skip_flag);
activation: add support for skipping activation of selected LVs Also add -k/--setactivationskip y/n and -K/--ignoreactivationskip options to lvcreate. The --setactivationskip y sets the flag in metadata for an LV to skip the LV during activation. Also, the newly created LV is not activated. Thin snapsots have this flag set automatically if not specified directly by the --setactivationskip y/n option. The --ignoreactivationskip overrides the activation skip flag set in metadata for an LV (just for the run of the command - the flag is not changed in metadata!) A few examples for the lvcreate with the new options: (non-thin snap LV => skip flag not set in MDA + LV activated) raw/~ $ lvcreate -l1 vg Logical volume "lvol0" created raw/~ $ lvs -o lv_name,attr vg/lvol0 LV Attr lvol0 -wi-a---- (non-thin snap LV + -ky => skip flag set in MDA + LV not activated) raw/~ $ lvcreate -l1 -ky vg Logical volume "lvol1" created raw/~ $ lvs -o lv_name,attr vg/lvol1 LV Attr lvol1 -wi------ (non-thin snap LV + -ky + -K => skip flag set in MDA + LV activated) raw/~ $ lvcreate -l1 -ky -K vg Logical volume "lvol2" created raw/~ $ lvs -o lv_name,attr vg/lvol2 LV Attr lvol2 -wi-a---- (thin snap LV => skip flag set in MDA (default behaviour) + LV not activated) raw/~ $ lvcreate -L100M -T vg/pool -V 1T -n thin_lv Logical volume "thin_lv" created raw/~ $ lvcreate -s vg/thin_lv -n thin_snap Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg LV Attr pool twi-a-tz- thin_lv Vwi-a-tz- thin_snap Vwi---tz- (thin snap LV + -K => skip flag set in MDA (default behaviour) + LV activated) raw/~ $ lvcreate -s vg/thin_lv -n thin_snap -K Logical volume "thin_snap" created raw/~ $ lvs -o name,attr vg/thin_lv LV Attr thin_lv Vwi-a-tz- (thins snap LV + -kn => no skip flag in MDA (default behaviour overridden) + LV activated) [0] raw/~ # lvcreate -s vg/thin_lv -n thin_snap -kn Logical volume "thin_snap" created [0] raw/~ # lvs -o name,attr vg/thin_snap LV Attr thin_snap Vwi-a-tz-
2013-07-10 14:06:50 +02:00
/*
* Functions for layer manipulation
*/
int insert_layer_for_segments_on_pv(struct cmd_context *cmd,
struct logical_volume *lv_where,
struct logical_volume *layer_lv,
uint64_t status,
struct pv_list *pvl,
struct dm_list *lvs_changed);
int remove_layers_for_segments(struct cmd_context *cmd,
struct logical_volume *lv,
struct logical_volume *layer_lv,
uint64_t status_mask, struct dm_list *lvs_changed);
int remove_layers_for_segments_all(struct cmd_context *cmd,
struct logical_volume *layer_lv,
uint64_t status_mask,
struct dm_list *lvs_changed);
int split_parent_segments_for_layer(struct cmd_context *cmd,
struct logical_volume *layer_lv);
2007-12-20 18:55:46 +00:00
int remove_layer_from_lv(struct logical_volume *lv,
struct logical_volume *layer_lv);
struct logical_volume *insert_layer_for_lv(struct cmd_context *cmd,
struct logical_volume *lv_where,
uint64_t status,
const char *layer_suffix);
/* Find a PV within a given VG */
2008-03-13 22:51:24 +00:00
struct pv_list *find_pv_in_vg(const struct volume_group *vg,
const char *pv_name);
struct pv_list *find_pv_in_vg_by_uuid(const struct volume_group *vg,
const struct id *id);
/* Find an LV within a given VG */
2008-03-13 22:51:24 +00:00
struct lv_list *find_lv_in_vg(const struct volume_group *vg,
const char *lv_name);
/* FIXME Merge these functions with ones above */
2008-03-13 22:51:24 +00:00
struct logical_volume *find_lv(const struct volume_group *vg,
const char *lv_name);
struct generic_logical_volume *find_historical_glv(const struct volume_group *vg,
const char *historical_lv_name,
2016-03-01 15:26:57 +01:00
int check_removed_list,
struct glv_list **glvl_found);
int lv_name_is_used_in_vg(const struct volume_group *vg, const char *name, int *historical);
RAID: Add writemostly/writebehind support for RAID1 'lvchange' is used to alter a RAID 1 logical volume's write-mostly and write-behind characteristics. The '--writemostly' parameter takes a PV as an argument with an optional trailing character to specify whether to set ('y'), unset ('n'), or toggle ('t') the value. If no trailing character is given, it will set the flag. Synopsis: lvchange [--writemostly <PV>:{t|y|n}] [--writebehind <count>] vg/lv Example: lvchange --writemostly /dev/sdb1:y --writebehind 512 vg/raid1_lv The last character in the 'lv_attr' field is used to show whether a device has the WriteMostly flag set. It is signified with a 'w'. If the device has failed, the 'p'artial flag has priority. Example ("nosync" raid1 with mismatch_cnt and writemostly): [~]# lvs -a --segment vg LV VG Attr #Str Type SSize raid1 vg Rwi---r-m 2 raid1 500.00m [raid1_rimage_0] vg Iwi---r-- 1 linear 500.00m [raid1_rimage_1] vg Iwi---r-w 1 linear 500.00m [raid1_rmeta_0] vg ewi---r-- 1 linear 4.00m [raid1_rmeta_1] vg ewi---r-- 1 linear 4.00m Example (raid1 with mismatch_cnt, writemostly - but failed drive): [~]# lvs -a --segment vg LV VG Attr #Str Type SSize raid1 vg rwi---r-p 2 raid1 500.00m [raid1_rimage_0] vg Iwi---r-- 1 linear 500.00m [raid1_rimage_1] vg Iwi---r-p 1 linear 500.00m [raid1_rmeta_0] vg ewi---r-- 1 linear 4.00m [raid1_rmeta_1] vg ewi---r-p 1 linear 4.00m A new reportable field has been added for writebehind as well. If write-behind has not been set or the LV is not RAID1, the field will be blank. Example (writebehind is set): [~]# lvs -a -o name,attr,writebehind vg LV Attr WBehind lv rwi-a-r-- 512 [lv_rimage_0] iwi-aor-w [lv_rimage_1] iwi-aor-- [lv_rmeta_0] ewi-aor-- [lv_rmeta_1] ewi-aor-- Example (writebehind is not set): [~]# lvs -a -o name,attr,writebehind vg LV Attr WBehind lv rwi-a-r-- [lv_rimage_0] iwi-aor-w [lv_rimage_1] iwi-aor-- [lv_rmeta_0] ewi-aor-- [lv_rmeta_1] ewi-aor--
2013-04-15 13:59:46 -05:00
int lv_is_on_pv(struct logical_volume *lv, struct physical_volume *pv);
int lv_is_on_pvs(struct logical_volume *lv, struct dm_list *pvs);
int get_pv_list_for_lv(struct dm_pool *mem,
struct logical_volume *lv, struct dm_list *pvs);
RAID: Add writemostly/writebehind support for RAID1 'lvchange' is used to alter a RAID 1 logical volume's write-mostly and write-behind characteristics. The '--writemostly' parameter takes a PV as an argument with an optional trailing character to specify whether to set ('y'), unset ('n'), or toggle ('t') the value. If no trailing character is given, it will set the flag. Synopsis: lvchange [--writemostly <PV>:{t|y|n}] [--writebehind <count>] vg/lv Example: lvchange --writemostly /dev/sdb1:y --writebehind 512 vg/raid1_lv The last character in the 'lv_attr' field is used to show whether a device has the WriteMostly flag set. It is signified with a 'w'. If the device has failed, the 'p'artial flag has priority. Example ("nosync" raid1 with mismatch_cnt and writemostly): [~]# lvs -a --segment vg LV VG Attr #Str Type SSize raid1 vg Rwi---r-m 2 raid1 500.00m [raid1_rimage_0] vg Iwi---r-- 1 linear 500.00m [raid1_rimage_1] vg Iwi---r-w 1 linear 500.00m [raid1_rmeta_0] vg ewi---r-- 1 linear 4.00m [raid1_rmeta_1] vg ewi---r-- 1 linear 4.00m Example (raid1 with mismatch_cnt, writemostly - but failed drive): [~]# lvs -a --segment vg LV VG Attr #Str Type SSize raid1 vg rwi---r-p 2 raid1 500.00m [raid1_rimage_0] vg Iwi---r-- 1 linear 500.00m [raid1_rimage_1] vg Iwi---r-p 1 linear 500.00m [raid1_rmeta_0] vg ewi---r-- 1 linear 4.00m [raid1_rmeta_1] vg ewi---r-p 1 linear 4.00m A new reportable field has been added for writebehind as well. If write-behind has not been set or the LV is not RAID1, the field will be blank. Example (writebehind is set): [~]# lvs -a -o name,attr,writebehind vg LV Attr WBehind lv rwi-a-r-- 512 [lv_rimage_0] iwi-aor-w [lv_rimage_1] iwi-aor-- [lv_rmeta_0] ewi-aor-- [lv_rmeta_1] ewi-aor-- Example (writebehind is not set): [~]# lvs -a -o name,attr,writebehind vg LV Attr WBehind lv rwi-a-r-- [lv_rimage_0] iwi-aor-w [lv_rimage_1] iwi-aor-- [lv_rmeta_0] ewi-aor-- [lv_rmeta_1] ewi-aor--
2013-04-15 13:59:46 -05:00
/* Find LV segment containing given LE */
2007-12-20 18:55:46 +00:00
struct lv_segment *first_seg(const struct logical_volume *lv);
struct lv_segment *last_seg(const struct logical_volume *lv);
struct lv_segment *get_only_segment_using_this_lv(const struct logical_volume *lv);
/*
* Useful functions for managing snapshots.
*/
int lv_is_origin(const struct logical_volume *lv);
commands: new method for defining commands . Define a prototype for every lvm command. . Match every user command with one definition. . Generate help text and man pages from them. The new file command-lines.in defines a prototype for every unique lvm command. A unique lvm command is a unique combination of: command name + required option args + required positional args. Each of these prototypes also includes the optional option args and optional positional args that the command will accept, a description, and a unique string ID for the definition. Any valid command will match one of the prototypes. Here's an example of the lvresize command definitions from command-lines.in, there are three unique lvresize commands: lvresize --size SizeMB LV OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --resizefs, --stripes Number, --stripesize SizeKB, --poolmetadatasize SizeMB OP: PV ... ID: lvresize_by_size DESC: Resize an LV by a specified size. lvresize LV PV ... OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --resizefs, --stripes Number, --stripesize SizeKB ID: lvresize_by_pv DESC: Resize an LV by specified PV extents. FLAGS: SECONDARY_SYNTAX lvresize --poolmetadatasize SizeMB LV_thinpool OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --stripes Number, --stripesize SizeKB OP: PV ... ID: lvresize_pool_metadata_by_size DESC: Resize a pool metadata SubLV by a specified size. The three commands have separate definitions because they have different required parameters. Required parameters are specified on the first line of the definition. Optional options are listed after OO, and optional positional args are listed after OP. This data is used to generate corresponding command definition structures for lvm in command-lines.h. usage/help output is also auto generated, so it is always in sync with the definitions. Every user-entered command is compared against the set of command structures, and matched with one. An error is reported if an entered command does not have the required parameters for any definition. The closest match is printed as a suggestion, and running lvresize --help will display the usage for each possible lvresize command. The prototype syntax used for help/man output includes required --option and positional args on the first line, and optional --option and positional args enclosed in [ ] on subsequent lines. command_name <required_opt_args> <required_pos_args> [ <optional_opt_args> ] [ <optional_pos_args> ] Command definitions that are not to be advertised/suggested have the flag SECONDARY_SYNTAX. These commands will not be printed in the normal help output. Man page prototypes are also generated from the same original command definitions, and are always in sync with the code and help text. Very early in command execution, a matching command definition is found. lvm then knows the operation being done, and that the provided args conform to the definition. This will allow lots of ad hoc checking/validation to be removed throughout the code. Each command definition can also be routed to a specific function to implement it. The function is associated with an enum value for the command definition (generated from the ID string.) These per-command-definition implementation functions have not yet been created, so all commands currently fall back to the existing per-command-name implementation functions. Using per-command-definition functions will allow lots of code to be removed which tries to figure out what the command is meant to do. This is currently based on ad hoc and complicated option analysis. When using the new functions, what the command is doing is already known from the associated command definition.
2016-08-12 15:52:18 -05:00
#define lv_is_thick_origin lv_is_origin
int lv_is_thin_origin(const struct logical_volume *lv, unsigned *snap_count);
commands: new method for defining commands . Define a prototype for every lvm command. . Match every user command with one definition. . Generate help text and man pages from them. The new file command-lines.in defines a prototype for every unique lvm command. A unique lvm command is a unique combination of: command name + required option args + required positional args. Each of these prototypes also includes the optional option args and optional positional args that the command will accept, a description, and a unique string ID for the definition. Any valid command will match one of the prototypes. Here's an example of the lvresize command definitions from command-lines.in, there are three unique lvresize commands: lvresize --size SizeMB LV OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --resizefs, --stripes Number, --stripesize SizeKB, --poolmetadatasize SizeMB OP: PV ... ID: lvresize_by_size DESC: Resize an LV by a specified size. lvresize LV PV ... OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --resizefs, --stripes Number, --stripesize SizeKB ID: lvresize_by_pv DESC: Resize an LV by specified PV extents. FLAGS: SECONDARY_SYNTAX lvresize --poolmetadatasize SizeMB LV_thinpool OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --stripes Number, --stripesize SizeKB OP: PV ... ID: lvresize_pool_metadata_by_size DESC: Resize a pool metadata SubLV by a specified size. The three commands have separate definitions because they have different required parameters. Required parameters are specified on the first line of the definition. Optional options are listed after OO, and optional positional args are listed after OP. This data is used to generate corresponding command definition structures for lvm in command-lines.h. usage/help output is also auto generated, so it is always in sync with the definitions. Every user-entered command is compared against the set of command structures, and matched with one. An error is reported if an entered command does not have the required parameters for any definition. The closest match is printed as a suggestion, and running lvresize --help will display the usage for each possible lvresize command. The prototype syntax used for help/man output includes required --option and positional args on the first line, and optional --option and positional args enclosed in [ ] on subsequent lines. command_name <required_opt_args> <required_pos_args> [ <optional_opt_args> ] [ <optional_pos_args> ] Command definitions that are not to be advertised/suggested have the flag SECONDARY_SYNTAX. These commands will not be printed in the normal help output. Man page prototypes are also generated from the same original command definitions, and are always in sync with the code and help text. Very early in command execution, a matching command definition is found. lvm then knows the operation being done, and that the provided args conform to the definition. This will allow lots of ad hoc checking/validation to be removed throughout the code. Each command definition can also be routed to a specific function to implement it. The function is associated with an enum value for the command definition (generated from the ID string.) These per-command-definition implementation functions have not yet been created, so all commands currently fall back to the existing per-command-name implementation functions. Using per-command-definition functions will allow lots of code to be removed which tries to figure out what the command is meant to do. This is currently based on ad hoc and complicated option analysis. When using the new functions, what the command is doing is already known from the associated command definition.
2016-08-12 15:52:18 -05:00
int lv_is_thin_snapshot(const struct logical_volume *lv);
int lv_is_cow(const struct logical_volume *lv);
commands: new method for defining commands . Define a prototype for every lvm command. . Match every user command with one definition. . Generate help text and man pages from them. The new file command-lines.in defines a prototype for every unique lvm command. A unique lvm command is a unique combination of: command name + required option args + required positional args. Each of these prototypes also includes the optional option args and optional positional args that the command will accept, a description, and a unique string ID for the definition. Any valid command will match one of the prototypes. Here's an example of the lvresize command definitions from command-lines.in, there are three unique lvresize commands: lvresize --size SizeMB LV OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --resizefs, --stripes Number, --stripesize SizeKB, --poolmetadatasize SizeMB OP: PV ... ID: lvresize_by_size DESC: Resize an LV by a specified size. lvresize LV PV ... OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --resizefs, --stripes Number, --stripesize SizeKB ID: lvresize_by_pv DESC: Resize an LV by specified PV extents. FLAGS: SECONDARY_SYNTAX lvresize --poolmetadatasize SizeMB LV_thinpool OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --stripes Number, --stripesize SizeKB OP: PV ... ID: lvresize_pool_metadata_by_size DESC: Resize a pool metadata SubLV by a specified size. The three commands have separate definitions because they have different required parameters. Required parameters are specified on the first line of the definition. Optional options are listed after OO, and optional positional args are listed after OP. This data is used to generate corresponding command definition structures for lvm in command-lines.h. usage/help output is also auto generated, so it is always in sync with the definitions. Every user-entered command is compared against the set of command structures, and matched with one. An error is reported if an entered command does not have the required parameters for any definition. The closest match is printed as a suggestion, and running lvresize --help will display the usage for each possible lvresize command. The prototype syntax used for help/man output includes required --option and positional args on the first line, and optional --option and positional args enclosed in [ ] on subsequent lines. command_name <required_opt_args> <required_pos_args> [ <optional_opt_args> ] [ <optional_pos_args> ] Command definitions that are not to be advertised/suggested have the flag SECONDARY_SYNTAX. These commands will not be printed in the normal help output. Man page prototypes are also generated from the same original command definitions, and are always in sync with the code and help text. Very early in command execution, a matching command definition is found. lvm then knows the operation being done, and that the provided args conform to the definition. This will allow lots of ad hoc checking/validation to be removed throughout the code. Each command definition can also be routed to a specific function to implement it. The function is associated with an enum value for the command definition (generated from the ID string.) These per-command-definition implementation functions have not yet been created, so all commands currently fall back to the existing per-command-name implementation functions. Using per-command-definition functions will allow lots of code to be removed which tries to figure out what the command is meant to do. This is currently based on ad hoc and complicated option analysis. When using the new functions, what the command is doing is already known from the associated command definition.
2016-08-12 15:52:18 -05:00
#define lv_is_thick_snapshot lv_is_cow
int lv_is_cache_origin(const struct logical_volume *lv);
int lv_is_writecache_origin(const struct logical_volume *lv);
int lv_is_writecache_cachevol(const struct logical_volume *lv);
int writecache_settings_to_str_list(struct writecache_settings *settings, struct dm_list *result, struct dm_pool *mem);
int lv_writecache_set_cleaner(struct logical_volume *lv);
bool lv_writecache_is_clean(struct cmd_context *cmd, struct logical_volume *lv, uint64_t *dirty_blocks);
bool writecache_cleaner_supported(struct cmd_context *cmd);
commands: new method for defining commands . Define a prototype for every lvm command. . Match every user command with one definition. . Generate help text and man pages from them. The new file command-lines.in defines a prototype for every unique lvm command. A unique lvm command is a unique combination of: command name + required option args + required positional args. Each of these prototypes also includes the optional option args and optional positional args that the command will accept, a description, and a unique string ID for the definition. Any valid command will match one of the prototypes. Here's an example of the lvresize command definitions from command-lines.in, there are three unique lvresize commands: lvresize --size SizeMB LV OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --resizefs, --stripes Number, --stripesize SizeKB, --poolmetadatasize SizeMB OP: PV ... ID: lvresize_by_size DESC: Resize an LV by a specified size. lvresize LV PV ... OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --resizefs, --stripes Number, --stripesize SizeKB ID: lvresize_by_pv DESC: Resize an LV by specified PV extents. FLAGS: SECONDARY_SYNTAX lvresize --poolmetadatasize SizeMB LV_thinpool OO: --alloc Alloc, --autobackup Bool, --force, --nofsck, --nosync, --noudevsync, --reportformat String, --stripes Number, --stripesize SizeKB OP: PV ... ID: lvresize_pool_metadata_by_size DESC: Resize a pool metadata SubLV by a specified size. The three commands have separate definitions because they have different required parameters. Required parameters are specified on the first line of the definition. Optional options are listed after OO, and optional positional args are listed after OP. This data is used to generate corresponding command definition structures for lvm in command-lines.h. usage/help output is also auto generated, so it is always in sync with the definitions. Every user-entered command is compared against the set of command structures, and matched with one. An error is reported if an entered command does not have the required parameters for any definition. The closest match is printed as a suggestion, and running lvresize --help will display the usage for each possible lvresize command. The prototype syntax used for help/man output includes required --option and positional args on the first line, and optional --option and positional args enclosed in [ ] on subsequent lines. command_name <required_opt_args> <required_pos_args> [ <optional_opt_args> ] [ <optional_pos_args> ] Command definitions that are not to be advertised/suggested have the flag SECONDARY_SYNTAX. These commands will not be printed in the normal help output. Man page prototypes are also generated from the same original command definitions, and are always in sync with the code and help text. Very early in command execution, a matching command definition is found. lvm then knows the operation being done, and that the provided args conform to the definition. This will allow lots of ad hoc checking/validation to be removed throughout the code. Each command definition can also be routed to a specific function to implement it. The function is associated with an enum value for the command definition (generated from the ID string.) These per-command-definition implementation functions have not yet been created, so all commands currently fall back to the existing per-command-name implementation functions. Using per-command-definition functions will allow lots of code to be removed which tries to figure out what the command is meant to do. This is currently based on ad hoc and complicated option analysis. When using the new functions, what the command is doing is already known from the associated command definition.
2016-08-12 15:52:18 -05:00
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
int lv_is_integrity_origin(const struct logical_volume *lv);
int lv_is_merging_cow(const struct logical_volume *cow);
uint32_t cow_max_extents(const struct logical_volume *origin, uint32_t chunk_size);
int cow_has_min_chunks(const struct volume_group *vg, uint32_t cow_extents, uint32_t chunk_size);
int lv_is_cow_covering_origin(const struct logical_volume *lv);
/* Test if given LV is visible from user's perspective */
int lv_is_visible(const struct logical_volume *lv);
int lv_is_historical(const struct logical_volume *lv);
int pv_is_in_vg(struct volume_group *vg, struct physical_volume *pv);
2013-11-29 15:51:28 +01:00
/* Given a cow or thin LV, return the snapshot lv_segment that uses it */
struct lv_segment *find_snapshot(const struct logical_volume *lv);
/* Given a cow LV, return its origin */
struct logical_volume *origin_from_cow(const struct logical_volume *lv);
/* Given an internal snapshot LV, return its cow */
struct logical_volume *find_cow(const struct logical_volume *snap);
void init_snapshot_seg(struct lv_segment *seg, struct logical_volume *origin,
struct logical_volume *cow, uint32_t chunk_size, int merge);
void init_snapshot_merge(struct lv_segment *snap_seg, struct logical_volume *origin);
void clear_snapshot_merge(struct logical_volume *origin);
int vg_add_snapshot(struct logical_volume *origin, struct logical_volume *cow,
union lvid *lvid, uint32_t extent_count,
uint32_t chunk_size);
int vg_remove_snapshot(struct logical_volume *cow);
int validate_snapshot_origin(const struct logical_volume *origin_lv);
int vg_check_status(const struct volume_group *vg, uint64_t status);
int vg_check_pv_dev_block_sizes(const struct volume_group *vg);
/*
* Check if the VG reached maximal LVs count (if set)
*/
int vg_max_lv_reached(struct volume_group *vg);
/*
* Mirroring functions
*/
uint32_t get_default_region_size(struct cmd_context *cmd); /* in lv_manip.c */
struct lv_segment *find_mirror_seg(struct lv_segment *seg);
int lv_add_mirrors(struct cmd_context *cmd, struct logical_volume *lv,
uint32_t mirrors, uint32_t stripes, uint32_t stripe_size,
uint32_t region_size, uint32_t log_count,
struct dm_list *pvs, alloc_policy_t alloc, uint32_t flags);
int lv_split_mirror_images(struct logical_volume *lv, const char *split_lv_name,
uint32_t split_count, struct dm_list *removable_pvs);
int lv_remove_mirrors(struct cmd_context *cmd, struct logical_volume *lv,
uint32_t mirrors, uint32_t log_count,
int (*is_removable)(struct logical_volume *, void *),
void *removable_baton, uint64_t status_mask);
const char *get_mirror_log_name(int log_count);
2014-11-08 01:28:38 +01:00
int set_mirror_log_count(int *log_count, const char *mirrorlog);
int cluster_mirror_is_available(struct cmd_context *cmd);
2007-12-20 18:55:46 +00:00
int is_temporary_mirror_layer(const struct logical_volume *lv);
2008-01-16 19:13:51 +00:00
struct logical_volume * find_temporary_mirror(const struct logical_volume *lv);
2007-12-20 18:55:46 +00:00
uint32_t lv_mirror_count(const struct logical_volume *lv);
uint32_t adjusted_mirror_region_size(struct cmd_context *cmd,
uint32_t extent_size, uint32_t extents,
uint32_t region_size, int internal, int clustered);
int remove_mirrors_from_segments(struct logical_volume *lv,
uint32_t new_mirrors, uint64_t status_mask);
int add_mirrors_to_segments(struct cmd_context *cmd, struct logical_volume *lv,
uint32_t mirrors, uint32_t region_size,
struct dm_list *allocatable_pvs, alloc_policy_t alloc);
2007-12-20 18:55:46 +00:00
int remove_mirror_images(struct logical_volume *lv, uint32_t num_mirrors,
int (*is_removable)(struct logical_volume *, void *),
void *removable_baton, unsigned remove_log);
int add_mirror_images(struct cmd_context *cmd, struct logical_volume *lv,
uint32_t mirrors, uint32_t stripes, uint32_t stripe_size, uint32_t region_size,
struct dm_list *allocatable_pvs, alloc_policy_t alloc,
uint32_t log_count);
struct logical_volume *detach_mirror_log(struct lv_segment *mirrored_seg);
int attach_mirror_log(struct lv_segment *seg, struct logical_volume *log_lv);
int remove_mirror_log(struct cmd_context *cmd, struct logical_volume *lv,
struct dm_list *removable_pvs, int force);
struct logical_volume *prepare_mirror_log(struct logical_volume *lv,
int in_sync, uint32_t region_size,
struct dm_list *allocatable_pvs,
alloc_policy_t alloc);
int add_mirror_log(struct cmd_context *cmd, struct logical_volume *lv,
uint32_t log_count, uint32_t region_size,
struct dm_list *allocatable_pvs, alloc_policy_t alloc);
#if 0
/* FIXME: reconfigure_mirror_images: remove this code? */
int reconfigure_mirror_images(struct lv_segment *mirrored_seg, uint32_t num_mirrors,
struct dm_list *removable_pvs, unsigned remove_log);
#endif
2007-12-20 18:55:46 +00:00
int collapse_mirrored_lv(struct logical_volume *lv);
int shift_mirror_images(struct lv_segment *mirrored_seg, unsigned mimage);
/* ++ metadata/raid_manip.c */
struct lv_status_raid {
struct dm_pool *mem;
struct dm_status_raid *raid;
dm_percent_t in_sync;
};
int lv_is_raid_with_tracking(const struct logical_volume *lv);
uint32_t lv_raid_image_count(const struct logical_volume *lv);
int lv_raid_change_image_count(struct logical_volume *lv,
int yes,
lvconvert: add infrastructure for RaidLV reshaping support In order to support striped raid5/6/10 LV reshaping (change of LV type, stripesize or number of legs), this patch introduces infrastructure prerequisites to be used by raid_manip.c extensions in followup patches. This base is needed for allocation of out-of-place reshape space required by the MD raid personalities to avoid writing over data in-place when reading off the current RAID layout or number of legs and writing out the new layout or to a different number of legs (i.e. restripe) Changes: - add members reshape_len to 'struct lv_segment' to store out-of-place reshape length per component rimage - add member data_copies to struct lv_segment to support more than 2 raid10 data copies - make alloc_lv_segment() aware of both reshape_len and data_copies - adjust all alloc_lv_segment() callers to the new API - add functions to retrieve the current data offset (needed for out-of-place reshaping space allocation) and the devices count from the kernel - make libdm deptree code aware of reshape_len - add LV flags for disk add/remove reshaping - support import/export of the new 'struct lv_segment' members - enhance lv_extend/_lv_reduce to cope with reshape_len - add seg_is_*/segtype_is_* macros related to reshaping - add target version check for reshaping - grow rebuilds/writemostly bitmaps to 246 bit to support kernel maximal - enhance libdm deptree code to support data_offset (out-of-place reshaping) and delta_disk (legs add/remove reshaping) target arguments Related: rhbz834579 Related: rhbz1191935 Related: rhbz1191978
2017-02-24 00:50:00 +01:00
uint32_t new_count,
uint32_t new_region_size,
lvconvert: add infrastructure for RaidLV reshaping support In order to support striped raid5/6/10 LV reshaping (change of LV type, stripesize or number of legs), this patch introduces infrastructure prerequisites to be used by raid_manip.c extensions in followup patches. This base is needed for allocation of out-of-place reshape space required by the MD raid personalities to avoid writing over data in-place when reading off the current RAID layout or number of legs and writing out the new layout or to a different number of legs (i.e. restripe) Changes: - add members reshape_len to 'struct lv_segment' to store out-of-place reshape length per component rimage - add member data_copies to struct lv_segment to support more than 2 raid10 data copies - make alloc_lv_segment() aware of both reshape_len and data_copies - adjust all alloc_lv_segment() callers to the new API - add functions to retrieve the current data offset (needed for out-of-place reshaping space allocation) and the devices count from the kernel - make libdm deptree code aware of reshape_len - add LV flags for disk add/remove reshaping - support import/export of the new 'struct lv_segment' members - enhance lv_extend/_lv_reduce to cope with reshape_len - add seg_is_*/segtype_is_* macros related to reshaping - add target version check for reshaping - grow rebuilds/writemostly bitmaps to 246 bit to support kernel maximal - enhance libdm deptree code to support data_offset (out-of-place reshaping) and delta_disk (legs add/remove reshaping) target arguments Related: rhbz834579 Related: rhbz1191935 Related: rhbz1191978
2017-02-24 00:50:00 +01:00
struct dm_list *allocate_pvs);
int lv_raid_split(struct logical_volume *lv, int yes, const char *split_name,
uint32_t new_count, struct dm_list *splittable_pvs);
Add the ability to split an image from the mirror and track changes. ~> lvconvert --splitmirrors 1 --trackchanges vg/lv The '--trackchanges' option allows a user the ability to use an image of a RAID1 array for the purposes of temporary read-only access. The image can be merged back into the array at a later time and only the blocks that have changed in the array since the split will be resync'ed. This operation can be thought of as a partial split. The image is never completely extracted from the array, in that the array reserves the position the device occupied and tracks the differences between the array and the split image via a bitmap. The image itself is rendered read-only and the name (<LV>_rimage_*) cannot be changed. The user can complete the split (permanently splitting the image from the array) by re-issuing the 'lvconvert' command without the '--trackchanges' argument and specifying the '--name' argument. ~> lvconvert --splitmirrors 1 --name my_split vg/lv Merging the tracked image back into the array is done with the '--merge' option (included in a follow-on patch). ~> lvconvert --merge vg/lv_rimage_<n> The internal mechanics of this are relatively simple. The 'raid' device- mapper target allows for the specification of an empty slot in an array via '- -'. This is what will be used if a partial activation of an array is ever required. (It would also be possible to use 'error' targets in place of the '- -'.) If a RAID image is found to be both read-only and visible, then it is considered separate from the array and '- -' is used to hold it's position in the array. So, all that needs to be done to temporarily split an image from the array /and/ cause the kernel target's bitmap to track (aka "mark") changes made is to make the specified image visible and read-only. To merge the device back into the array, the image needs to be returned to the read/write state of the top-level LV and made invisible.
2011-08-18 19:38:26 +00:00
int lv_raid_split_and_track(struct logical_volume *lv,
int yes,
Add the ability to split an image from the mirror and track changes. ~> lvconvert --splitmirrors 1 --trackchanges vg/lv The '--trackchanges' option allows a user the ability to use an image of a RAID1 array for the purposes of temporary read-only access. The image can be merged back into the array at a later time and only the blocks that have changed in the array since the split will be resync'ed. This operation can be thought of as a partial split. The image is never completely extracted from the array, in that the array reserves the position the device occupied and tracks the differences between the array and the split image via a bitmap. The image itself is rendered read-only and the name (<LV>_rimage_*) cannot be changed. The user can complete the split (permanently splitting the image from the array) by re-issuing the 'lvconvert' command without the '--trackchanges' argument and specifying the '--name' argument. ~> lvconvert --splitmirrors 1 --name my_split vg/lv Merging the tracked image back into the array is done with the '--merge' option (included in a follow-on patch). ~> lvconvert --merge vg/lv_rimage_<n> The internal mechanics of this are relatively simple. The 'raid' device- mapper target allows for the specification of an empty slot in an array via '- -'. This is what will be used if a partial activation of an array is ever required. (It would also be possible to use 'error' targets in place of the '- -'.) If a RAID image is found to be both read-only and visible, then it is considered separate from the array and '- -' is used to hold it's position in the array. So, all that needs to be done to temporarily split an image from the array /and/ cause the kernel target's bitmap to track (aka "mark") changes made is to make the specified image visible and read-only. To merge the device back into the array, the image needs to be returned to the read/write state of the top-level LV and made invisible.
2011-08-18 19:38:26 +00:00
struct dm_list *splittable_pvs);
int lv_raid_merge(struct logical_volume *image_lv);
int lv_raid_convert(struct logical_volume *lv,
const struct segment_type *new_segtype,
int yes, int force,
const unsigned new_stripes,
const unsigned new_stripe_size_supplied,
const unsigned new_stripe_size,
const uint32_t new_region_size,
struct dm_list *allocate_pvs);
int lv_raid_rebuild(struct logical_volume *lv, struct dm_list *rebuild_pvs);
int lv_raid_replace(struct logical_volume *lv, int force,
struct dm_list *remove_pvs, struct dm_list *allocate_pvs);
int lv_raid_remove_missing(struct logical_volume *lv);
int partial_raid_lv_supports_degraded_activation(const struct logical_volume *clv);
uint32_t raid_rmeta_extents_delta(struct cmd_context *cmd,
uint32_t rimage_extents_cur, uint32_t rimage_extents_new,
uint32_t region_size, uint32_t extent_size);
uint32_t raid_rimage_extents(const struct segment_type *segtype,
uint32_t extents, uint32_t stripes, uint32_t data_copies);
uint32_t raid_ensure_min_region_size(const struct logical_volume *lv, uint64_t raid_size, uint32_t region_size);
int lv_raid_change_region_size(struct logical_volume *lv,
int yes, int force, uint32_t new_region_size);
int lv_raid_in_sync(const struct logical_volume *lv);
lvconvert: add infrastructure for RaidLV reshaping support In order to support striped raid5/6/10 LV reshaping (change of LV type, stripesize or number of legs), this patch introduces infrastructure prerequisites to be used by raid_manip.c extensions in followup patches. This base is needed for allocation of out-of-place reshape space required by the MD raid personalities to avoid writing over data in-place when reading off the current RAID layout or number of legs and writing out the new layout or to a different number of legs (i.e. restripe) Changes: - add members reshape_len to 'struct lv_segment' to store out-of-place reshape length per component rimage - add member data_copies to struct lv_segment to support more than 2 raid10 data copies - make alloc_lv_segment() aware of both reshape_len and data_copies - adjust all alloc_lv_segment() callers to the new API - add functions to retrieve the current data offset (needed for out-of-place reshaping space allocation) and the devices count from the kernel - make libdm deptree code aware of reshape_len - add LV flags for disk add/remove reshaping - support import/export of the new 'struct lv_segment' members - enhance lv_extend/_lv_reduce to cope with reshape_len - add seg_is_*/segtype_is_* macros related to reshaping - add target version check for reshaping - grow rebuilds/writemostly bitmaps to 246 bit to support kernel maximal - enhance libdm deptree code to support data_offset (out-of-place reshaping) and delta_disk (legs add/remove reshaping) target arguments Related: rhbz834579 Related: rhbz1191935 Related: rhbz1191978
2017-02-24 00:50:00 +01:00
uint32_t lv_raid_data_copies(const struct segment_type *segtype, uint32_t area_count);
int lv_raid_free_reshape_space(const struct logical_volume *lv);
int lv_raid_clear_lv(struct logical_volume *lv, int commit);
int lv_raid_has_visible_sublvs(const struct logical_volume *lv);
/* -- metadata/raid_manip.c */
/* ++ metadata/cache_manip.c */
struct lv_status_cache {
struct dm_pool *mem;
struct dm_status_cache *cache;
dm_percent_t data_usage;
dm_percent_t metadata_usage;
dm_percent_t dirty_usage;
};
const char *cache_mode_num_to_str(cache_mode_t mode);
const char *display_cache_mode(const struct lv_segment *seg);
Allow dm-cache cache device to be standard LV If a single, standard LV is specified as the cache, use it directly instead of converting it into a cache-pool object with two separate LVs (for data and metadata). With a single LV as the cache, lvm will use blocks at the beginning for metadata, and the rest for data. Separate dm linear devices are set up to point at the metadata and data areas of the LV. These dm devs are given to the dm-cache target to use. The single LV cache cannot be resized without recreating it. If the --poolmetadata option is used to specify an LV for metadata, then a cache pool will be created (with separate LVs for data and metadata.) Usage: $ lvcreate -n main -L 128M vg /dev/loop0 $ lvcreate -n fast -L 64M vg /dev/loop1 $ lvs -a vg LV VG Attr LSize Type Devices main vg -wi-a----- 128.00m linear /dev/loop0(0) fast vg -wi-a----- 64.00m linear /dev/loop1(0) $ lvconvert --type cache --cachepool fast vg/main $ lvs -a vg LV VG Attr LSize Origin Pool Type Devices [fast] vg Cwi---C--- 64.00m linear /dev/loop1(0) main vg Cwi---C--- 128.00m [main_corig] [fast] cache main_corig(0) [main_corig] vg owi---C--- 128.00m linear /dev/loop0(0) $ lvchange -ay vg/main $ dmsetup ls vg-fast_cdata (253:4) vg-fast_cmeta (253:5) vg-main_corig (253:6) vg-main (253:24) vg-fast (253:3) $ dmsetup table vg-fast_cdata: 0 98304 linear 253:3 32768 vg-fast_cmeta: 0 32768 linear 253:3 0 vg-main_corig: 0 262144 linear 7:0 2048 vg-main: 0 262144 cache 253:5 253:4 253:6 128 2 metadata2 writethrough mq 0 vg-fast: 0 131072 linear 7:1 2048 $ lvchange -an vg/min $ lvconvert --splitcache vg/main $ lvs -a vg LV VG Attr LSize Type Devices fast vg -wi------- 64.00m linear /dev/loop1(0) main vg -wi------- 128.00m linear /dev/loop0(0)
2018-08-17 15:45:52 -05:00
const char *get_cache_mode_name(const struct lv_segment *seg);
int set_cache_mode(cache_mode_t *mode, const char *cache_mode);
int cache_set_cache_mode(struct lv_segment *seg, cache_mode_t mode);
int cache_set_metadata_format(struct lv_segment *seg, cache_metadata_format_t format);
int cache_set_policy(struct lv_segment *seg, const char *name,
const struct dm_config_tree *settings);
int cache_set_params(struct lv_segment *seg,
2017-03-09 15:54:30 +01:00
uint32_t chunk_size,
cache_metadata_format_t format,
cache_mode_t mode,
const char *policy_name,
2017-03-09 15:54:30 +01:00
const struct dm_config_tree *policy_settings);
int cache_vol_set_params(struct cmd_context *cmd,
Allow dm-cache cache device to be standard LV If a single, standard LV is specified as the cache, use it directly instead of converting it into a cache-pool object with two separate LVs (for data and metadata). With a single LV as the cache, lvm will use blocks at the beginning for metadata, and the rest for data. Separate dm linear devices are set up to point at the metadata and data areas of the LV. These dm devs are given to the dm-cache target to use. The single LV cache cannot be resized without recreating it. If the --poolmetadata option is used to specify an LV for metadata, then a cache pool will be created (with separate LVs for data and metadata.) Usage: $ lvcreate -n main -L 128M vg /dev/loop0 $ lvcreate -n fast -L 64M vg /dev/loop1 $ lvs -a vg LV VG Attr LSize Type Devices main vg -wi-a----- 128.00m linear /dev/loop0(0) fast vg -wi-a----- 64.00m linear /dev/loop1(0) $ lvconvert --type cache --cachepool fast vg/main $ lvs -a vg LV VG Attr LSize Origin Pool Type Devices [fast] vg Cwi---C--- 64.00m linear /dev/loop1(0) main vg Cwi---C--- 128.00m [main_corig] [fast] cache main_corig(0) [main_corig] vg owi---C--- 128.00m linear /dev/loop0(0) $ lvchange -ay vg/main $ dmsetup ls vg-fast_cdata (253:4) vg-fast_cmeta (253:5) vg-main_corig (253:6) vg-main (253:24) vg-fast (253:3) $ dmsetup table vg-fast_cdata: 0 98304 linear 253:3 32768 vg-fast_cmeta: 0 32768 linear 253:3 0 vg-main_corig: 0 262144 linear 7:0 2048 vg-main: 0 262144 cache 253:5 253:4 253:6 128 2 metadata2 writethrough mq 0 vg-fast: 0 131072 linear 7:1 2048 $ lvchange -an vg/min $ lvconvert --splitcache vg/main $ lvs -a vg LV VG Attr LSize Type Devices fast vg -wi------- 64.00m linear /dev/loop1(0) main vg -wi------- 128.00m linear /dev/loop0(0)
2018-08-17 15:45:52 -05:00
struct logical_volume *cache_lv,
struct logical_volume *pool_lv,
uint64_t poolmetadatasize,
uint32_t chunk_size,
cache_metadata_format_t format,
cache_mode_t mode,
const char *policy,
const struct dm_config_tree *settings);
void cache_check_for_warns(const struct lv_segment *seg);
int update_cache_pool_params(struct cmd_context *cmd,
struct profile *profile,
uint32_t extent_size,
const struct segment_type *segtype,
unsigned attr,
uint32_t pool_data_extents,
uint32_t *pool_metadata_extents,
struct logical_volume *metadata_lv,
int *chunk_size_calc_method, uint32_t *chunk_size);
int validate_lv_cache_chunk_size(struct logical_volume *pool_lv, uint32_t chunk_size);
int validate_lv_cache_create_pool(const struct logical_volume *pool_lv);
int validate_lv_cache_create_origin(const struct logical_volume *origin_lv);
struct logical_volume *lv_cache_create(struct logical_volume *pool_lv,
struct logical_volume *origin_lv);
int lv_cache_wait_for_clean(struct logical_volume *cache_lv, int *is_clean);
int lv_cache_remove(struct logical_volume *cache_lv);
int lv_detach_writecache_cachevol(struct logical_volume *cache_lv, int noflush);
int wipe_cache_pool(struct logical_volume *cache_pool_lv);
/* -- metadata/cache_manip.c */
/* ++ metadata/vdo_manip.c */
struct lv_status_vdo {
struct dm_pool *mem;
struct dm_vdo_status *vdo;
uint64_t data_blocks_used; /* grabbed from /sys/kvdo */
uint64_t logical_blocks_used; /* grabbed from /sys/kvdo */
dm_percent_t usage;
dm_percent_t saving;
dm_percent_t data_usage;
};
const char *get_vdo_compression_state_name(enum dm_vdo_compression_state state);
const char *get_vdo_index_state_name(enum dm_vdo_index_state state);
const char *get_vdo_operating_mode_name(enum dm_vdo_operating_mode mode);
const char *get_vdo_write_policy_name(enum dm_vdo_write_policy policy);
uint64_t get_vdo_pool_virtual_size(const struct lv_segment *vdo_pool_seg);
int update_vdo_pool_virtual_size(struct lv_segment *vdo_pool_seg);
int parse_vdo_pool_status(struct dm_pool *mem, const struct logical_volume *vdo_pool_lv,
const char *params, const struct dm_info *dminfo,
struct lv_status_vdo *status);
struct logical_volume *convert_vdo_pool_lv(struct logical_volume *data_lv,
const struct dm_vdo_target_params *vtp,
uint32_t *virtual_extents,
int format,
uint64_t vdo_pool_header_size);
int set_vdo_write_policy(enum dm_vdo_write_policy *vwp, const char *policy);
int fill_vdo_target_params(struct cmd_context *cmd,
struct dm_vdo_target_params *vtp,
uint64_t *vdo_pool_header_size,
struct profile *profile);
/* -- metadata/vdo_manip.c */
struct logical_volume *find_pvmove_lv(struct volume_group *vg,
struct device *dev, uint64_t lv_type);
const struct logical_volume *find_pvmove_lv_in_lv(const struct logical_volume *lv);
const char *get_pvmove_pvname_from_lv(const struct logical_volume *lv);
const char *get_pvmove_pvname_from_lv_mirr(const struct logical_volume *lv_mirr);
struct dm_list *lvs_using_lv(struct cmd_context *cmd, struct volume_group *vg,
struct logical_volume *lv);
uint32_t find_free_lvnum(struct logical_volume *lv);
dm_percent_t copy_percent(const struct logical_volume *lv);
char *generate_lv_name(struct volume_group *vg, const char *format,
char *buffer, size_t len);
char *top_level_lv_name(struct volume_group *vg, const char *lv_name);
struct generic_logical_volume *get_or_create_glv(struct dm_pool *mem, struct logical_volume *lv, int *glv_created);
struct glv_list *get_or_create_glvl(struct dm_pool *mem, struct logical_volume *lv, int *glv_created);
/*
* Begin skeleton for external LVM library
*/
int pv_change_metadataignore(struct physical_volume *pv, uint32_t mda_ignored);
int vg_flag_write_locked(struct volume_group *vg);
int vg_check_write_mode(struct volume_group *vg);
#define vg_is_clustered(vg) ((vg_status((vg)) & CLUSTERED) ? 1 : 0)
#define vg_is_exported(vg) ((vg_status((vg)) & EXPORTED_VG) ? 1 : 0)
#define vg_is_resizeable(vg) ((vg_status((vg)) & RESIZEABLE_VG) ? 1 : 0)
int lv_has_unknown_segments(const struct logical_volume *lv);
int vg_has_unknown_segments(const struct volume_group *vg);
int vg_mark_partial_lvs(struct volume_group *vg, int clear);
struct vgcreate_params {
const char *vg_name;
uint32_t extent_size;
size_t max_pv;
size_t max_lv;
alloc_policy_t alloc;
int clustered; /* FIXME: put this into a 'status' variable instead? */
uint32_t vgmetadatacopies;
const char *system_id;
2015-03-05 14:00:44 -06:00
const char *lock_type;
const char *lock_args;
};
int validate_major_minor(const struct cmd_context *cmd,
const struct format_type *fmt,
int32_t major, int32_t minor);
int vgcreate_params_validate(struct cmd_context *cmd,
struct vgcreate_params *vp);
int validate_vg_rename_params(struct cmd_context *cmd,
const char *vg_name_old,
const char *vg_name_new);
2015-03-05 14:00:44 -06:00
int is_lockd_type(const char *lock_type);
int vg_is_shared(const struct volume_group *vg);
2015-03-05 14:00:44 -06:00
int is_system_id_allowed(struct cmd_context *cmd, const char *system_id);
int vg_strip_outdated_historical_lvs(struct volume_group *vg);
int lv_on_pmem(struct logical_volume *lv);
int vg_is_foreign(struct volume_group *vg);
void vg_write_commit_bad_mdas(struct cmd_context *cmd, struct volume_group *vg);
2020-01-14 14:12:20 -06:00
struct dm_list *create_pv_list(struct dm_pool *mem, struct volume_group *vg, int argc,
char **argv, int allocatable_only);
struct dm_list *clone_pv_list(struct dm_pool *mem, struct dm_list *pvsl);
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
int lv_add_integrity_to_raid(struct logical_volume *lv, struct integrity_settings *settings, struct dm_list *pvh,
struct logical_volume *lv_imeta_0);
int lv_remove_integrity_from_raid(struct logical_volume *lv);
void lv_clear_integrity_recalculate_metadata(struct logical_volume *lv);
int lv_has_integrity_recalculate_metadata(struct logical_volume *lv);
int lv_raid_has_integrity(struct logical_volume *lv);
int lv_extend_integrity_in_raid(struct logical_volume *lv, struct dm_list *pvh);
int lv_get_raid_integrity_settings(struct logical_volume *lv, struct integrity_settings **isettings);
int integrity_mode_set(const char *mode, struct integrity_settings *settings);
int lv_integrity_mismatches(struct cmd_context *cmd, const struct logical_volume *lv, uint64_t *mismatches);
int lv_raid_integrity_total_mismatches(struct cmd_context *cmd, const struct logical_volume *lv, uint64_t *mismatches);
Allow dm-integrity to be used for raid images dm-integrity stores checksums of the data written to an LV, and returns an error if data read from the LV does not match the previously saved checksum. When used on raid images, dm-raid will correct the error by reading the block from another image, and the device user sees no error. The integrity metadata (checksums) are stored on an internal LV allocated by lvm for each linear image. The internal LV is allocated on the same PV as the image. Create a raid LV with an integrity layer over each raid image (for raid levels 1,4,5,6,10): lvcreate --type raidN --raidintegrity y [options] Add an integrity layer to images of an existing raid LV: lvconvert --raidintegrity y LV Remove the integrity layer from images of a raid LV: lvconvert --raidintegrity n LV Settings Use --raidintegritymode journal|bitmap (journal is default) to configure the method used by dm-integrity to ensure crash consistency. Initialization When integrity is added to an LV, the kernel needs to initialize the integrity metadata/checksums for all blocks in the LV. The data corruption checking performed by dm-integrity will only operate on areas of the LV that are already initialized. The progress of integrity initialization is reported by the "syncpercent" LV reporting field (and under the Cpy%Sync lvs column.) Example: create a raid1 LV with integrity: $ lvcreate --type raid1 -m1 --raidintegrity y -n rr -L1G foo Creating integrity metadata LV rr_rimage_0_imeta with size 12.00 MiB. Logical volume "rr_rimage_0_imeta" created. Creating integrity metadata LV rr_rimage_1_imeta with size 12.00 MiB. Logical volume "rr_rimage_1_imeta" created. Logical volume "rr" created. $ lvs -a foo LV VG Attr LSize Origin Cpy%Sync rr foo rwi-a-r--- 1.00g 4.93 [rr_rimage_0] foo gwi-aor--- 1.00g [rr_rimage_0_iorig] 41.02 [rr_rimage_0_imeta] foo ewi-ao---- 12.00m [rr_rimage_0_iorig] foo -wi-ao---- 1.00g [rr_rimage_1] foo gwi-aor--- 1.00g [rr_rimage_1_iorig] 39.45 [rr_rimage_1_imeta] foo ewi-ao---- 12.00m [rr_rimage_1_iorig] foo -wi-ao---- 1.00g [rr_rmeta_0] foo ewi-aor--- 4.00m [rr_rmeta_1] foo ewi-aor--- 4.00m
2019-11-20 16:07:27 -06:00
#endif