mirror of
git://sourceware.org/git/lvm2.git
synced 2024-12-21 13:34:40 +03:00
Add policy based automated repair of RAID logical volumes
The RAID plug-in for dmeventd now calls 'lvconvert --repair' to address failures of devices in a RAID logical volume. The action taken can be either to "warn" or "allocate" a new device from any spares that may be available in the volume group. The action is designated by setting 'raid_fault_policy' in lvm.conf - the default being "warn".
This commit is contained in:
parent
707c49ab77
commit
d098140177
@ -1,5 +1,6 @@
|
||||
Version 2.02.89 -
|
||||
==================================
|
||||
Add policy based automated repair of RAID logical volumes
|
||||
Don't allow two images to be split and tracked from a RAID LV at one time
|
||||
Don't allow size change of RAID LV that is tracking changes for a split image
|
||||
Don't allow size change of RAID sub-LVs independently
|
||||
|
@ -24,6 +24,41 @@
|
||||
/* FIXME Replace most syslogs with log_error() style messages and add complete context. */
|
||||
/* FIXME Reformat to 80 char lines. */
|
||||
|
||||
/*
|
||||
* run_repair is a close copy to
|
||||
* plugins/mirror/dmeventd_mirror.c:_remove_failed_devices()
|
||||
*/
|
||||
static int run_repair(const char *device)
|
||||
{
|
||||
int r;
|
||||
#define CMD_SIZE 256 /* FIXME Use system restriction */
|
||||
char cmd_str[CMD_SIZE];
|
||||
char *vg = NULL, *lv = NULL, *layer = NULL;
|
||||
|
||||
if (strlen(device) > 200) /* FIXME Use real restriction */
|
||||
return -1;
|
||||
|
||||
if (!dm_split_lvm_name(dmeventd_lvm2_pool(), device, &vg, &lv, &layer)) {
|
||||
syslog(LOG_ERR, "Unable to determine VG name from %s.",
|
||||
device);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* FIXME Is any sanity-checking required on %s? */
|
||||
if (CMD_SIZE <= snprintf(cmd_str, CMD_SIZE, "lvconvert --config devices{ignore_suspended_devices=1} --repair --use-policies %s/%s", vg, lv)) {
|
||||
/* this error should be caught above, but doesn't hurt to check again */
|
||||
syslog(LOG_ERR, "Unable to form LVM command: Device name too long.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
r = dmeventd_lvm2_run(cmd_str);
|
||||
|
||||
if (r != ECMD_PROCESSED)
|
||||
syslog(LOG_INFO, "Repair of RAID LV %s/%s failed.", vg, lv);
|
||||
|
||||
return (r == ECMD_PROCESSED) ? 0 : -1;
|
||||
}
|
||||
|
||||
static int _process_raid_event(char *params, const char *device)
|
||||
{
|
||||
int i, n, failure = 0;
|
||||
@ -71,7 +106,7 @@ static int _process_raid_event(char *params, const char *device)
|
||||
break;
|
||||
}
|
||||
if (failure)
|
||||
return 0; /* Don't bother parsing rest of status */
|
||||
return run_repair(device);
|
||||
}
|
||||
|
||||
p = strstr(resync_ratio, "/");
|
||||
|
@ -522,9 +522,31 @@ activation {
|
||||
# "auto" - Use default value chosen by kernel.
|
||||
readahead = "auto"
|
||||
|
||||
# 'raid_fault_policy' defines how a device failure in a RAID logical
|
||||
# volume is handled. This includes logical volumes that have the following
|
||||
# segment types: raid1, raid4, raid5*, and raid6*.
|
||||
#
|
||||
# In the event of a failure, the following policies will determine what
|
||||
# actions are performed during the automated response to failures (when
|
||||
# dmeventd is monitoring the RAID logical volume) and when 'lvconvert' is
|
||||
# called manually with the options '--repair' and '--use-policies'.
|
||||
#
|
||||
# "warn" - Use the system log to warn the user that a device in the RAID
|
||||
# logical volume has failed. It is left to the user to run
|
||||
# 'lvconvert --repair' manually to remove or replace the failed
|
||||
# device. As long as the number of failed devices does not
|
||||
# exceed the redundancy of the logical volume (1 device for
|
||||
# raid4/5, 2 for raid6, etc) the logical volume will remain
|
||||
# usable.
|
||||
#
|
||||
# "allocate" - Attempt to use any extra physical volumes in the volume
|
||||
# group as spares and replace faulty devices.
|
||||
#
|
||||
raid_fault_policy = "warn"
|
||||
|
||||
# 'mirror_image_fault_policy' and 'mirror_log_fault_policy' define
|
||||
# how a device failure affecting a mirror is handled.
|
||||
# A mirror is composed of mirror images (copies) and a log.
|
||||
# how a device failure affecting a mirror (of "mirror" segment type) is
|
||||
# handled. A mirror is composed of mirror images (copies) and a log.
|
||||
# A disk log ensures that a mirror does not need to be re-synced
|
||||
# (all copies made the same) every time a machine reboots or crashes.
|
||||
#
|
||||
|
@ -380,7 +380,7 @@ until all possible changes have been made. This reduces the likelyhood of bad
|
||||
intermediate stages being left due to a failure of operation or machine crash.
|
||||
|
||||
RAID1 '--splitmirrors', '--trackchanges', and '--merge' operations
|
||||
-----------------------------------------------------------------
|
||||
------------------------------------------------------------------
|
||||
This suite of operations is only available to the "raid1" segment type.
|
||||
|
||||
Splitting an image from a RAID1 array is almost identical to the removal of
|
||||
@ -404,3 +404,72 @@ set as "hidden" again. Recycling the array (suspend/resume) restores the sub-LV
|
||||
to its position in the array and begins the process of sync'ing the changes that
|
||||
were made since the time it was split from the array.
|
||||
|
||||
RAID device replacement with '--replace'
|
||||
----------------------------------------
|
||||
This option is available to all RAID segment types.
|
||||
|
||||
The '--replace' option can be used to remove a particular device from a RAID
|
||||
logical volume and replace it with a different one in one action (CLI command).
|
||||
The device device to be removed is specified as the argument to the '--replace'
|
||||
option. This option can be specified more than once in a single command,
|
||||
allowing multiple devices to be replaced at the same time - provided the RAID
|
||||
logical volume has the necessary redundancy to allow the action. The devices
|
||||
to be used as replacements can also be specified in the command; similar to the
|
||||
way allocatable devices are specified during an up-convert.
|
||||
|
||||
Example> lvconvert --replace /dev/sdd1 --replace /dev/sde1 vg/lv /dev/sd[bc]1
|
||||
|
||||
RAID '--repair'
|
||||
---------------
|
||||
This 'lvconvert' option is available to all RAID segment types and is described
|
||||
under "RAID Fault Handling".
|
||||
|
||||
|
||||
RAID Fault Handling
|
||||
===================
|
||||
RAID is not like traditional LVM mirroring (i.e. the "mirror" segment type).
|
||||
LVM mirroring required failed devices to be removed or the logical volume would
|
||||
simply hang. RAID arrays can keep on running with failed devices. In fact, for
|
||||
RAID types other than RAID1 removing a device would mean substituting an error
|
||||
target or converting to a lower level RAID (e.g. RAID6 -> RAID5, or RAID4/5 to
|
||||
RAID0). Therefore, rather than removing a failed device unconditionally, the
|
||||
user has a couple of options to choose from.
|
||||
|
||||
The automated response to a device failure is handled according to the user's
|
||||
preference defined in lvm.conf:activation.raid_fault_policy. The options are:
|
||||
# "warn" - Use the system log to warn the user that a device in the RAID
|
||||
# logical volume has failed. It is left to the user to run
|
||||
# 'lvconvert --repair' manually to remove or replace the failed
|
||||
# device. As long as the number of failed devices does not
|
||||
# exceed the redundancy of the logical volume (1 device for
|
||||
# raid4/5, 2 for raid6, etc) the logical volume will remain
|
||||
# usable.
|
||||
#
|
||||
# "remove" - NOT CURRENTLY IMPLEMENTED OR DOCUMENTED IN example.conf.in.
|
||||
# Remove the failed device and reduce the RAID logical volume
|
||||
# accordingly. If a single device dies in a 3-way mirror,
|
||||
# remove it and reduce the mirror to 2-way. If a single device
|
||||
# dies in a RAID 4/5 logical volume, reshape it to a striped
|
||||
# volume, etc - RAID 6 -> RAID 4/5 -> RAID 0. If devices
|
||||
# cannot be removed for lack of redundancy, fail.
|
||||
# THIS OPTION CANNOT YET BE IMPLEMENTED BECAUSE RESHAPE IS NOT
|
||||
# YET SUPPORTED IN linux/drivers/md/dm-raid.c. The superblock
|
||||
# does not yet hold enough information to support reshaping.
|
||||
#
|
||||
# "allocate" - Attempt to use any extra physical volumes in the volume
|
||||
# group as spares and replace faulty devices.
|
||||
|
||||
If manual intervention is taken, either in response to the automated solution's
|
||||
"warn" mode or simply because dmeventd hadn't run, then the user can call
|
||||
'lvconvert --repair vg/lv' and follow the prompts. They will be prompted
|
||||
whether or not to replace the device and cause a full recovery of the failed
|
||||
device.
|
||||
|
||||
If replacement is chosen via the manual method or "allocate" is the policy taken
|
||||
by the automated response, then 'lvconvert --replace' is the mechanism used to
|
||||
attempt the replacement of the failed device.
|
||||
|
||||
'vgreduce --removemissing' is ineffectual at repairing RAID logical volumes. It
|
||||
will remove the failed device, but the RAID logical volume will simply continue
|
||||
to operate with an <unknown> sub-LV. The user should clear the failed device
|
||||
with 'lvconvert --repair'.
|
||||
|
@ -15,6 +15,12 @@ from (e.g. a power failure, intermittent network outage, block
|
||||
relocation, etc). The policies for handling both types of failures
|
||||
is described herein.
|
||||
|
||||
Users need to be aware that there are two implementations of RAID1 in LVM.
|
||||
The first is defined by the "mirror" segment type. The second is defined by
|
||||
the "raid1" segment type. The characteristics of each of these are defined
|
||||
in lvm.conf under 'mirror_segtype_default' - the configuration setting used to
|
||||
identify the default RAID1 implementation used for LVM operations.
|
||||
|
||||
Available Operations During a Device Failure
|
||||
--------------------------------------------
|
||||
When there is a device failure, LVM behaves somewhat differently because
|
||||
@ -51,30 +57,36 @@ are as follows:
|
||||
a linear, stripe, or snapshot device is located on the failed device
|
||||
the command will not proceed without a '--force' option. The result
|
||||
of using the '--force' option is the entire removal and complete
|
||||
loss of the non-redundant logical volume. Once this operation is
|
||||
complete, the volume group will again have a complete and consistent
|
||||
view of the devices it contains. Thus, all operations will be
|
||||
permitted - including creation, conversion, and resizing operations.
|
||||
loss of the non-redundant logical volume. If an image or metadata area
|
||||
of a RAID logical volume is on the failed device, the sub-LV affected is
|
||||
replace with an error target device - appearing as <unknown> in 'lvs'
|
||||
output. RAID logical volumes cannot be completely repaired by vgreduce -
|
||||
'lvconvert --repair' (listed below) must be used. Once this operation is
|
||||
complete on volume groups not containing RAID logical volumes, the volume
|
||||
group will again have a complete and consistent view of the devices it
|
||||
contains. Thus, all operations will be permitted - including creation,
|
||||
conversion, and resizing operations. It is currently the preferred method
|
||||
to call 'lvconvert --repair' on the individual logical volumes to repair
|
||||
them followed by 'vgreduce --removemissing' to extract the physical volume's
|
||||
representation in the volume group.
|
||||
|
||||
- 'lvconvert --repair <VG/LV>': This action is designed specifically
|
||||
to operate on mirrored logical volumes. It is used on logical volumes
|
||||
individually and does not remove the faulty device from the volume
|
||||
group. If, for example, a failed device happened to contain the
|
||||
images of four distinct mirrors, it would be necessary to run
|
||||
'lvconvert --repair' on each of them. The ultimate result is to leave
|
||||
the faulty device in the volume group, but have no logical volumes
|
||||
referencing it. In addition to removing mirror images that reside
|
||||
on failed devices, 'lvconvert --repair' can also replace the failed
|
||||
device if there are spare devices available in the volume group. The
|
||||
user is prompted whether to simply remove the failed portions of the
|
||||
mirror or to also allocate a replacement, if run from the command-line.
|
||||
Optionally, the '--use-policies' flag can be specified which will
|
||||
cause the operation not to prompt the user, but instead respect
|
||||
to operate on individual logical volumes. If, for example, a failed
|
||||
device happened to contain the images of four distinct mirrors, it would
|
||||
be necessary to run 'lvconvert --repair' on each of them. The ultimate
|
||||
result is to leave the faulty device in the volume group, but have no logical
|
||||
volumes referencing it. (This allows for 'vgreduce --removemissing' to
|
||||
removed the physical volumes cleanly.) In addition to removing mirror or
|
||||
RAID images that reside on failed devices, 'lvconvert --repair' can also
|
||||
replace the failed device if there are spare devices available in the
|
||||
volume group. The user is prompted whether to simply remove the failed
|
||||
portions of the mirror or to also allocate a replacement, if run from the
|
||||
command-line. Optionally, the '--use-policies' flag can be specified which
|
||||
will cause the operation not to prompt the user, but instead respect
|
||||
the policies outlined in the LVM configuration file - usually,
|
||||
/etc/lvm/lvm.conf. Once this operation is complete, mirrored logical
|
||||
volumes will be consistent and I/O will be allowed to continue.
|
||||
However, the volume group will still be inconsistent - due to the
|
||||
refernced-but-missing device/PV - and operations will still be
|
||||
/etc/lvm/lvm.conf. Once this operation is complete, the logical volumes
|
||||
will be consistent. However, the volume group will still be inconsistent -
|
||||
due to the refernced-but-missing device/PV - and operations will still be
|
||||
restricted to the aformentioned actions until either the device is
|
||||
restored or 'vgreduce --removemissing' is run.
|
||||
|
||||
@ -98,13 +110,15 @@ following possible exceptions exist:
|
||||
|
||||
Automated Target Response to Failures:
|
||||
--------------------------------------
|
||||
The only LVM target type (i.e. "personality") that has an automated
|
||||
response to failures is a mirrored logical volume. The other target
|
||||
The only LVM target types (i.e. "personalities") that have an automated
|
||||
response to failures are the mirror and RAID logical volumes. The other target
|
||||
types (linear, stripe, snapshot, etc) will simply propagate the failure.
|
||||
[A snapshot becomes invalid if its underlying device fails, but the
|
||||
origin will remain valid - presuming the origin device has not failed.]
|
||||
There are three types of errors that a mirror can suffer - read, write,
|
||||
and resynchronization errors. Each is described in depth below.
|
||||
|
||||
Starting with the "mirror" segment type, there are three types of errors that
|
||||
a mirror can suffer - read, write, and resynchronization errors. Each is
|
||||
described in depth below.
|
||||
|
||||
Mirror read failures:
|
||||
If a mirror is 'in-sync' (i.e. all images have been initialized and
|
||||
@ -184,38 +198,5 @@ command are set in the LVM configuration file. They are:
|
||||
choice of when to incure the extra performance costs of replacing
|
||||
the failed image.
|
||||
|
||||
TODO...
|
||||
The appropriate time to take permanent corrective action on a mirror
|
||||
should be driven by policy. There should be a directive that takes
|
||||
a time or percentage argument. Something like the following:
|
||||
- mirror_fault_policy_WHEN = "10sec"/"10%"
|
||||
A time value would signal the amount of time to wait for transient
|
||||
failures to resolve themselves. The percentage value would signal the
|
||||
amount a mirror could become out-of-sync before the faulty device is
|
||||
removed.
|
||||
|
||||
A mirror cannot be used unless /some/ corrective action is taken,
|
||||
however. One option is to replace the failed mirror image with an
|
||||
error target, forgo the use of 'handle_errors', and simply let the
|
||||
out-of-sync regions accumulate and be tracked by the log. Mirrors
|
||||
that have more than 2 images would have to "stack" to perform the
|
||||
tracking, as each failed image would have to be associated with a
|
||||
log. If the failure is transient, the device would replace the
|
||||
error target that was holding its spot and the log that was tracking
|
||||
the deltas would be used to quickly restore the portions that changed.
|
||||
|
||||
One unresolved issue with the above scheme is how to know which
|
||||
regions of the mirror are out-of-sync when a problem occurs. When
|
||||
a write failure occurs in the kernel, the log will contain those
|
||||
regions that are not in-sync. If the log is a disk log, that log
|
||||
could continue to be used to track differences. However, if the
|
||||
log was a core log - or if the log device failed at the same time
|
||||
as an image device - there would be no way to determine which
|
||||
regions are out-of-sync to begin with as we start to track the
|
||||
deltas for the failed image. I don't have a solution for this
|
||||
problem other than to only be able to handle errors in this way
|
||||
if conditions are right. These issues will have to be ironed out
|
||||
before proceeding. This could be another case, where it is better
|
||||
to handle failures in the kernel by allowing the kernel to store
|
||||
updates in various metadata areas.
|
||||
...TODO
|
||||
RAID logical volume device failures are handled differently from the "mirror"
|
||||
segment type. Discussion of this can be found in lvm2-raid.txt.
|
||||
|
@ -55,6 +55,7 @@
|
||||
#define DEFAULT_MIRROR_LOG_FAULT_POLICY "allocate"
|
||||
#define DEFAULT_MIRROR_IMAGE_FAULT_POLICY "remove"
|
||||
#define DEFAULT_MIRROR_MAX_IMAGES 8 /* limited by kernel DM_KCOPYD_MAX_REGIONS */
|
||||
#define DEFAULT_RAID_FAULT_POLICY "warn"
|
||||
#define DEFAULT_DMEVENTD_RAID_LIB "libdevmapper-event-lvm2raid.so"
|
||||
#define DEFAULT_DMEVENTD_MIRROR_LIB "libdevmapper-event-lvm2mirror.so"
|
||||
#define DEFAULT_DMEVENTD_SNAPSHOT_LIB "libdevmapper-event-lvm2snapshot.so"
|
||||
|
@ -1424,9 +1424,44 @@ static int is_valid_raid_conversion(const struct segment_type *from_segtype,
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void _lvconvert_raid_repair_ask(struct cmd_context *cmd, int *replace_dev)
|
||||
{
|
||||
const char *dev_policy = NULL;
|
||||
|
||||
int force = arg_count(cmd, force_ARG);
|
||||
int yes = arg_count(cmd, yes_ARG);
|
||||
|
||||
*replace_dev = 0;
|
||||
|
||||
if (arg_count(cmd, use_policies_ARG)) {
|
||||
dev_policy = find_config_tree_str(cmd, "activation/raid_fault_policy", DEFAULT_RAID_FAULT_POLICY);
|
||||
|
||||
if (!strcmp(dev_policy, "allocate") ||
|
||||
!strcmp(dev_policy, "replace"))
|
||||
*replace_dev = 1;
|
||||
/* else if (!strcmp(dev_policy, "anything_else")) -- ignore */
|
||||
return;
|
||||
}
|
||||
|
||||
if (yes) {
|
||||
*replace_dev = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
if (force != PROMPT)
|
||||
return;
|
||||
|
||||
if (yes_no_prompt("Attempt to replace failed RAID images "
|
||||
"(requires full device resync)? [y/n]: ") == 'y') {
|
||||
*replace_dev = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static int lvconvert_raid(struct logical_volume *lv, struct lvconvert_params *lp)
|
||||
{
|
||||
int replace = 0;
|
||||
int image_count;
|
||||
struct dm_list *failed_pvs;
|
||||
struct cmd_context *cmd = lv->vg->cmd;
|
||||
struct lv_segment *seg = first_seg(lv);
|
||||
|
||||
@ -1485,6 +1520,25 @@ static int lvconvert_raid(struct logical_volume *lv, struct lvconvert_params *lp
|
||||
if (arg_count(cmd, replace_ARG))
|
||||
return lv_raid_replace(lv, lp->replace_pvh, lp->pvh);
|
||||
|
||||
if (arg_count(cmd, repair_ARG)) {
|
||||
_lvconvert_raid_repair_ask(cmd, &replace);
|
||||
|
||||
if (replace) {
|
||||
if (!(failed_pvs = _failed_pv_list(lv->vg))) {
|
||||
stack;
|
||||
return ECMD_FAILED;
|
||||
}
|
||||
return lv_raid_replace(lv, failed_pvs, lp->pvh);
|
||||
}
|
||||
|
||||
/* "warn" if policy not set to replace */
|
||||
if (arg_count(cmd, use_policies_ARG))
|
||||
log_error("Issue 'lvconvert --repair %s/%s' to "
|
||||
"replace failed device",
|
||||
lv->vg->name, lv->name);
|
||||
return 1;
|
||||
}
|
||||
|
||||
log_error("Conversion operation not yet supported.");
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user