Add policy based automated repair of RAID logical volumes

The RAID plug-in for dmeventd now calls 'lvconvert --repair' to address failures of devices in a RAID logical volume. The action taken can be either to "warn" or "allocate" a new device from any spares that may be available in the volume group. The action is designated by setting 'raid_fault_policy' in lvm.conf - the default being "warn".
2024-12-21 13:34:40 +03:00 · 2011-12-06 19:30:15 +00:00 · 2011-12-06 19:30:15 +00:00 · d098140177
commit d098140177
parent 707c49ab77
7 changed files with 227 additions and 64 deletions
--- a/1
+++ b/1
@ -1,5 +1,6 @@
 Version 2.02.89 - 
 ==================================
+  Add policy based automated repair of RAID logical volumes
  Don't allow two images to be split and tracked from a RAID LV at one time
  Don't allow size change of RAID LV that is tracking changes for a split image
  Don't allow size change of RAID sub-LVs independently
--- a/daemons/dmeventd/plugins/raid/dmeventd_raid.c
+++ b/daemons/dmeventd/plugins/raid/dmeventd_raid.c
@ -24,6 +24,41 @@
 /* FIXME Replace most syslogs with log_error() style messages and add complete context. */
 /* FIXME Reformat to 80 char lines. */

+/*
+ * run_repair is a close copy to
+ * plugins/mirror/dmeventd_mirror.c:_remove_failed_devices()
+ */
+static int run_repair(const char *device)
+{
+	int r;
+#define CMD_SIZE 256	/* FIXME Use system restriction */
+	char cmd_str[CMD_SIZE];
+	char *vg = NULL, *lv = NULL, *layer = NULL;
+
+	if (strlen(device) > 200)  /* FIXME Use real restriction */
+		return -1;
+
+	if (!dm_split_lvm_name(dmeventd_lvm2_pool(), device, &vg, &lv, &layer)) {
+		syslog(LOG_ERR, "Unable to determine VG name from %s.",
+		       device);
+		return -1;
+	}
+
+	/* FIXME Is any sanity-checking required on %s? */
+	if (CMD_SIZE <= snprintf(cmd_str, CMD_SIZE, "lvconvert --config devices{ignore_suspended_devices=1} --repair --use-policies %s/%s", vg, lv)) {
+		/* this error should be caught above, but doesn't hurt to check again */
+		syslog(LOG_ERR, "Unable to form LVM command: Device name too long.");
+		return -1;
+	}
+
+	r = dmeventd_lvm2_run(cmd_str);
+
+	if (r != ECMD_PROCESSED)
+		syslog(LOG_INFO, "Repair of RAID LV %s/%s failed.", vg, lv);
+
+	return (r == ECMD_PROCESSED) ? 0 : -1;
+}
+
 static int _process_raid_event(char *params, const char *device)
 {
 	int i, n, failure = 0;
@ -71,7 +106,7 @@ static int _process_raid_event(char *params, const char *device)
 			break;
 		}
 		if (failure)
-			return 0; /* Don't bother parsing rest of status */
+			return run_repair(device);
 	}

 	p = strstr(resync_ratio, "/");
--- a/doc/example.conf.in
+++ b/doc/example.conf.in
@ -522,9 +522,31 @@ activation {
    # "auto" - Use default value chosen by kernel.
    readahead = "auto"

+    # 'raid_fault_policy' defines how a device failure in a RAID logical
+    # volume is handled.  This includes logical volumes that have the following
+    # segment types: raid1, raid4, raid5*, and raid6*.
+    #
+    # In the event of a failure, the following policies will determine what
+    # actions are performed during the automated response to failures (when
+    # dmeventd is monitoring the RAID logical volume) and when 'lvconvert' is
+    # called manually with the options '--repair' and '--use-policies'.
+    #
+    # "warn"	- Use the system log to warn the user that a device in the RAID
+    # 		  logical volume has failed.  It is left to the user to run
+    #		  'lvconvert --repair' manually to remove or replace the failed
+    #		  device.  As long as the number of failed devices does not
+    #		  exceed the redundancy of the logical volume (1 device for
+    #		  raid4/5, 2 for raid6, etc) the logical volume will remain
+    #		  usable.
+    #
+    # "allocate" - Attempt to use any extra physical volumes in the volume
+    # 		  group as spares and replace faulty devices.
+    #
+    raid_fault_policy = "warn"
+
    # 'mirror_image_fault_policy' and 'mirror_log_fault_policy' define
-    # how a device failure affecting a mirror is handled.
-    # A mirror is composed of mirror images (copies) and a log.
+    # how a device failure affecting a mirror (of "mirror" segment type) is
+    # handled.  A mirror is composed of mirror images (copies) and a log.
    # A disk log ensures that a mirror does not need to be re-synced
    # (all copies made the same) every time a machine reboots or crashes.
    #
--- a/doc/lvm2-raid.txt
+++ b/doc/lvm2-raid.txt
@ -380,7 +380,7 @@ until all possible changes have been made.  This reduces the likelyhood of bad
 intermediate stages being left due to a failure of operation or machine crash.

 RAID1 '--splitmirrors', '--trackchanges', and '--merge' operations
-----------------------------------------------------------------
+------------------------------------------------------------------
 This suite of operations is only available to the "raid1" segment type.

 Splitting an image from a RAID1 array is almost identical to the removal of
@ -404,3 +404,72 @@ set as "hidden" again.  Recycling the array (suspend/resume) restores the sub-LV
 to its position in the array and begins the process of sync'ing the changes that
 were made since the time it was split from the array.

+RAID device replacement with '--replace'
+----------------------------------------
+This option is available to all RAID segment types.
+
+The '--replace' option can be used to remove a particular device from a RAID
+logical volume and replace it with a different one in one action (CLI command).
+The device device to be removed is specified as the argument to the '--replace'
+option.  This option can be specified more than once in a single command,
+allowing multiple devices to be replaced at the same time - provided the RAID
+logical volume has the necessary redundancy to allow the action.  The devices
+to be used as replacements can also be specified in the command; similar to the
+way allocatable devices are specified during an up-convert.
+
+Example> lvconvert --replace /dev/sdd1 --replace /dev/sde1 vg/lv /dev/sd[bc]1
+
+RAID '--repair'
+---------------
+This 'lvconvert' option is available to all RAID segment types and is described
+under "RAID Fault Handling".
+
+
+RAID Fault Handling
+===================
+RAID is not like traditional LVM mirroring (i.e. the "mirror" segment type).
+LVM mirroring required failed devices to be removed or the logical volume would
+simply hang.  RAID arrays can keep on running with failed devices.  In fact, for
+RAID types other than RAID1 removing a device would mean substituting an error
+target or converting to a lower level RAID (e.g. RAID6 -> RAID5, or RAID4/5 to
+RAID0).  Therefore, rather than removing a failed device unconditionally, the
+user has a couple of options to choose from.
+
+The automated response to a device failure is handled according to the user's
+preference defined in lvm.conf:activation.raid_fault_policy.  The options are:
+    # "warn"    - Use the system log to warn the user that a device in the RAID
+    #             logical volume has failed.  It is left to the user to run
+    #             'lvconvert --repair' manually to remove or replace the failed
+    #             device.  As long as the number of failed devices does not
+    #             exceed the redundancy of the logical volume (1 device for
+    #             raid4/5, 2 for raid6, etc) the logical volume will remain
+    #             usable.
+    #
+    # "remove"  - NOT CURRENTLY IMPLEMENTED OR DOCUMENTED IN example.conf.in.
+    #             Remove the failed device and reduce the RAID logical volume
+    #             accordingly.  If a single device dies in a 3-way mirror,
+    #             remove it and reduce the mirror to 2-way.  If a single device
+    #             dies in a RAID 4/5 logical volume, reshape it to a striped
+    #             volume, etc - RAID 6 -> RAID 4/5 -> RAID 0.  If devices
+    #             cannot be removed for lack of redundancy, fail.
+    #             THIS OPTION CANNOT YET BE IMPLEMENTED BECAUSE RESHAPE IS NOT
+    #             YET SUPPORTED IN linux/drivers/md/dm-raid.c.  The superblock
+    #             does not yet hold enough information to support reshaping.
+    #
+    # "allocate" - Attempt to use any extra physical volumes in the volume
+    #             group as spares and replace faulty devices.
+
+If manual intervention is taken, either in response to the automated solution's
+"warn" mode or simply because dmeventd hadn't run, then the user can call
+'lvconvert --repair vg/lv' and follow the prompts.  They will be prompted
+whether or not to replace the device and cause a full recovery of the failed
+device.
+
+If replacement is chosen via the manual method or "allocate" is the policy taken
+by the automated response, then 'lvconvert --replace' is the mechanism used to
+attempt the replacement of the failed device.
+
+'vgreduce --removemissing' is ineffectual at repairing RAID logical volumes.  It
+will remove the failed device, but the RAID logical volume will simply continue
+to operate with an <unknown> sub-LV.  The user should clear the failed device
+with 'lvconvert --repair'.
--- a/doc/lvm_fault_handling.txt
+++ b/doc/lvm_fault_handling.txt
@ -15,6 +15,12 @@ from (e.g. a power failure, intermittent network outage, block
 relocation, etc).  The policies for handling both types of failures
 is described herein.

+Users need to be aware that there are two implementations of RAID1 in LVM.
+The first is defined by the "mirror" segment type.  The second is defined by
+the "raid1" segment type.  The characteristics of each of these are defined
+in lvm.conf under 'mirror_segtype_default' - the configuration setting used to
+identify the default RAID1 implementation used for LVM operations.
+
 Available Operations During a Device Failure
 --------------------------------------------
 When there is a device failure, LVM behaves somewhat differently because
@ -51,30 +57,36 @@ are as follows:
  a linear, stripe, or snapshot device is located on the failed device
  the command will not proceed without a '--force' option.  The result
  of using the '--force' option is the entire removal and complete
-  loss of the non-redundant logical volume.  Once this operation is
-  complete, the volume group will again have a complete and consistent
-  view of the devices it contains.  Thus, all operations will be
-  permitted - including creation, conversion, and resizing operations.
+  loss of the non-redundant logical volume.  If an image or metadata area
+  of a RAID logical volume is on the failed device, the sub-LV affected is
+  replace with an error target device - appearing as <unknown> in 'lvs'
+  output.  RAID logical volumes cannot be completely repaired by vgreduce -
+  'lvconvert --repair' (listed below) must be used.  Once this operation is
+  complete on volume groups not containing RAID logical volumes, the volume
+  group will again have a complete and consistent view of the devices it
+  contains.  Thus, all operations will be permitted - including creation,
+  conversion, and resizing operations.  It is currently the preferred method
+  to call 'lvconvert --repair' on the individual logical volumes to repair
+  them followed by 'vgreduce --removemissing' to extract the physical volume's
+  representation in the volume group.

 - 'lvconvert --repair <VG/LV>':  This action is designed specifically
-  to operate on mirrored logical volumes.  It is used on logical volumes
-  individually and does not remove the faulty device from the volume
-  group.  If, for example, a failed device happened to contain the
-  images of four distinct mirrors, it would be necessary to run
-  'lvconvert --repair' on each of them.  The ultimate result is to leave
-  the faulty device in the volume group, but have no logical volumes
-  referencing it.  In addition to removing mirror images that reside
-  on failed devices, 'lvconvert --repair' can also replace the failed
-  device if there are spare devices available in the volume group.  The
-  user is prompted whether to simply remove the failed portions of the
-  mirror or to also allocate a replacement, if run from the command-line.
-  Optionally, the '--use-policies' flag can be specified which will
-  cause the operation not to prompt the user, but instead respect
+  to operate on individual logical volumes.  If, for example, a failed
+  device happened to contain the images of four distinct mirrors, it would
+  be necessary to run 'lvconvert --repair' on each of them.  The ultimate
+  result is to leave the faulty device in the volume group, but have no logical
+  volumes referencing it.  (This allows for 'vgreduce --removemissing' to
+  removed the physical volumes cleanly.)  In addition to removing mirror or
+  RAID images that reside on failed devices, 'lvconvert --repair' can also
+  replace the failed device if there are spare devices available in the
+  volume group.  The user is prompted whether to simply remove the failed
+  portions of the mirror or to also allocate a replacement, if run from the
+  command-line.  Optionally, the '--use-policies' flag can be specified which
+  will cause the operation not to prompt the user, but instead respect
  the policies outlined in the LVM configuration file - usually,
-  /etc/lvm/lvm.conf.  Once this operation is complete, mirrored logical
-  volumes will be consistent and I/O will be allowed to continue.
-  However, the volume group will still be inconsistent -  due to the
-  refernced-but-missing device/PV - and operations will still be
+  /etc/lvm/lvm.conf.  Once this operation is complete, the logical volumes
+  will be consistent.  However, the volume group will still be inconsistent -
+  due to the refernced-but-missing device/PV - and operations will still be
  restricted to the aformentioned actions until either the device is
  restored or 'vgreduce --removemissing' is run.

@ -98,13 +110,15 @@ following possible exceptions exist:

 Automated Target Response to Failures:
 --------------------------------------
-The only LVM target type (i.e. "personality") that has an automated
-response to failures is a mirrored logical volume.  The other target
+The only LVM target types (i.e. "personalities") that have an automated
+response to failures are the mirror and RAID logical volumes.  The other target
 types (linear, stripe, snapshot, etc) will simply propagate the failure.
 [A snapshot becomes invalid if its underlying device fails, but the
 origin will remain valid - presuming the origin device has not failed.]
-There are three types of errors that a mirror can suffer - read, write,
-and resynchronization errors.  Each is described in depth below.
+
+Starting with the "mirror" segment type, there are three types of errors that
+a mirror can suffer - read, write, and resynchronization errors.  Each is
+described in depth below.

 Mirror read failures:
 If a mirror is 'in-sync' (i.e. all images have been initialized and
@ -184,38 +198,5 @@ command are set in the LVM configuration file.  They are:
  choice of when to incure the extra performance costs of replacing
  the failed image.

-TODO...
-The appropriate time to take permanent corrective action on a mirror
-should be driven by policy.  There should be a directive that takes
-a time or percentage argument.  Something like the following:
- mirror_fault_policy_WHEN = "10sec"/"10%"
-A time value would signal the amount of time to wait for transient
-failures to resolve themselves.  The percentage value would signal the
-amount a mirror could become out-of-sync before the faulty device is
-removed.
-
-A mirror cannot be used unless /some/ corrective action is taken,
-however.  One option is to replace the failed mirror image with an
-error target, forgo the use of 'handle_errors', and simply let the
-out-of-sync regions accumulate and be tracked by the log.  Mirrors
-that have more than 2 images would have to "stack" to perform the
-tracking, as each failed image would have to be associated with a
-log.  If the failure is transient, the device would replace the
-error target that was holding its spot and the log that was tracking
-the deltas would be used to quickly restore the portions that changed.
-
-One unresolved issue with the above scheme is how to know which
-regions of the mirror are out-of-sync when a problem occurs.  When
-a write failure occurs in the kernel, the log will contain those
-regions that are not in-sync.  If the log is a disk log, that log
-could continue to be used to track differences.  However, if the
-log was a core log - or if the log device failed at the same time
-as an image device - there would be no way to determine which
-regions are out-of-sync to begin with as we start to track the
-deltas for the failed image.  I don't have a solution for this
-problem other than to only be able to handle errors in this way
-if conditions are right.  These issues will have to be ironed out
-before proceeding.  This could be another case, where it is better
-to handle failures in the kernel by allowing the kernel to store
-updates in various metadata areas.
-...TODO
+RAID logical volume device failures are handled differently from the "mirror"
+segment type.  Discussion of this can be found in lvm2-raid.txt.
--- a/lib/config/defaults.h
+++ b/lib/config/defaults.h
@ -55,6 +55,7 @@
 #define DEFAULT_MIRROR_LOG_FAULT_POLICY "allocate"
 #define DEFAULT_MIRROR_IMAGE_FAULT_POLICY "remove"
 #define DEFAULT_MIRROR_MAX_IMAGES 8 /* limited by kernel DM_KCOPYD_MAX_REGIONS */
+#define DEFAULT_RAID_FAULT_POLICY "warn"
 #define DEFAULT_DMEVENTD_RAID_LIB "libdevmapper-event-lvm2raid.so"
 #define DEFAULT_DMEVENTD_MIRROR_LIB "libdevmapper-event-lvm2mirror.so"
 #define DEFAULT_DMEVENTD_SNAPSHOT_LIB "libdevmapper-event-lvm2snapshot.so"
--- a/tools/lvconvert.c
+++ b/tools/lvconvert.c
@ -1424,9 +1424,44 @@ static int is_valid_raid_conversion(const struct segment_type *from_segtype,
 	return 1;
 }

+static void _lvconvert_raid_repair_ask(struct cmd_context *cmd, int *replace_dev)
+{
+	const char *dev_policy = NULL;
+
+	int force = arg_count(cmd, force_ARG);
+	int yes = arg_count(cmd, yes_ARG);
+
+	*replace_dev = 0;
+
+	if (arg_count(cmd, use_policies_ARG)) {
+		dev_policy = find_config_tree_str(cmd, "activation/raid_fault_policy", DEFAULT_RAID_FAULT_POLICY);
+
+		if (!strcmp(dev_policy, "allocate") ||
+		    !strcmp(dev_policy, "replace"))
+			*replace_dev = 1;
+		/* else if (!strcmp(dev_policy, "anything_else")) -- ignore */
+		return;
+	}
+
+	if (yes) {
+		*replace_dev = 1;
+		return;
+	}
+
+	if (force != PROMPT)
+		return;
+
+	if (yes_no_prompt("Attempt to replace failed RAID images "
+			  "(requires full device resync)? [y/n]: ") == 'y') {
+		*replace_dev = 1;
+	}
+}
+
 static int lvconvert_raid(struct logical_volume *lv, struct lvconvert_params *lp)
 {
+	int replace = 0;
 	int image_count;
+	struct dm_list *failed_pvs;
 	struct cmd_context *cmd = lv->vg->cmd;
 	struct lv_segment *seg = first_seg(lv);

@ -1485,6 +1520,25 @@ static int lvconvert_raid(struct logical_volume *lv, struct lvconvert_params *lp
 	if (arg_count(cmd, replace_ARG))
 		return lv_raid_replace(lv, lp->replace_pvh, lp->pvh);

+	if (arg_count(cmd, repair_ARG)) {
+		_lvconvert_raid_repair_ask(cmd, &replace);
+
+		if (replace) {
+			if (!(failed_pvs = _failed_pv_list(lv->vg))) {
+				stack;
+				return ECMD_FAILED;
+			}
+			return lv_raid_replace(lv, failed_pvs, lp->pvh);
+		}
+
+		/* "warn" if policy not set to replace */
+		if (arg_count(cmd, use_policies_ARG))
+			log_error("Issue 'lvconvert --repair %s/%s' to "
+				  "replace failed device",
+				  lv->vg->name, lv->name);
+		return 1;
+	}
+
 	log_error("Conversion operation not yet supported.");
 	return 0;
 }