mirror of
git://sourceware.org/git/lvm2.git
synced 2025-01-07 21:18:59 +03:00
6f355c6736
Followup patch mentioned in previous commit fe577f84cbf6bd7be76b457c9d54d0e54e57f93d. Problem: even though dead raid component devices are detected, the raid plugin is bailing out thus preventing a repair attempt. Rational: in case of component device errors, the MD resynchronization thread runs in parallel with the thrown event being processed by the raid plugin. The plugin retrieves the raid device status but that still reflects insync regions as 0 (when it should already be total regions) because the MD thread didn't update it yet. Solution: Remove the insync regions check but keep the informal message "waiting for resynchronization" and let lvconvert carry out its pre-repair checks and optionally carry out a repair attempt. Related: https://bugzilla.redhat.com/show_bug.cgi?id=1751887 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1560739 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1468590 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1654860 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1729303 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1741016
190 lines
5.0 KiB
C
190 lines
5.0 KiB
C
/*
|
|
* Copyright (C) 2005-2017 Red Hat, Inc. All rights reserved.
|
|
*
|
|
* This file is part of LVM2.
|
|
*
|
|
* This copyrighted material is made available to anyone wishing to use,
|
|
* modify, copy, or redistribute it subject to the terms and conditions
|
|
* of the GNU Lesser General Public License v.2.1.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "lib/misc/lib.h"
|
|
#include "daemons/dmeventd/plugins/lvm2/dmeventd_lvm.h"
|
|
#include "daemons/dmeventd/libdevmapper-event.h"
|
|
#include "lib/config/defaults.h"
|
|
|
|
/* Hold enough elements for the mximum number of RAID images */
|
|
#define RAID_DEVS_ELEMS ((DEFAULT_RAID_MAX_IMAGES + 63) / 64)
|
|
|
|
struct dso_state {
|
|
struct dm_pool *mem;
|
|
char cmd_lvconvert[512];
|
|
uint64_t raid_devs[RAID_DEVS_ELEMS];
|
|
int failed;
|
|
int warned;
|
|
};
|
|
|
|
DM_EVENT_LOG_FN("raid")
|
|
|
|
/* FIXME Reformat to 80 char lines. */
|
|
|
|
static int _process_raid_event(struct dso_state *state, char *params, const char *device)
|
|
{
|
|
struct dm_status_raid *status;
|
|
const char *d;
|
|
int dead = 0, r = 1;
|
|
uint32_t dev;
|
|
|
|
if (!dm_get_status_raid(state->mem, params, &status)) {
|
|
log_error("Failed to process status line for %s.", device);
|
|
return 0;
|
|
}
|
|
|
|
d = status->dev_health;
|
|
while ((d = strchr(d, 'D'))) {
|
|
dev = (uint32_t)(d - status->dev_health);
|
|
|
|
if (!(state->raid_devs[dev / 64] & (UINT64_C(1) << (dev % 64)))) {
|
|
state->raid_devs[dev / 64] |= (UINT64_C(1) << (dev % 64));
|
|
log_warn("WARNING: Device #%u of %s array, %s, has failed.",
|
|
dev, status->raid_type, device);
|
|
}
|
|
|
|
d++;
|
|
dead = 1;
|
|
}
|
|
|
|
/*
|
|
* if we are converting from non-RAID to RAID (e.g. linear -> raid1)
|
|
* and too many original devices die, such that we cannot continue
|
|
* the "recover" operation, the sync action will go to "idle", the
|
|
* unsynced devs will remain at 'a', and the original devices will
|
|
* NOT SWITCH TO 'D', but will remain at 'A' - hoping to be revived.
|
|
*
|
|
* This is simply the way the kernel works...
|
|
*/
|
|
if (!strcmp(status->sync_action, "idle") &&
|
|
(status->dev_health[0] == 'a') &&
|
|
(status->insync_regions < status->total_regions)) {
|
|
log_error("Primary sources for new RAID, %s, have failed.",
|
|
device);
|
|
dead = 1; /* run it through LVM repair */
|
|
}
|
|
|
|
if (dead) {
|
|
/*
|
|
* Use the first event to run a repair ignoring any additonal ones.
|
|
*
|
|
* We presume lvconvert to do pre-repair
|
|
* checks to avoid bloat in this plugin.
|
|
*/
|
|
if (!state->warned && status->insync_regions < status->total_regions) {
|
|
state->warned = 1;
|
|
log_warn("WARNING: waiting for resynchronization to finish "
|
|
"before initiating repair on RAID device %s.", device);
|
|
/* Fall through to allow lvconvert to run. */
|
|
}
|
|
|
|
if (state->failed)
|
|
goto out; /* already reported */
|
|
|
|
state->failed = 1;
|
|
|
|
/* if repair goes OK, report success even if lvscan has failed */
|
|
if (!dmeventd_lvm2_run_with_lock(state->cmd_lvconvert)) {
|
|
log_error("Repair of RAID device %s failed.", device);
|
|
r = 0;
|
|
}
|
|
} else {
|
|
state->failed = 0;
|
|
if (status->insync_regions == status->total_regions)
|
|
memset(&state->raid_devs, 0, sizeof(state->raid_devs));
|
|
log_info("%s array, %s, is %s in-sync.",
|
|
status->raid_type, device,
|
|
(status->insync_regions == status->total_regions) ? "now" : "not");
|
|
}
|
|
out:
|
|
dm_pool_free(state->mem, status);
|
|
|
|
return r;
|
|
}
|
|
|
|
void process_event(struct dm_task *dmt,
|
|
enum dm_event_mask event __attribute__((unused)),
|
|
void **user)
|
|
{
|
|
struct dso_state *state = *user;
|
|
void *next = NULL;
|
|
uint64_t start, length;
|
|
char *target_type = NULL;
|
|
char *params;
|
|
const char *device = dm_task_get_name(dmt);
|
|
|
|
do {
|
|
next = dm_get_next_target(dmt, next, &start, &length,
|
|
&target_type, ¶ms);
|
|
|
|
if (!target_type) {
|
|
log_info("%s mapping lost.", device);
|
|
continue;
|
|
}
|
|
|
|
if (strcmp(target_type, "raid")) {
|
|
log_info("%s has non-raid portion.", device);
|
|
continue;
|
|
}
|
|
|
|
if (!_process_raid_event(state, params, device))
|
|
log_error("Failed to process event for %s.",
|
|
device);
|
|
} while (next);
|
|
}
|
|
|
|
int register_device(const char *device,
|
|
const char *uuid __attribute__((unused)),
|
|
int major __attribute__((unused)),
|
|
int minor __attribute__((unused)),
|
|
void **user)
|
|
{
|
|
struct dso_state *state;
|
|
|
|
if (!dmeventd_lvm2_init_with_pool("raid_state", state))
|
|
goto_bad;
|
|
|
|
if (!dmeventd_lvm2_command(state->mem, state->cmd_lvconvert, sizeof(state->cmd_lvconvert),
|
|
"lvconvert --repair --use-policies", device))
|
|
goto_bad;
|
|
|
|
*user = state;
|
|
|
|
log_info("Monitoring RAID device %s for events.", device);
|
|
|
|
return 1;
|
|
bad:
|
|
log_error("Failed to monitor RAID %s.", device);
|
|
|
|
if (state)
|
|
dmeventd_lvm2_exit_with_pool(state);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int unregister_device(const char *device,
|
|
const char *uuid __attribute__((unused)),
|
|
int major __attribute__((unused)),
|
|
int minor __attribute__((unused)),
|
|
void **user)
|
|
{
|
|
struct dso_state *state = *user;
|
|
|
|
dmeventd_lvm2_exit_with_pool(state);
|
|
log_info("No longer monitoring RAID device %s for events.",
|
|
device);
|
|
|
|
return 1;
|
|
}
|