e94987db2e
The DLM informs us in case of node failure with the DLM slot number. cluster_info->recovery_map sets the bit corresponding to the slot number and wakes up the recovery thread. The recovery thread: 1. Derives the slot number from the recovery_map 2. Locks the bitmap corresponding to the slot 3. Copies the set bits to the node-local bitmap Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
449 lines
11 KiB
C
449 lines
11 KiB
C
/*
|
|
* Copyright (C) 2015, SUSE
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2, or (at your option)
|
|
* any later version.
|
|
*
|
|
*/
|
|
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/dlm.h>
|
|
#include <linux/sched.h>
|
|
#include "md.h"
|
|
#include "bitmap.h"
|
|
#include "md-cluster.h"
|
|
|
|
#define LVB_SIZE 64
|
|
|
|
struct dlm_lock_resource {
|
|
dlm_lockspace_t *ls;
|
|
struct dlm_lksb lksb;
|
|
char *name; /* lock name. */
|
|
uint32_t flags; /* flags to pass to dlm_lock() */
|
|
struct completion completion; /* completion for synchronized locking */
|
|
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
|
|
struct mddev *mddev; /* pointing back to mddev. */
|
|
};
|
|
|
|
struct suspend_info {
|
|
int slot;
|
|
sector_t lo;
|
|
sector_t hi;
|
|
struct list_head list;
|
|
};
|
|
|
|
struct resync_info {
|
|
__le64 lo;
|
|
__le64 hi;
|
|
};
|
|
|
|
struct md_cluster_info {
|
|
/* dlm lock space and resources for clustered raid. */
|
|
dlm_lockspace_t *lockspace;
|
|
int slot_number;
|
|
struct completion completion;
|
|
struct dlm_lock_resource *sb_lock;
|
|
struct mutex sb_mutex;
|
|
struct dlm_lock_resource *bitmap_lockres;
|
|
struct list_head suspend_list;
|
|
spinlock_t suspend_lock;
|
|
struct md_thread *recovery_thread;
|
|
unsigned long recovery_map;
|
|
};
|
|
|
|
static void sync_ast(void *arg)
|
|
{
|
|
struct dlm_lock_resource *res;
|
|
|
|
res = (struct dlm_lock_resource *) arg;
|
|
complete(&res->completion);
|
|
}
|
|
|
|
static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
|
|
{
|
|
int ret = 0;
|
|
|
|
init_completion(&res->completion);
|
|
ret = dlm_lock(res->ls, mode, &res->lksb,
|
|
res->flags, res->name, strlen(res->name),
|
|
0, sync_ast, res, res->bast);
|
|
if (ret)
|
|
return ret;
|
|
wait_for_completion(&res->completion);
|
|
return res->lksb.sb_status;
|
|
}
|
|
|
|
static int dlm_unlock_sync(struct dlm_lock_resource *res)
|
|
{
|
|
return dlm_lock_sync(res, DLM_LOCK_NL);
|
|
}
|
|
|
|
static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
|
|
char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
|
|
{
|
|
struct dlm_lock_resource *res = NULL;
|
|
int ret, namelen;
|
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
|
|
|
res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
|
|
if (!res)
|
|
return NULL;
|
|
res->ls = cinfo->lockspace;
|
|
res->mddev = mddev;
|
|
namelen = strlen(name);
|
|
res->name = kzalloc(namelen + 1, GFP_KERNEL);
|
|
if (!res->name) {
|
|
pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
|
|
goto out_err;
|
|
}
|
|
strlcpy(res->name, name, namelen + 1);
|
|
if (with_lvb) {
|
|
res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
|
|
if (!res->lksb.sb_lvbptr) {
|
|
pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
|
|
goto out_err;
|
|
}
|
|
res->flags = DLM_LKF_VALBLK;
|
|
}
|
|
|
|
if (bastfn)
|
|
res->bast = bastfn;
|
|
|
|
res->flags |= DLM_LKF_EXPEDITE;
|
|
|
|
ret = dlm_lock_sync(res, DLM_LOCK_NL);
|
|
if (ret) {
|
|
pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
|
|
goto out_err;
|
|
}
|
|
res->flags &= ~DLM_LKF_EXPEDITE;
|
|
res->flags |= DLM_LKF_CONVERT;
|
|
|
|
return res;
|
|
out_err:
|
|
kfree(res->lksb.sb_lvbptr);
|
|
kfree(res->name);
|
|
kfree(res);
|
|
return NULL;
|
|
}
|
|
|
|
static void lockres_free(struct dlm_lock_resource *res)
|
|
{
|
|
if (!res)
|
|
return;
|
|
|
|
init_completion(&res->completion);
|
|
dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
|
|
wait_for_completion(&res->completion);
|
|
|
|
kfree(res->name);
|
|
kfree(res->lksb.sb_lvbptr);
|
|
kfree(res);
|
|
}
|
|
|
|
static char *pretty_uuid(char *dest, char *src)
|
|
{
|
|
int i, len = 0;
|
|
|
|
for (i = 0; i < 16; i++) {
|
|
if (i == 4 || i == 6 || i == 8 || i == 10)
|
|
len += sprintf(dest + len, "-");
|
|
len += sprintf(dest + len, "%02x", (__u8)src[i]);
|
|
}
|
|
return dest;
|
|
}
|
|
|
|
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
|
|
sector_t lo, sector_t hi)
|
|
{
|
|
struct resync_info *ri;
|
|
|
|
ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
|
|
ri->lo = cpu_to_le64(lo);
|
|
ri->hi = cpu_to_le64(hi);
|
|
}
|
|
|
|
static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
|
|
{
|
|
struct resync_info ri;
|
|
struct suspend_info *s = NULL;
|
|
sector_t hi = 0;
|
|
|
|
dlm_lock_sync(lockres, DLM_LOCK_CR);
|
|
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
|
|
hi = le64_to_cpu(ri.hi);
|
|
if (ri.hi > 0) {
|
|
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
|
|
if (!s)
|
|
goto out;
|
|
s->hi = hi;
|
|
s->lo = le64_to_cpu(ri.lo);
|
|
}
|
|
dlm_unlock_sync(lockres);
|
|
out:
|
|
return s;
|
|
}
|
|
|
|
void recover_bitmaps(struct md_thread *thread)
|
|
{
|
|
struct mddev *mddev = thread->mddev;
|
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
|
struct dlm_lock_resource *bm_lockres;
|
|
char str[64];
|
|
int slot, ret;
|
|
struct suspend_info *s, *tmp;
|
|
sector_t lo, hi;
|
|
|
|
while (cinfo->recovery_map) {
|
|
slot = fls64((u64)cinfo->recovery_map) - 1;
|
|
|
|
/* Clear suspend_area associated with the bitmap */
|
|
spin_lock_irq(&cinfo->suspend_lock);
|
|
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
|
|
if (slot == s->slot) {
|
|
list_del(&s->list);
|
|
kfree(s);
|
|
}
|
|
spin_unlock_irq(&cinfo->suspend_lock);
|
|
|
|
snprintf(str, 64, "bitmap%04d", slot);
|
|
bm_lockres = lockres_init(mddev, str, NULL, 1);
|
|
if (!bm_lockres) {
|
|
pr_err("md-cluster: Cannot initialize bitmaps\n");
|
|
goto clear_bit;
|
|
}
|
|
|
|
ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
|
|
if (ret) {
|
|
pr_err("md-cluster: Could not DLM lock %s: %d\n",
|
|
str, ret);
|
|
goto clear_bit;
|
|
}
|
|
ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi);
|
|
if (ret)
|
|
pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
|
|
dlm_unlock_sync(bm_lockres);
|
|
clear_bit:
|
|
clear_bit(slot, &cinfo->recovery_map);
|
|
}
|
|
}
|
|
|
|
static void recover_prep(void *arg)
|
|
{
|
|
}
|
|
|
|
static void recover_slot(void *arg, struct dlm_slot *slot)
|
|
{
|
|
struct mddev *mddev = arg;
|
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
|
|
|
pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
|
|
mddev->bitmap_info.cluster_name,
|
|
slot->nodeid, slot->slot,
|
|
cinfo->slot_number);
|
|
set_bit(slot->slot - 1, &cinfo->recovery_map);
|
|
if (!cinfo->recovery_thread) {
|
|
cinfo->recovery_thread = md_register_thread(recover_bitmaps,
|
|
mddev, "recover");
|
|
if (!cinfo->recovery_thread) {
|
|
pr_warn("md-cluster: Could not create recovery thread\n");
|
|
return;
|
|
}
|
|
}
|
|
md_wakeup_thread(cinfo->recovery_thread);
|
|
}
|
|
|
|
static void recover_done(void *arg, struct dlm_slot *slots,
|
|
int num_slots, int our_slot,
|
|
uint32_t generation)
|
|
{
|
|
struct mddev *mddev = arg;
|
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
|
|
|
cinfo->slot_number = our_slot;
|
|
complete(&cinfo->completion);
|
|
}
|
|
|
|
static const struct dlm_lockspace_ops md_ls_ops = {
|
|
.recover_prep = recover_prep,
|
|
.recover_slot = recover_slot,
|
|
.recover_done = recover_done,
|
|
};
|
|
|
|
static int gather_all_resync_info(struct mddev *mddev, int total_slots)
|
|
{
|
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
|
int i, ret = 0;
|
|
struct dlm_lock_resource *bm_lockres;
|
|
struct suspend_info *s;
|
|
char str[64];
|
|
|
|
|
|
for (i = 0; i < total_slots; i++) {
|
|
memset(str, '\0', 64);
|
|
snprintf(str, 64, "bitmap%04d", i);
|
|
bm_lockres = lockres_init(mddev, str, NULL, 1);
|
|
if (!bm_lockres)
|
|
return -ENOMEM;
|
|
if (i == (cinfo->slot_number - 1))
|
|
continue;
|
|
|
|
bm_lockres->flags |= DLM_LKF_NOQUEUE;
|
|
ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
|
|
if (ret == -EAGAIN) {
|
|
memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
|
|
s = read_resync_info(mddev, bm_lockres);
|
|
if (s) {
|
|
pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
|
|
__func__, __LINE__,
|
|
(unsigned long long) s->lo,
|
|
(unsigned long long) s->hi, i);
|
|
spin_lock_irq(&cinfo->suspend_lock);
|
|
s->slot = i;
|
|
list_add(&s->list, &cinfo->suspend_list);
|
|
spin_unlock_irq(&cinfo->suspend_lock);
|
|
}
|
|
ret = 0;
|
|
lockres_free(bm_lockres);
|
|
continue;
|
|
}
|
|
if (ret)
|
|
goto out;
|
|
/* TODO: Read the disk bitmap sb and check if it needs recovery */
|
|
dlm_unlock_sync(bm_lockres);
|
|
lockres_free(bm_lockres);
|
|
}
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int join(struct mddev *mddev, int nodes)
|
|
{
|
|
struct md_cluster_info *cinfo;
|
|
int ret, ops_rv;
|
|
char str[64];
|
|
|
|
if (!try_module_get(THIS_MODULE))
|
|
return -ENOENT;
|
|
|
|
cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
|
|
if (!cinfo)
|
|
return -ENOMEM;
|
|
|
|
init_completion(&cinfo->completion);
|
|
|
|
mutex_init(&cinfo->sb_mutex);
|
|
mddev->cluster_info = cinfo;
|
|
|
|
memset(str, 0, 64);
|
|
pretty_uuid(str, mddev->uuid);
|
|
ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
|
|
DLM_LSFL_FS, LVB_SIZE,
|
|
&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
|
|
if (ret)
|
|
goto err;
|
|
wait_for_completion(&cinfo->completion);
|
|
if (nodes <= cinfo->slot_number) {
|
|
pr_err("md-cluster: Slot allotted(%d) greater than available slots(%d)", cinfo->slot_number - 1,
|
|
nodes);
|
|
ret = -ERANGE;
|
|
goto err;
|
|
}
|
|
cinfo->sb_lock = lockres_init(mddev, "cmd-super",
|
|
NULL, 0);
|
|
if (!cinfo->sb_lock) {
|
|
ret = -ENOMEM;
|
|
goto err;
|
|
}
|
|
|
|
pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
|
|
snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
|
|
cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
|
|
if (!cinfo->bitmap_lockres)
|
|
goto err;
|
|
if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
|
|
pr_err("Failed to get bitmap lock\n");
|
|
ret = -EINVAL;
|
|
goto err;
|
|
}
|
|
|
|
INIT_LIST_HEAD(&cinfo->suspend_list);
|
|
spin_lock_init(&cinfo->suspend_lock);
|
|
|
|
ret = gather_all_resync_info(mddev, nodes);
|
|
if (ret)
|
|
goto err;
|
|
|
|
return 0;
|
|
err:
|
|
lockres_free(cinfo->bitmap_lockres);
|
|
lockres_free(cinfo->sb_lock);
|
|
if (cinfo->lockspace)
|
|
dlm_release_lockspace(cinfo->lockspace, 2);
|
|
mddev->cluster_info = NULL;
|
|
kfree(cinfo);
|
|
module_put(THIS_MODULE);
|
|
return ret;
|
|
}
|
|
|
|
static int leave(struct mddev *mddev)
|
|
{
|
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
|
|
|
if (!cinfo)
|
|
return 0;
|
|
md_unregister_thread(&cinfo->recovery_thread);
|
|
lockres_free(cinfo->sb_lock);
|
|
lockres_free(cinfo->bitmap_lockres);
|
|
dlm_release_lockspace(cinfo->lockspace, 2);
|
|
return 0;
|
|
}
|
|
|
|
/* slot_number(): Returns the MD slot number to use
|
|
* DLM starts the slot numbers from 1, wheras cluster-md
|
|
* wants the number to be from zero, so we deduct one
|
|
*/
|
|
static int slot_number(struct mddev *mddev)
|
|
{
|
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
|
|
|
return cinfo->slot_number - 1;
|
|
}
|
|
|
|
static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
|
|
{
|
|
struct md_cluster_info *cinfo = mddev->cluster_info;
|
|
|
|
add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
|
|
/* Re-acquire the lock to refresh LVB */
|
|
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
|
|
}
|
|
|
|
static struct md_cluster_operations cluster_ops = {
|
|
.join = join,
|
|
.leave = leave,
|
|
.slot_number = slot_number,
|
|
.resync_info_update = resync_info_update,
|
|
};
|
|
|
|
static int __init cluster_init(void)
|
|
{
|
|
pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
|
|
pr_info("Registering Cluster MD functions\n");
|
|
register_md_cluster_operations(&cluster_ops, THIS_MODULE);
|
|
return 0;
|
|
}
|
|
|
|
static void cluster_exit(void)
|
|
{
|
|
unregister_md_cluster_operations();
|
|
}
|
|
|
|
module_init(cluster_init);
|
|
module_exit(cluster_exit);
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_DESCRIPTION("Clustering support for MD");
|