staging: ramster: cluster/messaging foundation
Copy cluster subdirectory from ocfs2. These files implement the basic cluster discovery, mapping, heartbeat / keepalive, and messaging ("o2net") that ramster requires for internode communication. Note: there are NO ramster-specific changes yet; this commit does NOT pass checkpatch since the copied source files do not. (Why copy? This particular part of ocfs2 has never been broken out for non-ocfs2 use before, some (small) changes are required for ramster to use that code, and ramster is currently incompatible with real ocfs2 anyway (requires !CONFIG_OCFS2_FS). Before ramster can be promoted out of staging, we will need to work with the ocfs2 maintainers to see if the code interdependencies can be merged, but for now, for staging, this seemed to be an expedient way to make use of the ocfs2 core cluster code while still incorporating necessary changes for ramster.) Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
32de21f75d
commit
1135ca9c86
4
drivers/staging/ramster/cluster/Makefile
Normal file
4
drivers/staging/ramster/cluster/Makefile
Normal file
@ -0,0 +1,4 @@
|
||||
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
|
||||
|
||||
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
|
||||
quorum.o tcp.o netdebug.o ver.o
|
2678
drivers/staging/ramster/cluster/heartbeat.c
Normal file
2678
drivers/staging/ramster/cluster/heartbeat.c
Normal file
File diff suppressed because it is too large
Load Diff
89
drivers/staging/ramster/cluster/heartbeat.h
Normal file
89
drivers/staging/ramster/cluster/heartbeat.h
Normal file
@ -0,0 +1,89 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* heartbeat.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_HEARTBEAT_H
|
||||
#define O2CLUSTER_HEARTBEAT_H
|
||||
|
||||
#include "ocfs2_heartbeat.h"
|
||||
|
||||
#define O2HB_REGION_TIMEOUT_MS 2000
|
||||
|
||||
#define O2HB_MAX_REGION_NAME_LEN 32
|
||||
|
||||
/* number of changes to be seen as live */
|
||||
#define O2HB_LIVE_THRESHOLD 2
|
||||
/* number of equal samples to be seen as dead */
|
||||
extern unsigned int o2hb_dead_threshold;
|
||||
#define O2HB_DEFAULT_DEAD_THRESHOLD 31
|
||||
/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
|
||||
#define O2HB_MIN_DEAD_THRESHOLD 2
|
||||
#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
|
||||
|
||||
#define O2HB_CB_MAGIC 0x51d1e4ec
|
||||
|
||||
/* callback stuff */
|
||||
enum o2hb_callback_type {
|
||||
O2HB_NODE_DOWN_CB = 0,
|
||||
O2HB_NODE_UP_CB,
|
||||
O2HB_NUM_CB
|
||||
};
|
||||
|
||||
struct o2nm_node;
|
||||
typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
|
||||
|
||||
struct o2hb_callback_func {
|
||||
u32 hc_magic;
|
||||
struct list_head hc_item;
|
||||
o2hb_cb_func *hc_func;
|
||||
void *hc_data;
|
||||
int hc_priority;
|
||||
enum o2hb_callback_type hc_type;
|
||||
};
|
||||
|
||||
struct config_group *o2hb_alloc_hb_set(void);
|
||||
void o2hb_free_hb_set(struct config_group *group);
|
||||
|
||||
void o2hb_setup_callback(struct o2hb_callback_func *hc,
|
||||
enum o2hb_callback_type type,
|
||||
o2hb_cb_func *func,
|
||||
void *data,
|
||||
int priority);
|
||||
int o2hb_register_callback(const char *region_uuid,
|
||||
struct o2hb_callback_func *hc);
|
||||
void o2hb_unregister_callback(const char *region_uuid,
|
||||
struct o2hb_callback_func *hc);
|
||||
void o2hb_fill_node_map(unsigned long *map,
|
||||
unsigned bytes);
|
||||
void o2hb_exit(void);
|
||||
int o2hb_init(void);
|
||||
int o2hb_check_node_heartbeating(u8 node_num);
|
||||
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
|
||||
int o2hb_check_local_node_heartbeating(void);
|
||||
void o2hb_stop_all_regions(void);
|
||||
int o2hb_get_all_regions(char *region_uuids, u8 numregions);
|
||||
int o2hb_global_heartbeat_active(void);
|
||||
|
||||
#endif /* O2CLUSTER_HEARTBEAT_H */
|
155
drivers/staging/ramster/cluster/masklog.c
Normal file
155
drivers/staging/ramster/cluster/masklog.c
Normal file
@ -0,0 +1,155 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/string.h>
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
#include "masklog.h"
|
||||
|
||||
struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
|
||||
EXPORT_SYMBOL_GPL(mlog_and_bits);
|
||||
struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
|
||||
EXPORT_SYMBOL_GPL(mlog_not_bits);
|
||||
|
||||
static ssize_t mlog_mask_show(u64 mask, char *buf)
|
||||
{
|
||||
char *state;
|
||||
|
||||
if (__mlog_test_u64(mask, mlog_and_bits))
|
||||
state = "allow";
|
||||
else if (__mlog_test_u64(mask, mlog_not_bits))
|
||||
state = "deny";
|
||||
else
|
||||
state = "off";
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s\n", state);
|
||||
}
|
||||
|
||||
static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
|
||||
{
|
||||
if (!strnicmp(buf, "allow", 5)) {
|
||||
__mlog_set_u64(mask, mlog_and_bits);
|
||||
__mlog_clear_u64(mask, mlog_not_bits);
|
||||
} else if (!strnicmp(buf, "deny", 4)) {
|
||||
__mlog_set_u64(mask, mlog_not_bits);
|
||||
__mlog_clear_u64(mask, mlog_and_bits);
|
||||
} else if (!strnicmp(buf, "off", 3)) {
|
||||
__mlog_clear_u64(mask, mlog_not_bits);
|
||||
__mlog_clear_u64(mask, mlog_and_bits);
|
||||
} else
|
||||
return -EINVAL;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
struct mlog_attribute {
|
||||
struct attribute attr;
|
||||
u64 mask;
|
||||
};
|
||||
|
||||
#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
|
||||
|
||||
#define define_mask(_name) { \
|
||||
.attr = { \
|
||||
.name = #_name, \
|
||||
.mode = S_IRUGO | S_IWUSR, \
|
||||
}, \
|
||||
.mask = ML_##_name, \
|
||||
}
|
||||
|
||||
static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
|
||||
define_mask(TCP),
|
||||
define_mask(MSG),
|
||||
define_mask(SOCKET),
|
||||
define_mask(HEARTBEAT),
|
||||
define_mask(HB_BIO),
|
||||
define_mask(DLMFS),
|
||||
define_mask(DLM),
|
||||
define_mask(DLM_DOMAIN),
|
||||
define_mask(DLM_THREAD),
|
||||
define_mask(DLM_MASTER),
|
||||
define_mask(DLM_RECOVERY),
|
||||
define_mask(DLM_GLUE),
|
||||
define_mask(VOTE),
|
||||
define_mask(CONN),
|
||||
define_mask(QUORUM),
|
||||
define_mask(BASTS),
|
||||
define_mask(CLUSTER),
|
||||
define_mask(ERROR),
|
||||
define_mask(NOTICE),
|
||||
define_mask(KTHREAD),
|
||||
};
|
||||
|
||||
static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
|
||||
|
||||
static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
|
||||
|
||||
return mlog_mask_show(mlog_attr->mask, buf);
|
||||
}
|
||||
|
||||
static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
|
||||
|
||||
return mlog_mask_store(mlog_attr->mask, buf, count);
|
||||
}
|
||||
|
||||
static const struct sysfs_ops mlog_attr_ops = {
|
||||
.show = mlog_show,
|
||||
.store = mlog_store,
|
||||
};
|
||||
|
||||
static struct kobj_type mlog_ktype = {
|
||||
.default_attrs = mlog_attr_ptrs,
|
||||
.sysfs_ops = &mlog_attr_ops,
|
||||
};
|
||||
|
||||
static struct kset mlog_kset = {
|
||||
.kobj = {.ktype = &mlog_ktype},
|
||||
};
|
||||
|
||||
int mlog_sys_init(struct kset *o2cb_kset)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
while (mlog_attrs[i].attr.mode) {
|
||||
mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
|
||||
i++;
|
||||
}
|
||||
mlog_attr_ptrs[i] = NULL;
|
||||
|
||||
kobject_set_name(&mlog_kset.kobj, "logmask");
|
||||
mlog_kset.kobj.kset = o2cb_kset;
|
||||
return kset_register(&mlog_kset);
|
||||
}
|
||||
|
||||
void mlog_sys_shutdown(void)
|
||||
{
|
||||
kset_unregister(&mlog_kset);
|
||||
}
|
219
drivers/staging/ramster/cluster/masklog.h
Normal file
219
drivers/staging/ramster/cluster/masklog.h
Normal file
@ -0,0 +1,219 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_MASKLOG_H
|
||||
#define O2CLUSTER_MASKLOG_H
|
||||
|
||||
/*
|
||||
* For now this is a trivial wrapper around printk() that gives the critical
|
||||
* ability to enable sets of debugging output at run-time. In the future this
|
||||
* will almost certainly be redirected to relayfs so that it can pay a
|
||||
* substantially lower heisenberg tax.
|
||||
*
|
||||
* Callers associate the message with a bitmask and a global bitmask is
|
||||
* maintained with help from /proc. If any of the bits match the message is
|
||||
* output.
|
||||
*
|
||||
* We must have efficient bit tests on i386 and it seems gcc still emits crazy
|
||||
* code for the 64bit compare. It emits very good code for the dual unsigned
|
||||
* long tests, though, completely avoiding tests that can never pass if the
|
||||
* caller gives a constant bitmask that fills one of the longs with all 0s. So
|
||||
* the desire is to have almost all of the calls decided on by comparing just
|
||||
* one of the longs. This leads to having infrequently given bits that are
|
||||
* frequently matched in the high bits.
|
||||
*
|
||||
* _ERROR and _NOTICE are used for messages that always go to the console and
|
||||
* have appropriate KERN_ prefixes. We wrap these in our function instead of
|
||||
* just calling printk() so that this can eventually make its way through
|
||||
* relayfs along with the debugging messages. Everything else gets KERN_DEBUG.
|
||||
* The inline tests and macro dance give GCC the opportunity to quite cleverly
|
||||
* only emit the appropriage printk() when the caller passes in a constant
|
||||
* mask, as is almost always the case.
|
||||
*
|
||||
* All this bitmask nonsense is managed from the files under
|
||||
* /sys/fs/o2cb/logmask/. Reading the files gives a straightforward
|
||||
* indication of which bits are allowed (allow) or denied (off/deny).
|
||||
* ENTRY deny
|
||||
* EXIT deny
|
||||
* TCP off
|
||||
* MSG off
|
||||
* SOCKET off
|
||||
* ERROR allow
|
||||
* NOTICE allow
|
||||
*
|
||||
* Writing changes the state of a given bit and requires a strictly formatted
|
||||
* single write() call:
|
||||
*
|
||||
* write(fd, "allow", 5);
|
||||
*
|
||||
* Echoing allow/deny/off string into the logmask files can flip the bits
|
||||
* on or off as expected; here is the bash script for example:
|
||||
*
|
||||
* log_mask="/sys/fs/o2cb/log_mask"
|
||||
* for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
|
||||
* echo allow >"$log_mask"/"$node"
|
||||
* done
|
||||
*
|
||||
* The debugfs.ocfs2 tool can also flip the bits with the -l option:
|
||||
*
|
||||
* debugfs.ocfs2 -l TCP allow
|
||||
*/
|
||||
|
||||
/* for task_struct */
|
||||
#include <linux/sched.h>
|
||||
|
||||
/* bits that are frequently given and infrequently matched in the low word */
|
||||
/* NOTE: If you add a flag, you need to also update masklog.c! */
|
||||
#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */
|
||||
#define ML_MSG 0x0000000000000002ULL /* net network messages */
|
||||
#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */
|
||||
#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */
|
||||
#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */
|
||||
#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */
|
||||
#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */
|
||||
#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */
|
||||
#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */
|
||||
#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */
|
||||
#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */
|
||||
#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */
|
||||
#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */
|
||||
#define ML_CONN 0x0000000000002000ULL /* net connection management */
|
||||
#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */
|
||||
#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */
|
||||
#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */
|
||||
|
||||
/* bits that are infrequently given and frequently matched in the high word */
|
||||
#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
|
||||
#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */
|
||||
#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
|
||||
|
||||
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
|
||||
#ifndef MLOG_MASK_PREFIX
|
||||
#define MLOG_MASK_PREFIX 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When logging is disabled, force the bit test to 0 for anything other
|
||||
* than errors and notices, allowing gcc to remove the code completely.
|
||||
* When enabled, allow all masks.
|
||||
*/
|
||||
#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
|
||||
#define ML_ALLOWED_BITS ~0
|
||||
#else
|
||||
#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE)
|
||||
#endif
|
||||
|
||||
#define MLOG_MAX_BITS 64
|
||||
|
||||
struct mlog_bits {
|
||||
unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
|
||||
};
|
||||
|
||||
extern struct mlog_bits mlog_and_bits, mlog_not_bits;
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
|
||||
#define __mlog_test_u64(mask, bits) \
|
||||
( (u32)(mask & 0xffffffff) & bits.words[0] || \
|
||||
((u64)(mask) >> 32) & bits.words[1] )
|
||||
#define __mlog_set_u64(mask, bits) do { \
|
||||
bits.words[0] |= (u32)(mask & 0xffffffff); \
|
||||
bits.words[1] |= (u64)(mask) >> 32; \
|
||||
} while (0)
|
||||
#define __mlog_clear_u64(mask, bits) do { \
|
||||
bits.words[0] &= ~((u32)(mask & 0xffffffff)); \
|
||||
bits.words[1] &= ~((u64)(mask) >> 32); \
|
||||
} while (0)
|
||||
#define MLOG_BITS_RHS(mask) { \
|
||||
{ \
|
||||
[0] = (u32)(mask & 0xffffffff), \
|
||||
[1] = (u64)(mask) >> 32, \
|
||||
} \
|
||||
}
|
||||
|
||||
#else /* 32bit long above, 64bit long below */
|
||||
|
||||
#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0])
|
||||
#define __mlog_set_u64(mask, bits) do { \
|
||||
bits.words[0] |= (mask); \
|
||||
} while (0)
|
||||
#define __mlog_clear_u64(mask, bits) do { \
|
||||
bits.words[0] &= ~(mask); \
|
||||
} while (0)
|
||||
#define MLOG_BITS_RHS(mask) { { (mask) } }
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* smp_processor_id() "helpfully" screams when called outside preemptible
|
||||
* regions in current kernels. sles doesn't have the variants that don't
|
||||
* scream. just do this instead of trying to guess which we're building
|
||||
* against.. *sigh*.
|
||||
*/
|
||||
#define __mlog_cpu_guess ({ \
|
||||
unsigned long _cpu = get_cpu(); \
|
||||
put_cpu(); \
|
||||
_cpu; \
|
||||
})
|
||||
|
||||
/* In the following two macros, the whitespace after the ',' just
|
||||
* before ##args is intentional. Otherwise, gcc 2.95 will eat the
|
||||
* previous token if args expands to nothing.
|
||||
*/
|
||||
#define __mlog_printk(level, fmt, args...) \
|
||||
printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
|
||||
task_pid_nr(current), __mlog_cpu_guess, \
|
||||
__PRETTY_FUNCTION__, __LINE__ , ##args)
|
||||
|
||||
#define mlog(mask, fmt, args...) do { \
|
||||
u64 __m = MLOG_MASK_PREFIX | (mask); \
|
||||
if ((__m & ML_ALLOWED_BITS) && \
|
||||
__mlog_test_u64(__m, mlog_and_bits) && \
|
||||
!__mlog_test_u64(__m, mlog_not_bits)) { \
|
||||
if (__m & ML_ERROR) \
|
||||
__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
|
||||
else if (__m & ML_NOTICE) \
|
||||
__mlog_printk(KERN_NOTICE, fmt , ##args); \
|
||||
else __mlog_printk(KERN_INFO, fmt , ##args); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define mlog_errno(st) do { \
|
||||
int _st = (st); \
|
||||
if (_st != -ERESTARTSYS && _st != -EINTR && \
|
||||
_st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \
|
||||
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
|
||||
} while (0)
|
||||
|
||||
#define mlog_bug_on_msg(cond, fmt, args...) do { \
|
||||
if (cond) { \
|
||||
mlog(ML_ERROR, "bug expression: " #cond "\n"); \
|
||||
mlog(ML_ERROR, fmt, ##args); \
|
||||
BUG(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/sysfs.h>
|
||||
int mlog_sys_init(struct kset *o2cb_subsys);
|
||||
void mlog_sys_shutdown(void);
|
||||
|
||||
#endif /* O2CLUSTER_MASKLOG_H */
|
579
drivers/staging/ramster/cluster/netdebug.c
Normal file
579
drivers/staging/ramster/cluster/netdebug.c
Normal file
@ -0,0 +1,579 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* netdebug.c
|
||||
*
|
||||
* debug functionality for o2net
|
||||
*
|
||||
* Copyright (C) 2005, 2008 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/idr.h>
|
||||
#include <linux/kref.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/debugfs.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
#include "tcp.h"
|
||||
#include "nodemanager.h"
|
||||
#define MLOG_MASK_PREFIX ML_TCP
|
||||
#include "masklog.h"
|
||||
|
||||
#include "tcp_internal.h"
|
||||
|
||||
#define O2NET_DEBUG_DIR "o2net"
|
||||
#define SC_DEBUG_NAME "sock_containers"
|
||||
#define NST_DEBUG_NAME "send_tracking"
|
||||
#define STATS_DEBUG_NAME "stats"
|
||||
#define NODES_DEBUG_NAME "connected_nodes"
|
||||
|
||||
#define SHOW_SOCK_CONTAINERS 0
|
||||
#define SHOW_SOCK_STATS 1
|
||||
|
||||
static struct dentry *o2net_dentry;
|
||||
static struct dentry *sc_dentry;
|
||||
static struct dentry *nst_dentry;
|
||||
static struct dentry *stats_dentry;
|
||||
static struct dentry *nodes_dentry;
|
||||
|
||||
static DEFINE_SPINLOCK(o2net_debug_lock);
|
||||
|
||||
static LIST_HEAD(sock_containers);
|
||||
static LIST_HEAD(send_tracking);
|
||||
|
||||
void o2net_debug_add_nst(struct o2net_send_tracking *nst)
|
||||
{
|
||||
spin_lock(&o2net_debug_lock);
|
||||
list_add(&nst->st_net_debug_item, &send_tracking);
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
}
|
||||
|
||||
void o2net_debug_del_nst(struct o2net_send_tracking *nst)
|
||||
{
|
||||
spin_lock(&o2net_debug_lock);
|
||||
if (!list_empty(&nst->st_net_debug_item))
|
||||
list_del_init(&nst->st_net_debug_item);
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
}
|
||||
|
||||
static struct o2net_send_tracking
|
||||
*next_nst(struct o2net_send_tracking *nst_start)
|
||||
{
|
||||
struct o2net_send_tracking *nst, *ret = NULL;
|
||||
|
||||
assert_spin_locked(&o2net_debug_lock);
|
||||
|
||||
list_for_each_entry(nst, &nst_start->st_net_debug_item,
|
||||
st_net_debug_item) {
|
||||
/* discover the head of the list */
|
||||
if (&nst->st_net_debug_item == &send_tracking)
|
||||
break;
|
||||
|
||||
/* use st_task to detect real nsts in the list */
|
||||
if (nst->st_task != NULL) {
|
||||
ret = nst;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
{
|
||||
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
|
||||
|
||||
spin_lock(&o2net_debug_lock);
|
||||
nst = next_nst(dummy_nst);
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
|
||||
return nst;
|
||||
}
|
||||
|
||||
static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
|
||||
|
||||
spin_lock(&o2net_debug_lock);
|
||||
nst = next_nst(dummy_nst);
|
||||
list_del_init(&dummy_nst->st_net_debug_item);
|
||||
if (nst)
|
||||
list_add(&dummy_nst->st_net_debug_item,
|
||||
&nst->st_net_debug_item);
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
|
||||
return nst; /* unused, just needs to be null when done */
|
||||
}
|
||||
|
||||
static int nst_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
|
||||
ktime_t now;
|
||||
s64 sock, send, status;
|
||||
|
||||
spin_lock(&o2net_debug_lock);
|
||||
nst = next_nst(dummy_nst);
|
||||
if (!nst)
|
||||
goto out;
|
||||
|
||||
now = ktime_get();
|
||||
sock = ktime_to_us(ktime_sub(now, nst->st_sock_time));
|
||||
send = ktime_to_us(ktime_sub(now, nst->st_send_time));
|
||||
status = ktime_to_us(ktime_sub(now, nst->st_status_time));
|
||||
|
||||
/* get_task_comm isn't exported. oh well. */
|
||||
seq_printf(seq, "%p:\n"
|
||||
" pid: %lu\n"
|
||||
" tgid: %lu\n"
|
||||
" process name: %s\n"
|
||||
" node: %u\n"
|
||||
" sc: %p\n"
|
||||
" message id: %d\n"
|
||||
" message type: %u\n"
|
||||
" message key: 0x%08x\n"
|
||||
" sock acquiry: %lld usecs ago\n"
|
||||
" send start: %lld usecs ago\n"
|
||||
" wait start: %lld usecs ago\n",
|
||||
nst, (unsigned long)task_pid_nr(nst->st_task),
|
||||
(unsigned long)nst->st_task->tgid,
|
||||
nst->st_task->comm, nst->st_node,
|
||||
nst->st_sc, nst->st_id, nst->st_msg_type,
|
||||
nst->st_msg_key,
|
||||
(long long)sock,
|
||||
(long long)send,
|
||||
(long long)status);
|
||||
|
||||
out:
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nst_seq_stop(struct seq_file *seq, void *v)
|
||||
{
|
||||
}
|
||||
|
||||
static const struct seq_operations nst_seq_ops = {
|
||||
.start = nst_seq_start,
|
||||
.next = nst_seq_next,
|
||||
.stop = nst_seq_stop,
|
||||
.show = nst_seq_show,
|
||||
};
|
||||
|
||||
static int nst_fop_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct o2net_send_tracking *dummy_nst;
|
||||
struct seq_file *seq;
|
||||
int ret;
|
||||
|
||||
dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
|
||||
if (dummy_nst == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
dummy_nst->st_task = NULL;
|
||||
|
||||
ret = seq_open(file, &nst_seq_ops);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
seq = file->private_data;
|
||||
seq->private = dummy_nst;
|
||||
o2net_debug_add_nst(dummy_nst);
|
||||
|
||||
dummy_nst = NULL;
|
||||
|
||||
out:
|
||||
kfree(dummy_nst);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nst_fop_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct seq_file *seq = file->private_data;
|
||||
struct o2net_send_tracking *dummy_nst = seq->private;
|
||||
|
||||
o2net_debug_del_nst(dummy_nst);
|
||||
return seq_release_private(inode, file);
|
||||
}
|
||||
|
||||
static const struct file_operations nst_seq_fops = {
|
||||
.open = nst_fop_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = nst_fop_release,
|
||||
};
|
||||
|
||||
void o2net_debug_add_sc(struct o2net_sock_container *sc)
|
||||
{
|
||||
spin_lock(&o2net_debug_lock);
|
||||
list_add(&sc->sc_net_debug_item, &sock_containers);
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
}
|
||||
|
||||
void o2net_debug_del_sc(struct o2net_sock_container *sc)
|
||||
{
|
||||
spin_lock(&o2net_debug_lock);
|
||||
list_del_init(&sc->sc_net_debug_item);
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
}
|
||||
|
||||
struct o2net_sock_debug {
|
||||
int dbg_ctxt;
|
||||
struct o2net_sock_container *dbg_sock;
|
||||
};
|
||||
|
||||
static struct o2net_sock_container
|
||||
*next_sc(struct o2net_sock_container *sc_start)
|
||||
{
|
||||
struct o2net_sock_container *sc, *ret = NULL;
|
||||
|
||||
assert_spin_locked(&o2net_debug_lock);
|
||||
|
||||
list_for_each_entry(sc, &sc_start->sc_net_debug_item,
|
||||
sc_net_debug_item) {
|
||||
/* discover the head of the list miscast as a sc */
|
||||
if (&sc->sc_net_debug_item == &sock_containers)
|
||||
break;
|
||||
|
||||
/* use sc_page to detect real scs in the list */
|
||||
if (sc->sc_page != NULL) {
|
||||
ret = sc;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
{
|
||||
struct o2net_sock_debug *sd = seq->private;
|
||||
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
|
||||
|
||||
spin_lock(&o2net_debug_lock);
|
||||
sc = next_sc(dummy_sc);
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
|
||||
return sc;
|
||||
}
|
||||
|
||||
static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
struct o2net_sock_debug *sd = seq->private;
|
||||
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
|
||||
|
||||
spin_lock(&o2net_debug_lock);
|
||||
sc = next_sc(dummy_sc);
|
||||
list_del_init(&dummy_sc->sc_net_debug_item);
|
||||
if (sc)
|
||||
list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
|
||||
return sc; /* unused, just needs to be null when done */
|
||||
}
|
||||
|
||||
#ifdef CONFIG_OCFS2_FS_STATS
|
||||
# define sc_send_count(_s) ((_s)->sc_send_count)
|
||||
# define sc_recv_count(_s) ((_s)->sc_recv_count)
|
||||
# define sc_tv_acquiry_total_ns(_s) (ktime_to_ns((_s)->sc_tv_acquiry_total))
|
||||
# define sc_tv_send_total_ns(_s) (ktime_to_ns((_s)->sc_tv_send_total))
|
||||
# define sc_tv_status_total_ns(_s) (ktime_to_ns((_s)->sc_tv_status_total))
|
||||
# define sc_tv_process_total_ns(_s) (ktime_to_ns((_s)->sc_tv_process_total))
|
||||
#else
|
||||
# define sc_send_count(_s) (0U)
|
||||
# define sc_recv_count(_s) (0U)
|
||||
# define sc_tv_acquiry_total_ns(_s) (0LL)
|
||||
# define sc_tv_send_total_ns(_s) (0LL)
|
||||
# define sc_tv_status_total_ns(_s) (0LL)
|
||||
# define sc_tv_process_total_ns(_s) (0LL)
|
||||
#endif
|
||||
|
||||
/* So that debugfs.ocfs2 can determine which format is being used */
|
||||
#define O2NET_STATS_STR_VERSION 1
|
||||
static void sc_show_sock_stats(struct seq_file *seq,
|
||||
struct o2net_sock_container *sc)
|
||||
{
|
||||
if (!sc)
|
||||
return;
|
||||
|
||||
seq_printf(seq, "%d,%u,%lu,%lld,%lld,%lld,%lu,%lld\n", O2NET_STATS_STR_VERSION,
|
||||
sc->sc_node->nd_num, (unsigned long)sc_send_count(sc),
|
||||
(long long)sc_tv_acquiry_total_ns(sc),
|
||||
(long long)sc_tv_send_total_ns(sc),
|
||||
(long long)sc_tv_status_total_ns(sc),
|
||||
(unsigned long)sc_recv_count(sc),
|
||||
(long long)sc_tv_process_total_ns(sc));
|
||||
}
|
||||
|
||||
static void sc_show_sock_container(struct seq_file *seq,
|
||||
struct o2net_sock_container *sc)
|
||||
{
|
||||
struct inet_sock *inet = NULL;
|
||||
__be32 saddr = 0, daddr = 0;
|
||||
__be16 sport = 0, dport = 0;
|
||||
|
||||
if (!sc)
|
||||
return;
|
||||
|
||||
if (sc->sc_sock) {
|
||||
inet = inet_sk(sc->sc_sock->sk);
|
||||
/* the stack's structs aren't sparse endian clean */
|
||||
saddr = (__force __be32)inet->inet_saddr;
|
||||
daddr = (__force __be32)inet->inet_daddr;
|
||||
sport = (__force __be16)inet->inet_sport;
|
||||
dport = (__force __be16)inet->inet_dport;
|
||||
}
|
||||
|
||||
/* XXX sigh, inet-> doesn't have sparse annotation so any
|
||||
* use of it here generates a warning with -Wbitwise */
|
||||
seq_printf(seq, "%p:\n"
|
||||
" krefs: %d\n"
|
||||
" sock: %pI4:%u -> "
|
||||
"%pI4:%u\n"
|
||||
" remote node: %s\n"
|
||||
" page off: %zu\n"
|
||||
" handshake ok: %u\n"
|
||||
" timer: %lld usecs\n"
|
||||
" data ready: %lld usecs\n"
|
||||
" advance start: %lld usecs\n"
|
||||
" advance stop: %lld usecs\n"
|
||||
" func start: %lld usecs\n"
|
||||
" func stop: %lld usecs\n"
|
||||
" func key: 0x%08x\n"
|
||||
" func type: %u\n",
|
||||
sc,
|
||||
atomic_read(&sc->sc_kref.refcount),
|
||||
&saddr, inet ? ntohs(sport) : 0,
|
||||
&daddr, inet ? ntohs(dport) : 0,
|
||||
sc->sc_node->nd_name,
|
||||
sc->sc_page_off,
|
||||
sc->sc_handshake_ok,
|
||||
(long long)ktime_to_us(sc->sc_tv_timer),
|
||||
(long long)ktime_to_us(sc->sc_tv_data_ready),
|
||||
(long long)ktime_to_us(sc->sc_tv_advance_start),
|
||||
(long long)ktime_to_us(sc->sc_tv_advance_stop),
|
||||
(long long)ktime_to_us(sc->sc_tv_func_start),
|
||||
(long long)ktime_to_us(sc->sc_tv_func_stop),
|
||||
sc->sc_msg_key,
|
||||
sc->sc_msg_type);
|
||||
}
|
||||
|
||||
static int sc_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct o2net_sock_debug *sd = seq->private;
|
||||
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
|
||||
|
||||
spin_lock(&o2net_debug_lock);
|
||||
sc = next_sc(dummy_sc);
|
||||
|
||||
if (sc) {
|
||||
if (sd->dbg_ctxt == SHOW_SOCK_CONTAINERS)
|
||||
sc_show_sock_container(seq, sc);
|
||||
else
|
||||
sc_show_sock_stats(seq, sc);
|
||||
}
|
||||
|
||||
spin_unlock(&o2net_debug_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void sc_seq_stop(struct seq_file *seq, void *v)
|
||||
{
|
||||
}
|
||||
|
||||
static const struct seq_operations sc_seq_ops = {
|
||||
.start = sc_seq_start,
|
||||
.next = sc_seq_next,
|
||||
.stop = sc_seq_stop,
|
||||
.show = sc_seq_show,
|
||||
};
|
||||
|
||||
static int sc_common_open(struct file *file, struct o2net_sock_debug *sd)
|
||||
{
|
||||
struct o2net_sock_container *dummy_sc;
|
||||
struct seq_file *seq;
|
||||
int ret;
|
||||
|
||||
dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
|
||||
if (dummy_sc == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
dummy_sc->sc_page = NULL;
|
||||
|
||||
ret = seq_open(file, &sc_seq_ops);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
seq = file->private_data;
|
||||
seq->private = sd;
|
||||
sd->dbg_sock = dummy_sc;
|
||||
o2net_debug_add_sc(dummy_sc);
|
||||
|
||||
dummy_sc = NULL;
|
||||
|
||||
out:
|
||||
kfree(dummy_sc);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int sc_fop_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct seq_file *seq = file->private_data;
|
||||
struct o2net_sock_debug *sd = seq->private;
|
||||
struct o2net_sock_container *dummy_sc = sd->dbg_sock;
|
||||
|
||||
o2net_debug_del_sc(dummy_sc);
|
||||
return seq_release_private(inode, file);
|
||||
}
|
||||
|
||||
static int stats_fop_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct o2net_sock_debug *sd;
|
||||
|
||||
sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
|
||||
if (sd == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
sd->dbg_ctxt = SHOW_SOCK_STATS;
|
||||
sd->dbg_sock = NULL;
|
||||
|
||||
return sc_common_open(file, sd);
|
||||
}
|
||||
|
||||
static const struct file_operations stats_seq_fops = {
|
||||
.open = stats_fop_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = sc_fop_release,
|
||||
};
|
||||
|
||||
static int sc_fop_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct o2net_sock_debug *sd;
|
||||
|
||||
sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
|
||||
if (sd == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
|
||||
sd->dbg_sock = NULL;
|
||||
|
||||
return sc_common_open(file, sd);
|
||||
}
|
||||
|
||||
static const struct file_operations sc_seq_fops = {
|
||||
.open = sc_fop_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = sc_fop_release,
|
||||
};
|
||||
|
||||
static int o2net_fill_bitmap(char *buf, int len)
|
||||
{
|
||||
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
int i = -1, out = 0;
|
||||
|
||||
o2net_fill_node_map(map, sizeof(map));
|
||||
|
||||
while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
|
||||
out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
|
||||
out += snprintf(buf + out, PAGE_SIZE - out, "\n");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static int nodes_fop_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
char *buf;
|
||||
|
||||
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
|
||||
|
||||
file->private_data = buf;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int o2net_debug_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
kfree(file->private_data);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t o2net_debug_read(struct file *file, char __user *buf,
|
||||
size_t nbytes, loff_t *ppos)
|
||||
{
|
||||
return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
|
||||
i_size_read(file->f_mapping->host));
|
||||
}
|
||||
|
||||
static const struct file_operations nodes_fops = {
|
||||
.open = nodes_fop_open,
|
||||
.release = o2net_debug_release,
|
||||
.read = o2net_debug_read,
|
||||
.llseek = generic_file_llseek,
|
||||
};
|
||||
|
||||
void o2net_debugfs_exit(void)
|
||||
{
|
||||
debugfs_remove(nodes_dentry);
|
||||
debugfs_remove(stats_dentry);
|
||||
debugfs_remove(sc_dentry);
|
||||
debugfs_remove(nst_dentry);
|
||||
debugfs_remove(o2net_dentry);
|
||||
}
|
||||
|
||||
int o2net_debugfs_init(void)
|
||||
{
|
||||
mode_t mode = S_IFREG|S_IRUSR;
|
||||
|
||||
o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
|
||||
if (o2net_dentry)
|
||||
nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
|
||||
o2net_dentry, NULL, &nst_seq_fops);
|
||||
if (nst_dentry)
|
||||
sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
|
||||
o2net_dentry, NULL, &sc_seq_fops);
|
||||
if (sc_dentry)
|
||||
stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
|
||||
o2net_dentry, NULL, &stats_seq_fops);
|
||||
if (stats_dentry)
|
||||
nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
|
||||
o2net_dentry, NULL, &nodes_fops);
|
||||
if (nodes_dentry)
|
||||
return 0;
|
||||
|
||||
o2net_debugfs_exit();
|
||||
mlog_errno(-ENOMEM);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_DEBUG_FS */
|
989
drivers/staging/ramster/cluster/nodemanager.c
Normal file
989
drivers/staging/ramster/cluster/nodemanager.c
Normal file
@ -0,0 +1,989 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/configfs.h>
|
||||
|
||||
#include "tcp.h"
|
||||
#include "nodemanager.h"
|
||||
#include "heartbeat.h"
|
||||
#include "masklog.h"
|
||||
#include "sys.h"
|
||||
#include "ver.h"
|
||||
|
||||
/* for now we operate under the assertion that there can be only one
|
||||
* cluster active at a time. Changing this will require trickling
|
||||
* cluster references throughout where nodes are looked up */
|
||||
struct o2nm_cluster *o2nm_single_cluster = NULL;
|
||||
|
||||
char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
|
||||
"reset", /* O2NM_FENCE_RESET */
|
||||
"panic", /* O2NM_FENCE_PANIC */
|
||||
};
|
||||
|
||||
struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
|
||||
{
|
||||
struct o2nm_node *node = NULL;
|
||||
|
||||
if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
|
||||
goto out;
|
||||
|
||||
read_lock(&o2nm_single_cluster->cl_nodes_lock);
|
||||
node = o2nm_single_cluster->cl_nodes[node_num];
|
||||
if (node)
|
||||
config_item_get(&node->nd_item);
|
||||
read_unlock(&o2nm_single_cluster->cl_nodes_lock);
|
||||
out:
|
||||
return node;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
|
||||
|
||||
int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
|
||||
{
|
||||
struct o2nm_cluster *cluster = o2nm_single_cluster;
|
||||
|
||||
BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
|
||||
|
||||
if (cluster == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
read_lock(&cluster->cl_nodes_lock);
|
||||
memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
|
||||
read_unlock(&cluster->cl_nodes_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
|
||||
|
||||
static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
|
||||
__be32 ip_needle,
|
||||
struct rb_node ***ret_p,
|
||||
struct rb_node **ret_parent)
|
||||
{
|
||||
struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct o2nm_node *node, *ret = NULL;
|
||||
|
||||
while (*p) {
|
||||
int cmp;
|
||||
|
||||
parent = *p;
|
||||
node = rb_entry(parent, struct o2nm_node, nd_ip_node);
|
||||
|
||||
cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
|
||||
sizeof(ip_needle));
|
||||
if (cmp < 0)
|
||||
p = &(*p)->rb_left;
|
||||
else if (cmp > 0)
|
||||
p = &(*p)->rb_right;
|
||||
else {
|
||||
ret = node;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret_p != NULL)
|
||||
*ret_p = p;
|
||||
if (ret_parent != NULL)
|
||||
*ret_parent = parent;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
|
||||
{
|
||||
struct o2nm_node *node = NULL;
|
||||
struct o2nm_cluster *cluster = o2nm_single_cluster;
|
||||
|
||||
if (cluster == NULL)
|
||||
goto out;
|
||||
|
||||
read_lock(&cluster->cl_nodes_lock);
|
||||
node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
|
||||
if (node)
|
||||
config_item_get(&node->nd_item);
|
||||
read_unlock(&cluster->cl_nodes_lock);
|
||||
|
||||
out:
|
||||
return node;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
|
||||
|
||||
void o2nm_node_put(struct o2nm_node *node)
|
||||
{
|
||||
config_item_put(&node->nd_item);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_node_put);
|
||||
|
||||
void o2nm_node_get(struct o2nm_node *node)
|
||||
{
|
||||
config_item_get(&node->nd_item);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_node_get);
|
||||
|
||||
u8 o2nm_this_node(void)
|
||||
{
|
||||
u8 node_num = O2NM_MAX_NODES;
|
||||
|
||||
if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
|
||||
node_num = o2nm_single_cluster->cl_local_node;
|
||||
|
||||
return node_num;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_this_node);
|
||||
|
||||
/* node configfs bits */
|
||||
|
||||
static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
|
||||
{
|
||||
return item ?
|
||||
container_of(to_config_group(item), struct o2nm_cluster,
|
||||
cl_group)
|
||||
: NULL;
|
||||
}
|
||||
|
||||
static struct o2nm_node *to_o2nm_node(struct config_item *item)
|
||||
{
|
||||
return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
|
||||
}
|
||||
|
||||
static void o2nm_node_release(struct config_item *item)
|
||||
{
|
||||
struct o2nm_node *node = to_o2nm_node(item);
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
|
||||
{
|
||||
return sprintf(page, "%d\n", node->nd_num);
|
||||
}
|
||||
|
||||
static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
|
||||
{
|
||||
/* through the first node_set .parent
|
||||
* mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
|
||||
return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
|
||||
}
|
||||
|
||||
enum {
|
||||
O2NM_NODE_ATTR_NUM = 0,
|
||||
O2NM_NODE_ATTR_PORT,
|
||||
O2NM_NODE_ATTR_ADDRESS,
|
||||
O2NM_NODE_ATTR_LOCAL,
|
||||
};
|
||||
|
||||
static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
|
||||
size_t count)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
|
||||
unsigned long tmp;
|
||||
char *p = (char *)page;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 0);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
if (tmp >= O2NM_MAX_NODES)
|
||||
return -ERANGE;
|
||||
|
||||
/* once we're in the cl_nodes tree networking can look us up by
|
||||
* node number and try to use our address and port attributes
|
||||
* to connect to this node.. make sure that they've been set
|
||||
* before writing the node attribute? */
|
||||
if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
|
||||
!test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
|
||||
return -EINVAL; /* XXX */
|
||||
|
||||
write_lock(&cluster->cl_nodes_lock);
|
||||
if (cluster->cl_nodes[tmp])
|
||||
p = NULL;
|
||||
else {
|
||||
cluster->cl_nodes[tmp] = node;
|
||||
node->nd_num = tmp;
|
||||
set_bit(tmp, cluster->cl_nodes_bitmap);
|
||||
}
|
||||
write_unlock(&cluster->cl_nodes_lock);
|
||||
if (p == NULL)
|
||||
return -EEXIST;
|
||||
|
||||
return count;
|
||||
}
|
||||
static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
|
||||
{
|
||||
return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
unsigned long tmp;
|
||||
char *p = (char *)page;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 0);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
if (tmp == 0)
|
||||
return -EINVAL;
|
||||
if (tmp >= (u16)-1)
|
||||
return -ERANGE;
|
||||
|
||||
node->nd_ipv4_port = htons(tmp);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
|
||||
{
|
||||
return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
|
||||
const char *page,
|
||||
size_t count)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
|
||||
int ret, i;
|
||||
struct rb_node **p, *parent;
|
||||
unsigned int octets[4];
|
||||
__be32 ipv4_addr = 0;
|
||||
|
||||
ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
|
||||
&octets[1], &octets[0]);
|
||||
if (ret != 4)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(octets); i++) {
|
||||
if (octets[i] > 255)
|
||||
return -ERANGE;
|
||||
be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
write_lock(&cluster->cl_nodes_lock);
|
||||
if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
|
||||
ret = -EEXIST;
|
||||
else {
|
||||
rb_link_node(&node->nd_ip_node, parent, p);
|
||||
rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
|
||||
}
|
||||
write_unlock(&cluster->cl_nodes_lock);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
|
||||
{
|
||||
return sprintf(page, "%d\n", node->nd_local);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
|
||||
size_t count)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
|
||||
unsigned long tmp;
|
||||
char *p = (char *)page;
|
||||
ssize_t ret;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 0);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
tmp = !!tmp; /* boolean of whether this node wants to be local */
|
||||
|
||||
/* setting local turns on networking rx for now so we require having
|
||||
* set everything else first */
|
||||
if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
|
||||
!test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
|
||||
!test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
|
||||
return -EINVAL; /* XXX */
|
||||
|
||||
/* the only failure case is trying to set a new local node
|
||||
* when a different one is already set */
|
||||
if (tmp && tmp == cluster->cl_has_local &&
|
||||
cluster->cl_local_node != node->nd_num)
|
||||
return -EBUSY;
|
||||
|
||||
/* bring up the rx thread if we're setting the new local node. */
|
||||
if (tmp && !cluster->cl_has_local) {
|
||||
ret = o2net_start_listening(node);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!tmp && cluster->cl_has_local &&
|
||||
cluster->cl_local_node == node->nd_num) {
|
||||
o2net_stop_listening(node);
|
||||
cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
|
||||
}
|
||||
|
||||
node->nd_local = tmp;
|
||||
if (node->nd_local) {
|
||||
cluster->cl_has_local = tmp;
|
||||
cluster->cl_local_node = node->nd_num;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
struct o2nm_node_attribute {
|
||||
struct configfs_attribute attr;
|
||||
ssize_t (*show)(struct o2nm_node *, char *);
|
||||
ssize_t (*store)(struct o2nm_node *, const char *, size_t);
|
||||
};
|
||||
|
||||
static struct o2nm_node_attribute o2nm_node_attr_num = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "num",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_node_num_read,
|
||||
.store = o2nm_node_num_write,
|
||||
};
|
||||
|
||||
static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "ipv4_port",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_node_ipv4_port_read,
|
||||
.store = o2nm_node_ipv4_port_write,
|
||||
};
|
||||
|
||||
static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "ipv4_address",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_node_ipv4_address_read,
|
||||
.store = o2nm_node_ipv4_address_write,
|
||||
};
|
||||
|
||||
static struct o2nm_node_attribute o2nm_node_attr_local = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "local",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_node_local_read,
|
||||
.store = o2nm_node_local_write,
|
||||
};
|
||||
|
||||
static struct configfs_attribute *o2nm_node_attrs[] = {
|
||||
[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
|
||||
[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
|
||||
[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
|
||||
[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int o2nm_attr_index(struct configfs_attribute *attr)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
|
||||
if (attr == o2nm_node_attrs[i])
|
||||
return i;
|
||||
}
|
||||
BUG();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_show(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
struct o2nm_node *node = to_o2nm_node(item);
|
||||
struct o2nm_node_attribute *o2nm_node_attr =
|
||||
container_of(attr, struct o2nm_node_attribute, attr);
|
||||
ssize_t ret = 0;
|
||||
|
||||
if (o2nm_node_attr->show)
|
||||
ret = o2nm_node_attr->show(node, page);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_store(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct o2nm_node *node = to_o2nm_node(item);
|
||||
struct o2nm_node_attribute *o2nm_node_attr =
|
||||
container_of(attr, struct o2nm_node_attribute, attr);
|
||||
ssize_t ret;
|
||||
int attr_index = o2nm_attr_index(attr);
|
||||
|
||||
if (o2nm_node_attr->store == NULL) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (test_bit(attr_index, &node->nd_set_attributes))
|
||||
return -EBUSY;
|
||||
|
||||
ret = o2nm_node_attr->store(node, page, count);
|
||||
if (ret < count)
|
||||
goto out;
|
||||
|
||||
set_bit(attr_index, &node->nd_set_attributes);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct configfs_item_operations o2nm_node_item_ops = {
|
||||
.release = o2nm_node_release,
|
||||
.show_attribute = o2nm_node_show,
|
||||
.store_attribute = o2nm_node_store,
|
||||
};
|
||||
|
||||
static struct config_item_type o2nm_node_type = {
|
||||
.ct_item_ops = &o2nm_node_item_ops,
|
||||
.ct_attrs = o2nm_node_attrs,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/* node set */
|
||||
|
||||
struct o2nm_node_group {
|
||||
struct config_group ns_group;
|
||||
/* some stuff? */
|
||||
};
|
||||
|
||||
#if 0
|
||||
static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
|
||||
{
|
||||
return group ?
|
||||
container_of(group, struct o2nm_node_group, ns_group)
|
||||
: NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct o2nm_cluster_attribute {
|
||||
struct configfs_attribute attr;
|
||||
ssize_t (*show)(struct o2nm_cluster *, char *);
|
||||
ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
|
||||
};
|
||||
|
||||
static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
|
||||
unsigned int *val)
|
||||
{
|
||||
unsigned long tmp;
|
||||
char *p = (char *)page;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 0);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
if (tmp == 0)
|
||||
return -EINVAL;
|
||||
if (tmp >= (u32)-1)
|
||||
return -ERANGE;
|
||||
|
||||
*val = tmp;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
|
||||
struct o2nm_cluster *cluster, char *page)
|
||||
{
|
||||
return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
|
||||
struct o2nm_cluster *cluster, const char *page, size_t count)
|
||||
{
|
||||
ssize_t ret;
|
||||
unsigned int val;
|
||||
|
||||
ret = o2nm_cluster_attr_write(page, count, &val);
|
||||
|
||||
if (ret > 0) {
|
||||
if (cluster->cl_idle_timeout_ms != val
|
||||
&& o2net_num_connected_peers()) {
|
||||
mlog(ML_NOTICE,
|
||||
"o2net: cannot change idle timeout after "
|
||||
"the first peer has agreed to it."
|
||||
" %d connected peers\n",
|
||||
o2net_num_connected_peers());
|
||||
ret = -EINVAL;
|
||||
} else if (val <= cluster->cl_keepalive_delay_ms) {
|
||||
mlog(ML_NOTICE, "o2net: idle timeout must be larger "
|
||||
"than keepalive delay\n");
|
||||
ret = -EINVAL;
|
||||
} else {
|
||||
cluster->cl_idle_timeout_ms = val;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
|
||||
struct o2nm_cluster *cluster, char *page)
|
||||
{
|
||||
return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
|
||||
struct o2nm_cluster *cluster, const char *page, size_t count)
|
||||
{
|
||||
ssize_t ret;
|
||||
unsigned int val;
|
||||
|
||||
ret = o2nm_cluster_attr_write(page, count, &val);
|
||||
|
||||
if (ret > 0) {
|
||||
if (cluster->cl_keepalive_delay_ms != val
|
||||
&& o2net_num_connected_peers()) {
|
||||
mlog(ML_NOTICE,
|
||||
"o2net: cannot change keepalive delay after"
|
||||
" the first peer has agreed to it."
|
||||
" %d connected peers\n",
|
||||
o2net_num_connected_peers());
|
||||
ret = -EINVAL;
|
||||
} else if (val >= cluster->cl_idle_timeout_ms) {
|
||||
mlog(ML_NOTICE, "o2net: keepalive delay must be "
|
||||
"smaller than idle timeout\n");
|
||||
ret = -EINVAL;
|
||||
} else {
|
||||
cluster->cl_keepalive_delay_ms = val;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
|
||||
struct o2nm_cluster *cluster, char *page)
|
||||
{
|
||||
return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
|
||||
struct o2nm_cluster *cluster, const char *page, size_t count)
|
||||
{
|
||||
return o2nm_cluster_attr_write(page, count,
|
||||
&cluster->cl_reconnect_delay_ms);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_attr_fence_method_read(
|
||||
struct o2nm_cluster *cluster, char *page)
|
||||
{
|
||||
ssize_t ret = 0;
|
||||
|
||||
if (cluster)
|
||||
ret = sprintf(page, "%s\n",
|
||||
o2nm_fence_method_desc[cluster->cl_fence_method]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_attr_fence_method_write(
|
||||
struct o2nm_cluster *cluster, const char *page, size_t count)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
if (page[count - 1] != '\n')
|
||||
goto bail;
|
||||
|
||||
for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
|
||||
if (count != strlen(o2nm_fence_method_desc[i]) + 1)
|
||||
continue;
|
||||
if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
|
||||
continue;
|
||||
if (cluster->cl_fence_method != i) {
|
||||
printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
|
||||
o2nm_fence_method_desc[i]);
|
||||
cluster->cl_fence_method = i;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
bail:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "idle_timeout_ms",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_cluster_attr_idle_timeout_ms_read,
|
||||
.store = o2nm_cluster_attr_idle_timeout_ms_write,
|
||||
};
|
||||
|
||||
static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "keepalive_delay_ms",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_cluster_attr_keepalive_delay_ms_read,
|
||||
.store = o2nm_cluster_attr_keepalive_delay_ms_write,
|
||||
};
|
||||
|
||||
static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "reconnect_delay_ms",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_cluster_attr_reconnect_delay_ms_read,
|
||||
.store = o2nm_cluster_attr_reconnect_delay_ms_write,
|
||||
};
|
||||
|
||||
static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "fence_method",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_cluster_attr_fence_method_read,
|
||||
.store = o2nm_cluster_attr_fence_method_write,
|
||||
};
|
||||
|
||||
static struct configfs_attribute *o2nm_cluster_attrs[] = {
|
||||
&o2nm_cluster_attr_idle_timeout_ms.attr,
|
||||
&o2nm_cluster_attr_keepalive_delay_ms.attr,
|
||||
&o2nm_cluster_attr_reconnect_delay_ms.attr,
|
||||
&o2nm_cluster_attr_fence_method.attr,
|
||||
NULL,
|
||||
};
|
||||
static ssize_t o2nm_cluster_show(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
|
||||
struct o2nm_cluster_attribute *o2nm_cluster_attr =
|
||||
container_of(attr, struct o2nm_cluster_attribute, attr);
|
||||
ssize_t ret = 0;
|
||||
|
||||
if (o2nm_cluster_attr->show)
|
||||
ret = o2nm_cluster_attr->show(cluster, page);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_cluster_store(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
|
||||
struct o2nm_cluster_attribute *o2nm_cluster_attr =
|
||||
container_of(attr, struct o2nm_cluster_attribute, attr);
|
||||
ssize_t ret;
|
||||
|
||||
if (o2nm_cluster_attr->store == NULL) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = o2nm_cluster_attr->store(cluster, page, count);
|
||||
if (ret < count)
|
||||
goto out;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct config_item *o2nm_node_group_make_item(struct config_group *group,
|
||||
const char *name)
|
||||
{
|
||||
struct o2nm_node *node = NULL;
|
||||
|
||||
if (strlen(name) > O2NM_MAX_NAME_LEN)
|
||||
return ERR_PTR(-ENAMETOOLONG);
|
||||
|
||||
node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
|
||||
if (node == NULL)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
|
||||
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
|
||||
spin_lock_init(&node->nd_lock);
|
||||
|
||||
mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
|
||||
|
||||
return &node->nd_item;
|
||||
}
|
||||
|
||||
static void o2nm_node_group_drop_item(struct config_group *group,
|
||||
struct config_item *item)
|
||||
{
|
||||
struct o2nm_node *node = to_o2nm_node(item);
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
|
||||
|
||||
o2net_disconnect_node(node);
|
||||
|
||||
if (cluster->cl_has_local &&
|
||||
(cluster->cl_local_node == node->nd_num)) {
|
||||
cluster->cl_has_local = 0;
|
||||
cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
|
||||
o2net_stop_listening(node);
|
||||
}
|
||||
|
||||
/* XXX call into net to stop this node from trading messages */
|
||||
|
||||
write_lock(&cluster->cl_nodes_lock);
|
||||
|
||||
/* XXX sloppy */
|
||||
if (node->nd_ipv4_address)
|
||||
rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
|
||||
|
||||
/* nd_num might be 0 if the node number hasn't been set.. */
|
||||
if (cluster->cl_nodes[node->nd_num] == node) {
|
||||
cluster->cl_nodes[node->nd_num] = NULL;
|
||||
clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
|
||||
}
|
||||
write_unlock(&cluster->cl_nodes_lock);
|
||||
|
||||
mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
|
||||
config_item_name(&node->nd_item));
|
||||
|
||||
config_item_put(item);
|
||||
}
|
||||
|
||||
static struct configfs_group_operations o2nm_node_group_group_ops = {
|
||||
.make_item = o2nm_node_group_make_item,
|
||||
.drop_item = o2nm_node_group_drop_item,
|
||||
};
|
||||
|
||||
static struct config_item_type o2nm_node_group_type = {
|
||||
.ct_group_ops = &o2nm_node_group_group_ops,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/* cluster */
|
||||
|
||||
static void o2nm_cluster_release(struct config_item *item)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
|
||||
|
||||
kfree(cluster->cl_group.default_groups);
|
||||
kfree(cluster);
|
||||
}
|
||||
|
||||
static struct configfs_item_operations o2nm_cluster_item_ops = {
|
||||
.release = o2nm_cluster_release,
|
||||
.show_attribute = o2nm_cluster_show,
|
||||
.store_attribute = o2nm_cluster_store,
|
||||
};
|
||||
|
||||
static struct config_item_type o2nm_cluster_type = {
|
||||
.ct_item_ops = &o2nm_cluster_item_ops,
|
||||
.ct_attrs = o2nm_cluster_attrs,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/* cluster set */
|
||||
|
||||
struct o2nm_cluster_group {
|
||||
struct configfs_subsystem cs_subsys;
|
||||
/* some stuff? */
|
||||
};
|
||||
|
||||
#if 0
|
||||
static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
|
||||
{
|
||||
return group ?
|
||||
container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
|
||||
: NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
|
||||
const char *name)
|
||||
{
|
||||
struct o2nm_cluster *cluster = NULL;
|
||||
struct o2nm_node_group *ns = NULL;
|
||||
struct config_group *o2hb_group = NULL, *ret = NULL;
|
||||
void *defs = NULL;
|
||||
|
||||
/* this runs under the parent dir's i_mutex; there can be only
|
||||
* one caller in here at a time */
|
||||
if (o2nm_single_cluster)
|
||||
return ERR_PTR(-ENOSPC);
|
||||
|
||||
cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
|
||||
ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
|
||||
defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
|
||||
o2hb_group = o2hb_alloc_hb_set();
|
||||
if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
|
||||
goto out;
|
||||
|
||||
config_group_init_type_name(&cluster->cl_group, name,
|
||||
&o2nm_cluster_type);
|
||||
config_group_init_type_name(&ns->ns_group, "node",
|
||||
&o2nm_node_group_type);
|
||||
|
||||
cluster->cl_group.default_groups = defs;
|
||||
cluster->cl_group.default_groups[0] = &ns->ns_group;
|
||||
cluster->cl_group.default_groups[1] = o2hb_group;
|
||||
cluster->cl_group.default_groups[2] = NULL;
|
||||
rwlock_init(&cluster->cl_nodes_lock);
|
||||
cluster->cl_node_ip_tree = RB_ROOT;
|
||||
cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
|
||||
cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
|
||||
cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
|
||||
cluster->cl_fence_method = O2NM_FENCE_RESET;
|
||||
|
||||
ret = &cluster->cl_group;
|
||||
o2nm_single_cluster = cluster;
|
||||
|
||||
out:
|
||||
if (ret == NULL) {
|
||||
kfree(cluster);
|
||||
kfree(ns);
|
||||
o2hb_free_hb_set(o2hb_group);
|
||||
kfree(defs);
|
||||
ret = ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
|
||||
int i;
|
||||
struct config_item *killme;
|
||||
|
||||
BUG_ON(o2nm_single_cluster != cluster);
|
||||
o2nm_single_cluster = NULL;
|
||||
|
||||
for (i = 0; cluster->cl_group.default_groups[i]; i++) {
|
||||
killme = &cluster->cl_group.default_groups[i]->cg_item;
|
||||
cluster->cl_group.default_groups[i] = NULL;
|
||||
config_item_put(killme);
|
||||
}
|
||||
|
||||
config_item_put(item);
|
||||
}
|
||||
|
||||
static struct configfs_group_operations o2nm_cluster_group_group_ops = {
|
||||
.make_group = o2nm_cluster_group_make_group,
|
||||
.drop_item = o2nm_cluster_group_drop_item,
|
||||
};
|
||||
|
||||
static struct config_item_type o2nm_cluster_group_type = {
|
||||
.ct_group_ops = &o2nm_cluster_group_group_ops,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static struct o2nm_cluster_group o2nm_cluster_group = {
|
||||
.cs_subsys = {
|
||||
.su_group = {
|
||||
.cg_item = {
|
||||
.ci_namebuf = "cluster",
|
||||
.ci_type = &o2nm_cluster_group_type,
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
int o2nm_depend_item(struct config_item *item)
|
||||
{
|
||||
return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
|
||||
}
|
||||
|
||||
void o2nm_undepend_item(struct config_item *item)
|
||||
{
|
||||
configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
|
||||
}
|
||||
|
||||
int o2nm_depend_this_node(void)
|
||||
{
|
||||
int ret = 0;
|
||||
struct o2nm_node *local_node;
|
||||
|
||||
local_node = o2nm_get_node_by_num(o2nm_this_node());
|
||||
if (!local_node) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = o2nm_depend_item(&local_node->nd_item);
|
||||
o2nm_node_put(local_node);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void o2nm_undepend_this_node(void)
|
||||
{
|
||||
struct o2nm_node *local_node;
|
||||
|
||||
local_node = o2nm_get_node_by_num(o2nm_this_node());
|
||||
BUG_ON(!local_node);
|
||||
|
||||
o2nm_undepend_item(&local_node->nd_item);
|
||||
o2nm_node_put(local_node);
|
||||
}
|
||||
|
||||
|
||||
static void __exit exit_o2nm(void)
|
||||
{
|
||||
/* XXX sync with hb callbacks and shut down hb? */
|
||||
o2net_unregister_hb_callbacks();
|
||||
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
|
||||
o2cb_sys_shutdown();
|
||||
|
||||
o2net_exit();
|
||||
o2hb_exit();
|
||||
}
|
||||
|
||||
static int __init init_o2nm(void)
|
||||
{
|
||||
int ret = -1;
|
||||
|
||||
cluster_print_version();
|
||||
|
||||
ret = o2hb_init();
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = o2net_init();
|
||||
if (ret)
|
||||
goto out_o2hb;
|
||||
|
||||
ret = o2net_register_hb_callbacks();
|
||||
if (ret)
|
||||
goto out_o2net;
|
||||
|
||||
config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
|
||||
mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
|
||||
ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
|
||||
if (ret) {
|
||||
printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
|
||||
goto out_callbacks;
|
||||
}
|
||||
|
||||
ret = o2cb_sys_init();
|
||||
if (!ret)
|
||||
goto out;
|
||||
|
||||
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
|
||||
out_callbacks:
|
||||
o2net_unregister_hb_callbacks();
|
||||
out_o2net:
|
||||
o2net_exit();
|
||||
out_o2hb:
|
||||
o2hb_exit();
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
MODULE_AUTHOR("Oracle");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_o2nm)
|
||||
module_exit(exit_o2nm)
|
88
drivers/staging/ramster/cluster/nodemanager.h
Normal file
88
drivers/staging/ramster/cluster/nodemanager.h
Normal file
@ -0,0 +1,88 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* nodemanager.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_NODEMANAGER_H
|
||||
#define O2CLUSTER_NODEMANAGER_H
|
||||
|
||||
#include "ocfs2_nodemanager.h"
|
||||
|
||||
/* This totally doesn't belong here. */
|
||||
#include <linux/configfs.h>
|
||||
#include <linux/rbtree.h>
|
||||
|
||||
enum o2nm_fence_method {
|
||||
O2NM_FENCE_RESET = 0,
|
||||
O2NM_FENCE_PANIC,
|
||||
O2NM_FENCE_METHODS, /* Number of fence methods */
|
||||
};
|
||||
|
||||
struct o2nm_node {
|
||||
spinlock_t nd_lock;
|
||||
struct config_item nd_item;
|
||||
char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
|
||||
__u8 nd_num;
|
||||
/* only one address per node, as attributes, for now. */
|
||||
__be32 nd_ipv4_address;
|
||||
__be16 nd_ipv4_port;
|
||||
struct rb_node nd_ip_node;
|
||||
/* there can be only one local node for now */
|
||||
int nd_local;
|
||||
|
||||
unsigned long nd_set_attributes;
|
||||
};
|
||||
|
||||
struct o2nm_cluster {
|
||||
struct config_group cl_group;
|
||||
unsigned cl_has_local:1;
|
||||
u8 cl_local_node;
|
||||
rwlock_t cl_nodes_lock;
|
||||
struct o2nm_node *cl_nodes[O2NM_MAX_NODES];
|
||||
struct rb_root cl_node_ip_tree;
|
||||
unsigned int cl_idle_timeout_ms;
|
||||
unsigned int cl_keepalive_delay_ms;
|
||||
unsigned int cl_reconnect_delay_ms;
|
||||
enum o2nm_fence_method cl_fence_method;
|
||||
|
||||
/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
|
||||
unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
};
|
||||
|
||||
extern struct o2nm_cluster *o2nm_single_cluster;
|
||||
|
||||
u8 o2nm_this_node(void);
|
||||
|
||||
int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
|
||||
struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
|
||||
struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
|
||||
void o2nm_node_get(struct o2nm_node *node);
|
||||
void o2nm_node_put(struct o2nm_node *node);
|
||||
|
||||
int o2nm_depend_item(struct config_item *item);
|
||||
void o2nm_undepend_item(struct config_item *item);
|
||||
int o2nm_depend_this_node(void);
|
||||
void o2nm_undepend_this_node(void);
|
||||
|
||||
#endif /* O2CLUSTER_NODEMANAGER_H */
|
38
drivers/staging/ramster/cluster/ocfs2_heartbeat.h
Normal file
38
drivers/staging/ramster/cluster/ocfs2_heartbeat.h
Normal file
@ -0,0 +1,38 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs2_heartbeat.h
|
||||
*
|
||||
* On-disk structures for ocfs2_heartbeat
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _OCFS2_HEARTBEAT_H
|
||||
#define _OCFS2_HEARTBEAT_H
|
||||
|
||||
struct o2hb_disk_heartbeat_block {
|
||||
__le64 hb_seq;
|
||||
__u8 hb_node;
|
||||
__u8 hb_pad1[3];
|
||||
__le32 hb_cksum;
|
||||
__le64 hb_generation;
|
||||
__le32 hb_dead_ms;
|
||||
};
|
||||
|
||||
#endif /* _OCFS2_HEARTBEAT_H */
|
45
drivers/staging/ramster/cluster/ocfs2_nodemanager.h
Normal file
45
drivers/staging/ramster/cluster/ocfs2_nodemanager.h
Normal file
@ -0,0 +1,45 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs2_nodemanager.h
|
||||
*
|
||||
* Header describing the interface between userspace and the kernel
|
||||
* for the ocfs2_nodemanager module.
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _OCFS2_NODEMANAGER_H
|
||||
#define _OCFS2_NODEMANAGER_H
|
||||
|
||||
#define O2NM_API_VERSION 5
|
||||
|
||||
#define O2NM_MAX_NODES 255
|
||||
#define O2NM_INVALID_NODE_NUM 255
|
||||
|
||||
/* host name, group name, cluster name all 64 bytes */
|
||||
#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
|
||||
|
||||
/*
|
||||
* Maximum number of global heartbeat regions allowed.
|
||||
* **CAUTION** Changing this number will break dlm compatibility.
|
||||
*/
|
||||
#define O2NM_MAX_REGIONS 32
|
||||
|
||||
#endif /* _OCFS2_NODEMANAGER_H */
|
331
drivers/staging/ramster/cluster/quorum.c
Normal file
331
drivers/staging/ramster/cluster/quorum.c
Normal file
@ -0,0 +1,331 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
*
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
/* This quorum hack is only here until we transition to some more rational
|
||||
* approach that is driven from userspace. Honest. No foolin'.
|
||||
*
|
||||
* Imagine two nodes lose network connectivity to each other but they're still
|
||||
* up and operating in every other way. Presumably a network timeout indicates
|
||||
* that a node is broken and should be recovered. They can't both recover each
|
||||
* other and both carry on without serialising their access to the file system.
|
||||
* They need to decide who is authoritative. Now extend that problem to
|
||||
* arbitrary groups of nodes losing connectivity between each other.
|
||||
*
|
||||
* So we declare that a node which has given up on connecting to a majority
|
||||
* of nodes who are still heartbeating will fence itself.
|
||||
*
|
||||
* There are huge opportunities for races here. After we give up on a node's
|
||||
* connection we need to wait long enough to give heartbeat an opportunity
|
||||
* to declare the node as truly dead. We also need to be careful with the
|
||||
* race between when we see a node start heartbeating and when we connect
|
||||
* to it.
|
||||
*
|
||||
* So nodes that are in this transtion put a hold on the quorum decision
|
||||
* with a counter. As they fall out of this transition they drop the count
|
||||
* and if they're the last, they fire off the decision.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/reboot.h>
|
||||
|
||||
#include "heartbeat.h"
|
||||
#include "nodemanager.h"
|
||||
#define MLOG_MASK_PREFIX ML_QUORUM
|
||||
#include "masklog.h"
|
||||
#include "quorum.h"
|
||||
|
||||
static struct o2quo_state {
|
||||
spinlock_t qs_lock;
|
||||
struct work_struct qs_work;
|
||||
int qs_pending;
|
||||
int qs_heartbeating;
|
||||
unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
int qs_connected;
|
||||
unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
int qs_holds;
|
||||
unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
} o2quo_state;
|
||||
|
||||
/* this is horribly heavy-handed. It should instead flip the file
|
||||
* system RO and call some userspace script. */
|
||||
static void o2quo_fence_self(void)
|
||||
{
|
||||
/* panic spins with interrupts enabled. with preempt
|
||||
* threads can still schedule, etc, etc */
|
||||
o2hb_stop_all_regions();
|
||||
|
||||
switch (o2nm_single_cluster->cl_fence_method) {
|
||||
case O2NM_FENCE_PANIC:
|
||||
panic("*** ocfs2 is very sorry to be fencing this system by "
|
||||
"panicing ***\n");
|
||||
break;
|
||||
default:
|
||||
WARN_ON(o2nm_single_cluster->cl_fence_method >=
|
||||
O2NM_FENCE_METHODS);
|
||||
case O2NM_FENCE_RESET:
|
||||
printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
|
||||
"system by restarting ***\n");
|
||||
emergency_restart();
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
/* Indicate that a timeout occurred on a hearbeat region write. The
|
||||
* other nodes in the cluster may consider us dead at that time so we
|
||||
* want to "fence" ourselves so that we don't scribble on the disk
|
||||
* after they think they've recovered us. This can't solve all
|
||||
* problems related to writeout after recovery but this hack can at
|
||||
* least close some of those gaps. When we have real fencing, this can
|
||||
* go away as our node would be fenced externally before other nodes
|
||||
* begin recovery. */
|
||||
void o2quo_disk_timeout(void)
|
||||
{
|
||||
o2quo_fence_self();
|
||||
}
|
||||
|
||||
static void o2quo_make_decision(struct work_struct *work)
|
||||
{
|
||||
int quorum;
|
||||
int lowest_hb, lowest_reachable = 0, fence = 0;
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
|
||||
if (lowest_hb != O2NM_MAX_NODES)
|
||||
lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
|
||||
|
||||
mlog(0, "heartbeating: %d, connected: %d, "
|
||||
"lowest: %d (%sreachable)\n", qs->qs_heartbeating,
|
||||
qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
|
||||
|
||||
if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
|
||||
qs->qs_heartbeating == 1)
|
||||
goto out;
|
||||
|
||||
if (qs->qs_heartbeating & 1) {
|
||||
/* the odd numbered cluster case is straight forward --
|
||||
* if we can't talk to the majority we're hosed */
|
||||
quorum = (qs->qs_heartbeating + 1)/2;
|
||||
if (qs->qs_connected < quorum) {
|
||||
mlog(ML_ERROR, "fencing this node because it is "
|
||||
"only connected to %u nodes and %u is needed "
|
||||
"to make a quorum out of %u heartbeating nodes\n",
|
||||
qs->qs_connected, quorum,
|
||||
qs->qs_heartbeating);
|
||||
fence = 1;
|
||||
}
|
||||
} else {
|
||||
/* the even numbered cluster adds the possibility of each half
|
||||
* of the cluster being able to talk amongst themselves.. in
|
||||
* that case we're hosed if we can't talk to the group that has
|
||||
* the lowest numbered node */
|
||||
quorum = qs->qs_heartbeating / 2;
|
||||
if (qs->qs_connected < quorum) {
|
||||
mlog(ML_ERROR, "fencing this node because it is "
|
||||
"only connected to %u nodes and %u is needed "
|
||||
"to make a quorum out of %u heartbeating nodes\n",
|
||||
qs->qs_connected, quorum,
|
||||
qs->qs_heartbeating);
|
||||
fence = 1;
|
||||
}
|
||||
else if ((qs->qs_connected == quorum) &&
|
||||
!lowest_reachable) {
|
||||
mlog(ML_ERROR, "fencing this node because it is "
|
||||
"connected to a half-quorum of %u out of %u "
|
||||
"nodes which doesn't include the lowest active "
|
||||
"node %u\n", quorum, qs->qs_heartbeating,
|
||||
lowest_hb);
|
||||
fence = 1;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock(&qs->qs_lock);
|
||||
if (fence)
|
||||
o2quo_fence_self();
|
||||
}
|
||||
|
||||
static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
|
||||
{
|
||||
assert_spin_locked(&qs->qs_lock);
|
||||
|
||||
if (!test_and_set_bit(node, qs->qs_hold_bm)) {
|
||||
qs->qs_holds++;
|
||||
mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
|
||||
"node %u\n", node);
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_holds);
|
||||
}
|
||||
}
|
||||
|
||||
static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
|
||||
{
|
||||
assert_spin_locked(&qs->qs_lock);
|
||||
|
||||
if (test_and_clear_bit(node, qs->qs_hold_bm)) {
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
|
||||
if (--qs->qs_holds == 0) {
|
||||
if (qs->qs_pending) {
|
||||
qs->qs_pending = 0;
|
||||
schedule_work(&qs->qs_work);
|
||||
}
|
||||
}
|
||||
mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
|
||||
node, qs->qs_holds);
|
||||
}
|
||||
}
|
||||
|
||||
/* as a node comes up we delay the quorum decision until we know the fate of
|
||||
* the connection. the hold will be droped in conn_up or hb_down. it might be
|
||||
* perpetuated by con_err until hb_down. if we already have a conn, we might
|
||||
* be dropping a hold that conn_up got. */
|
||||
void o2quo_hb_up(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
qs->qs_heartbeating++;
|
||||
mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
|
||||
"node %u\n", node);
|
||||
mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
|
||||
set_bit(node, qs->qs_hb_bm);
|
||||
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
|
||||
|
||||
if (!test_bit(node, qs->qs_conn_bm))
|
||||
o2quo_set_hold(qs, node);
|
||||
else
|
||||
o2quo_clear_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
/* hb going down releases any holds we might have had due to this node from
|
||||
* conn_up, conn_err, or hb_up */
|
||||
void o2quo_hb_down(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
qs->qs_heartbeating--;
|
||||
mlog_bug_on_msg(qs->qs_heartbeating < 0,
|
||||
"node %u, %d heartbeating\n",
|
||||
node, qs->qs_heartbeating);
|
||||
mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
|
||||
clear_bit(node, qs->qs_hb_bm);
|
||||
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
|
||||
|
||||
o2quo_clear_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
/* this tells us that we've decided that the node is still heartbeating
|
||||
* even though we've lost it's conn. it must only be called after conn_err
|
||||
* and indicates that we must now make a quorum decision in the future,
|
||||
* though we might be doing so after waiting for holds to drain. Here
|
||||
* we'll be dropping the hold from conn_err. */
|
||||
void o2quo_hb_still_up(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
mlog(0, "node %u\n", node);
|
||||
|
||||
qs->qs_pending = 1;
|
||||
o2quo_clear_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
/* This is analogous to hb_up. as a node's connection comes up we delay the
|
||||
* quorum decision until we see it heartbeating. the hold will be droped in
|
||||
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
|
||||
* it's already heartbeating we we might be dropping a hold that conn_up got.
|
||||
* */
|
||||
void o2quo_conn_up(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
qs->qs_connected++;
|
||||
mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
|
||||
"node %u\n", node);
|
||||
mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
|
||||
set_bit(node, qs->qs_conn_bm);
|
||||
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
|
||||
|
||||
if (!test_bit(node, qs->qs_hb_bm))
|
||||
o2quo_set_hold(qs, node);
|
||||
else
|
||||
o2quo_clear_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
/* we've decided that we won't ever be connecting to the node again. if it's
|
||||
* still heartbeating we grab a hold that will delay decisions until either the
|
||||
* node stops heartbeating from hb_down or the caller decides that the node is
|
||||
* still up and calls still_up */
|
||||
void o2quo_conn_err(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
if (test_bit(node, qs->qs_conn_bm)) {
|
||||
qs->qs_connected--;
|
||||
mlog_bug_on_msg(qs->qs_connected < 0,
|
||||
"node %u, connected %d\n",
|
||||
node, qs->qs_connected);
|
||||
|
||||
clear_bit(node, qs->qs_conn_bm);
|
||||
}
|
||||
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
|
||||
|
||||
if (test_bit(node, qs->qs_hb_bm))
|
||||
o2quo_set_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
void o2quo_init(void)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock_init(&qs->qs_lock);
|
||||
INIT_WORK(&qs->qs_work, o2quo_make_decision);
|
||||
}
|
||||
|
||||
void o2quo_exit(void)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
flush_work_sync(&qs->qs_work);
|
||||
}
|
36
drivers/staging/ramster/cluster/quorum.h
Normal file
36
drivers/staging/ramster/cluster/quorum.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_QUORUM_H
|
||||
#define O2CLUSTER_QUORUM_H
|
||||
|
||||
void o2quo_init(void);
|
||||
void o2quo_exit(void);
|
||||
|
||||
void o2quo_hb_up(u8 node);
|
||||
void o2quo_hb_down(u8 node);
|
||||
void o2quo_hb_still_up(u8 node);
|
||||
void o2quo_conn_up(u8 node);
|
||||
void o2quo_conn_err(u8 node);
|
||||
void o2quo_disk_timeout(void);
|
||||
|
||||
#endif /* O2CLUSTER_QUORUM_H */
|
82
drivers/staging/ramster/cluster/sys.c
Normal file
82
drivers/staging/ramster/cluster/sys.c
Normal file
@ -0,0 +1,82 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* sys.c
|
||||
*
|
||||
* OCFS2 cluster sysfs interface
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation,
|
||||
* version 2 of the License.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/fs.h>
|
||||
|
||||
#include "ocfs2_nodemanager.h"
|
||||
#include "masklog.h"
|
||||
#include "sys.h"
|
||||
|
||||
|
||||
static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
|
||||
}
|
||||
static struct kobj_attribute attr_version =
|
||||
__ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
|
||||
|
||||
static struct attribute *o2cb_attrs[] = {
|
||||
&attr_version.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group o2cb_attr_group = {
|
||||
.attrs = o2cb_attrs,
|
||||
};
|
||||
|
||||
static struct kset *o2cb_kset;
|
||||
|
||||
void o2cb_sys_shutdown(void)
|
||||
{
|
||||
mlog_sys_shutdown();
|
||||
kset_unregister(o2cb_kset);
|
||||
}
|
||||
|
||||
int o2cb_sys_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj);
|
||||
if (!o2cb_kset)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
|
||||
if (ret)
|
||||
goto error;
|
||||
|
||||
ret = mlog_sys_init(o2cb_kset);
|
||||
if (ret)
|
||||
goto error;
|
||||
return 0;
|
||||
error:
|
||||
kset_unregister(o2cb_kset);
|
||||
return ret;
|
||||
}
|
33
drivers/staging/ramster/cluster/sys.h
Normal file
33
drivers/staging/ramster/cluster/sys.h
Normal file
@ -0,0 +1,33 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* sys.h
|
||||
*
|
||||
* Function prototypes for o2cb sysfs interface
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation,
|
||||
* version 2 of the License.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_SYS_H
|
||||
#define O2CLUSTER_SYS_H
|
||||
|
||||
void o2cb_sys_shutdown(void);
|
||||
int o2cb_sys_init(void);
|
||||
|
||||
#endif /* O2CLUSTER_SYS_H */
|
2150
drivers/staging/ramster/cluster/tcp.c
Normal file
2150
drivers/staging/ramster/cluster/tcp.c
Normal file
File diff suppressed because it is too large
Load Diff
154
drivers/staging/ramster/cluster/tcp.h
Normal file
154
drivers/staging/ramster/cluster/tcp.h
Normal file
@ -0,0 +1,154 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* tcp.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_TCP_H
|
||||
#define O2CLUSTER_TCP_H
|
||||
|
||||
#include <linux/socket.h>
|
||||
#ifdef __KERNEL__
|
||||
#include <net/sock.h>
|
||||
#include <linux/tcp.h>
|
||||
#else
|
||||
#include <sys/socket.h>
|
||||
#endif
|
||||
#include <linux/inet.h>
|
||||
#include <linux/in.h>
|
||||
|
||||
struct o2net_msg
|
||||
{
|
||||
__be16 magic;
|
||||
__be16 data_len;
|
||||
__be16 msg_type;
|
||||
__be16 pad1;
|
||||
__be32 sys_status;
|
||||
__be32 status;
|
||||
__be32 key;
|
||||
__be32 msg_num;
|
||||
__u8 buf[0];
|
||||
};
|
||||
|
||||
typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data,
|
||||
void **ret_data);
|
||||
typedef void (o2net_post_msg_handler_func)(int status, void *data,
|
||||
void *ret_data);
|
||||
|
||||
#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg))
|
||||
|
||||
/* same as hb delay, we're waiting for another node to recognize our hb */
|
||||
#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000
|
||||
|
||||
#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
|
||||
#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
|
||||
|
||||
|
||||
/* TODO: figure this out.... */
|
||||
static inline int o2net_link_down(int err, struct socket *sock)
|
||||
{
|
||||
if (sock) {
|
||||
if (sock->sk->sk_state != TCP_ESTABLISHED &&
|
||||
sock->sk->sk_state != TCP_CLOSE_WAIT)
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (err >= 0)
|
||||
return 0;
|
||||
switch (err) {
|
||||
/* ????????????????????????? */
|
||||
case -ERESTARTSYS:
|
||||
case -EBADF:
|
||||
/* When the server has died, an ICMP port unreachable
|
||||
* message prompts ECONNREFUSED. */
|
||||
case -ECONNREFUSED:
|
||||
case -ENOTCONN:
|
||||
case -ECONNRESET:
|
||||
case -EPIPE:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum {
|
||||
O2NET_DRIVER_UNINITED,
|
||||
O2NET_DRIVER_READY,
|
||||
};
|
||||
|
||||
int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
|
||||
u8 target_node, int *status);
|
||||
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
|
||||
size_t veclen, u8 target_node, int *status);
|
||||
|
||||
int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
|
||||
o2net_msg_handler_func *func, void *data,
|
||||
o2net_post_msg_handler_func *post_func,
|
||||
struct list_head *unreg_list);
|
||||
void o2net_unregister_handler_list(struct list_head *list);
|
||||
|
||||
void o2net_fill_node_map(unsigned long *map, unsigned bytes);
|
||||
|
||||
struct o2nm_node;
|
||||
int o2net_register_hb_callbacks(void);
|
||||
void o2net_unregister_hb_callbacks(void);
|
||||
int o2net_start_listening(struct o2nm_node *node);
|
||||
void o2net_stop_listening(struct o2nm_node *node);
|
||||
void o2net_disconnect_node(struct o2nm_node *node);
|
||||
int o2net_num_connected_peers(void);
|
||||
|
||||
int o2net_init(void);
|
||||
void o2net_exit(void);
|
||||
|
||||
struct o2net_send_tracking;
|
||||
struct o2net_sock_container;
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
int o2net_debugfs_init(void);
|
||||
void o2net_debugfs_exit(void);
|
||||
void o2net_debug_add_nst(struct o2net_send_tracking *nst);
|
||||
void o2net_debug_del_nst(struct o2net_send_tracking *nst);
|
||||
void o2net_debug_add_sc(struct o2net_sock_container *sc);
|
||||
void o2net_debug_del_sc(struct o2net_sock_container *sc);
|
||||
#else
|
||||
static inline int o2net_debugfs_init(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void o2net_debugfs_exit(void)
|
||||
{
|
||||
}
|
||||
static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst)
|
||||
{
|
||||
}
|
||||
static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst)
|
||||
{
|
||||
}
|
||||
static inline void o2net_debug_add_sc(struct o2net_sock_container *sc)
|
||||
{
|
||||
}
|
||||
static inline void o2net_debug_del_sc(struct o2net_sock_container *sc)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_DEBUG_FS */
|
||||
|
||||
#endif /* O2CLUSTER_TCP_H */
|
242
drivers/staging/ramster/cluster/tcp_internal.h
Normal file
242
drivers/staging/ramster/cluster/tcp_internal.h
Normal file
@ -0,0 +1,242 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_TCP_INTERNAL_H
|
||||
#define O2CLUSTER_TCP_INTERNAL_H
|
||||
|
||||
#define O2NET_MSG_MAGIC ((u16)0xfa55)
|
||||
#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56)
|
||||
#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57)
|
||||
#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
|
||||
|
||||
/* we're delaying our quorum decision so that heartbeat will have timed
|
||||
* out truly dead nodes by the time we come around to making decisions
|
||||
* on their number */
|
||||
#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
|
||||
|
||||
/*
|
||||
* This version number represents quite a lot, unfortunately. It not
|
||||
* only represents the raw network message protocol on the wire but also
|
||||
* locking semantics of the file system using the protocol. It should
|
||||
* be somewhere else, I'm sure, but right now it isn't.
|
||||
*
|
||||
* With version 11, we separate out the filesystem locking portion. The
|
||||
* filesystem now has a major.minor version it negotiates. Version 11
|
||||
* introduces this negotiation to the o2dlm protocol, and as such the
|
||||
* version here in tcp_internal.h should not need to be bumped for
|
||||
* filesystem locking changes.
|
||||
*
|
||||
* New in version 11
|
||||
* - Negotiation of filesystem locking in the dlm join.
|
||||
*
|
||||
* New in version 10:
|
||||
* - Meta/data locks combined
|
||||
*
|
||||
* New in version 9:
|
||||
* - All votes removed
|
||||
*
|
||||
* New in version 8:
|
||||
* - Replace delete inode votes with a cluster lock
|
||||
*
|
||||
* New in version 7:
|
||||
* - DLM join domain includes the live nodemap
|
||||
*
|
||||
* New in version 6:
|
||||
* - DLM lockres remote refcount fixes.
|
||||
*
|
||||
* New in version 5:
|
||||
* - Network timeout checking protocol
|
||||
*
|
||||
* New in version 4:
|
||||
* - Remove i_generation from lock names for better stat performance.
|
||||
*
|
||||
* New in version 3:
|
||||
* - Replace dentry votes with a cluster lock
|
||||
*
|
||||
* New in version 2:
|
||||
* - full 64 bit i_size in the metadata lock lvbs
|
||||
* - introduction of "rw" lock and pushing meta/data locking down
|
||||
*/
|
||||
#define O2NET_PROTOCOL_VERSION 11ULL
|
||||
struct o2net_handshake {
|
||||
__be64 protocol_version;
|
||||
__be64 connector_id;
|
||||
__be32 o2hb_heartbeat_timeout_ms;
|
||||
__be32 o2net_idle_timeout_ms;
|
||||
__be32 o2net_keepalive_delay_ms;
|
||||
__be32 o2net_reconnect_delay_ms;
|
||||
};
|
||||
|
||||
struct o2net_node {
|
||||
/* this is never called from int/bh */
|
||||
spinlock_t nn_lock;
|
||||
|
||||
/* set the moment an sc is allocated and a connect is started */
|
||||
struct o2net_sock_container *nn_sc;
|
||||
/* _valid is only set after the handshake passes and tx can happen */
|
||||
unsigned nn_sc_valid:1;
|
||||
/* if this is set tx just returns it */
|
||||
int nn_persistent_error;
|
||||
/* It is only set to 1 after the idle time out. */
|
||||
atomic_t nn_timeout;
|
||||
|
||||
/* threads waiting for an sc to arrive wait on the wq for generation
|
||||
* to increase. it is increased when a connecting socket succeeds
|
||||
* or fails or when an accepted socket is attached. */
|
||||
wait_queue_head_t nn_sc_wq;
|
||||
|
||||
struct idr nn_status_idr;
|
||||
struct list_head nn_status_list;
|
||||
|
||||
/* connects are attempted from when heartbeat comes up until either hb
|
||||
* goes down, the node is unconfigured, no connect attempts succeed
|
||||
* before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work
|
||||
* is queued from set_nn_state both from hb up and from itself if a
|
||||
* connect attempt fails and so can be self-arming. shutdown is
|
||||
* careful to first mark the nn such that no connects will be attempted
|
||||
* before canceling delayed connect work and flushing the queue. */
|
||||
struct delayed_work nn_connect_work;
|
||||
unsigned long nn_last_connect_attempt;
|
||||
|
||||
/* this is queued as nodes come up and is canceled when a connection is
|
||||
* established. this expiring gives up on the node and errors out
|
||||
* transmits */
|
||||
struct delayed_work nn_connect_expired;
|
||||
|
||||
/* after we give up on a socket we wait a while before deciding
|
||||
* that it is still heartbeating and that we should do some
|
||||
* quorum work */
|
||||
struct delayed_work nn_still_up;
|
||||
};
|
||||
|
||||
struct o2net_sock_container {
|
||||
struct kref sc_kref;
|
||||
/* the next two are valid for the life time of the sc */
|
||||
struct socket *sc_sock;
|
||||
struct o2nm_node *sc_node;
|
||||
|
||||
/* all of these sc work structs hold refs on the sc while they are
|
||||
* queued. they should not be able to ref a freed sc. the teardown
|
||||
* race is with o2net_wq destruction in o2net_stop_listening() */
|
||||
|
||||
/* rx and connect work are generated from socket callbacks. sc
|
||||
* shutdown removes the callbacks and then flushes the work queue */
|
||||
struct work_struct sc_rx_work;
|
||||
struct work_struct sc_connect_work;
|
||||
/* shutdown work is triggered in two ways. the simple way is
|
||||
* for a code path calls ensure_shutdown which gets a lock, removes
|
||||
* the sc from the nn, and queues the work. in this case the
|
||||
* work is single-shot. the work is also queued from a sock
|
||||
* callback, though, and in this case the work will find the sc
|
||||
* still on the nn and will call ensure_shutdown itself.. this
|
||||
* ends up triggering the shutdown work again, though nothing
|
||||
* will be done in that second iteration. so work queue teardown
|
||||
* has to be careful to remove the sc from the nn before waiting
|
||||
* on the work queue so that the shutdown work doesn't remove the
|
||||
* sc and rearm itself.
|
||||
*/
|
||||
struct work_struct sc_shutdown_work;
|
||||
|
||||
struct timer_list sc_idle_timeout;
|
||||
struct delayed_work sc_keepalive_work;
|
||||
|
||||
unsigned sc_handshake_ok:1;
|
||||
|
||||
struct page *sc_page;
|
||||
size_t sc_page_off;
|
||||
|
||||
/* original handlers for the sockets */
|
||||
void (*sc_state_change)(struct sock *sk);
|
||||
void (*sc_data_ready)(struct sock *sk, int bytes);
|
||||
|
||||
u32 sc_msg_key;
|
||||
u16 sc_msg_type;
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
struct list_head sc_net_debug_item;
|
||||
ktime_t sc_tv_timer;
|
||||
ktime_t sc_tv_data_ready;
|
||||
ktime_t sc_tv_advance_start;
|
||||
ktime_t sc_tv_advance_stop;
|
||||
ktime_t sc_tv_func_start;
|
||||
ktime_t sc_tv_func_stop;
|
||||
#endif
|
||||
#ifdef CONFIG_OCFS2_FS_STATS
|
||||
ktime_t sc_tv_acquiry_total;
|
||||
ktime_t sc_tv_send_total;
|
||||
ktime_t sc_tv_status_total;
|
||||
u32 sc_send_count;
|
||||
u32 sc_recv_count;
|
||||
ktime_t sc_tv_process_total;
|
||||
#endif
|
||||
struct mutex sc_send_lock;
|
||||
};
|
||||
|
||||
struct o2net_msg_handler {
|
||||
struct rb_node nh_node;
|
||||
u32 nh_max_len;
|
||||
u32 nh_msg_type;
|
||||
u32 nh_key;
|
||||
o2net_msg_handler_func *nh_func;
|
||||
o2net_msg_handler_func *nh_func_data;
|
||||
o2net_post_msg_handler_func
|
||||
*nh_post_func;
|
||||
struct kref nh_kref;
|
||||
struct list_head nh_unregister_item;
|
||||
};
|
||||
|
||||
enum o2net_system_error {
|
||||
O2NET_ERR_NONE = 0,
|
||||
O2NET_ERR_NO_HNDLR,
|
||||
O2NET_ERR_OVERFLOW,
|
||||
O2NET_ERR_DIED,
|
||||
O2NET_ERR_MAX
|
||||
};
|
||||
|
||||
struct o2net_status_wait {
|
||||
enum o2net_system_error ns_sys_status;
|
||||
s32 ns_status;
|
||||
int ns_id;
|
||||
wait_queue_head_t ns_wq;
|
||||
struct list_head ns_node_item;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
/* just for state dumps */
|
||||
struct o2net_send_tracking {
|
||||
struct list_head st_net_debug_item;
|
||||
struct task_struct *st_task;
|
||||
struct o2net_sock_container *st_sc;
|
||||
u32 st_id;
|
||||
u32 st_msg_type;
|
||||
u32 st_msg_key;
|
||||
u8 st_node;
|
||||
ktime_t st_sock_time;
|
||||
ktime_t st_send_time;
|
||||
ktime_t st_status_time;
|
||||
};
|
||||
#else
|
||||
struct o2net_send_tracking {
|
||||
u32 dummy;
|
||||
};
|
||||
#endif /* CONFIG_DEBUG_FS */
|
||||
|
||||
#endif /* O2CLUSTER_TCP_INTERNAL_H */
|
42
drivers/staging/ramster/cluster/ver.c
Normal file
42
drivers/staging/ramster/cluster/ver.c
Normal file
@ -0,0 +1,42 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ver.c
|
||||
*
|
||||
* version string
|
||||
*
|
||||
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include "ver.h"
|
||||
|
||||
#define CLUSTER_BUILD_VERSION "1.5.0"
|
||||
|
||||
#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
|
||||
|
||||
void cluster_print_version(void)
|
||||
{
|
||||
printk(KERN_INFO "%s\n", VERSION_STR);
|
||||
}
|
||||
|
||||
MODULE_DESCRIPTION(VERSION_STR);
|
||||
|
||||
MODULE_VERSION(CLUSTER_BUILD_VERSION);
|
31
drivers/staging/ramster/cluster/ver.h
Normal file
31
drivers/staging/ramster/cluster/ver.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ver.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_VER_H
|
||||
#define O2CLUSTER_VER_H
|
||||
|
||||
void cluster_print_version(void);
|
||||
|
||||
#endif /* O2CLUSTER_VER_H */
|
Loading…
Reference in New Issue
Block a user