features/bitrot: cleanup, v1

This is a short series of patches (with other cleanups) aimed at
cleaning up some of the incorrect assumptions taken in reconfigure()
leading to crashes when subvolumes are not fully initialized (as
reported here[1] on gluster-devel@). Furthermore, there is some
amount of code cleanup to handle disconnection and cleanup up data
structure (as part of subsequent patch).

[1] http://www.gluster.org/pipermail/gluster-devel/2015-June/045410.html

Change-Id: I68ac4bccfbac4bf02fcc31615bd7d2d191021132
BUG: 1231617
Signed-off-by: Venky Shankar <vshankar@redhat.com>
Reviewed-on: http://review.gluster.org/11147
Tested-by: NetBSD Build System <jenkins@build.gluster.org>
Reviewed-by: Raghavendra Bhat <raghavendra@redhat.com>
This commit is contained in:
Venky Shankar 2015-06-02 21:23:48 +05:30 committed by Raghavendra Bhat
parent 9b1305e549
commit 17b838ce18
3 changed files with 280 additions and 135 deletions

View File

@ -25,6 +25,18 @@
#define BR_HASH_CALC_READ_SIZE (128 * 1024)
typedef int32_t (br_child_handler)(xlator_t *, br_child_t *);
struct br_child_event {
xlator_t *this;
br_child_t *child;
br_child_handler *call;
struct list_head list;
};
static int
br_find_child_index (xlator_t *this, xlator_t *child)
{
@ -49,26 +61,6 @@ out:
return index;
}
static void
br_free_children (xlator_t *this)
{
br_private_t *priv = NULL;
int32_t i = 0;
br_child_t *child = NULL;
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
child = &priv->children[i];
mem_pool_destroy (child->timer_pool);
list_del_init (&priv->children[i].list);
}
GF_FREE (priv->children);
priv->children = NULL;
}
br_child_t *
br_get_child_from_brick_path (xlator_t *this, char *brick_path)
{
@ -1085,6 +1077,16 @@ br_oneshot_signer (void *arg)
return NULL;
}
static void
br_set_child_state (br_child_t *child, br_child_state_t state)
{
LOCK (&child->lock);
{
_br_set_child_state (child, state);
}
UNLOCK (&child->lock);
}
/**
* At this point a thread is spawned to crawl the filesystem (in
* tortoise pace) to sign objects that were not signed in previous run(s).
@ -1172,7 +1174,12 @@ br_enact_scrubber (xlator_t *this, br_child_t *child)
goto error_return;
}
ret = br_fsscan_schedule (this, child, fsscan, fsscrub);
/* this needs to be serialized with reconfigure() */
pthread_mutex_lock (&priv->lock);
{
ret = br_fsscan_schedule (this, child, fsscan, fsscrub);
}
pthread_mutex_unlock (&priv->lock);
if (ret)
goto error_return;
@ -1197,6 +1204,30 @@ br_enact_scrubber (xlator_t *this, br_child_t *child)
return -1;
}
static int32_t
br_child_enaction (xlator_t *this, br_child_t *child, br_stub_init_t *stub)
{
int32_t ret = -1;
br_private_t *priv = this->private;
LOCK (&child->lock);
{
if (priv->iamscrubber)
ret = br_enact_scrubber (this, child);
else
ret = br_enact_signer (this, child, stub);
if (!ret) {
_br_set_child_state (child, BR_CHILD_STATE_CONNECTED);
gf_log (this->name, GF_LOG_INFO,
"Connected to brick %s..", child->brick_path);
}
}
UNLOCK (&child->lock);
return ret;
}
/**
* This routine fetches various attributes associated with a child which
* is basically a subvolume. Attributes include brick path and the stub
@ -1204,7 +1235,7 @@ br_enact_scrubber (xlator_t *this, br_child_t *child)
* by getxattr() on a virtual key. Depending on the configuration, the
* process either acts as a signer or a scrubber.
*/
static inline int32_t
int32_t
br_brick_connect (xlator_t *this, br_child_t *child)
{
int32_t ret = -1;
@ -1213,14 +1244,13 @@ br_brick_connect (xlator_t *this, br_child_t *child)
struct iatt parent = {0, };
br_stub_init_t *stub = NULL;
dict_t *xattr = NULL;
br_private_t *priv = NULL;
int op_errno = 0;
GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
GF_VALIDATE_OR_GOTO (this->name, child, out);
GF_VALIDATE_OR_GOTO (this->name, this->private, out);
priv = this->private;
br_set_child_state (child, BR_CHILD_STATE_INITIALIZING);
loc.inode = inode_ref (child->table->root);
gf_uuid_copy (loc.gfid, loc.inode->gfid);
@ -1257,20 +1287,15 @@ br_brick_connect (xlator_t *this, br_child_t *child)
child->tv.tv_sec = ntohl (stub->timebuf[0]);
child->tv.tv_usec = ntohl (stub->timebuf[1]);
if (priv->iamscrubber)
ret = br_enact_scrubber (this, child);
else
ret = br_enact_signer (this, child, stub);
if (!ret)
gf_msg (this->name, GF_LOG_INFO, 0, BRB_MSG_CONNECTED_TO_BRICK,
"Connected to brick %s..", child->brick_path);
ret = br_child_enaction (this, child, stub);
free_dict:
dict_unref (xattr);
wipeloc:
loc_wipe (&loc);
out:
if (ret)
br_set_child_state (child, BR_CHILD_STATE_CONNFAILED);
return ret;
}
@ -1285,7 +1310,8 @@ br_handle_events (void *arg)
int32_t ret = 0;
xlator_t *this = NULL;
br_private_t *priv = NULL;
br_child_t *child = NULL;
br_child_t *child = NULL;
struct br_child_event *childev = NULL;
this = arg;
priv = this->private;
@ -1304,17 +1330,20 @@ br_handle_events (void *arg)
while (list_empty (&priv->bricks))
pthread_cond_wait (&priv->cond, &priv->lock);
child = list_first_entry
(&priv->bricks, br_child_t, list);
list_del_init (&child->list);
childev = list_first_entry
(&priv->bricks, struct br_child_event, list);
list_del_init (&childev->list);
}
pthread_mutex_unlock (&priv->lock);
ret = br_brick_connect (this, child);
child = childev->child;
ret = childev->call (this, child);
if (ret)
gf_msg (this->name, GF_LOG_ERROR, 0,
BRB_MSG_SUBVOL_CONNECT_FAILED, "failed to "
"connect to subvolume %s", child->xl->name);
BRB_MSG_SUBVOL_CONNECT_FAILED,
"callback handler for subvolume [%s] failed",
child->xl->name);
GF_FREE (childev);
}
return NULL;
@ -1339,6 +1368,29 @@ mem_acct_init (xlator_t *this)
return ret;
}
static void
_br_qchild_event (xlator_t *this, br_child_t *child, br_child_handler *call)
{
br_private_t *priv = NULL;
struct br_child_event *childev = NULL;
priv = this->private;
childev = GF_CALLOC (1, sizeof (*childev), gf_br_mt_br_child_event_t);
if (!childev) {
gf_log (this->name, GF_LOG_ERROR, "Event unhandled for "
"child.. [Brick: %s]", child->xl->name);
return;
}
INIT_LIST_HEAD (&childev->list);
childev->this = this;
childev->child = child;
childev->call = call;
list_add_tail (&childev->list, &priv->bricks);
}
int
notify (xlator_t *this, int32_t event, void *data, ...)
{
@ -1368,14 +1420,14 @@ notify (xlator_t *this, int32_t event, void *data, ...)
child = &priv->children[idx];
if (child->child_up == 1)
goto unblock;
priv->up_children++;
child->child_up = 1;
child->xl = subvol;
child->table = inode_table_new (4096, subvol);
if (!child->table)
child->table = inode_table_new (4096, subvol);
priv->up_children++;
list_add_tail (&child->list, &priv->bricks);
_br_qchild_event (this, child, br_brick_connect);
pthread_cond_signal (&priv->cond);
}
unblock:
@ -1405,6 +1457,7 @@ notify (xlator_t *this, int32_t event, void *data, ...)
if (priv->up_children == 0)
default_notify (this, event, data);
break;
default:
default_notify (this, event, data);
}
@ -1558,59 +1611,94 @@ br_signer_init (xlator_t *this, br_private_t *priv)
}
static void
br_free_children (xlator_t *this, br_private_t *priv, int count)
{
br_child_t *child = NULL;
for (--count; count >= 0; count--) {
child = &priv->children[count];
mem_pool_destroy (child->timer_pool);
LOCK_DESTROY (&child->lock);
}
GF_FREE (priv->children);
priv->children = NULL;
}
static int
br_init_children (xlator_t *this, br_private_t *priv)
{
int i = 0;
br_child_t *child = NULL;
xlator_list_t *trav = NULL;
priv->child_count = xlator_subvolume_count (this);
priv->children = GF_CALLOC (priv->child_count, sizeof (*priv->children),
gf_br_mt_br_child_t);
if (!priv->children)
goto err;
trav = this->children;
while (trav) {
child = &priv->children[i];
LOCK_INIT (&child->lock);
br_set_child_state (child, BR_CHILD_STATE_DISCONNECTED);
child->this = this;
child->xl = trav->xlator;
child->timer_pool = mem_pool_new
(struct gf_tw_timer_list, 4096);
if (!child->timer_pool) {
gf_log (this->name, GF_LOG_ERROR,
"failed to allocate mem-pool for timer");
errno = ENOMEM;
goto freechild;
}
INIT_LIST_HEAD (&child->list);
i++;
trav = trav->next;
}
return 0;
freechild:
br_free_children (this, priv, i);
err:
return -1;
}
int32_t
init (xlator_t *this)
{
int i = 0;
int32_t ret = -1;
br_private_t *priv = NULL;
xlator_list_t *trav = NULL;
int32_t ret = -1;
br_private_t *priv = NULL;
if (!this->children) {
gf_msg (this->name, GF_LOG_ERROR, 0, BRB_MSG_NO_CHILD,
"FATAL: no children");
goto out;
}
if (!this->children) {
gf_log (this->name, GF_LOG_ERROR, "FATAL: no children");
goto out;
}
priv = GF_CALLOC (1, sizeof (*priv), gf_br_mt_br_private_t);
if (!priv) {
gf_msg (this->name, GF_LOG_ERROR, ENOMEM, BRB_MSG_NO_MEMORY,
gf_log (this->name, GF_LOG_ERROR,
"failed to allocate memory (->priv)");
goto out;
}
GF_OPTION_INIT ("scrubber", priv->iamscrubber, bool, out);
priv->child_count = xlator_subvolume_count (this);
priv->children = GF_CALLOC (priv->child_count, sizeof (*priv->children),
gf_br_mt_br_child_t);
if (!priv->children)
ret = br_init_children (this, priv);
if (ret)
goto free_priv;
trav = this->children;
while (trav) {
priv->children[i].this = this;
priv->children[i].xl = trav->xlator;
priv->children[i].timer_pool =
mem_pool_new (struct gf_tw_timer_list, 4096);
if (!priv->children[i].timer_pool) {
gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
BRB_MSG_NO_MEMORY, "failed to allocate mem-pool"
" for timer");
errno = ENOMEM;
goto free_children;
}
i++;
trav = trav->next;
}
pthread_mutex_init (&priv->lock, NULL);
pthread_cond_init (&priv->cond, NULL);
for (i = 0; i < priv->child_count; i++)
INIT_LIST_HEAD (&priv->children[i].list);
INIT_LIST_HEAD (&priv->bricks);
INIT_LIST_HEAD (&priv->signing);
@ -1619,7 +1707,7 @@ init (xlator_t *this)
gf_msg (this->name, GF_LOG_ERROR, 0,
BRB_MSG_TIMER_WHEEL_UNAVAILABLE,
"global timer wheel unavailable");
goto cleanup_mutex;
goto cleanup;
}
this->private = priv;
@ -1635,7 +1723,7 @@ init (xlator_t *this)
}
if (ret)
goto cleanup_mutex;
goto cleanup;
ret = gf_thread_create (&priv->thread, NULL, br_handle_events, this);
if (ret != 0) {
@ -1651,16 +1739,12 @@ init (xlator_t *this)
return 0;
}
cleanup_mutex:
cleanup:
(void) pthread_cond_destroy (&priv->cond);
(void) pthread_mutex_destroy (&priv->lock);
free_children:
for (i = 0; i < priv->child_count; i++) {
if (priv->children[i].timer_pool)
mem_pool_destroy (priv->children[i].timer_pool);
}
GF_FREE (priv->children);
br_free_children (this, priv, priv->child_count);
free_priv:
GF_FREE (priv);
out:
@ -1678,7 +1762,7 @@ fini (xlator_t *this)
if (!priv->iamscrubber)
br_fini_signer (this, priv);
br_free_children (this);
br_free_children (this, priv, priv->child_count);
this->private = NULL;
GF_FREE (priv);
@ -1686,64 +1770,96 @@ fini (xlator_t *this)
return;
}
int
reconfigure (xlator_t *this, dict_t *options)
static void
br_reconfigure_child (xlator_t *this,
br_child_t *child, struct br_scrubber *fsscrub)
{
int i = 0;
int32_t ret = -1;
br_child_t *child = NULL;
br_private_t *priv = NULL;
struct br_scanfs *fsscan = NULL;
int32_t ret = 0;
struct br_scanfs *fsscan = &child->fsscan;
ret = br_fsscan_reschedule (this, child, fsscan, fsscrub, _gf_true);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Could not reschedule scrubber for brick: %s. "
"Scubbing will continue according to old frequency.",
child->brick_path);
}
}
static int
br_reconfigure_scrubber (xlator_t *this, dict_t *options)
{
int i = 0;
int32_t ret = -1;
br_child_t *child = NULL;
br_private_t *priv = NULL;
struct br_scrubber *fsscrub = NULL;
priv = this->private;
if (!priv->iamscrubber) {
ret = br_signer_handle_options (this, priv, options);
if (ret)
goto err;
return 0;
}
ret = br_scrubber_handle_options (this, priv, options);
if (ret)
goto err;
fsscrub = &priv->fsscrub;
/* reschedule all _up_ subvolume(s) */
pthread_mutex_lock (&priv->lock);
{
for (; i < priv->child_count; i++) {
child = &priv->children[i];
if (!child->child_up) {
gf_msg (this->name, GF_LOG_INFO, 0,
BRB_MSG_BRICK_INFO, "Brick %s is "
"offline, skipping rescheduling (scrub"
" would auto- schedule when brick is "
"back online).", child->brick_path);
continue;
}
fsscan = &child->fsscan;
ret = br_fsscan_reschedule (this, child,
fsscan, fsscrub, _gf_true);
if (ret) {
gf_msg (this->name, GF_LOG_ERROR, 0,
BRB_MSG_RESCHEDULE_SCRUBBER_FAILED,
"Could not reschedule scrubber for "
"brick: %s. Scubbing will continue "
"according to old frequency.",
child->brick_path);
}
}
ret = br_scrubber_handle_options (this, priv, options);
}
pthread_mutex_unlock (&priv->lock);
return 0;
if (ret)
goto err;
/* reschedule all _up_ subvolume(s) */
for (; i < priv->child_count; i++) {
child = &priv->children[i];
LOCK (&child->lock);
{
if (_br_child_failed_conn (child)) {
gf_log (this->name, GF_LOG_INFO,
"Scrubber for brick [%s] failed "
"initialization, rescheduling is "
"skipped", child->brick_path);
goto unblock;
}
if (_br_is_child_connected (child))
br_reconfigure_child (this, child, fsscrub);
/**
* for the rest.. either the child is in initialization
* phase or is disconnected. either way, updated values
* would be reflected on successful connection.
*/
}
unblock:
UNLOCK (&child->lock);
}
err:
return -1;
return ret;
}
static int
br_reconfigure_signer (xlator_t *this, dict_t *options)
{
br_private_t *priv = this->private;
return br_signer_handle_options (this, priv, options);
}
int
reconfigure (xlator_t *this, dict_t *options)
{
int ret = 0;
br_private_t *priv = NULL;
priv = this->private;
if (priv->iamscrubber)
ret = br_reconfigure_scrubber (this, options);
else
ret = br_reconfigure_signer (this, options);
return ret;
}
struct xlator_fops fops;

View File

@ -71,7 +71,18 @@ struct br_scanfs {
struct gf_tw_timer_list *timer;
};
/* just need three states to track child status */
typedef enum br_child_state {
BR_CHILD_STATE_CONNECTED = 1,
BR_CHILD_STATE_INITIALIZING,
BR_CHILD_STATE_CONNFAILED,
BR_CHILD_STATE_DISCONNECTED,
} br_child_state_t;
struct br_child {
gf_lock_t lock;
br_child_state_t c_state;
char child_up; /* Indicates whether this child is
up or not */
xlator_t *xl; /* client xlator corresponding to
@ -135,8 +146,8 @@ typedef struct br_obj_n_workers br_obj_n_workers_t;
struct br_private {
pthread_mutex_t lock;
struct list_head bricks; /* list of bricks from which CHILD_UP
has been received */
struct list_head bricks; /* list of bricks from which enents
have been received */
struct list_head signing;
@ -202,5 +213,22 @@ br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
gf_boolean_t
bitd_is_bad_file (xlator_t *, br_child_t *, loc_t *, fd_t *);
static inline void
_br_set_child_state (br_child_t *child, br_child_state_t state)
{
child->c_state = state;
}
static inline int
_br_is_child_connected (br_child_t *child)
{
return (child->c_state == BR_CHILD_STATE_CONNECTED);
}
static inline int
_br_child_failed_conn (br_child_t *child)
{
return (child->c_state == BR_CHILD_STATE_CONNFAILED);
}
#endif /* __BIT_ROT_H__ */

View File

@ -31,6 +31,7 @@ enum br_mem_types {
gf_br_stub_mt_br_stub_fd_t,
gf_br_stub_mt_br_scanner_freq_t,
gf_br_stub_mt_sigstub_t,
gf_br_mt_br_child_event_t,
gf_br_stub_mt_end,
};