features/bit-rot: filesystem scrubber

Scrubber performs signature verification for objects that were
signed by signer. This is done by recalculating the signature
(using the hash algorithm the object was signed with) and
verifying it aginst the objects persisted signature. Since the
object could be undergoing IO opretaion at the time of hash
calculation, the signature may not match objects persisted
signature. Bitrot stub provides additional information about
the stalesness of an objects signature (determinted by it's
versioning mechanism). This additional bit of information is
used by scrubber to determine the staleness of the signature,
and in such cases the object is skipped verification (although
signature staleness is performed twice: once before initiation
of hash calculation and another after it (an object could be
modified after staleness checks).

The implmentation is a part of the bitrot xlator (signer) which
acts as a signer or scrubber based on a translator option. As
of now the scrub process is ever running (but has some form of
weak throttling mechanism during filesystem scan). Going forward,
there needs to be some form of scrub scheduling and IO throttling
(during hash calculation) tunables (via CLI).

Change-Id: I665ce90208f6074b98c5a1dd841ce776627cc6f9
BUG: 1170075
Original-Author: Raghavendra Bhat <rabhat@redhat.com>
Original-Author: Venky Shankar <vshankar@redhat.com>
Signed-off-by: Venky Shankar <vshankar@redhat.com>
Reviewed-on: http://review.gluster.org/9914
Tested-by: Vijay Bellur <vbellur@redhat.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
This commit is contained in:
Venky Shankar 2015-03-13 21:23:20 +05:30 committed by Vijay Bellur
parent 7927e8747c
commit 866c64ba5e
7 changed files with 503 additions and 61 deletions

View File

@ -118,6 +118,7 @@ enum _gf_client_pid
GF_CLIENT_PID_AFR_SELF_HEALD = -6,
GF_CLIENT_PID_GLFS_HEAL = -7,
GF_CLIENT_PID_BITD = -8,
GF_CLIENT_PID_SCRUB = -9,
};
enum _gf_xlator_ipc_targets {

View File

@ -9,11 +9,11 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
-I$(CONTRIBDIR)/timer-wheel \
-I$(top_srcdir)/xlators/features/bit-rot/src/stub
bit_rot_la_SOURCES = bit-rot.c
bit_rot_la_SOURCES = bit-rot.c bit-rot-scrub.c
bit_rot_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la \
$(top_builddir)/xlators/features/changelog/lib/src/libgfchangelog.la
noinst_HEADERS = bit-rot.h
noinst_HEADERS = bit-rot.h bit-rot-scrub.h
AM_CFLAGS = -Wall $(GF_CFLAGS)

View File

@ -0,0 +1,291 @@
/*
Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif
#include <ctype.h>
#include <sys/uio.h>
#include "glusterfs.h"
#include "xlator.h"
#include "logging.h"
#include "bit-rot.h"
#include "bit-rot-scrub.h"
#include <pthread.h>
static inline int32_t
bitd_fetch_signature (xlator_t *this,
br_child_t *child, fd_t *fd, br_isignature_out_t *sign)
{
int32_t ret = -1;
dict_t *xattr = NULL;
br_isignature_out_t *sigptr = NULL;
ret = syncop_fgetxattr (child->xl, fd, &xattr,
GLUSTERFS_GET_OBJECT_SIGNATURE, NULL);
if (ret < 0) {
br_log_object (this, "getxattr", fd->inode->gfid, -ret);
goto out;
}
ret = dict_get_ptr (xattr, GLUSTERFS_GET_OBJECT_SIGNATURE,
(void **)&sigptr);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"failed to extract signature info [GFID: %s]",
uuid_utoa (fd->inode->gfid));
goto unref_dict;
}
ret = 0;
(void) memcpy (sign, sigptr, sizeof (br_isignature_out_t));
unref_dict:
dict_unref (xattr);
out:
return ret;
}
static inline int32_t
bitd_scrub_post_compute_check (xlator_t *this,
br_child_t *child,
br_isignature_out_t *sign, fd_t *fd)
{
int32_t ret = 0;
ret = bitd_fetch_signature (this, child, fd, sign);
if (ret)
goto out;
if (sign->stale)
ret = -1;
out:
return ret;
}
static inline int32_t
bitd_scrub_pre_compute_check (xlator_t *this, br_child_t *child, fd_t *fd)
{
int32_t ret = -1;
br_isignature_out_t sign = {0,};
/* if the object is already marked bad, don't bother checking */
if (bitd_is_bad_file (this, child, NULL, fd))
goto out;
/* else, check for signature staleness */
ret = bitd_fetch_signature (this, child, fd, &sign);
if (ret)
goto out;
if (sign.stale) {
ret = -1;
goto out;
}
ret = 0;
out:
return ret;
}
static inline int
bitd_compare_ckum (xlator_t *this,
br_isignature_out_t *sign,
unsigned char *md, inode_t *linked_inode,
gf_dirent_t *entry, fd_t *fd, br_child_t *child)
{
int ret = -1;
dict_t xattr = {0,};
GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
GF_VALIDATE_OR_GOTO (this->name, sign, out);
GF_VALIDATE_OR_GOTO (this->name, fd, out);
GF_VALIDATE_OR_GOTO (this->name, child, out);
GF_VALIDATE_OR_GOTO (this->name, linked_inode, out);
GF_VALIDATE_OR_GOTO (this->name, md, out);
GF_VALIDATE_OR_GOTO (this->name, entry, out);
if (strncmp (sign->signature, (char *)md, strlen (sign->signature))) {
gf_log (this->name, GF_LOG_WARNING, "checksums does not match "
"for the entry %s (gfid: %s)", entry->d_name,
uuid_utoa (linked_inode->gfid));
ret = dict_set_int32 (&xattr, "trusted.glusterfs.bad-file",
_gf_true);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "dict-set for "
"bad-file (entry: %s, gfid: %s) failed",
entry->d_name, uuid_utoa (linked_inode->gfid));
goto out;
}
ret = syncop_fsetxattr (child->xl, fd, &xattr, 0);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "setxattr to mark "
"the file %s (gfid: %s) as bad failed",
entry->d_name, uuid_utoa (linked_inode->gfid));
goto out;
}
}
out:
return ret;
}
/**
* This is the scrubber. As of now there's a mix of fd and inode
* operations. Better to move them to fd based to be clean and
* avoid code cluttering.
*/
int
bitd_start_scrub (xlator_t *subvol,
gf_dirent_t *entry, loc_t *parent, void *data)
{
int32_t ret = -1;
fd_t *fd = NULL;
loc_t loc = {0, };
struct iatt iatt = {0, };
struct iatt parent_buf = {0, };
pid_t pid = 0;
br_child_t *child = NULL;
xlator_t *this = NULL;
unsigned char *md = NULL;
inode_t *linked_inode = NULL;
br_isignature_out_t sign = {0,};
GF_VALIDATE_OR_GOTO ("bit-rot", subvol, out);
GF_VALIDATE_OR_GOTO ("bit-rot", data, out);
child = data;
this = child->this;
pid = GF_CLIENT_PID_SCRUB;
ret = br_prepare_loc (this, child, parent, entry, &loc);
if (!ret)
goto out;
syncopctx_setfspid (&pid);
ret = syncop_lookup (child->xl, &loc, NULL, &iatt, NULL, &parent_buf);
if (ret) {
br_log_object_path (this, "lookup", loc.path, -ret);
goto out;
}
linked_inode = inode_link (loc.inode, parent->inode, loc.name, &iatt);
if (linked_inode)
inode_lookup (linked_inode);
if (iatt.ia_type != IA_IFREG) {
gf_log (this->name, GF_LOG_DEBUG, "%s is not a regular "
"file", entry->d_name);
ret = 0;
goto unref_inode;
}
/**
* open() an fd for subsequent opertaions
*/
fd = fd_create (linked_inode, 0);
if (!fd) {
gf_log (this->name, GF_LOG_ERROR, "failed to create fd for "
"inode %s", uuid_utoa (linked_inode->gfid));
goto unref_inode;
}
ret = syncop_open (child->xl, &loc, O_RDWR, fd);
if (ret) {
br_log_object (this, "open", linked_inode->gfid, -ret);
ret = -1;
goto unrefd;
}
fd_bind (fd);
/**
* perform pre compute checks before initiating checksum
* computation
* - presence of bad object
* - signature staleness
*/
ret = bitd_scrub_pre_compute_check (this, child, fd);
if (ret)
goto unrefd; /* skip this object */
/* if all's good, proceed to calculate the hash */
md = GF_CALLOC (SHA256_DIGEST_LENGTH, sizeof (*md),
gf_common_mt_char);
if (!md)
goto unrefd;
ret = br_calculate_obj_checksum (md, child, fd, &iatt);
if (ret) {
gf_log (this->name, GF_LOG_ERROR, "error calculating hash "
"for object [GFID: %s]", uuid_utoa (fd->inode->gfid));
ret = -1;
goto free_md;
}
/**
* perform post compute checks as an object's signature may have
* become stale while scrubber calculated checksum.
*/
ret = bitd_scrub_post_compute_check (this, child, &sign, fd);
if (ret)
goto free_md;
ret = bitd_compare_ckum (this, &sign, md,
linked_inode, entry, fd, child);
/** fd_unref() takes care of closing fd.. like syncop_close() */
free_md:
GF_FREE (md);
unrefd:
fd_unref (fd);
unref_inode:
inode_unref (linked_inode);
out:
loc_wipe (&loc);
return ret;
}
#define BR_SCRUB_THROTTLE_COUNT 10
#define BR_SCRUB_THROTTLE_ZZZ 100
void *
br_scrubber (void *arg)
{
loc_t loc = {0,};
xlator_t *this = NULL;
br_child_t *child = NULL;
child = arg;
this = child->this;
THIS = this;
loc.inode = child->table->root;
while (1) {
(void) syncop_ftw_throttle
(child->xl, &loc,
GF_CLIENT_PID_SCRUB, child, bitd_start_scrub,
BR_SCRUB_THROTTLE_COUNT, BR_SCRUB_THROTTLE_ZZZ);
sleep (BR_SCRUB_THROTTLE_ZZZ * BR_SCRUB_THROTTLE_COUNT);
}
return NULL;
}

View File

@ -0,0 +1,16 @@
/*
Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
#ifndef __BIT_ROT__SCRUB_H__
#define __BIT_ROT_SCRUB_H__
void *br_scrubber (void *);
#endif /* __BIT_ROT_SCRUB_H__ */

View File

@ -22,6 +22,7 @@
#include "compat-errno.h"
#include "bit-rot.h"
#include "bit-rot-scrub.h"
#include <pthread.h>
static int
@ -146,6 +147,38 @@ br_prepare_signature (const unsigned char *sign,
return signature;
}
gf_boolean_t
bitd_is_bad_file (xlator_t *this, br_child_t *child, loc_t *loc, fd_t *fd)
{
int32_t ret = -1;
dict_t *xattr = NULL;
inode_t *inode = NULL;
gf_boolean_t bad_file = _gf_false;
GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
inode = (loc) ? loc->inode : fd->inode;
if (fd)
ret = syncop_fgetxattr (child->xl, fd, &xattr,
"trusted.glusterfs.bad-file", NULL);
else if (loc)
ret = syncop_getxattr (child->xl, loc, &xattr,
"trusted.glusterfs.bad-file", NULL);
if (!ret) {
gf_log (this->name, GF_LOG_ERROR, "[GFID: %s] is marked "
"corrupted", uuid_utoa (inode->gfid));
bad_file = _gf_true;
}
if (xattr)
dict_unref (xattr);
out:
return bad_file;
}
/**
* Do a lookup on the gfid present within the object.
*/
@ -222,6 +255,7 @@ br_object_open (xlator_t *this,
ret = syncop_open (object->child->xl, &loc, O_RDONLY, fd);
if (ret) {
br_log_object (this, "open", inode->gfid, -ret);
fd_unref (fd);
fd = NULL;
} else {
@ -284,8 +318,8 @@ br_object_read_block_and_sign (xlator_t *this, fd_t *fd, br_child_t *child,
}
int32_t
br_object_checksum (unsigned char *md,
br_object_t *object, fd_t *fd, struct iatt *iatt)
br_calculate_obj_checksum (unsigned char *md,
br_child_t *child, fd_t *fd, struct iatt *iatt)
{
int32_t ret = -1;
off_t offset = 0;
@ -294,16 +328,16 @@ br_object_checksum (unsigned char *md,
SHA256_CTX sha256;
GF_VALIDATE_OR_GOTO ("bit-rot", object, out);
GF_VALIDATE_OR_GOTO ("bit-rot", child, out);
GF_VALIDATE_OR_GOTO ("bit-rot", iatt, out);
GF_VALIDATE_OR_GOTO ("bit-rot", fd, out);
this = object->this;
this = child->this;
SHA256_Init (&sha256);
while (1) {
ret = br_object_read_block_and_sign (this, fd, object->child,
ret = br_object_read_block_and_sign (this, fd, child,
offset, block, &sha256);
if (ret < 0) {
gf_log (this->name, GF_LOG_ERROR, "reading block with "
@ -325,6 +359,13 @@ br_object_checksum (unsigned char *md,
return ret;
}
static inline int32_t
br_object_checksum (unsigned char *md,
br_object_t *object, fd_t *fd, struct iatt *iatt)
{
return br_calculate_obj_checksum (md, object->child, fd, iatt);
}
static inline int32_t
br_object_read_sign (inode_t *linked_inode, fd_t *fd, br_object_t *object,
struct iatt *iatt)
@ -396,7 +437,8 @@ br_object_read_sign (inode_t *linked_inode, fd_t *fd, br_object_t *object,
static inline int br_object_sign_softerror (int32_t op_errno)
{
return ((op_errno == ENOENT) || (op_errno = ESTALE));
return ((op_errno == ENOENT) || (op_errno = ESTALE)
|| (op_errno == ENODATA));
}
void
@ -459,8 +501,6 @@ static inline int32_t br_sign_object (br_object_t *object)
* we have an open file descriptor on the object. from here on,
* do not be generous to file operation errors.
*/
/* change this to DEBUG log level later */
gf_log (this->name, GF_LOG_DEBUG,
"Signing object [%s]", uuid_utoa (linked_inode->gfid));
@ -878,6 +918,9 @@ bitd_oneshot_crawl (xlator_t *subvol,
* if there are any fds present for that inode) and handle properly.
*/
if (bitd_is_bad_file (this, child, &loc, NULL))
goto unref_inode;
ret = syncop_getxattr (child->xl, &loc, &xattr,
GLUSTERFS_GET_OBJECT_SIGNATURE, NULL);
if (ret < 0) {
@ -993,11 +1036,26 @@ br_enact_signer (xlator_t *this, br_child_t *child, br_stub_init_t *stub)
return -1;
}
static inline int32_t
br_enact_scrubber (xlator_t *this, br_child_t *child)
{
int32_t ret = 0;
ret = gf_thread_create (&child->thread, NULL, br_scrubber, child);
if (ret != 0) {
ret = -1;
gf_log (this->name, GF_LOG_ERROR, "failed to spawn scrubber");
}
return ret;
}
/**
* This routine fetches various attributes associated with a child which
* is basically a subvolume. Attributes include brick path and the stub
* birth time. This is done by performing a lookup on the root followed
* by getxattr() on a virtual key.
* by getxattr() on a virtual key. Depending on the configuration, the
* process either acts as a signer or a scrubber.
*/
static inline int32_t
br_brick_connect (xlator_t *this, br_child_t *child)
@ -1008,12 +1066,15 @@ br_brick_connect (xlator_t *this, br_child_t *child)
struct iatt parent = {0, };
br_stub_init_t *stub = NULL;
dict_t *xattr = NULL;
br_private_t *priv = NULL;
int op_errno = 0;
GF_VALIDATE_OR_GOTO ("bit-rot", this, out);
GF_VALIDATE_OR_GOTO (this->name, child, out);
GF_VALIDATE_OR_GOTO (this->name, this->private, out);
priv = this->private;
loc.inode = inode_ref (child->table->root);
uuid_copy (loc.gfid, loc.inode->gfid);
loc.path = gf_strdup ("/");
@ -1049,7 +1110,10 @@ br_brick_connect (xlator_t *this, br_child_t *child)
child->tv.tv_sec = ntohl (stub->timebuf[0]);
child->tv.tv_usec = ntohl (stub->timebuf[0]);
ret = br_enact_signer (this, child, stub);
if (priv->iamscrubber)
ret = br_enact_scrubber (this, child);
else
ret = br_enact_signer (this, child, stub);
free_dict:
dict_unref (xattr);
@ -1208,6 +1272,78 @@ out:
return 0;
}
/**
* Initialize signer specific structures, spawn worker threads.
*/
static inline void
br_fini_signer (xlator_t *this, br_private_t *priv)
{
int i = 0;
for (; i < BR_WORKERS; i++) {
(void) gf_thread_cleanup_xint (priv->obj_queue->workers[i]);
}
pthread_cond_destroy (&priv->object_cond);
gf_tw_cleanup_timers (priv->timer_wheel);
}
static inline int32_t
br_init_signer (xlator_t *this, br_private_t *priv)
{
int i = 0;
int32_t ret = -1;
/* initialize gfchangelog xlator context */
ret = gf_changelog_init (this);
if (ret)
goto out;
priv->timer_wheel = gf_tw_init_timers ();
if (!priv->timer_wheel) {
gf_log (this->name, GF_LOG_ERROR,
"failed to initialize the timer wheel");
goto out;
}
pthread_cond_init (&priv->object_cond, NULL);
priv->obj_queue = GF_CALLOC (1, sizeof (*priv->obj_queue),
gf_br_mt_br_ob_n_wk_t);
if (!priv->obj_queue)
goto cleanup_timer;
INIT_LIST_HEAD (&priv->obj_queue->objects);
for (i = 0; i < BR_WORKERS; i++) {
ret = gf_thread_create (&priv->obj_queue->workers[i], NULL,
br_process_object, this);
if (ret != 0) {
gf_log (this->name, GF_LOG_ERROR,
"thread creation failed (%s)", strerror (-ret));
ret = -1;
goto cleanup_threads;
}
}
return 0;
cleanup_threads:
for (i--; i >= 0; i--) {
(void) gf_thread_cleanup_xint (priv->obj_queue->workers[i]);
}
GF_FREE (priv->obj_queue);
cleanup_timer:
/* that's explicit */
pthread_cond_destroy (&priv->object_cond);
gf_tw_cleanup_timers (priv->timer_wheel);
out:
return -1;
}
int32_t
init (xlator_t *this)
{
@ -1228,18 +1364,14 @@ init (xlator_t *this)
goto out;
}
/* initialize gfchangelog xlator context */
ret = gf_changelog_init (this);
if (ret)
goto out;
GF_OPTION_INIT ("scrubber", priv->iamscrubber, bool, out);
GF_OPTION_INIT ("expiry-time", priv->expiry_time, int32, out);
priv->child_count = xlator_subvolume_count (this);
priv->children = GF_CALLOC (priv->child_count, sizeof (*priv->children),
gf_br_mt_br_child_t);
if (!priv->children)
goto out;
goto free_priv;
trav = this->children;
while (trav) {
@ -1252,7 +1384,7 @@ init (xlator_t *this)
gf_log (this->name, GF_LOG_ERROR,
"failed to allocate mem-pool for timer");
errno = ENOMEM;
goto out;
goto free_children;
}
i++;
@ -1268,55 +1400,41 @@ init (xlator_t *this)
this->private = priv;
ret = gf_thread_create (&priv->thread, NULL, br_handle_events,
this);
if (!priv->iamscrubber) {
ret = br_init_signer (this, priv);
if (ret)
goto cleanup_mutex;
}
ret = gf_thread_create (&priv->thread, NULL, br_handle_events, this);
if (ret != 0) {
gf_log (this->name, GF_LOG_ERROR,
"thread creation failed (%s)", strerror (errno));
goto out;
"thread creation failed (%s)", strerror (-ret));
ret = -1;
}
priv->timer_wheel = gf_tw_init_timers ();
if (!priv->timer_wheel) {
gf_log (this->name, GF_LOG_ERROR, "failed to initialize the "
"timer wheel");
goto out;
if (!ret) {
gf_log (this->name, GF_LOG_INFO,
"bit-rot xlator loaded in \"%s\" mode",
(priv->iamscrubber) ? "SCRUBBER" : "SIGNER");
return 0;
}
pthread_cond_init (&priv->object_cond, NULL);
priv->obj_queue = GF_CALLOC (1, sizeof (*priv->obj_queue),
gf_br_mt_br_ob_n_wk_t);
if (!priv->obj_queue) {
gf_log (this->name, GF_LOG_ERROR, "memory allocation failed");
goto out;
cleanup_mutex:
(void) pthread_cond_destroy (&priv->cond);
(void) pthread_mutex_destroy (&priv->lock);
free_children:
for (i = 0; i < priv->child_count; i++) {
if (priv->children[i].timer_pool)
mem_pool_destroy (priv->children[i].timer_pool);
}
INIT_LIST_HEAD (&priv->obj_queue->objects);
for (i = 0; i < BR_WORKERS; i++) {
gf_thread_create (&priv->obj_queue->workers[i], NULL,
br_process_object, this);
if (ret != 0) {
gf_log (this->name, GF_LOG_ERROR,
"thread creation failed (%s)",
strerror (errno));
goto out;
}
}
ret = 0;
out:
if (ret) {
if (priv->children)
GF_FREE (priv->children);
if (priv->timer_wheel)
gf_tw_cleanup_timers (priv->timer_wheel);
GF_FREE (priv);
}
gf_log (this->name, GF_LOG_DEBUG, "bit-rot xlator loaded");
return ret;
GF_FREE (priv->children);
free_priv:
GF_FREE (priv);
out:
this->private = NULL;
return -1;
}
void
@ -1327,9 +1445,12 @@ fini (xlator_t *this)
if (!priv)
return;
if (!priv->iamscrubber)
br_fini_signer (this, priv);
br_free_children (this);
if (priv->timer_wheel)
gf_tw_cleanup_timers (priv->timer_wheel);
this->private = NULL;
GF_FREE (priv);
@ -1347,5 +1468,10 @@ struct volume_options options[] = {
.description = "default time duration for which an object waits "
"before it is signed",
},
{ .key = {"scrubber"},
.type = GF_OPTION_TYPE_BOOL,
.default_value = "false",
.description = "option to run as a scrubber",
},
{ .key = {NULL} },
};

View File

@ -91,6 +91,8 @@ struct br_private {
signing and the workers which sign
the objects */
int32_t expiry_time; /* objects "wait" time */
gf_boolean_t iamscrubber; /* function as a fs scrubber */
};
typedef struct br_private br_private_t;
@ -118,9 +120,13 @@ void
br_log_object_path (xlator_t *, char *, const char *, int32_t);
int32_t
br_object_checksum (unsigned char *, br_object_t *, fd_t *, struct iatt *);
br_calculate_obj_checksum (unsigned char *,
br_child_t *, fd_t *, struct iatt *);
int32_t
br_prepare_loc (xlator_t *, br_child_t *, loc_t *, gf_dirent_t *, loc_t *);
gf_boolean_t
bitd_is_bad_file (xlator_t *, br_child_t *, loc_t *, fd_t *);
#endif /* __BIT_ROT_H__ */

View File

@ -927,6 +927,8 @@ br_stub_open (call_frame_t *frame, xlator_t *this,
if (!flags)
goto wind;
if (frame->root->pid == GF_CLIENT_PID_SCRUB)
goto wind;
cookie = (void *) BR_STUB_REQUEST_COOKIE;
wind: