cluster/afr: Add afr-v1 xattr compatibility

All the special cases v1 handles and also
self-accusing pending changelog from v1 pre-op also is handled
in this patch.

Change-Id: Ie10f71633fb20276f01ecafbd728f20483e7029c
BUG: 1128721
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
Reviewed-on: http://review.gluster.org/8536
Reviewed-by: Ravishankar N <ravishankar@redhat.com>
Tested-by: Gluster Build System <jenkins@build.gluster.com>
This commit is contained in:
Pranith Kumar K 2014-08-20 21:50:06 +05:30 committed by Pranith Kumar Karampuri
parent f95a25c35e
commit 443e48abf9
6 changed files with 330 additions and 83 deletions

View File

@ -287,7 +287,36 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
return 0;
}
void
afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
unsigned char *locked_on, unsigned char *sinks)
{
int i = 0;
afr_private_t *priv = NULL;
priv = this->private;
memset (sinks, 0, sizeof (*sinks) * priv->child_count);
for (i = 0; i < priv->child_count; i++) {
if (!sources[i] && locked_on[i])
sinks[i] = 1;
}
}
gf_boolean_t
afr_does_witness_exist (xlator_t *this, uint64_t *witness)
{
int i = 0;
afr_private_t *priv = NULL;
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
if (witness[i])
return _gf_true;
}
return _gf_false;
}
/*
* This function determines if a self-heal is required for a given inode,
@ -309,22 +338,29 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
*/
int
afr_selfheal_find_direction (xlator_t *this, struct afr_reply *replies,
afr_transaction_type type, unsigned char *locked_on,
unsigned char *sources, unsigned char *sinks)
afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
struct afr_reply *replies,
afr_transaction_type type,
unsigned char *locked_on, unsigned char *sources,
unsigned char *sinks, uint64_t *witness)
{
afr_private_t *priv = NULL;
int i = 0;
int j = 0;
int *dirty = NULL;
int **matrix = NULL;
char *accused = NULL;
afr_private_t *priv = NULL;
int i = 0;
int j = 0;
int *dirty = NULL; /* Denotes if dirty xattr is set */
int **matrix = NULL;/* Changelog matrix */
char *accused = NULL;/* Accused others without any self-accusal */
char *pending = NULL;/* Have pending operations on others */
char *self_accused = NULL; /* Accused itself */
priv = this->private;
dirty = alloca0 (priv->child_count * sizeof (int));
accused = alloca0 (priv->child_count);
pending = alloca0 (priv->child_count);
self_accused = alloca0 (priv->child_count);
matrix = ALLOC_MATRIX(priv->child_count, int);
memset (witness, 0, sizeof (*witness) * priv->child_count);
if (afr_success_count (replies,
priv->child_count) < AFR_SH_MIN_PARTICIPANTS) {
@ -335,11 +371,23 @@ afr_selfheal_find_direction (xlator_t *this, struct afr_reply *replies,
/* First construct the pending matrix for further analysis */
afr_selfheal_extract_xattr (this, replies, type, dirty, matrix);
/* short list all self-accused */
for (i = 0; i < priv->child_count; i++) {
if (matrix[i][i])
self_accused[i] = 1;
}
/* Next short list all accused to exclude them from being sources */
/* Self-accused can't accuse others as they are FOOLs */
for (i = 0; i < priv->child_count; i++) {
for (j = 0; j < priv->child_count; j++) {
if (matrix[i][j])
accused[j] = 1;
if (matrix[i][j]) {
if (!self_accused[i])
accused[j] = 1;
if (i != j)
pending[i] = 1;
}
}
}
@ -350,38 +398,47 @@ afr_selfheal_find_direction (xlator_t *this, struct afr_reply *replies,
sources[i] = 1;
}
/* Everyone accused by sources are sinks */
memset (sinks, 0, priv->child_count);
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
for (j = 0; j < priv->child_count; j++) {
if (matrix[i][j])
sinks[j] = 1;
}
}
/* Everyone accused by non-self-accused sources are sinks */
memset (sinks, 0, priv->child_count);
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
if (self_accused[i])
continue;
for (j = 0; j < priv->child_count; j++) {
if (matrix[i][j])
sinks[j] = 1;
}
}
/* If any source has 'dirty' bit, pick first
'dirty' source and make everybody else sinks */
for (i = 0; i < priv->child_count; i++) {
if (sources[i] && dirty[i]) {
for (j = 0; j < priv->child_count; j++) {
if (j != i) {
sources[j] = 0;
sinks[j] = 1;
}
}
break;
}
}
/* For breaking ties provide with number of fops they witnessed */
/* If no sources, all locked nodes are sinks - split brain */
if (AFR_COUNT (sources, priv->child_count) == 0) {
for (i = 0; i < priv->child_count; i++) {
if (locked_on[i])
sinks[i] = 1;
}
}
/*
* count the pending fops witnessed from itself to others when it is
* self-accused
*/
for (i = 0; i < priv->child_count; i++) {
if (!self_accused[i])
continue;
for (j = 0; j < priv->child_count; j++) {
if (i == j)
continue;
witness[i] += matrix[i][j];
}
}
/* In afr-v1 if a file is self-accused but didn't have any pending
* operations on others then it is similar to 'dirty' in afr-v2.
* Consider such cases as witness.
*/
for (i = 0; i < priv->child_count; i++) {
if (self_accused[i] && !pending[i])
witness[i] += matrix[i][i];
}
/* count the number of dirty fops witnessed */
for (i = 0; i < priv->child_count; i++)
witness[i] += dirty[i];
return 0;
}

View File

@ -372,21 +372,160 @@ __afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this,
return 0;
}
gf_boolean_t
afr_has_source_witnesses (xlator_t *this, unsigned char *sources,
uint64_t *witness)
{
int i = 0;
afr_private_t *priv = NULL;
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
if (sources[i] && witness[i])
return _gf_true;
}
return _gf_false;
}
static gf_boolean_t
afr_does_size_mismatch (xlator_t *this, unsigned char *sources,
struct afr_reply *replies)
{
int i = 0;
afr_private_t *priv = NULL;
struct iatt *min = NULL;
struct iatt *max = NULL;
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid)
continue;
if (replies[i].op_ret < 0)
continue;
if (!min)
min = &replies[i].poststat;
if (!max)
max = &replies[i].poststat;
if (min->ia_size > replies[i].poststat.ia_size)
min = &replies[i].poststat;
if (max->ia_size < replies[i].poststat.ia_size)
max = &replies[i].poststat;
}
if (min && max) {
if (min->ia_size != max->ia_size)
return _gf_true;
}
return _gf_false;
}
/*
* If by chance there are multiple sources with differing sizes, select
* the largest file as the source.
*
* This can only happen if data was directly modified in the backend.
* This can happen if data was directly modified in the backend or for snapshots
*/
static void
afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
struct afr_reply *replies)
{
int i = 0;
afr_private_t *priv = NULL;
uint64_t size = 0;
/* Find source with biggest file size */
priv = this->private;
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
if (size <= replies[i].poststat.ia_size) {
size = replies[i].poststat.ia_size;
}
}
/* Mark sources with less size as not source */
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
if (size > replies[i].poststat.ia_size)
sources[i] = 0;
}
return;
}
static void
afr_mark_biggest_witness_as_source (xlator_t *this, unsigned char *sources,
uint64_t *witness)
{
int i = 0;
afr_private_t *priv = NULL;
uint64_t biggest_witness = 0;
priv = this->private;
/* Find source with biggest witness count */
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
if (biggest_witness < witness[i])
biggest_witness = witness[i];
}
/* Mark files with less witness count as not source */
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
if (witness[i] < biggest_witness)
sources[i] = 0;
}
return;
}
/* This is a tie breaker function. Only one source be assigned here */
static void
afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources,
struct afr_reply *replies)
{
int i = 0;
afr_private_t *priv = NULL;
int source = -1;
uint32_t max_ctime = 0;
priv = this->private;
/* Find source with latest ctime */
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
if (max_ctime <= replies[i].poststat.ia_ctime) {
source = i;
max_ctime = replies[i].poststat.ia_ctime;
}
}
/* Only mark one of the files as source to break ties */
memset (sources, 0, sizeof (*sources) * priv->child_count);
sources[source] = 1;
}
static int
__afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
unsigned char *healed_sinks,
unsigned char *locked_on,
struct afr_reply *replies)
struct afr_reply *replies,
uint64_t *witness)
{
int i = 0;
afr_private_t *priv = NULL;
uint64_t size = 0;
int source = -1;
int sources_count = 0;
@ -400,24 +539,24 @@ __afr_selfheal_data_finalize_source (xlator_t *this, unsigned char *sources,
return -EIO;
}
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
if (size <= replies[i].poststat.ia_size) {
size = replies[i].poststat.ia_size;
source = i;
}
}
/* If there are no witnesses/size-mismatches on sources we are done*/
if (!afr_does_size_mismatch (this, sources, replies) &&
!afr_has_source_witnesses (this, sources, witness))
goto out;
for (i = 0; i < priv->child_count; i++) {
if (!sources[i])
continue;
if (replies[i].poststat.ia_size < size) {
sources[i] = 0;
healed_sinks[i] = 1;
}
}
afr_mark_largest_file_as_source (this, sources, replies);
afr_mark_biggest_witness_as_source (this, sources, witness);
afr_mark_newest_file_as_source (this, sources, replies);
out:
afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
for (i = 0; i < priv->child_count; i++) {
if (sources[i]) {
source = i;
break;
}
}
return source;
}
@ -439,6 +578,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
int ret = -1;
int source = -1;
afr_private_t *priv = NULL;
uint64_t *witness = NULL;
priv = this->private;
@ -447,15 +587,16 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (ret)
return ret;
ret = afr_selfheal_find_direction (this, replies, AFR_DATA_TRANSACTION,
locked_on, sources, sinks);
witness = alloca0(priv->child_count * sizeof (*witness));
ret = afr_selfheal_find_direction (frame, this, replies,
AFR_DATA_TRANSACTION,
locked_on, sources, sinks, witness);
if (ret)
return ret;
/* Initialize the healed_sinks[] array optimistically to
the intersection of to-be-healed (i.e sinks[]) and
the list of servers which are up (i.e locked_on[]).
As we encounter failures in the healing process, we
will unmark the respective servers in the healed_sinks[]
array.
@ -464,7 +605,7 @@ __afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
source = __afr_selfheal_data_finalize_source (this, sources,
healed_sinks, locked_on,
replies);
replies, witness);
if (source < 0)
return -EIO;

View File

@ -326,7 +326,9 @@ __afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
static int
__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
unsigned char *healed_sinks,
unsigned char *locked_on)
unsigned char *locked_on,
struct afr_reply *replies,
uint64_t *witness)
{
int i = 0;
afr_private_t *priv = NULL;
@ -338,7 +340,10 @@ __afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
sources_count = AFR_COUNT (sources, priv->child_count);
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
|| !sources_count || afr_does_witness_exist (this, witness)) {
memset (sources, 0, sizeof (*sources) * priv->child_count);
afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
return -1;
}
@ -362,6 +367,7 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
int ret = -1;
int source = -1;
afr_private_t *priv = NULL;
uint64_t *witness = NULL;
priv = this->private;
@ -370,8 +376,10 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
if (ret)
return ret;
ret = afr_selfheal_find_direction (this, replies, AFR_ENTRY_TRANSACTION,
locked_on, sources, sinks);
witness = alloca0 (sizeof (*witness) * priv->child_count);
ret = afr_selfheal_find_direction (frame, this, replies,
AFR_ENTRY_TRANSACTION,
locked_on, sources, sinks, witness);
if (ret)
return ret;
@ -386,7 +394,10 @@ __afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this, fd_t *fd,
AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
source = __afr_selfheal_entry_finalize_source (this, sources,
healed_sinks, locked_on);
healed_sinks,
locked_on, replies,
witness);
if (source < 0) {
/* If source is < 0 (typically split-brain), we perform a
conservative merge of entries rather than erroring out */

View File

@ -169,7 +169,6 @@ __afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
return source;
}
static int
__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode,
unsigned char *locked_on, unsigned char *sources,
@ -179,6 +178,8 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
int ret = -1;
int source = -1;
afr_private_t *priv = NULL;
int i = 0;
uint64_t *witness = NULL;
priv = this->private;
@ -187,9 +188,10 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
if (ret)
return ret;
ret = afr_selfheal_find_direction (this, replies,
AFR_METADATA_TRANSACTION,
locked_on, sources, sinks);
witness = alloca0 (sizeof (*witness) * priv->child_count);
ret = afr_selfheal_find_direction (frame, this, replies,
AFR_METADATA_TRANSACTION,
locked_on, sources, sinks, witness);
if (ret)
return ret;
@ -203,9 +205,28 @@ __afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *i
*/
AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
/* If any source has witness, pick first
* witness source and make everybody else sinks */
for (i = 0; i < priv->child_count; i++) {
if (sources[i] && witness[i]) {
source = i;
break;
}
}
if (source != -1) {
for (i = 0; i < priv->child_count; i++) {
if (i != source && sources[i]) {
sources[i] = 0;
healed_sinks[i] = 1;
}
}
}
source = __afr_selfheal_metadata_finalize_source (frame, this, sources,
healed_sinks,
locked_on, replies);
locked_on, replies);
if (source < 0)
return -EIO;

View File

@ -457,7 +457,9 @@ __afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
int
__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources,
unsigned char *healed_sinks,
unsigned char *locked_on)
unsigned char *locked_on,
struct afr_reply *replies,
uint64_t *witness)
{
int i = 0;
afr_private_t *priv = NULL;
@ -469,7 +471,9 @@ __afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources,
sources_count = AFR_COUNT (sources, priv->child_count);
if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
|| !sources_count) {
|| !sources_count || afr_does_witness_exist (this, witness)) {
memset (sources, 0, sizeof (*sources) * priv->child_count);
afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
return -1;
}
@ -483,7 +487,6 @@ __afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources,
return source;
}
int
__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent,
uuid_t pargfid, unsigned char *locked_on,
@ -494,6 +497,7 @@ __afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *paren
int source = -1;
afr_private_t *priv = NULL;
struct afr_reply *replies = NULL;
uint64_t *witness = NULL;
priv = this->private;
@ -503,8 +507,10 @@ __afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *paren
if (ret)
goto out;
ret = afr_selfheal_find_direction (this, replies, AFR_ENTRY_TRANSACTION,
locked_on, sources, sinks);
witness = alloca0 (sizeof (*witness) * priv->child_count);
ret = afr_selfheal_find_direction (frame, this, replies,
AFR_ENTRY_TRANSACTION,
locked_on, sources, sinks, witness);
if (ret)
goto out;
@ -519,7 +525,9 @@ __afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *paren
AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
source = __afr_selfheal_name_finalize_source (this, sources,
healed_sinks, locked_on);
healed_sinks,
locked_on, replies,
witness);
if (source < 0) {
/* If source is < 0 (typically split-brain), we perform a
conservative merge of entries rather than erroring out */

View File

@ -136,9 +136,11 @@ afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
unsigned char *lookup_on, dict_t *xattr);
int
afr_selfheal_find_direction (xlator_t *this, struct afr_reply *replies,
afr_transaction_type type, unsigned char *locked_on,
unsigned char *sources, unsigned char *sinks);
afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
struct afr_reply *replies,
afr_transaction_type type,
unsigned char *locked_on, unsigned char *sources,
unsigned char *sinks, uint64_t *witness);
int
afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
@ -189,4 +191,11 @@ afr_success_count (struct afr_reply *replies, unsigned int count);
void
afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
int source, unsigned char *healed_sinks);
void
afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
unsigned char *locked_on, unsigned char *sinks);
gf_boolean_t
afr_does_witness_exist (xlator_t *this, uint64_t *witness);
#endif /* !_AFR_SELFHEAL_H */