afr: do not set arbiter as a readable subvol in inode context

Problem:
If afr_lookup_done() or afr_read_subvol_select_by_policy() chooses the
arbiter brick to serve the stat() data, file size will be reported as
zero from the mount, despite other data bricks being available. This can
break programs like tar which use the stat info to decide how much to read.

Fix:
In the inode-context, mark arbiter as a non-readable subvol for both
data and metadata.

It it to be noted that by making this fix, we are *not* going to serve
metadata FOPS anymore from the arbiter brick despite the brick storing
the metadata. It makes sense to do this because the ever increasing
over-loaded FOPs (getxattr returning stat data etc.) and compound FOPS
in gluster will otherwise make it difficult to add checks in code to
handle corner cases.

Change-Id: Ic60b25d77fd05e0897481b7fcb3716d4f2101001
BUG: 1310171
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
Reported-by: Mat Clayton <mat@mixcloud.com>
Reviewed-on: http://review.gluster.org/13539
Reviewed-by: Anuradha Talur <atalur@redhat.com>
Reviewed-by: Krutika Dhananjay <kdhananj@redhat.com>
Smoke: Gluster Build System <jenkins@build.gluster.com>
NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
CentOS-regression: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Jeff Darcy <jdarcy@redhat.com>
This commit is contained in:
Ravishankar N 2016-02-29 05:16:50 +00:00 committed by Jeff Darcy
parent bf80b90052
commit 8ab87137c8
3 changed files with 67 additions and 4 deletions

View File

@ -0,0 +1,44 @@
#!/bin/bash
. $(dirname $0)/../../include.rc
. $(dirname $0)/../../volume.rc
. $(dirname $0)/../../afr.rc
. $(dirname $0)/../../nfs.rc
cleanup;
#Check that mounting fails when only arbiter brick is up.
TEST glusterd;
TEST pidof glusterd
TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0,1,2}
TEST $CLI volume start $V0
EXPECT 'Started' volinfo_field $V0 'Status'
EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available;
TEST kill_brick $V0 $H0 $B0/${V0}0
TEST kill_brick $V0 $H0 $B0/${V0}1
# Doing `mount -t glusterfs $H0:$V0 $M0` fails right away but doesn't work on NetBSD
# So check that stat <mount> fails instead.
TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0
TEST ! stat $M0
TEST umount $M0
mount_nfs $H0:/$V0 $N0
TEST [ $? -ne 0 ]
TEST $CLI volume start $V0 force
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}0
EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1
EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available;
TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0
TEST stat $M0
TEST umount $M0
mount_nfs $H0:/$V0 $N0
TEST [ $? -eq 0 ]
TEST umount $N0
cleanup

View File

@ -28,9 +28,13 @@ TEST pidof glusterd
TEST mkdir -p $B0/${V0}{0,1,2}
TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0,1,2}
TEST $CLI volume set $V0 performance.write-behind off
TEST $CLI volume set $V0 performance.stat-prefetch off
TEST $CLI volume set $V0 cluster.self-heal-daemon off
TEST $CLI volume set $V0 cluster.metadata-self-heal off
TEST $CLI volume set $V0 cluster.data-self-heal off
TEST $CLI volume set $V0 cluster.entry-self-heal off
TEST $CLI volume start $V0
TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0;
TEST stat $M0/.meta/graphs/active/$V0-replicate-0/options/arbiter-count
EXPECT "1" cat $M0/.meta/graphs/active/$V0-replicate-0/options/arbiter-count
@ -48,9 +52,11 @@ TEST kill_brick $V0 $H0 $B0/${V0}1
echo "B2 is down, B3 is the only source, writes will fail" >> $M0/file
EXPECT_NOT "0" echo $?
TEST ! cat $M0/file
# Metadata I/O should still succeed.
TEST getfattr -n user.name $M0/file
TEST setfattr -n user.name -v value3 $M0/file
# Though metadata IO could have been served from arbiter, we do not allow it
# anymore as FOPS like getfattr could be overloaded to return iatt buffers for
# use by other translators.
TEST ! getfattr -n user.name $M0/file
TEST ! setfattr -n user.name -v value3 $M0/file
#shd should not data self-heal from arbiter to the sinks.
TEST $CLI volume set $V0 cluster.self-heal-daemon on

View File

@ -691,6 +691,10 @@ afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode,
data_readable[i] = 1;
metadata_readable[i] = 1;
}
if (AFR_IS_ARBITER_BRICK (priv, ARBITER_BRICK_INDEX)) {
data_readable[ARBITER_BRICK_INDEX] = 0;
metadata_readable[ARBITER_BRICK_INDEX] = 0;
}
for (i = 0; i < priv->child_count; i++) {
if (!replies[i].valid) {
@ -1773,9 +1777,14 @@ unwind:
read_subvol = spb_choice;
else
read_subvol = afr_first_up_child (frame, this);
}
par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
readable);
if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) {
local->op_ret = -1;
local->op_errno = ENOTCONN;
}
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->replies[read_subvol].poststat,
@ -2222,6 +2231,10 @@ unwind:
else
read_subvol = afr_first_up_child (frame, this);
}
if (AFR_IS_ARBITER_BRICK (priv, read_subvol) && local->op_ret == 0) {
local->op_ret = -1;
local->op_errno = ENOTCONN;
}
AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
local->inode, &local->replies[read_subvol].poststat,