cluster/dht: Fix min-free-disk calculations when quota-deem-statfs is on

PROBLEM:

As part of file creation, DHT sends a statfs call to all of its
sub-volumes and expects in return the local space consumption and
availability on each one of them. This information is used by DHT to
ensure that atleast min-free-disk amount of space is left on each
sub-volume in the event that there ARE other sub-volumes with more
space available.
But when quota-deem-statfs is enabled, quota xlator on every brick
unwinds the statfs call with volume-wide consumption of disk space.
This leads to miscalculation in min-free-disk algo, thereby misleading
DHT at some point, into thinking all sub-volumes have equal available
space, in which case DHT keeps sending new file creates to subvol-0,
causing it to become 100% full at some point although there ARE other
subvols with ample space available.

FIX:

The fix is to make quota_statfs() behave as if quota xlator weren't
enabled, thereby making every brick return only its local consumption
and disk space availability.

Change-Id: I211371a1eddb220037bd36a128973938ea8124c2
BUG: 1099890
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
Reviewed-on: http://review.gluster.org/7845
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Reviewed-by: Vijay Bellur <vbellur@redhat.com>
This commit is contained in:
Krutika Dhananjay 2014-05-21 17:47:03 +05:30 committed by Vijay Bellur
parent 0d26de1b0a
commit db022ef7ec
5 changed files with 167 additions and 5 deletions

View File

@ -89,6 +89,7 @@
#define GF_XATTR_USER_PATHINFO_KEY "glusterfs.pathinfo"
#define QUOTA_LIMIT_KEY "trusted.glusterfs.quota.limit-set"
#define VIRTUAL_QUOTA_XATTR_CLEANUP_KEY "glusterfs.quota-xattr-cleanup"
#define GF_INTERNAL_IGNORE_DEEM_STATFS "ignore-deem-statfs"
#define GF_READDIR_SKIP_DIRS "readdir-filter-directories"

120
tests/bugs/bug-1099890.t Normal file
View File

@ -0,0 +1,120 @@
#!/bin/bash
. $(dirname $0)/../include.rc
. $(dirname $0)/../volume.rc
. $(dirname $0)/../dht.rc
## TO-DO: Fix the following once the dht du refresh interval issue is fixed:
## 1. Do away with sleep(1).
## 2. Do away with creation of empty files.
cleanup;
TEST glusterd;
TEST pidof glusterd;
# Create 2 loop devices, one per brick.
TEST truncate -s 100M $B0/brick1
TEST truncate -s 100M $B0/brick2
TEST L1=`losetup --find --show $B0/brick1`
TEST mkfs.xfs $L1
TEST L2=`losetup --find --show $B0/brick2`
TEST mkfs.xfs $L2
TEST mkdir -p $B0/${V0}{1,2}
TEST mount -t xfs $L1 $B0/${V0}1
TEST mount -t xfs $L2 $B0/${V0}2
# Create a plain distribute volume with 2 subvols.
TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
TEST $CLI volume start $V0;
EXPECT "Started" volinfo_field $V0 'Status';
TEST $CLI volume quota $V0 enable;
TEST $CLI volume set $V0 features.quota-deem-statfs on
TEST $CLI volume quota $V0 limit-usage / 150MB;
TEST $CLI volume set $V0 cluster.min-free-disk 50%
TEST glusterfs -s $H0 --volfile-id=$V0 $M0
# Make sure quota-deem-statfs is working as expected
EXPECT "150M" echo `df -h $M0 -P | tail -1 | awk {'print $2'}`
# Create a new file 'foo' under the root of the volume, which hashes to subvol-0
# of DHT, that consumes 40M
TEST dd if=/dev/zero of=$M0/foo bs=5M count=8
TEST stat $B0/${V0}1/foo
TEST ! stat $B0/${V0}2/foo
# Create a new file 'bar' under the root of the volume, which hashes to subvol-1
# of DHT, that consumes 40M
TEST dd if=/dev/zero of=$M0/bar bs=5M count=8
TEST ! stat $B0/${V0}1/bar
TEST stat $B0/${V0}2/bar
# Touch a zero-byte file on the root of the volume to make sure the statfs data
# on DHT is refreshed
sleep 1;
TEST touch $M0/empty1;
# At this point, the available space on each subvol {60M,60M} is greater than
# their min-free-disk {50M,50M}, but if this bug still exists, then
# the total available space on the volume as perceived by DHT should be less
# than min-free-disk, i.e.,
#
# consumed space returned per subvol by quota = (40M + 40M) = 80M
#
# Therefore, consumed space per subvol computed by DHT WITHOUT the fix would be:
# (80M/150M)*100 = 53%
#
# Available space per subvol as perceived by DHT with the bug = 47%
# which is less than min-free-disk
# Now I create a file that hashes to subvol-1 (counting from 0) of DHT.
# If this bug still exists,then DHT should be routing this creation to subvol-0.
# If this bug is fixed, then DHT should be routing the creation to subvol-1 only
# as it has more than min-free-disk space available.
TEST dd if=/dev/zero of=$M0/file bs=1K count=1
sleep 1;
TEST ! stat $B0/${V0}1/file
TEST stat $B0/${V0}2/file
# Touch another zero-byte file on the root of the volume to refresh statfs
# values stored by DHT.
TEST touch $M0/empty2;
# Now I create a new file that hashes to subvol-0, at the end of which, there
# will be less than min-free-disk space available on it.
TEST dd if=/dev/zero of=$M0/fil bs=5M count=4
sleep 1;
TEST stat $B0/${V0}1/fil
TEST ! stat $B0/${V0}2/fil
# Touch to refresh statfs info cached by DHT
TEST touch $M0/empty3;
# Now I create a file that hashes to subvol-0 but since it has less than
# min-free-disk space available, its data will be cached on subvol-1.
TEST dd if=/dev/zero of=$M0/zz bs=5M count=1
TEST stat $B0/${V0}1/zz
TEST stat $B0/${V0}2/zz
EXPECT "$V0-client-1" dht_get_linkto_target "$B0/${V0}1/zz"
EXPECT "1" is_dht_linkfile "$B0/${V0}1/zz"
cleanup;

View File

@ -91,3 +91,22 @@ function remove_brick_completed()
echo $val
return $val
}
function dht_get_linkto_target()
{
local path=$1;
echo `getfattr -d -m . -e text --only-values --absolute-names --name=trusted.glusterfs.dht.linkto $path`
}
function is_dht_linkfile()
{
local path=$1
retval=0
local output=`stat --format=%a $path`
if [ $output -eq 1000 ]; then
retval=1
fi
echo $retval
return $retval
}

View File

@ -135,6 +135,7 @@ int
dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
{
int i = 0;
int ret = -1;
dht_conf_t *conf = NULL;
call_frame_t *statfs_frame = NULL;
dht_local_t *statfs_local = NULL;
@ -164,12 +165,25 @@ dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
goto err;
}
statfs_local->params = dict_new ();
if (!statfs_local->params)
goto err;
ret = dict_set_int8 (statfs_local->params,
GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
if (ret) {
gf_log (this->name, GF_LOG_ERROR,
"Failed to set "
GF_INTERNAL_IGNORE_DEEM_STATFS" in dict");
goto err;
}
statfs_local->call_cnt = conf->subvolume_cnt;
for (i = 0; i < conf->subvolume_cnt; i++) {
STACK_WIND (statfs_frame, dht_du_info_cbk,
conf->subvolumes[i],
conf->subvolumes[i]->fops->statfs,
&tmp_loc, NULL);
&tmp_loc, statfs_local->params);
}
conf->last_stat_fetch.tv_sec = tv.tv_sec;

View File

@ -3808,16 +3808,24 @@ out:
int32_t
quota_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
quota_local_t *local = NULL;
int op_errno = 0;
int ret = -1;
quota_priv_t *priv = NULL;
int op_errno = 0;
int ret = -1;
int8_t ignore_deem_statfs = 0;
quota_priv_t *priv = NULL;
quota_local_t *local = NULL;
priv = this->private;
GF_ASSERT (loc);
WIND_IF_QUOTAOFF (priv->is_quota_on, off);
ret = dict_get_int8 (xdata, GF_INTERNAL_IGNORE_DEEM_STATFS,
&ignore_deem_statfs);
ret = 0;
if (ignore_deem_statfs)
goto off;
if (priv->consider_statfs && loc->inode) {
local = quota_local_new ();
if (!local) {