cluster/tier: stop tier migration after graph switch

On a graph switch, a new xlator and private structures are created. The tier migration daemon must stop using the old xlator and private structures and begin using the new ones. Otherwise, when RPCs arrive (such as counter queries from glusterd), the new xlator will be consulted but it will not have up to date information. The fix detects a graph switch and exits the daemon in this case. Typical graph switches for the tier case would be turning off performance translators. Change-Id: Ibfbd4720dc82ea179b77c81b8f534abced21e3c8 BUG: 1226005 Signed-off-by: Dan Lambright <dlambrig@redhat.com> Reviewed-on: http://review.gluster.org/11372
2015-06-23 16:35:03 -04:00 · 2015-06-23 16:35:03 -04:00 · 875aa01ec8
commit 875aa01ec8
parent 26ef697318
2 changed files with 34 additions and 3 deletions
--- a/tests/basic/tier/tier.t
+++ b/tests/basic/tier/tier.t
@ -52,6 +52,15 @@ function confirm_vol_stopped {
    fi
 }

+function check_counters_nonzero {
+    $CLI volume rebalance $V0 tier status | grep ' 0 '
+    if [ $? == 0 ]; then
+        echo "1"
+    else
+        echo "0"
+    fi
+}
+
 DEMOTE_TIMEOUT=12
 PROMOTE_TIMEOUT=5
 MIGRATION_TIMEOUT=10
@ -62,12 +71,17 @@ TEST pidof glusterd

 TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0..$LAST_BRICK}
 # testing bug 1215122, ie should fail if replica count and bricks are not compatible.
+
 TEST ! $CLI volume attach-tier $V0 replica 5 $H0:$B0/${V0}$CACHE_BRICK_FIRST $H0:$B0/${V0}$CACHE_BRICK_LAST

 TEST $CLI volume start $V0

+# The following two commands instigate a graph switch. Do them
+# before attaching the tier. If done on a tiered volume the rebalance
+# daemon will terminate and must be restarted manually.
 TEST $CLI volume set $V0 performance.quick-read off
 TEST $CLI volume set $V0 performance.io-cache off
+
 TEST $CLI volume set $V0 features.ctr-enabled on

 #Not a tier volume
@ -78,6 +92,8 @@ TEST ! $CLI volume detach-tier $V0 commit force

 TEST $CLI volume attach-tier $V0 replica 2 $H0:$B0/${V0}$CACHE_BRICK_FIRST $H0:$B0/${V0}$CACHE_BRICK_LAST

+$CLI volume rebalance $V0 tier status
+
 #Tier options expect non-negative value
 TEST ! $CLI volume set $V0 cluster.tier-promote-frequency -1

@ -128,13 +144,12 @@ sleep 5
 EXPECT_WITHIN $PROMOTE_TIMEOUT "0" file_on_fast_tier d1/data2.txt
 EXPECT_WITHIN $PROMOTE_TIMEOUT "0" file_on_fast_tier d1/data3.txt

+EXPECT "0" check_counters_nonzero
+
 # stop gluster, when it comes back info file should have tiered volume
 killall glusterd
 TEST glusterd

-# Test rebalance commands
-TEST $CLI volume rebalance $V0 tier status
-
 TEST $CLI volume detach-tier $V0 start

 TEST $CLI volume detach-tier $V0 commit force
--- a/xlators/cluster/dht/src/tier.c
+++ b/xlators/cluster/dht/src/tier.c
@ -776,6 +776,8 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)
        pthread_t demote_thread;
        gf_boolean_t  is_promotion_triggered = _gf_false;
        gf_boolean_t  is_demotion_triggered = _gf_false;
+        xlator_t                *any        = NULL;
+        xlator_t                *xlator       = NULL;

        conf   = this->private;

@ -798,6 +800,20 @@ tier_start (xlator_t *this, gf_defrag_info_t *defrag)

        while (1) {

+                /*
+                 * Check if a graph switch occured. If so, stop migration
+                 * thread. It will need to be restarted manually.
+                 */
+                any = THIS->ctx->active->first;
+                xlator = xlator_search_by_name(any, this->name);
+
+                if (xlator != this) {
+                        gf_msg (this->name, GF_LOG_INFO, 0,
+                                DHT_MSG_LOG_TIER_STATUS,
+                                "Detected graph switch. Exiting migration daemon.");
+                        goto out;
+                }
+
                sleep(1);

                if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {