From f02e097485722badf27523c706adb99f21342f56 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 10 Jan 2022 13:22:19 +1100 Subject: [PATCH] ctdb-tools: recovery master -> leader The following command names are changed: recmaster -> leader setrecmasterrole -> setleaderrole Command output changed for the following commands: status getcapabilities Documentation and tests are updated to reflect these changes. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- ctdb/doc/ctdb.1.xml | 44 ++++--- .../database/recovery.001.volatile.sh | 62 +++++----- .../simple/cluster.002.recmaster_yield.sh | 2 +- .../UNIT/tool/ctdb.getcapabilities.001.sh | 2 +- .../UNIT/tool/ctdb.getcapabilities.002.sh | 2 +- .../UNIT/tool/ctdb.getcapabilities.004.sh | 6 +- ...db.recmaster.001.sh => ctdb.leader.001.sh} | 0 ...db.recmaster.002.sh => ctdb.leader.002.sh} | 0 ctdb/tests/UNIT/tool/ctdb.status.001.sh | 2 +- ctdb/tests/UNIT/tool/ctdb.status.002.sh | 2 +- ctdb/tests/scripts/integration.bash | 2 +- ctdb/tools/ctdb.c | 114 +++++++++++------- 12 files changed, 139 insertions(+), 99 deletions(-) rename ctdb/tests/UNIT/tool/{ctdb.recmaster.001.sh => ctdb.leader.001.sh} (100%) rename ctdb/tests/UNIT/tool/{ctdb.recmaster.002.sh => ctdb.leader.002.sh} (100%) diff --git a/ctdb/doc/ctdb.1.xml b/ctdb/doc/ctdb.1.xml index e0e05d8e542..6f9a1764ee4 100644 --- a/ctdb/doc/ctdb.1.xml +++ b/ctdb/doc/ctdb.1.xml @@ -299,10 +299,10 @@ RECOVERY - The cluster databases have all been frozen, pausing all services while the cluster awaits a recovery process to complete. A recovery process should finish within seconds. If a cluster is stuck in the RECOVERY state this would indicate a cluster malfunction which needs to be investigated. - Once the recovery master detects an inconsistency, for example a node + Once the leader detects an inconsistency, for example a node becomes disconnected/connected, the recovery daemon will trigger a cluster recovery process, where all databases are remerged across the - cluster. When this process starts, the recovery master will first + cluster. When this process starts, the leader will first "freeze" all databases to prevent applications such as samba from accessing the databases and it will also mark the recovery mode as RECOVERY. @@ -316,13 +316,16 @@ - Recovery master + Leader - This is the cluster node that is currently designated as the recovery master. This node is responsible of monitoring the consistency of the cluster and to perform the actual recovery process when reqired. + This is the cluster node that is currently designated as the + leader. This node is responsible of monitoring the + consistency of the cluster and to perform the actual + recovery process when reqired. - Only one node at a time can be the designated recovery master. Which - node is designated the recovery master is decided by an election + Only one node at a time can be the designated leader. Which + node is designated the leader is decided by an election process in the recovery daemons running on each node. @@ -343,7 +346,7 @@ hash:1 lmaster:1 hash:2 lmaster:2 hash:3 lmaster:3 Recovery mode:NORMAL (0) -Recovery master:0 +Leader:0 @@ -397,9 +400,9 @@ pnn:1 10.0.0.31 OK - recmaster + leader - This command shows the pnn of the node which is currently the recmaster. + This command shows the pnn of the node which is currently the leader. @@ -939,7 +942,7 @@ pnn:3 10.0.0.14 OK Example output: -RECMASTER: YES +LEADER: YES LMASTER: YES @@ -1217,13 +1220,20 @@ DB Statistics: locking.tdb - setrecmasterrole on|off + setleaderrole on|off - This command is used to enable/disable the RECMASTER capability for a node at runtime. This capability determines whether or not a node can be used as an RECMASTER for the cluster. A node that does not have the RECMASTER capability can not win a recmaster election. A node that already is the recmaster for the cluster when the capability is stripped off the node will remain the recmaster until the next cluster election. + This command is used to enable/disable the LEADER capability + for a node at runtime. This capability determines whether or + not a node can be elected leader of the cluster. A node that + does not have the LEADER capability can not be elected + leader. If the current leader has this capability removed then + an election will occur. - Nodes will by default have this capability, but it can be stripped off nodes by the setting in the sysconfig file or by using this command. + Nodes have this capability enabled by default, but it can be + removed via the cluster:leader capability + configuration setting or by using this command. See also "ctdb getcapabilities" @@ -1740,7 +1750,13 @@ HEALTH: NO-HEALTHY-NODES - ERROR - Backup of corrupted TDB in '/usr/local/var/li ipreallocate, sync - This command will force the recovery master to perform a full ip reallocation process and redistribute all ip addresses. This is useful to "reset" the allocations back to its default state if they have been changed using the "moveip" command. While a "recover" will also perform this reallocation, a recovery is much more hevyweight since it will also rebuild all the databases. + This command will force the leader to perform a full ip + reallocation process and redistribute all ip addresses. This + is useful to "reset" the allocations back to its default state + if they have been changed using the "moveip" command. While a + "recover" will also perform this reallocation, a recovery is + much more hevyweight since it will also rebuild all the + databases. diff --git a/ctdb/tests/INTEGRATION/database/recovery.001.volatile.sh b/ctdb/tests/INTEGRATION/database/recovery.001.volatile.sh index ffe322037f5..d7aaa3b8552 100755 --- a/ctdb/tests/INTEGRATION/database/recovery.001.volatile.sh +++ b/ctdb/tests/INTEGRATION/database/recovery.001.volatile.sh @@ -4,22 +4,22 @@ # Recovery can under certain circumstances lead to old record copies # resurrecting: Recovery selects the newest record copy purely by RSN. At -# the end of the recovery, the recovery master is the dmaster for all +# the end of the recovery, the leader is the dmaster for all # records in all (non-persistent) databases. And the other nodes locally # hold the complete copy of the databases. The bug is that the recovery -# process does not increment the RSN on the recovery master at the end of -# the recovery. Now clients acting directly on the Recovery master will -# directly change a record's content on the recmaster without migration +# process does not increment the RSN on the leader at the end of +# the recovery. Now clients acting directly on the leader will +# directly change a record's content on the leader without migration # and hence without RSN bump. So a subsequent recovery can not tell that -# the recmaster's copy is newer than the copies on the other nodes, since -# their RSN is the same. Hence, if the recmaster is not node 0 (or more +# the leader's copy is newer than the copies on the other nodes, since +# their RSN is the same. Hence, if the leader is not node 0 (or more # precisely not the active node with the lowest node number), the recovery # will choose copies from nodes with lower number and stick to these. # 1. Create a test database -# 2. Add a record with value value1 on recovery master +# 2. Add a record with value value1 on leader # 3. Force a recovery -# 4. Update the record with value value2 on recovery master +# 4. Update the record with value value2 on leader # 5. Force a recovery # 6. Confirm that the value is value2 @@ -36,15 +36,15 @@ TESTDB="rec_test.tdb" status=0 -# Make sure node 0 is not the recovery master -echo "find out which node is recmaster" -ctdb_onnode 0 recmaster -recmaster="$out" -if [ "$recmaster" = "0" ]; then - echo "node 0 is recmaster, disable recmasterrole on node 0" +# Make sure node 0 is not the leader +echo "find out which node is leader" +ctdb_onnode 0 leader +leader="$out" +if [ "$leader" = "0" ]; then + echo "node 0 is leader, disable leader role on node 0" # # Note: - # It should be sufficient to run "ctdb setrecmasterrole off" + # It should be sufficient to run "ctdb setleaderrole off" # on node 0 and wait for election and recovery to finish. # But there were problems related to this in this automatic # test, so for now use "ctdb stop" and "ctdb continue". @@ -56,57 +56,57 @@ if [ "$recmaster" = "0" ]; then try_command_on_node 0 $CTDB continue wait_until_node_has_status 0 notstopped - ctdb_onnode 0 recmaster - recmaster="$out" - if [ "$recmaster" = "0" ]; then - echo "failed to move recmaster to different node" + ctdb_onnode 0 leader + leader="$out" + if [ "$leader" = "0" ]; then + echo "failed to move leader to different node" exit 1 fi fi -echo "Recmaster:$recmaster" +echo "Leader:${leader}" # Create a temporary non-persistent database to test with echo "create test database $TESTDB" -try_command_on_node $recmaster $CTDB attach $TESTDB +ctdb_onnode "$leader" attach "$TESTDB" # Wipe Test database echo "wipe test database" -try_command_on_node $recmaster $CTDB wipedb $TESTDB +ctdb_onnode "$leader" wipedb "$TESTDB" # Add a record key=test1 data=value1 echo "store key(test1) data(value1)" -try_command_on_node $recmaster $CTDB writekey $TESTDB test1 value1 +ctdb_onnode "$leader" writekey "$TESTDB" test1 value1 # Fetch a record key=test1 echo "read key(test1)" -try_command_on_node $recmaster $CTDB readkey $TESTDB test1 +ctdb_onnode "$leader" readkey "$TESTDB" test1 cat "$outfile" # Do a recovery echo "force recovery" -try_command_on_node $recmaster $CTDB recover +ctdb_onnode "$leader" recover -wait_until_node_has_status $recmaster recovered +wait_until_node_has_status "$leader" recovered # Add a record key=test1 data=value2 echo "store key(test1) data(value2)" -try_command_on_node $recmaster $CTDB writekey $TESTDB test1 value2 +ctdb_onnode "$leader" writekey "$TESTDB" test1 value2 # Fetch a record key=test1 echo "read key(test1)" -try_command_on_node $recmaster $CTDB readkey $TESTDB test1 +ctdb_onnode "$leader" readkey "$TESTDB" test1 cat "$outfile" # Do a recovery echo "force recovery" -try_command_on_node $recmaster $CTDB recover +ctdb_onnode "$leader" recover -wait_until_node_has_status $recmaster recovered +wait_until_node_has_status "$leader" recovered # Verify record key=test1 echo "read key(test1)" -try_command_on_node $recmaster $CTDB readkey $TESTDB test1 +ctdb_onnode "$leader" readkey "$TESTDB" test1 cat "$outfile" if [ "$out" = "Data: size:6 ptr:[value2]" ]; then echo "GOOD: Recovery did not corrupt database" diff --git a/ctdb/tests/INTEGRATION/simple/cluster.002.recmaster_yield.sh b/ctdb/tests/INTEGRATION/simple/cluster.002.recmaster_yield.sh index 4a2932d42d1..317ecbd427b 100755 --- a/ctdb/tests/INTEGRATION/simple/cluster.002.recmaster_yield.sh +++ b/ctdb/tests/INTEGRATION/simple/cluster.002.recmaster_yield.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Verify that 'ctdb stop' causes a node to yield the recovery master role +# Verify that 'ctdb stop' causes a node to yield the leader role . "${TEST_SCRIPTS_DIR}/integration.bash" diff --git a/ctdb/tests/UNIT/tool/ctdb.getcapabilities.001.sh b/ctdb/tests/UNIT/tool/ctdb.getcapabilities.001.sh index d04db04b60d..da71f22e264 100755 --- a/ctdb/tests/UNIT/tool/ctdb.getcapabilities.001.sh +++ b/ctdb/tests/UNIT/tool/ctdb.getcapabilities.001.sh @@ -12,7 +12,7 @@ NODEMAP EOF required_result 0 <cmd_pnn, vnnmap, - recmode, recmaster); + print_status(mem_ctx, + ctdb, + nodemap, + ctdb->cmd_pnn, + vnnmap, + recmode, + leader); return 0; } @@ -2282,13 +2290,13 @@ static int control_getcapabilities(TALLOC_CTX *mem_ctx, if (options.machinereadable == 1) { printf("%s%s%s%s%s\n", options.sep, - "RECMASTER", options.sep, + "LEADER", options.sep, "LMASTER", options.sep); printf("%s%d%s%d%s\n", options.sep, !! (caps & CTDB_CAP_RECMASTER), options.sep, !! (caps & CTDB_CAP_LMASTER), options.sep); } else { - printf("RECMASTER: %s\n", + printf("LEADER: %s\n", (caps & CTDB_CAP_RECMASTER) ? "YES" : "NO"); printf("LMASTER: %s\n", (caps & CTDB_CAP_LMASTER) ? "YES" : "NO"); @@ -2923,23 +2931,28 @@ static int control_shutdown(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, static int get_generation(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, uint32_t *generation) { - uint32_t recmaster; + uint32_t leader; int recmode; struct ctdb_vnn_map *vnnmap; int ret; again: - ret = get_leader(mem_ctx, ctdb, &recmaster); + ret = get_leader(mem_ctx, ctdb, &leader); if (ret != 0) { - fprintf(stderr, "Failed to find recovery master\n"); + fprintf(stderr, "Failed to find leader\n"); return ret; } - ret = ctdb_ctrl_get_recmode(mem_ctx, ctdb->ev, ctdb->client, - recmaster, TIMEOUT(), &recmode); + ret = ctdb_ctrl_get_recmode(mem_ctx, + ctdb->ev, + ctdb->client, + leader, + TIMEOUT(), + &recmode); if (ret != 0) { - fprintf(stderr, "Failed to get recovery mode from node %u\n", - recmaster); + fprintf(stderr, + "Failed to get recovery mode from node %u\n", + leader); return ret; } @@ -2948,11 +2961,16 @@ again: goto again; } - ret = ctdb_ctrl_getvnnmap(mem_ctx, ctdb->ev, ctdb->client, - recmaster, TIMEOUT(), &vnnmap); + ret = ctdb_ctrl_getvnnmap(mem_ctx, + ctdb->ev, + ctdb->client, + leader, + TIMEOUT(), + &vnnmap); if (ret != 0) { - fprintf(stderr, "Failed to get generation from node %u\n", - recmaster); + fprintf(stderr, + "Failed to get generation from node %u\n", + leader); return ret; } @@ -3873,7 +3891,7 @@ static int rebalancenode(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, CTDB_BROADCAST_CONNECTED, pnn); if (ret != 0) { fprintf(stderr, - "Failed to ask recovery master to distribute IPs\n"); + "Failed to ask leader to distribute IPs\n"); return ret; } @@ -4586,18 +4604,20 @@ failed: return ret; } -static int control_recmaster(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb, - int argc, const char **argv) +static int control_leader(TALLOC_CTX *mem_ctx, + struct ctdb_context *ctdb, + int argc, + const char **argv) { - uint32_t recmaster; + uint32_t leader; int ret; - ret = get_leader(mem_ctx, ctdb, &recmaster); + ret = get_leader(mem_ctx, ctdb, &leader); if (ret != 0) { return ret; } - print_pnn(recmaster); + print_pnn(leader); return 0; } @@ -4752,28 +4772,32 @@ static int control_setlmasterrole(TALLOC_CTX *mem_ctx, return 0; } -static int control_setrecmasterrole(TALLOC_CTX *mem_ctx, - struct ctdb_context *ctdb, - int argc, const char **argv) +static int control_setleaderrole(TALLOC_CTX *mem_ctx, + struct ctdb_context *ctdb, + int argc, + const char **argv) { - uint32_t recmasterrole = 0; + uint32_t leaderrole = 0; int ret; if (argc != 1) { - usage("setrecmasterrole"); + usage("setleaderrole"); } if (strcmp(argv[0], "on") == 0) { - recmasterrole = 1; + leaderrole = 1; } else if (strcmp(argv[0], "off") == 0) { - recmasterrole = 0; + leaderrole = 0; } else { - usage("setrecmasterrole"); + usage("setleaderrole"); } - ret = ctdb_ctrl_set_recmasterrole(mem_ctx, ctdb->ev, ctdb->client, - ctdb->cmd_pnn, TIMEOUT(), - recmasterrole); + ret = ctdb_ctrl_set_recmasterrole(mem_ctx, + ctdb->ev, + ctdb->client, + ctdb->cmd_pnn, + TIMEOUT(), + leaderrole); if (ret != 0) { return ret; } @@ -6013,8 +6037,8 @@ static const struct ctdb_cmd { "dump database from a backup file", "" }, { "wipedb", control_wipedb, false, false, "wipe the contents of a database.", ""}, - { "recmaster", control_recmaster, false, true, - "show the pnn for the recovery master", NULL }, + { "leader", control_leader, false, true, + "show the pnn of the leader", NULL }, { "event", control_event, true, false, "event and event script commands", NULL }, { "scriptstatus", control_scriptstatus, true, false, @@ -6026,8 +6050,8 @@ static const struct ctdb_cmd { "get recovery lock file", NULL }, { "setlmasterrole", control_setlmasterrole, false, true, "set LMASTER role", "on|off" }, - { "setrecmasterrole", control_setrecmasterrole, false, true, - "set RECMASTER role", "on|off"}, + { "setleaderrole", control_setleaderrole, false, true, + "set LEADER role", "on|off"}, { "setdbreadonly", control_setdbreadonly, false, true, "enable readonly records", "" }, { "setdbsticky", control_setdbsticky, false, true,