1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-03 01:18:10 +03:00

ctdb-tool: Add UNKNOWN pseudo state

When a node is starting, CTDB reports remote nodes as unhealthy by
default.  This can be misleading.

To hide this, report an "UNKNOWN" pseudo state when a remote node is
not disconnected and the runstate is less than or equal to
"FIRST_RECOVERY".

Signed-off-by: Vinit Agnihotri <vagnihotri@ddn.com>
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
Vinit Agnihotri 2022-04-26 17:20:21 +10:00 committed by Amitay Isaacs
parent 428bc71f98
commit 794f125802
12 changed files with 122 additions and 47 deletions

View File

@ -25,9 +25,9 @@ EOF
simple_test all
required_result 0 <<EOF
|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|0|0|0|N|
|1|192.168.20.42|0|0|0|0|0|0|0|N|
|2|192.168.20.43|0|0|0|0|0|0|0|Y|
|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|0|0|0|0|N|
|1|192.168.20.42|0|0|0|0|0|0|0|0|N|
|2|192.168.20.43|0|0|0|0|0|0|0|0|Y|
EOF
simple_test -X all

View File

@ -25,9 +25,9 @@ EOF
simple_test all
required_result 1 <<EOF
|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|0|0|0|N|
|1|192.168.20.42|1|0|0|0|0|1|0|N|
|2|192.168.20.43|0|0|0|0|0|0|0|Y|
|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|0|0|0|0|N|
|1|192.168.20.42|1|0|0|0|0|0|1|0|N|
|2|192.168.20.43|0|0|0|0|0|0|0|0|Y|
EOF
simple_test -X all

View File

@ -25,9 +25,9 @@ EOF
simple_test all
required_result 2 <<EOF
|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|1|0|0|0|N|
|1|192.168.20.42|0|0|0|0|0|0|0|N|
|2|192.168.20.43|0|0|0|0|0|0|0|Y|
|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|1|0|0|0|N|
|1|192.168.20.42|0|0|0|0|0|0|0|0|N|
|2|192.168.20.43|0|0|0|0|0|0|0|0|Y|
EOF
simple_test -X all

View File

@ -22,7 +22,7 @@ EOF
simple_test
required_result 0 <<EOF
|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|2|192.168.20.43|0|0|0|0|0|0|0|Y|
|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|2|192.168.20.43|0|0|0|0|0|0|0|0|Y|
EOF
simple_test -X

View File

@ -22,7 +22,7 @@ EOF
simple_test 0
required_result 2 <<EOF
|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|1|0|0|0|N|
|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|1|0|0|0|N|
EOF
simple_test -X 0

View File

@ -22,8 +22,8 @@ EOF
simple_test 0
required_result 36 <<EOF
|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|1|0|1|1|0|N|
|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|1|0|1|1|0|N|
EOF
simple_test -X 0

View File

@ -38,9 +38,9 @@ EOF
simple_test
required_result 0 <<EOF
|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|0|0|0|Y|
|1|192.168.20.42|0|0|0|0|0|0|0|N|
|2|192.168.20.43|0|0|0|0|0|0|0|N|
|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|0|0|0|0|Y|
|1|192.168.20.42|0|0|0|0|0|0|0|0|N|
|2|192.168.20.43|0|0|0|0|0|0|0|0|N|
EOF
simple_test -X

View File

@ -38,9 +38,9 @@ EOF
simple_test
required_result 0 <<EOF
|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|1|0|0|0|N|
|1|192.168.20.42|0|0|0|0|0|0|0|Y|
|2|192.168.20.43|0|0|0|0|0|0|0|N|
|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
|0|192.168.20.41|0|0|0|0|1|0|0|0|N|
|1|192.168.20.42|0|0|0|0|0|0|0|0|Y|
|2|192.168.20.43|0|0|0|0|0|0|0|0|N|
EOF
simple_test -X

View File

@ -446,16 +446,16 @@ node_has_status ()
local bits
case "$status" in
unhealthy) bits="?|?|?|1|*" ;;
healthy) bits="?|?|?|0|*" ;;
unhealthy) bits="?|?|?|?|1|*" ;;
healthy) bits="?|?|?|?|0|*" ;;
disconnected) bits="1|*" ;;
connected) bits="0|*" ;;
banned) bits="?|1|*" ;;
unbanned) bits="?|0|*" ;;
disabled) bits="?|?|1|*" ;;
enabled) bits="?|?|0|*" ;;
stopped) bits="?|?|?|?|1|*" ;;
notstopped) bits="?|?|?|?|0|*" ;;
banned) bits="?|?|1|*" ;;
unbanned) bits="?|?|0|*" ;;
disabled) bits="?|?|?|1|*" ;;
enabled) bits="?|?|?|0|*" ;;
stopped) bits="?|?|?|?|?|1|*" ;;
notstopped) bits="?|?|?|?|?|0|*" ;;
*)
echo "node_has_status: unknown status \"$status\""
return 1

View File

@ -52,6 +52,8 @@
#define SRVID_CTDB_TOOL (CTDB_SRVID_TOOL_RANGE | 0x0001000000000000LL)
#define SRVID_CTDB_PUSHDB (CTDB_SRVID_TOOL_RANGE | 0x0002000000000000LL)
#define NODE_FLAGS_UNKNOWN 0x00000040
static struct {
const char *debuglevelstr;
int timelimit;
@ -111,6 +113,7 @@ static const char *pretty_print_flags(TALLOC_CTX *mem_ctx, uint32_t flags)
const char *name;
} flag_names[] = {
{ NODE_FLAGS_DISCONNECTED, "DISCONNECTED" },
{ NODE_FLAGS_UNKNOWN, "UNKNOWN" },
{ NODE_FLAGS_PERMANENTLY_DISABLED, "DISABLED" },
{ NODE_FLAGS_BANNED, "BANNED" },
{ NODE_FLAGS_UNHEALTHY, "UNHEALTHY" },
@ -367,6 +370,64 @@ done:
return true;
}
/*
* Remote nodes are initialised as UNHEALTHY in the daemon and their
* true status is udpated after they are connected. However, there
* is a small window when a healthy node may be shown as unhealthy
* between connecting and the status update. Hide this for nodes
* that are not DISCONNECTED nodes by reporting them as UNKNOWN until
* the runstate passes FIRST_RECOVERY. Code paths where this is used
* do not make any control decisions depending upon unknown/unhealthy
* state.
*/
static struct ctdb_node_map *get_nodemap_unknown(
TALLOC_CTX *mem_ctx,
struct ctdb_context *ctdb,
struct ctdb_node_map *nodemap_in)
{
unsigned int i;
int ret;
enum ctdb_runstate runstate;
struct ctdb_node_map *nodemap;
ret = ctdb_ctrl_get_runstate(mem_ctx,
ctdb->ev,
ctdb->client,
ctdb->cmd_pnn,
TIMEOUT(),
&runstate);
if (ret != 0 ) {
printf("Unable to get runstate");
return NULL;
}
nodemap = talloc_nodemap(mem_ctx, nodemap_in);
if (nodemap == NULL) {
printf("Unable to get nodemap");
return NULL;
}
nodemap->num = nodemap_in->num;
for (i=0; i<nodemap->num; i++) {
struct ctdb_node_and_flags *node_in = &nodemap_in->node[i];
struct ctdb_node_and_flags *node = &nodemap->node[i];
*node = *node_in;
if (node->flags & NODE_FLAGS_DELETED) {
continue;
}
if ((runstate <= CTDB_RUNSTATE_FIRST_RECOVERY) &&
!(node->flags & NODE_FLAGS_DISCONNECTED) &&
(node->pnn != ctdb->cmd_pnn)) {
node->flags = NODE_FLAGS_UNKNOWN;
}
}
return nodemap;
}
/* Compare IP address */
static bool ctdb_same_ip(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
{
@ -826,11 +887,12 @@ static void print_nodemap_machine(TALLOC_CTX *mem_ctx,
struct ctdb_node_and_flags *node;
unsigned int i;
printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
options.sep,
"Node", options.sep,
"IP", options.sep,
"Disconnected", options.sep,
"Unknown", options.sep,
"Banned", options.sep,
"Disabled", options.sep,
"Unhealthy", options.sep,
@ -845,12 +907,13 @@ static void print_nodemap_machine(TALLOC_CTX *mem_ctx,
continue;
}
printf("%s%u%s%s%s%d%s%d%s%d%s%d%s%d%s%d%s%d%s%c%s\n",
printf("%s%u%s%s%s%d%s%d%s%d%s%d%s%d%s%d%s%d%s%d%s%c%s\n",
options.sep,
node->pnn, options.sep,
ctdb_sock_addr_to_string(mem_ctx, &node->addr, false),
options.sep,
!! (node->flags & NODE_FLAGS_DISCONNECTED), options.sep,
!! (node->flags & NODE_FLAGS_UNKNOWN), options.sep,
!! (node->flags & NODE_FLAGS_BANNED), options.sep,
!! (node->flags & NODE_FLAGS_PERMANENTLY_DISABLED),
options.sep,
@ -935,6 +998,7 @@ static void print_status(TALLOC_CTX *mem_ctx,
static int control_status(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
int argc, const char **argv)
{
struct ctdb_node_map *nodemap_in;
struct ctdb_node_map *nodemap;
struct ctdb_vnn_map *vnnmap;
int recmode;
@ -945,7 +1009,12 @@ static int control_status(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
usage("status");
}
nodemap = get_nodemap(ctdb, false);
nodemap_in = get_nodemap(ctdb, false);
if (nodemap_in == NULL) {
return 1;
}
nodemap = get_nodemap_unknown(mem_ctx, ctdb, nodemap_in);
if (nodemap == NULL) {
return 1;
}
@ -5603,6 +5672,7 @@ static int control_nodestatus(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
int argc, const char **argv)
{
const char *nodestring = NULL;
struct ctdb_node_map *nodemap_in;
struct ctdb_node_map *nodemap;
unsigned int i;
int ret;
@ -5619,7 +5689,12 @@ static int control_nodestatus(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
}
}
if (! parse_nodestring(mem_ctx, ctdb, nodestring, &nodemap)) {
if (! parse_nodestring(mem_ctx, ctdb, nodestring, &nodemap_in)) {
return 1;
}
nodemap = get_nodemap_unknown(mem_ctx, ctdb, nodemap_in);
if (nodemap == NULL) {
return 1;
}

View File

@ -32,7 +32,7 @@ EOF
nodestatus_X=""
# Fields are:
# Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode
# Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode
get_nodestatus_X ()
{
# Result is cached in global variable nodestatus_X
@ -100,11 +100,11 @@ filter_nodes ()
# them, so the first to succeed will print the nodes.
# First try for a fully active and healthy node, so must not
# be DISABLED, UNHEALTHY or INACTIVE (last covers
# be UNKNOWN, DISABLED, UNHEALTHY or INACTIVE (last covers
# DISCONNECTED, BANNED or STOPPED)
awk -F '|' -v ns="$_ns" '
BEGIN { ret = 255 }
ns ~ "@" $2 "@" && $5 == 0 && $6 == 0 && $8 == 0 {
ns ~ "@" $2 "@" && $4 == 0 && $6 == 0 && $7 == 0 && $9 == 0 {
print $1, $2 ; ret=0
}
END { exit ret }
@ -115,7 +115,7 @@ EOF
# DISABLED
awk -F '|' -v ns="$_ns" '
BEGIN { ret = 255 }
ns ~ "@" $2 "@" && $5 == 0 && $8 == 0 {
ns ~ "@" $2 "@" && $6 == 0 && $9 == 0 {
print $1, $2 ; ret=0
}
END { exit ret }

View File

@ -32,7 +32,7 @@ EOF
nodestatus_X=""
# Fields are:
# Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode
# Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode
get_nodestatus_X ()
{
# Result is cached in global variable nodestatus_X
@ -102,12 +102,12 @@ EOF
# the first to succeed will select the leader node.
# First try for a fully active and healthy node, so must not be
# DISABLED, UNHEALTHY or INACTIVE (last covers DISCONNECTED,
# UNKNOWN, DISABLED, UNHEALTHY or INACTIVE (last covers DISCONNECTED,
# BANNED or STOPPED)
awk -F '|' -v ms="$_ms" \
'BEGIN { ret = 2 }
ms ~ "@" $2 "@" &&
$5 == 0 && $6 == 0 && $8 == 0 { print $1, $2 ; ret=0 ; exit }
$4 == 0 && $6 == 0 && $7 == 0 && $9 == 0 { print $1, $2 ; ret=0 ; exit }
END { exit ret }' <<EOF ||
$nodestatus_X
EOF
@ -116,7 +116,7 @@ EOF
awk -F '|' -v ms="$_ms" \
'BEGIN { ret = 2 }
ms ~ "@" $2 "@" &&
$3 == 0 && $5 == 0 && $7 == 0 { print $1, $2 ; ret=0 ; exit }
$3 == 0 && $6 == 0 && $8 == 0 { print $1, $2 ; ret=0 ; exit }
END { exit ret }' <<EOF ||
$nodestatus_X
EOF
@ -125,7 +125,7 @@ EOF
awk -F '|' -v ms="$_ms" \
'BEGIN { ret = 2 }
ms ~ "@" $2 "@" &&
$3 == 0 && $5 == 0 { print $1, $2 ; ret=0 ; exit }
$3 == 0 && $6 == 0 { print $1, $2 ; ret=0 ; exit }
END { exit ret }' <<EOF
$nodestatus_X
EOF