Ensure only primary sender drives slot ownership updates (#754)
Fixes a regression introduced in PR #445, which allowed a message from a replica to update the slot ownership of its primary. The regression results in a `replicaof` cycle, causing server crashes due to the cycle detection assert. The fix restores the previous behavior where only primary senders can trigger `clusterUpdateSlotsConfigWith`. Additional changes: * Handling of primaries without slots is obsoleted by new handling of when a sender that was a replica announces that it is now a primary. * Replication loop detection code is unchanged but shifted downwards. * Some variables are renamed for better readability and some are introduced to avoid repeated memcmp() calls. Fixes #753. --------- Signed-off-by: Ping Xie <pingxie@google.com>
This commit is contained in:
parent
1a8bd045f3
commit
66d0f7d9a1
@ -2927,8 +2927,11 @@ int clusterProcessPacket(clusterLink *link) {
|
||||
mstime_t now = mstime();
|
||||
|
||||
uint16_t flags = ntohs(hdr->flags);
|
||||
uint64_t senderCurrentEpoch = 0, senderConfigEpoch = 0;
|
||||
uint64_t sender_claimed_current_epoch = 0, sender_claimed_config_epoch = 0;
|
||||
clusterNode *sender = getNodeFromLinkAndMsg(link, hdr);
|
||||
int sender_claims_to_be_primary = !memcmp(hdr->replicaof, CLUSTER_NODE_NULL_NAME, CLUSTER_NAMELEN);
|
||||
int sender_last_reported_as_replica = sender && nodeIsReplica(sender);
|
||||
int sender_last_reported_as_primary = sender && nodeIsPrimary(sender);
|
||||
|
||||
if (sender && (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA)) {
|
||||
sender->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED;
|
||||
@ -2942,13 +2945,13 @@ int clusterProcessPacket(clusterLink *link) {
|
||||
|
||||
if (sender && !nodeInHandshake(sender)) {
|
||||
/* Update our currentEpoch if we see a newer epoch in the cluster. */
|
||||
senderCurrentEpoch = ntohu64(hdr->currentEpoch);
|
||||
senderConfigEpoch = ntohu64(hdr->configEpoch);
|
||||
if (senderCurrentEpoch > server.cluster->currentEpoch) server.cluster->currentEpoch = senderCurrentEpoch;
|
||||
sender_claimed_current_epoch = ntohu64(hdr->currentEpoch);
|
||||
sender_claimed_config_epoch = ntohu64(hdr->configEpoch);
|
||||
if (sender_claimed_current_epoch > server.cluster->currentEpoch)
|
||||
server.cluster->currentEpoch = sender_claimed_current_epoch;
|
||||
/* Update the sender configEpoch if it is a primary publishing a newer one. */
|
||||
if (!memcmp(hdr->replicaof, CLUSTER_NODE_NULL_NAME, sizeof(hdr->replicaof)) &&
|
||||
senderConfigEpoch > sender->configEpoch) {
|
||||
sender->configEpoch = senderConfigEpoch;
|
||||
if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) {
|
||||
sender->configEpoch = sender_claimed_config_epoch;
|
||||
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
|
||||
}
|
||||
/* Update the replication offset info for this node. */
|
||||
@ -3110,36 +3113,36 @@ int clusterProcessPacket(clusterLink *link) {
|
||||
/* Check for role switch: replica -> primary or primary -> replica. */
|
||||
if (sender) {
|
||||
serverLog(LL_DEBUG, "node %.40s (%s) announces that it is a %s in shard %.40s", sender->name,
|
||||
sender->human_nodename,
|
||||
!memcmp(hdr->replicaof, CLUSTER_NODE_NULL_NAME, sizeof(hdr->replicaof)) ? "primary" : "replica",
|
||||
sender->shard_id);
|
||||
if (!memcmp(hdr->replicaof, CLUSTER_NODE_NULL_NAME, sizeof(hdr->replicaof))) {
|
||||
sender->human_nodename, sender_claims_to_be_primary ? "primary" : "replica", sender->shard_id);
|
||||
if (sender_claims_to_be_primary) {
|
||||
/* Node is a primary. */
|
||||
clusterSetNodeAsPrimary(sender);
|
||||
} else {
|
||||
/* Node is a replica. */
|
||||
clusterNode *primary = clusterLookupNode(hdr->replicaof, CLUSTER_NAMELEN);
|
||||
clusterNode *sender_claimed_primary = clusterLookupNode(hdr->replicaof, CLUSTER_NAMELEN);
|
||||
|
||||
if (clusterNodeIsPrimary(sender)) {
|
||||
if (sender_last_reported_as_primary) {
|
||||
/* Primary turned into a replica! Reconfigure the node. */
|
||||
if (primary && areInSameShard(primary, sender)) {
|
||||
if (sender_claimed_primary && areInSameShard(sender_claimed_primary, sender)) {
|
||||
/* `sender` was a primary and was in the same shard as its new primary */
|
||||
if (sender->configEpoch > senderConfigEpoch) {
|
||||
if (sender->configEpoch > sender_claimed_config_epoch) {
|
||||
serverLog(LL_NOTICE,
|
||||
"Ignore stale message from %.40s (%s) in shard %.40s;"
|
||||
" gossip config epoch: %llu, current config epoch: %llu",
|
||||
sender->name, sender->human_nodename, sender->shard_id,
|
||||
(unsigned long long)senderConfigEpoch, (unsigned long long)sender->configEpoch);
|
||||
(unsigned long long)sender_claimed_config_epoch,
|
||||
(unsigned long long)sender->configEpoch);
|
||||
} else {
|
||||
/* `primary` is still a `replica` in this observer node's view;
|
||||
* update its role and configEpoch */
|
||||
clusterSetNodeAsPrimary(primary);
|
||||
primary->configEpoch = senderConfigEpoch;
|
||||
clusterSetNodeAsPrimary(sender_claimed_primary);
|
||||
sender_claimed_primary->configEpoch = sender_claimed_config_epoch;
|
||||
serverLog(LL_NOTICE,
|
||||
"A failover occurred in shard %.40s; node %.40s (%s)"
|
||||
" failed over to node %.40s (%s) with a config epoch of %llu",
|
||||
sender->shard_id, sender->name, sender->human_nodename, primary->name,
|
||||
primary->human_nodename, (unsigned long long)primary->configEpoch);
|
||||
sender->shard_id, sender->name, sender->human_nodename,
|
||||
sender_claimed_primary->name, sender_claimed_primary->human_nodename,
|
||||
(unsigned long long)sender_claimed_primary->configEpoch);
|
||||
}
|
||||
} else {
|
||||
/* `sender` was moved to another shard and has become a replica, remove its slot assignment */
|
||||
@ -3148,9 +3151,9 @@ int clusterProcessPacket(clusterLink *link) {
|
||||
"Node %.40s (%s) is no longer primary of shard %.40s;"
|
||||
" removed all %d slot(s) it used to own",
|
||||
sender->name, sender->human_nodename, sender->shard_id, slots);
|
||||
if (primary != NULL) {
|
||||
if (sender_claimed_primary != NULL) {
|
||||
serverLog(LL_NOTICE, "Node %.40s (%s) is now part of shard %.40s", sender->name,
|
||||
sender->human_nodename, primary->shard_id);
|
||||
sender->human_nodename, sender_claimed_primary->shard_id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3162,17 +3165,17 @@ int clusterProcessPacket(clusterLink *link) {
|
||||
}
|
||||
|
||||
/* Primary node changed for this replica? */
|
||||
if (primary && sender->replicaof != primary) {
|
||||
if (sender_claimed_primary && sender->replicaof != sender_claimed_primary) {
|
||||
if (sender->replicaof) clusterNodeRemoveReplica(sender->replicaof, sender);
|
||||
serverLog(LL_NOTICE, "Node %.40s (%s) is now a replica of node %.40s (%s) in shard %.40s",
|
||||
sender->name, sender->human_nodename, primary->name, primary->human_nodename,
|
||||
sender->shard_id);
|
||||
clusterNodeAddReplica(primary, sender);
|
||||
sender->replicaof = primary;
|
||||
sender->name, sender->human_nodename, sender_claimed_primary->name,
|
||||
sender_claimed_primary->human_nodename, sender->shard_id);
|
||||
clusterNodeAddReplica(sender_claimed_primary, sender);
|
||||
sender->replicaof = sender_claimed_primary;
|
||||
|
||||
/* Update the shard_id when a replica is connected to its
|
||||
* primary in the very first time. */
|
||||
updateShardId(sender, primary->shard_id);
|
||||
updateShardId(sender, sender_claimed_primary->shard_id);
|
||||
|
||||
/* Update config. */
|
||||
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
|
||||
@ -3187,66 +3190,41 @@ int clusterProcessPacket(clusterLink *link) {
|
||||
|
||||
/* Many checks are only needed if the set of served slots this
|
||||
* instance claims is different compared to the set of slots we have
|
||||
* for it. Check this ASAP to avoid other computational expansive
|
||||
* checks later. */
|
||||
clusterNode *sender_primary = NULL; /* Sender or its primary if replica. */
|
||||
int dirty_slots = 0; /* Sender claimed slots don't match my view? */
|
||||
* for it or if there was a failover in the sender's shard. Check
|
||||
* this ASAP to avoid other computational expensive checks later.*/
|
||||
|
||||
if (sender) {
|
||||
sender_primary = clusterNodeIsPrimary(sender) ? sender : sender->replicaof;
|
||||
if (sender_primary) {
|
||||
dirty_slots = memcmp(sender_primary->slots, hdr->myslots, sizeof(hdr->myslots)) != 0;
|
||||
if (sender && sender_claims_to_be_primary &&
|
||||
(sender_last_reported_as_replica || memcmp(sender->slots, hdr->myslots, sizeof(hdr->myslots)))) {
|
||||
/* Make sure CLUSTER_NODE_PRIMARY has already been set by now on sender */
|
||||
serverAssert(nodeIsPrimary(sender));
|
||||
|
||||
/* Force dirty when the sending shard owns no slots so that
|
||||
* we have a chance to examine and repair slot migrating/importing
|
||||
* states that involve empty shards. */
|
||||
dirty_slots |= sender_primary->numslots == 0;
|
||||
}
|
||||
}
|
||||
/* 1) If the sender of the message is a primary, and we detected that
|
||||
* the set of slots it claims changed, scan the slots to see if we
|
||||
* need to update our configuration. */
|
||||
clusterUpdateSlotsConfigWith(sender, sender_claimed_config_epoch, hdr->myslots);
|
||||
|
||||
/* 1) If the sender of the message is a primary, and we detected that
|
||||
* the set of slots it claims changed, scan the slots to see if we
|
||||
* need to update our configuration. */
|
||||
if (sender_primary && dirty_slots)
|
||||
clusterUpdateSlotsConfigWith(sender_primary, senderConfigEpoch, hdr->myslots);
|
||||
|
||||
/* Explicitly check for a replication loop before attempting the replication
|
||||
* chain folding logic. */
|
||||
if (myself->replicaof && myself->replicaof->replicaof && myself->replicaof->replicaof != myself) {
|
||||
/* Safeguard against sub-replicas. A replica's primary can turn itself
|
||||
* into a replica if its last slot is removed. If no other node takes
|
||||
* over the slot, there is nothing else to trigger replica migration. */
|
||||
serverLog(LL_NOTICE, "I'm a sub-replica! Reconfiguring myself as a replica of %.40s from %.40s",
|
||||
myself->replicaof->replicaof->name, myself->replicaof->name);
|
||||
clusterSetPrimary(myself->replicaof->replicaof, 1);
|
||||
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
|
||||
}
|
||||
|
||||
/* 2) We also check for the reverse condition, that is, the sender
|
||||
* claims to serve slots we know are served by a primary with a
|
||||
* greater configEpoch. If this happens we inform the sender.
|
||||
*
|
||||
* This is useful because sometimes after a partition heals, a
|
||||
* reappearing primary may be the last one to claim a given set of
|
||||
* hash slots, but with a configuration that other instances know to
|
||||
* be deprecated. Example:
|
||||
*
|
||||
* A and B are primary and replica for slots 1,2,3.
|
||||
* A is partitioned away, B gets promoted.
|
||||
* B is partitioned away, and A returns available.
|
||||
*
|
||||
* Usually B would PING A publishing its set of served slots and its
|
||||
* configEpoch, but because of the partition B can't inform A of the
|
||||
* new configuration, so other nodes that have an updated table must
|
||||
* do it. In this way A will stop to act as a primary (or can try to
|
||||
* failover if there are the conditions to win the election). */
|
||||
if (sender && dirty_slots) {
|
||||
int j;
|
||||
|
||||
for (j = 0; j < CLUSTER_SLOTS; j++) {
|
||||
/* 2) We also check for the reverse condition, that is, the sender
|
||||
* claims to serve slots we know are served by a primary with a
|
||||
* greater configEpoch. If this happens we inform the sender.
|
||||
*
|
||||
* This is useful because sometimes after a partition heals, a
|
||||
* reappearing primary may be the last one to claim a given set of
|
||||
* hash slots, but with a configuration that other instances know to
|
||||
* be deprecated. Example:
|
||||
*
|
||||
* A and B are primary and replica for slots 1,2,3.
|
||||
* A is partitioned away, B gets promoted.
|
||||
* B is partitioned away, and A returns available.
|
||||
*
|
||||
* Usually B would PING A publishing its set of served slots and its
|
||||
* configEpoch, but because of the partition B can't inform A of the
|
||||
* new configuration, so other nodes that have an updated table must
|
||||
* do it. In this way A will stop to act as a primary (or can try to
|
||||
* failover if there are the conditions to win the election). */
|
||||
for (int j = 0; j < CLUSTER_SLOTS; j++) {
|
||||
if (bitmapTestBit(hdr->myslots, j)) {
|
||||
if (server.cluster->slots[j] == sender || isSlotUnclaimed(j)) continue;
|
||||
if (server.cluster->slots[j]->configEpoch > senderConfigEpoch) {
|
||||
if (server.cluster->slots[j]->configEpoch > sender_claimed_config_epoch) {
|
||||
serverLog(LL_VERBOSE,
|
||||
"Node %.40s has old slots configuration, sending "
|
||||
"an UPDATE message about %.40s",
|
||||
@ -3262,10 +3240,22 @@ int clusterProcessPacket(clusterLink *link) {
|
||||
}
|
||||
}
|
||||
|
||||
/* Explicitly check for a replication loop before attempting the replication
|
||||
* chain folding logic. */
|
||||
if (myself->replicaof && myself->replicaof->replicaof && myself->replicaof->replicaof != myself) {
|
||||
/* Safeguard against sub-replicas. A replica's primary can turn itself
|
||||
* into a replica if its last slot is removed. If no other node takes
|
||||
* over the slot, there is nothing else to trigger replica migration. */
|
||||
serverLog(LL_NOTICE, "I'm a sub-replica! Reconfiguring myself as a replica of %.40s from %.40s",
|
||||
myself->replicaof->replicaof->name, myself->replicaof->name);
|
||||
clusterSetPrimary(myself->replicaof->replicaof, 1);
|
||||
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
|
||||
}
|
||||
|
||||
/* If our config epoch collides with the sender's try to fix
|
||||
* the problem. */
|
||||
if (sender && clusterNodeIsPrimary(myself) && clusterNodeIsPrimary(sender) &&
|
||||
senderConfigEpoch == myself->configEpoch) {
|
||||
if (sender && nodeIsPrimary(myself) && nodeIsPrimary(sender) &&
|
||||
sender_claimed_config_epoch == myself->configEpoch) {
|
||||
clusterHandleConfigEpochCollision(sender);
|
||||
}
|
||||
|
||||
@ -3317,7 +3307,7 @@ int clusterProcessPacket(clusterLink *link) {
|
||||
/* We consider this vote only if the sender is a primary serving
|
||||
* a non zero number of slots, and its currentEpoch is greater or
|
||||
* equal to epoch where this node started the election. */
|
||||
if (clusterNodeIsVotingPrimary(sender) && senderCurrentEpoch >= server.cluster->failover_auth_epoch) {
|
||||
if (clusterNodeIsVotingPrimary(sender) && sender_claimed_current_epoch >= server.cluster->failover_auth_epoch) {
|
||||
server.cluster->failover_auth_count++;
|
||||
/* Maybe we reached a quorum here, set a flag to make sure
|
||||
* we check ASAP. */
|
||||
|
@ -156,18 +156,18 @@ test "Verify the nodes configured with prefer hostname only show hostname for ne
|
||||
# to accept our isolated nodes connections. At this point they will
|
||||
# start showing up in cluster slots.
|
||||
wait_for_condition 50 100 {
|
||||
[llength [R 6 CLUSTER SLOTS]] eq 3
|
||||
[llength [R 6 CLUSTER SLOTS]] eq 2
|
||||
} else {
|
||||
fail "Node did not learn about the 2 shards it can talk to"
|
||||
}
|
||||
wait_for_condition 50 100 {
|
||||
[lindex [get_slot_field [R 6 CLUSTER SLOTS] 1 2 3] 1] eq "shard-1.com"
|
||||
[lindex [get_slot_field [R 6 CLUSTER SLOTS] 0 2 3] 1] eq "shard-1.com"
|
||||
} else {
|
||||
fail "hostname for shard-1 didn't reach node 6"
|
||||
}
|
||||
|
||||
wait_for_condition 50 100 {
|
||||
[lindex [get_slot_field [R 6 CLUSTER SLOTS] 2 2 3] 1] eq "shard-2.com"
|
||||
[lindex [get_slot_field [R 6 CLUSTER SLOTS] 1 2 3] 1] eq "shard-2.com"
|
||||
} else {
|
||||
fail "hostname for shard-2 didn't reach node 6"
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user