mirror of
git://git.proxmox.com/git/pve-ha-manager.git
synced 2025-01-04 09:17:59 +03:00
crm: factor out giving up the watchdog protection
See the added comment for full details why the watchdog protection of the CRM needs less strict safety requirements compared to the one of an (active) LRM. In short, CRM does not manages services but directs them through the manager_status state-file. This means the watchdog mainly protects it from a hung system where locks would timeout before writing the state out and thus a race with the new CRM would happen. So the CRM basically can always give up the watchdog safely when it stops being the active CRM anyway. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
b4a7a92d57
commit
af71e0049f
@ -112,6 +112,26 @@ sub get_protected_ha_manager_lock {
|
||||
return 0;
|
||||
}
|
||||
|
||||
# NOTE: This is disabling the self-fence mechanism for the CRM, which has almost *no* safety or
|
||||
# intergrity implications for the HA stack. The reason for this is that fencing is protecting
|
||||
# against running the same services multiple times in a cluster, e.g. due to split brain, which then
|
||||
# can cause corruption, especially of shared resources. But as the CRM itself does not run any
|
||||
# resources, but only handles delegation and orchestration of them, disabling fencing in the CRM is
|
||||
# fine as long as we do not touch the manager HA state anymore, which normaly means transition out
|
||||
# of the "master" (= active manager) state in our FSM.
|
||||
#
|
||||
# If we would actually always fence we might reset a node that currently has no active HA service
|
||||
# and thus an idle LRM that does not need fencing. This then could make a partial outage even worse,
|
||||
# as one nodes less is available in such a situation.
|
||||
my sub give_up_watchdog_protection {
|
||||
my ($self) = @_;
|
||||
|
||||
if ($self->{ha_manager_wd}) {
|
||||
$self->{haenv}->watchdog_close($self->{ha_manager_wd});
|
||||
delete $self->{ha_manager_wd}; # only delete after close!
|
||||
}
|
||||
}
|
||||
|
||||
# checks quorum, for no active pending fence jobs and if services are configured
|
||||
sub can_get_active {
|
||||
my ($self, $allow_no_service) = @_;
|
||||
@ -233,10 +253,8 @@ sub work {
|
||||
|
||||
if ($self->{shutdown_request}) {
|
||||
|
||||
if ($self->{ha_manager_wd}) {
|
||||
$haenv->watchdog_close($self->{ha_manager_wd});
|
||||
delete $self->{ha_manager_wd};
|
||||
}
|
||||
# safety: service gets shutdown and thus won't change manager state again.
|
||||
give_up_watchdog_protection($self);
|
||||
|
||||
# release the manager lock, so another CRM slave can get it
|
||||
# and continue to work without waiting for the lock timeout
|
||||
@ -268,10 +286,8 @@ sub work {
|
||||
|
||||
} elsif ($state eq 'lost_manager_lock') {
|
||||
|
||||
if ($self->{ha_manager_wd}) {
|
||||
$haenv->watchdog_close($self->{ha_manager_wd});
|
||||
delete $self->{ha_manager_wd};
|
||||
}
|
||||
# safety: not touching manager state in this or next FSM state
|
||||
give_up_watchdog_protection($self);
|
||||
|
||||
return 0 if $self->{shutdown_request};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user