5
0
mirror of git://git.proxmox.com/git/pve-ha-manager.git synced 2025-01-03 05:17:57 +03:00

LRM: release lock and close watchdog if no service configured for >10min

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Thomas Lamprecht 2021-07-01 15:55:43 +02:00
parent abc1499bc6
commit 21051707f6
7 changed files with 93 additions and 0 deletions

View File

@ -20,6 +20,10 @@ my $valid_states = {
lost_agent_lock => "lost agent_lock",
};
# we sleep ~10s per 'active' round, so if no services is available for >= 10 min we'd go in wait
# state givining up the watchdog and the LRM lock acquire voluntary, ensuring the WD can do no harm
my $max_active_idle_rounds = 60;
sub new {
my ($this, $haenv) = @_;
@ -36,6 +40,7 @@ sub new {
# mode can be: active, reboot, shutdown, restart
mode => 'active',
cluster_state_update => 0,
active_idle_rounds => 0,
}, $class;
$self->set_local_status({ state => 'wait_for_agent_lock' });
@ -216,6 +221,23 @@ sub get_protected_ha_agent_lock {
return 0;
}
# only cares if any service has the local node as their node, independent of which req.state it is
sub has_configured_service_on_local_node {
my ($self) = @_;
my $haenv = $self->{haenv};
my $nodename = $haenv->nodename();
my $ss = $self->{service_status};
foreach my $sid (keys %$ss) {
my $sd = $ss->{$sid};
next if !$sd->{node} || $sd->{node} ne $nodename;
return 1;
}
return 0;
}
sub active_service_count {
my ($self) = @_;
@ -326,6 +348,21 @@ sub work {
$self->set_local_status({ state => 'lost_agent_lock'});
} elsif ($self->{mode} eq 'maintenance') {
$self->set_local_status({ state => 'maintenance'});
} else {
if (!$self->has_configured_service_on_local_node() && !$self->run_workers()) {
# no active service configured for this node and all (old) workers are done
$self->{active_idle_rounds}++;
if ($self->{active_idle_rounds} > $max_active_idle_rounds) {
$haenv->log('info', "node had no service configured for $max_active_idle_rounds rounds, going idle.\n");
# safety: no active service & no running worker for quite some time -> OK
$haenv->release_ha_agent_lock();
give_up_watchdog_protection($self);
$self->set_local_status({ state => 'wait_for_agent_lock'});
$self->{active_idle_rounds} = 0;
}
} elsif ($self->{active_idle_rounds}) {
$self->{active_idle_rounds} = 0;
}
}
} elsif ($state eq 'maintenance') {

View File

@ -0,0 +1,7 @@
Test an user triggered service removal from a previously active LRM, which
should make said LRM going idle and dropping the lock once enough cycles passed
without any new service.
We use some delays to stall execution, as else we'd exit the test-simulation
earlier than the 60 rounds idle time required before the LRM gives up their
lock and watchdog.

View File

@ -0,0 +1,11 @@
[
[ "power node1 on", "power node2 on", "power node3 on"],
[ "service vm:103 delete" ],
[ "delay 0" ],
[ "delay 0" ],
[ "delay 0" ],
[ "delay 0" ],
[ "delay 0" ],
[ "delay 0" ],
[ "delay 0" ]
]

View File

@ -0,0 +1,5 @@
{
"node1": { "power": "off", "network": "off" },
"node2": { "power": "off", "network": "off" },
"node3": { "power": "off", "network": "off" }
}

View File

@ -0,0 +1,29 @@
info 0 hardware: starting simulation
info 20 cmdlist: execute power node1 on
info 20 node1/crm: status change startup => wait_for_quorum
info 20 node1/lrm: status change startup => wait_for_agent_lock
info 20 cmdlist: execute power node2 on
info 20 node2/crm: status change startup => wait_for_quorum
info 20 node2/lrm: status change startup => wait_for_agent_lock
info 20 cmdlist: execute power node3 on
info 20 node3/crm: status change startup => wait_for_quorum
info 20 node3/lrm: status change startup => wait_for_agent_lock
info 20 node1/crm: got lock 'ha_manager_lock'
info 20 node1/crm: status change wait_for_quorum => master
info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
info 20 node1/crm: adding new service 'vm:103' on node 'node3'
info 22 node2/crm: status change wait_for_quorum => slave
info 24 node3/crm: status change wait_for_quorum => slave
info 25 node3/lrm: got lock 'ha_agent_node3_lock'
info 25 node3/lrm: status change wait_for_agent_lock => active
info 25 node3/lrm: starting service vm:103
info 25 node3/lrm: service status vm:103 started
info 120 cmdlist: execute service vm:103 delete
info 120 node1/crm: removing stale service 'vm:103' (no config)
info 122 node2/crm: status change slave => wait_for_quorum
info 124 node3/crm: status change slave => wait_for_quorum
info 1325 node3/lrm: node had no service configured for 60 rounds, going idle.
info 1325 node3/lrm: status change active => wait_for_agent_lock
info 1420 hardware: exit simulation - done

View File

@ -0,0 +1 @@
{}

View File

@ -0,0 +1,3 @@
{
"vm:103": { "node": "node3", "state": "enabled" }
}