mirror of
git://git.proxmox.com/git/pve-ha-manager.git
synced 2025-01-04 09:17:59 +03:00
LRM: release lock and close watchdog if no service configured for >10min
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
abc1499bc6
commit
21051707f6
@ -20,6 +20,10 @@ my $valid_states = {
|
||||
lost_agent_lock => "lost agent_lock",
|
||||
};
|
||||
|
||||
# we sleep ~10s per 'active' round, so if no services is available for >= 10 min we'd go in wait
|
||||
# state givining up the watchdog and the LRM lock acquire voluntary, ensuring the WD can do no harm
|
||||
my $max_active_idle_rounds = 60;
|
||||
|
||||
sub new {
|
||||
my ($this, $haenv) = @_;
|
||||
|
||||
@ -36,6 +40,7 @@ sub new {
|
||||
# mode can be: active, reboot, shutdown, restart
|
||||
mode => 'active',
|
||||
cluster_state_update => 0,
|
||||
active_idle_rounds => 0,
|
||||
}, $class;
|
||||
|
||||
$self->set_local_status({ state => 'wait_for_agent_lock' });
|
||||
@ -216,6 +221,23 @@ sub get_protected_ha_agent_lock {
|
||||
return 0;
|
||||
}
|
||||
|
||||
# only cares if any service has the local node as their node, independent of which req.state it is
|
||||
sub has_configured_service_on_local_node {
|
||||
my ($self) = @_;
|
||||
|
||||
my $haenv = $self->{haenv};
|
||||
my $nodename = $haenv->nodename();
|
||||
|
||||
my $ss = $self->{service_status};
|
||||
foreach my $sid (keys %$ss) {
|
||||
my $sd = $ss->{$sid};
|
||||
next if !$sd->{node} || $sd->{node} ne $nodename;
|
||||
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
sub active_service_count {
|
||||
my ($self) = @_;
|
||||
|
||||
@ -326,6 +348,21 @@ sub work {
|
||||
$self->set_local_status({ state => 'lost_agent_lock'});
|
||||
} elsif ($self->{mode} eq 'maintenance') {
|
||||
$self->set_local_status({ state => 'maintenance'});
|
||||
} else {
|
||||
if (!$self->has_configured_service_on_local_node() && !$self->run_workers()) {
|
||||
# no active service configured for this node and all (old) workers are done
|
||||
$self->{active_idle_rounds}++;
|
||||
if ($self->{active_idle_rounds} > $max_active_idle_rounds) {
|
||||
$haenv->log('info', "node had no service configured for $max_active_idle_rounds rounds, going idle.\n");
|
||||
# safety: no active service & no running worker for quite some time -> OK
|
||||
$haenv->release_ha_agent_lock();
|
||||
give_up_watchdog_protection($self);
|
||||
$self->set_local_status({ state => 'wait_for_agent_lock'});
|
||||
$self->{active_idle_rounds} = 0;
|
||||
}
|
||||
} elsif ($self->{active_idle_rounds}) {
|
||||
$self->{active_idle_rounds} = 0;
|
||||
}
|
||||
}
|
||||
} elsif ($state eq 'maintenance') {
|
||||
|
||||
|
7
src/test/test-lrm-going-idle1/README
Normal file
7
src/test/test-lrm-going-idle1/README
Normal file
@ -0,0 +1,7 @@
|
||||
Test an user triggered service removal from a previously active LRM, which
|
||||
should make said LRM going idle and dropping the lock once enough cycles passed
|
||||
without any new service.
|
||||
|
||||
We use some delays to stall execution, as else we'd exit the test-simulation
|
||||
earlier than the 60 rounds idle time required before the LRM gives up their
|
||||
lock and watchdog.
|
11
src/test/test-lrm-going-idle1/cmdlist
Normal file
11
src/test/test-lrm-going-idle1/cmdlist
Normal file
@ -0,0 +1,11 @@
|
||||
[
|
||||
[ "power node1 on", "power node2 on", "power node3 on"],
|
||||
[ "service vm:103 delete" ],
|
||||
[ "delay 0" ],
|
||||
[ "delay 0" ],
|
||||
[ "delay 0" ],
|
||||
[ "delay 0" ],
|
||||
[ "delay 0" ],
|
||||
[ "delay 0" ],
|
||||
[ "delay 0" ]
|
||||
]
|
5
src/test/test-lrm-going-idle1/hardware_status
Normal file
5
src/test/test-lrm-going-idle1/hardware_status
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"node1": { "power": "off", "network": "off" },
|
||||
"node2": { "power": "off", "network": "off" },
|
||||
"node3": { "power": "off", "network": "off" }
|
||||
}
|
29
src/test/test-lrm-going-idle1/log.expect
Normal file
29
src/test/test-lrm-going-idle1/log.expect
Normal file
@ -0,0 +1,29 @@
|
||||
info 0 hardware: starting simulation
|
||||
info 20 cmdlist: execute power node1 on
|
||||
info 20 node1/crm: status change startup => wait_for_quorum
|
||||
info 20 node1/lrm: status change startup => wait_for_agent_lock
|
||||
info 20 cmdlist: execute power node2 on
|
||||
info 20 node2/crm: status change startup => wait_for_quorum
|
||||
info 20 node2/lrm: status change startup => wait_for_agent_lock
|
||||
info 20 cmdlist: execute power node3 on
|
||||
info 20 node3/crm: status change startup => wait_for_quorum
|
||||
info 20 node3/lrm: status change startup => wait_for_agent_lock
|
||||
info 20 node1/crm: got lock 'ha_manager_lock'
|
||||
info 20 node1/crm: status change wait_for_quorum => master
|
||||
info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
|
||||
info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
|
||||
info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
|
||||
info 20 node1/crm: adding new service 'vm:103' on node 'node3'
|
||||
info 22 node2/crm: status change wait_for_quorum => slave
|
||||
info 24 node3/crm: status change wait_for_quorum => slave
|
||||
info 25 node3/lrm: got lock 'ha_agent_node3_lock'
|
||||
info 25 node3/lrm: status change wait_for_agent_lock => active
|
||||
info 25 node3/lrm: starting service vm:103
|
||||
info 25 node3/lrm: service status vm:103 started
|
||||
info 120 cmdlist: execute service vm:103 delete
|
||||
info 120 node1/crm: removing stale service 'vm:103' (no config)
|
||||
info 122 node2/crm: status change slave => wait_for_quorum
|
||||
info 124 node3/crm: status change slave => wait_for_quorum
|
||||
info 1325 node3/lrm: node had no service configured for 60 rounds, going idle.
|
||||
info 1325 node3/lrm: status change active => wait_for_agent_lock
|
||||
info 1420 hardware: exit simulation - done
|
1
src/test/test-lrm-going-idle1/manager_status
Normal file
1
src/test/test-lrm-going-idle1/manager_status
Normal file
@ -0,0 +1 @@
|
||||
{}
|
3
src/test/test-lrm-going-idle1/service_config
Normal file
3
src/test/test-lrm-going-idle1/service_config
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"vm:103": { "node": "node3", "state": "enabled" }
|
||||
}
|
Loading…
Reference in New Issue
Block a user