diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm index 82f78ca..97aa1e0 100644 --- a/src/PVE/HA/LRM.pm +++ b/src/PVE/HA/LRM.pm @@ -20,6 +20,10 @@ my $valid_states = { lost_agent_lock => "lost agent_lock", }; +# we sleep ~10s per 'active' round, so if no services is available for >= 10 min we'd go in wait +# state givining up the watchdog and the LRM lock acquire voluntary, ensuring the WD can do no harm +my $max_active_idle_rounds = 60; + sub new { my ($this, $haenv) = @_; @@ -36,6 +40,7 @@ sub new { # mode can be: active, reboot, shutdown, restart mode => 'active', cluster_state_update => 0, + active_idle_rounds => 0, }, $class; $self->set_local_status({ state => 'wait_for_agent_lock' }); @@ -216,6 +221,23 @@ sub get_protected_ha_agent_lock { return 0; } +# only cares if any service has the local node as their node, independent of which req.state it is +sub has_configured_service_on_local_node { + my ($self) = @_; + + my $haenv = $self->{haenv}; + my $nodename = $haenv->nodename(); + + my $ss = $self->{service_status}; + foreach my $sid (keys %$ss) { + my $sd = $ss->{$sid}; + next if !$sd->{node} || $sd->{node} ne $nodename; + + return 1; + } + return 0; +} + sub active_service_count { my ($self) = @_; @@ -326,6 +348,21 @@ sub work { $self->set_local_status({ state => 'lost_agent_lock'}); } elsif ($self->{mode} eq 'maintenance') { $self->set_local_status({ state => 'maintenance'}); + } else { + if (!$self->has_configured_service_on_local_node() && !$self->run_workers()) { + # no active service configured for this node and all (old) workers are done + $self->{active_idle_rounds}++; + if ($self->{active_idle_rounds} > $max_active_idle_rounds) { + $haenv->log('info', "node had no service configured for $max_active_idle_rounds rounds, going idle.\n"); + # safety: no active service & no running worker for quite some time -> OK + $haenv->release_ha_agent_lock(); + give_up_watchdog_protection($self); + $self->set_local_status({ state => 'wait_for_agent_lock'}); + $self->{active_idle_rounds} = 0; + } + } elsif ($self->{active_idle_rounds}) { + $self->{active_idle_rounds} = 0; + } } } elsif ($state eq 'maintenance') { diff --git a/src/test/test-lrm-going-idle1/README b/src/test/test-lrm-going-idle1/README new file mode 100644 index 0000000..ed44463 --- /dev/null +++ b/src/test/test-lrm-going-idle1/README @@ -0,0 +1,7 @@ +Test an user triggered service removal from a previously active LRM, which +should make said LRM going idle and dropping the lock once enough cycles passed +without any new service. + +We use some delays to stall execution, as else we'd exit the test-simulation +earlier than the 60 rounds idle time required before the LRM gives up their +lock and watchdog. diff --git a/src/test/test-lrm-going-idle1/cmdlist b/src/test/test-lrm-going-idle1/cmdlist new file mode 100644 index 0000000..8567d6a --- /dev/null +++ b/src/test/test-lrm-going-idle1/cmdlist @@ -0,0 +1,11 @@ +[ + [ "power node1 on", "power node2 on", "power node3 on"], + [ "service vm:103 delete" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ], + [ "delay 0" ] +] diff --git a/src/test/test-lrm-going-idle1/hardware_status b/src/test/test-lrm-going-idle1/hardware_status new file mode 100644 index 0000000..451beb1 --- /dev/null +++ b/src/test/test-lrm-going-idle1/hardware_status @@ -0,0 +1,5 @@ +{ + "node1": { "power": "off", "network": "off" }, + "node2": { "power": "off", "network": "off" }, + "node3": { "power": "off", "network": "off" } +} diff --git a/src/test/test-lrm-going-idle1/log.expect b/src/test/test-lrm-going-idle1/log.expect new file mode 100644 index 0000000..2dce3e9 --- /dev/null +++ b/src/test/test-lrm-going-idle1/log.expect @@ -0,0 +1,29 @@ +info 0 hardware: starting simulation +info 20 cmdlist: execute power node1 on +info 20 node1/crm: status change startup => wait_for_quorum +info 20 node1/lrm: status change startup => wait_for_agent_lock +info 20 cmdlist: execute power node2 on +info 20 node2/crm: status change startup => wait_for_quorum +info 20 node2/lrm: status change startup => wait_for_agent_lock +info 20 cmdlist: execute power node3 on +info 20 node3/crm: status change startup => wait_for_quorum +info 20 node3/lrm: status change startup => wait_for_agent_lock +info 20 node1/crm: got lock 'ha_manager_lock' +info 20 node1/crm: status change wait_for_quorum => master +info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online' +info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online' +info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online' +info 20 node1/crm: adding new service 'vm:103' on node 'node3' +info 22 node2/crm: status change wait_for_quorum => slave +info 24 node3/crm: status change wait_for_quorum => slave +info 25 node3/lrm: got lock 'ha_agent_node3_lock' +info 25 node3/lrm: status change wait_for_agent_lock => active +info 25 node3/lrm: starting service vm:103 +info 25 node3/lrm: service status vm:103 started +info 120 cmdlist: execute service vm:103 delete +info 120 node1/crm: removing stale service 'vm:103' (no config) +info 122 node2/crm: status change slave => wait_for_quorum +info 124 node3/crm: status change slave => wait_for_quorum +info 1325 node3/lrm: node had no service configured for 60 rounds, going idle. +info 1325 node3/lrm: status change active => wait_for_agent_lock +info 1420 hardware: exit simulation - done diff --git a/src/test/test-lrm-going-idle1/manager_status b/src/test/test-lrm-going-idle1/manager_status new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/src/test/test-lrm-going-idle1/manager_status @@ -0,0 +1 @@ +{} diff --git a/src/test/test-lrm-going-idle1/service_config b/src/test/test-lrm-going-idle1/service_config new file mode 100644 index 0000000..c6860e7 --- /dev/null +++ b/src/test/test-lrm-going-idle1/service_config @@ -0,0 +1,3 @@ +{ + "vm:103": { "node": "node3", "state": "enabled" } +}