5
0
mirror of git://git.proxmox.com/git/pve-ha-manager.git synced 2025-03-11 20:58:16 +03:00

do simple fallback if node comes back online from maintenance

We simply remember the node we where on, if moved for maintenance.
This record gets dropped once we move to _any_ other node, be it:
* our previous node, as it came back from maintenance
* another node due to manual migration, group priority changes or
  fencing

The first point is handled explicitly by this patch. In the select
service node we check for and old fallback node, if that one is found
in a online node list with top priority we _always_ move to it - even
if there's no real reason for a move.

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Thomas Lamprecht 2019-11-25 17:48:42 +01:00
parent 99278e06a8
commit 2167dd1e60
3 changed files with 71 additions and 4 deletions

View File

@ -93,7 +93,7 @@ sub get_node_priority_groups {
}
sub select_service_node {
my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes) = @_;
my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_;
my $group = get_service_group($groups, $online_node_usage, $service_conf);
@ -123,12 +123,19 @@ sub select_service_node {
} keys %{$pri_groups->{$top_pri}};
my $found;
my $found_maintenace_fallback;
for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) {
my $node = $nodes[$i];
if ($node eq $current_node) {
$found = $i;
last;
}
if (defined($maintenance_fallback) && $node eq $maintenance_fallback) {
$found_maintenace_fallback = $i;
}
}
if (defined($found_maintenace_fallback)) {
return $nodes[$found_maintenace_fallback];
}
if ($try_next) {
@ -207,6 +214,7 @@ my $change_service_state = sub {
my $old_state = $sd->{state};
my $old_node = $sd->{node};
my $old_failed_nodes = $sd->{failed_nodes};
my $old_maintenance_node = $sd->{maintenance_node};
die "no state change" if $old_state eq $new_state; # just to be sure
@ -217,6 +225,7 @@ my $change_service_state = sub {
$sd->{state} = $new_state;
$sd->{node} = $old_node;
$sd->{failed_nodes} = $old_failed_nodes if defined($old_failed_nodes);
$sd->{maintenance_node} = $old_maintenance_node if defined($old_maintenance_node);
my $text_state = '';
foreach my $k (sort keys %params) {
@ -641,6 +650,10 @@ sub next_state_started {
}
if ($ns->get_node_state($sd->{node}) ne 'maintenance') {
return;
} else {
# save current node as fallback for when it comes out of
# maintenance
$sd->{maintenance_node} = $sd->{node};
}
}
@ -733,11 +746,29 @@ sub next_state_started {
}
}
my $node = select_service_node($self->{groups}, $self->{online_node_usage},
$cd, $sd->{node}, $try_next, $sd->{failed_nodes});
my $node = select_service_node(
$self->{groups},
$self->{online_node_usage},
$cd,
$sd->{node},
$try_next,
$sd->{failed_nodes},
$sd->{maintenance_node},
);
if ($node && ($sd->{node} ne $node)) {
$self->{online_node_usage}->{$node}++;
if (defined(my $fallback = $sd->{maintenance_node})) {
if ($node eq $fallback) {
$haenv->log('info', "moving service '$sid' back to '$fallback', node came back from maintenance.");
delete $sd->{maintenance_node};
} elsif ($sd->{node} ne $fallback) {
$haenv->log('info', "dropping maintenance fallback node '$fallback' for '$sid'");
delete $sd->{maintenance_node};
}
}
if ($cd->{type} eq 'vm') {
$haenv->log('info', "migrate service '$sid' to node '$node' (running)");
&$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node);

View File

@ -102,5 +102,21 @@ info 345 reboot: execute power node3 on
info 345 node3/crm: status change startup => wait_for_quorum
info 340 node3/lrm: status change startup => wait_for_agent_lock
info 360 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
info 360 node1/crm: moving service 'fa:109' back to 'node3', node came back from maintenance.
info 360 node1/crm: relocate service 'fa:109' to node 'node3'
info 360 node1/crm: service 'fa:109': state changed from 'started' to 'relocate' (node = node2, target = node3)
info 360 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
info 360 node1/crm: migrate service 'vm:103' to node 'node3' (running)
info 360 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3)
info 361 node1/lrm: service vm:103 - start migrate to node 'node3'
info 361 node1/lrm: service vm:103 - end migrate to node 'node3'
err 363 node2/lrm: service fa:109 not moved (migration error)
info 364 node3/crm: status change wait_for_quorum => slave
err 380 node1/crm: service 'fa:109' - migration failed (exit code 1)
info 380 node1/crm: service 'fa:109': state changed from 'relocate' to 'started' (node = node2)
info 380 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
info 385 node3/lrm: got lock 'ha_agent_node3_lock'
info 385 node3/lrm: status change wait_for_agent_lock => active
info 385 node3/lrm: starting service vm:103
info 385 node3/lrm: service status vm:103 started
info 720 hardware: exit simulation - done

View File

@ -55,5 +55,25 @@ info 165 reboot: execute power node3 on
info 165 node3/crm: status change startup => wait_for_quorum
info 160 node3/lrm: status change startup => wait_for_agent_lock
info 180 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
info 180 node1/crm: moving service 'ct:102' back to 'node3', node came back from maintenance.
info 180 node1/crm: relocate service 'ct:102' to node 'node3'
info 180 node1/crm: service 'ct:102': state changed from 'started' to 'relocate' (node = node1, target = node3)
info 180 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
info 180 node1/crm: migrate service 'vm:103' to node 'node3' (running)
info 180 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3)
info 181 node1/lrm: service ct:102 - start relocate to node 'node3'
info 181 node1/lrm: stopping service ct:102 (relocate)
info 181 node1/lrm: service status ct:102 stopped
info 181 node1/lrm: service ct:102 - end relocate to node 'node3'
info 181 node1/lrm: service vm:103 - start migrate to node 'node3'
info 181 node1/lrm: service vm:103 - end migrate to node 'node3'
info 184 node3/crm: status change wait_for_quorum => slave
info 200 node1/crm: service 'ct:102': state changed from 'relocate' to 'started' (node = node3)
info 200 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
info 205 node3/lrm: got lock 'ha_agent_node3_lock'
info 205 node3/lrm: status change wait_for_agent_lock => active
info 205 node3/lrm: starting service ct:102
info 205 node3/lrm: service status ct:102 started
info 205 node3/lrm: starting service vm:103
info 205 node3/lrm: service status vm:103 started
info 720 hardware: exit simulation - done