From 2167dd1e6093362ccfbd3ad55c4c4bd0dcebc73d Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Mon, 25 Nov 2019 17:48:42 +0100 Subject: [PATCH] do simple fallback if node comes back online from maintenance We simply remember the node we where on, if moved for maintenance. This record gets dropped once we move to _any_ other node, be it: * our previous node, as it came back from maintenance * another node due to manual migration, group priority changes or fencing The first point is handled explicitly by this patch. In the select service node we check for and old fallback node, if that one is found in a online node list with top priority we _always_ move to it - even if there's no real reason for a move. Signed-off-by: Thomas Lamprecht --- src/PVE/HA/Manager.pm | 39 +++++++++++++++++-- .../log.expect | 16 ++++++++ src/test/test-shutdown-policy3/log.expect | 20 ++++++++++ 3 files changed, 71 insertions(+), 4 deletions(-) diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index 1f14754..9e46f19 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -93,7 +93,7 @@ sub get_node_priority_groups { } sub select_service_node { - my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes) = @_; + my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_; my $group = get_service_group($groups, $online_node_usage, $service_conf); @@ -123,12 +123,19 @@ sub select_service_node { } keys %{$pri_groups->{$top_pri}}; my $found; + my $found_maintenace_fallback; for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) { my $node = $nodes[$i]; if ($node eq $current_node) { $found = $i; - last; } + if (defined($maintenance_fallback) && $node eq $maintenance_fallback) { + $found_maintenace_fallback = $i; + } + } + + if (defined($found_maintenace_fallback)) { + return $nodes[$found_maintenace_fallback]; } if ($try_next) { @@ -207,6 +214,7 @@ my $change_service_state = sub { my $old_state = $sd->{state}; my $old_node = $sd->{node}; my $old_failed_nodes = $sd->{failed_nodes}; + my $old_maintenance_node = $sd->{maintenance_node}; die "no state change" if $old_state eq $new_state; # just to be sure @@ -217,6 +225,7 @@ my $change_service_state = sub { $sd->{state} = $new_state; $sd->{node} = $old_node; $sd->{failed_nodes} = $old_failed_nodes if defined($old_failed_nodes); + $sd->{maintenance_node} = $old_maintenance_node if defined($old_maintenance_node); my $text_state = ''; foreach my $k (sort keys %params) { @@ -641,6 +650,10 @@ sub next_state_started { } if ($ns->get_node_state($sd->{node}) ne 'maintenance') { return; + } else { + # save current node as fallback for when it comes out of + # maintenance + $sd->{maintenance_node} = $sd->{node}; } } @@ -733,11 +746,29 @@ sub next_state_started { } } - my $node = select_service_node($self->{groups}, $self->{online_node_usage}, - $cd, $sd->{node}, $try_next, $sd->{failed_nodes}); + my $node = select_service_node( + $self->{groups}, + $self->{online_node_usage}, + $cd, + $sd->{node}, + $try_next, + $sd->{failed_nodes}, + $sd->{maintenance_node}, + ); if ($node && ($sd->{node} ne $node)) { $self->{online_node_usage}->{$node}++; + + if (defined(my $fallback = $sd->{maintenance_node})) { + if ($node eq $fallback) { + $haenv->log('info', "moving service '$sid' back to '$fallback', node came back from maintenance."); + delete $sd->{maintenance_node}; + } elsif ($sd->{node} ne $fallback) { + $haenv->log('info', "dropping maintenance fallback node '$fallback' for '$sid'"); + delete $sd->{maintenance_node}; + } + } + if ($cd->{type} eq 'vm') { $haenv->log('info', "migrate service '$sid' to node '$node' (running)"); &$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node); diff --git a/src/test/test-shutdown-policy-migrate-fail1/log.expect b/src/test/test-shutdown-policy-migrate-fail1/log.expect index 79664c7..1bb2291 100644 --- a/src/test/test-shutdown-policy-migrate-fail1/log.expect +++ b/src/test/test-shutdown-policy-migrate-fail1/log.expect @@ -102,5 +102,21 @@ info 345 reboot: execute power node3 on info 345 node3/crm: status change startup => wait_for_quorum info 340 node3/lrm: status change startup => wait_for_agent_lock info 360 node1/crm: node 'node3': state changed from 'maintenance' => 'online' +info 360 node1/crm: moving service 'fa:109' back to 'node3', node came back from maintenance. +info 360 node1/crm: relocate service 'fa:109' to node 'node3' +info 360 node1/crm: service 'fa:109': state changed from 'started' to 'relocate' (node = node2, target = node3) +info 360 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance. +info 360 node1/crm: migrate service 'vm:103' to node 'node3' (running) +info 360 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3) +info 361 node1/lrm: service vm:103 - start migrate to node 'node3' +info 361 node1/lrm: service vm:103 - end migrate to node 'node3' +err 363 node2/lrm: service fa:109 not moved (migration error) info 364 node3/crm: status change wait_for_quorum => slave +err 380 node1/crm: service 'fa:109' - migration failed (exit code 1) +info 380 node1/crm: service 'fa:109': state changed from 'relocate' to 'started' (node = node2) +info 380 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3) +info 385 node3/lrm: got lock 'ha_agent_node3_lock' +info 385 node3/lrm: status change wait_for_agent_lock => active +info 385 node3/lrm: starting service vm:103 +info 385 node3/lrm: service status vm:103 started info 720 hardware: exit simulation - done diff --git a/src/test/test-shutdown-policy3/log.expect b/src/test/test-shutdown-policy3/log.expect index 6ecf211..921c9f3 100644 --- a/src/test/test-shutdown-policy3/log.expect +++ b/src/test/test-shutdown-policy3/log.expect @@ -55,5 +55,25 @@ info 165 reboot: execute power node3 on info 165 node3/crm: status change startup => wait_for_quorum info 160 node3/lrm: status change startup => wait_for_agent_lock info 180 node1/crm: node 'node3': state changed from 'maintenance' => 'online' +info 180 node1/crm: moving service 'ct:102' back to 'node3', node came back from maintenance. +info 180 node1/crm: relocate service 'ct:102' to node 'node3' +info 180 node1/crm: service 'ct:102': state changed from 'started' to 'relocate' (node = node1, target = node3) +info 180 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance. +info 180 node1/crm: migrate service 'vm:103' to node 'node3' (running) +info 180 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3) +info 181 node1/lrm: service ct:102 - start relocate to node 'node3' +info 181 node1/lrm: stopping service ct:102 (relocate) +info 181 node1/lrm: service status ct:102 stopped +info 181 node1/lrm: service ct:102 - end relocate to node 'node3' +info 181 node1/lrm: service vm:103 - start migrate to node 'node3' +info 181 node1/lrm: service vm:103 - end migrate to node 'node3' info 184 node3/crm: status change wait_for_quorum => slave +info 200 node1/crm: service 'ct:102': state changed from 'relocate' to 'started' (node = node3) +info 200 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3) +info 205 node3/lrm: got lock 'ha_agent_node3_lock' +info 205 node3/lrm: status change wait_for_agent_lock => active +info 205 node3/lrm: starting service ct:102 +info 205 node3/lrm: service status ct:102 started +info 205 node3/lrm: starting service vm:103 +info 205 node3/lrm: service status vm:103 started info 720 hardware: exit simulation - done