mirror of
git://git.proxmox.com/git/pve-ha-manager.git
synced 2025-03-11 20:58:16 +03:00
do simple fallback if node comes back online from maintenance
We simply remember the node we where on, if moved for maintenance. This record gets dropped once we move to _any_ other node, be it: * our previous node, as it came back from maintenance * another node due to manual migration, group priority changes or fencing The first point is handled explicitly by this patch. In the select service node we check for and old fallback node, if that one is found in a online node list with top priority we _always_ move to it - even if there's no real reason for a move. Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
99278e06a8
commit
2167dd1e60
@ -93,7 +93,7 @@ sub get_node_priority_groups {
|
|||||||
}
|
}
|
||||||
|
|
||||||
sub select_service_node {
|
sub select_service_node {
|
||||||
my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes) = @_;
|
my ($groups, $online_node_usage, $service_conf, $current_node, $try_next, $tried_nodes, $maintenance_fallback) = @_;
|
||||||
|
|
||||||
my $group = get_service_group($groups, $online_node_usage, $service_conf);
|
my $group = get_service_group($groups, $online_node_usage, $service_conf);
|
||||||
|
|
||||||
@ -123,12 +123,19 @@ sub select_service_node {
|
|||||||
} keys %{$pri_groups->{$top_pri}};
|
} keys %{$pri_groups->{$top_pri}};
|
||||||
|
|
||||||
my $found;
|
my $found;
|
||||||
|
my $found_maintenace_fallback;
|
||||||
for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) {
|
for (my $i = scalar(@nodes) - 1; $i >= 0; $i--) {
|
||||||
my $node = $nodes[$i];
|
my $node = $nodes[$i];
|
||||||
if ($node eq $current_node) {
|
if ($node eq $current_node) {
|
||||||
$found = $i;
|
$found = $i;
|
||||||
last;
|
|
||||||
}
|
}
|
||||||
|
if (defined($maintenance_fallback) && $node eq $maintenance_fallback) {
|
||||||
|
$found_maintenace_fallback = $i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (defined($found_maintenace_fallback)) {
|
||||||
|
return $nodes[$found_maintenace_fallback];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($try_next) {
|
if ($try_next) {
|
||||||
@ -207,6 +214,7 @@ my $change_service_state = sub {
|
|||||||
my $old_state = $sd->{state};
|
my $old_state = $sd->{state};
|
||||||
my $old_node = $sd->{node};
|
my $old_node = $sd->{node};
|
||||||
my $old_failed_nodes = $sd->{failed_nodes};
|
my $old_failed_nodes = $sd->{failed_nodes};
|
||||||
|
my $old_maintenance_node = $sd->{maintenance_node};
|
||||||
|
|
||||||
die "no state change" if $old_state eq $new_state; # just to be sure
|
die "no state change" if $old_state eq $new_state; # just to be sure
|
||||||
|
|
||||||
@ -217,6 +225,7 @@ my $change_service_state = sub {
|
|||||||
$sd->{state} = $new_state;
|
$sd->{state} = $new_state;
|
||||||
$sd->{node} = $old_node;
|
$sd->{node} = $old_node;
|
||||||
$sd->{failed_nodes} = $old_failed_nodes if defined($old_failed_nodes);
|
$sd->{failed_nodes} = $old_failed_nodes if defined($old_failed_nodes);
|
||||||
|
$sd->{maintenance_node} = $old_maintenance_node if defined($old_maintenance_node);
|
||||||
|
|
||||||
my $text_state = '';
|
my $text_state = '';
|
||||||
foreach my $k (sort keys %params) {
|
foreach my $k (sort keys %params) {
|
||||||
@ -641,6 +650,10 @@ sub next_state_started {
|
|||||||
}
|
}
|
||||||
if ($ns->get_node_state($sd->{node}) ne 'maintenance') {
|
if ($ns->get_node_state($sd->{node}) ne 'maintenance') {
|
||||||
return;
|
return;
|
||||||
|
} else {
|
||||||
|
# save current node as fallback for when it comes out of
|
||||||
|
# maintenance
|
||||||
|
$sd->{maintenance_node} = $sd->{node};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -733,11 +746,29 @@ sub next_state_started {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
my $node = select_service_node($self->{groups}, $self->{online_node_usage},
|
my $node = select_service_node(
|
||||||
$cd, $sd->{node}, $try_next, $sd->{failed_nodes});
|
$self->{groups},
|
||||||
|
$self->{online_node_usage},
|
||||||
|
$cd,
|
||||||
|
$sd->{node},
|
||||||
|
$try_next,
|
||||||
|
$sd->{failed_nodes},
|
||||||
|
$sd->{maintenance_node},
|
||||||
|
);
|
||||||
|
|
||||||
if ($node && ($sd->{node} ne $node)) {
|
if ($node && ($sd->{node} ne $node)) {
|
||||||
$self->{online_node_usage}->{$node}++;
|
$self->{online_node_usage}->{$node}++;
|
||||||
|
|
||||||
|
if (defined(my $fallback = $sd->{maintenance_node})) {
|
||||||
|
if ($node eq $fallback) {
|
||||||
|
$haenv->log('info', "moving service '$sid' back to '$fallback', node came back from maintenance.");
|
||||||
|
delete $sd->{maintenance_node};
|
||||||
|
} elsif ($sd->{node} ne $fallback) {
|
||||||
|
$haenv->log('info', "dropping maintenance fallback node '$fallback' for '$sid'");
|
||||||
|
delete $sd->{maintenance_node};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ($cd->{type} eq 'vm') {
|
if ($cd->{type} eq 'vm') {
|
||||||
$haenv->log('info', "migrate service '$sid' to node '$node' (running)");
|
$haenv->log('info', "migrate service '$sid' to node '$node' (running)");
|
||||||
&$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node);
|
&$change_service_state($self, $sid, 'migrate', node => $sd->{node}, target => $node);
|
||||||
|
@ -102,5 +102,21 @@ info 345 reboot: execute power node3 on
|
|||||||
info 345 node3/crm: status change startup => wait_for_quorum
|
info 345 node3/crm: status change startup => wait_for_quorum
|
||||||
info 340 node3/lrm: status change startup => wait_for_agent_lock
|
info 340 node3/lrm: status change startup => wait_for_agent_lock
|
||||||
info 360 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
|
info 360 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
|
||||||
|
info 360 node1/crm: moving service 'fa:109' back to 'node3', node came back from maintenance.
|
||||||
|
info 360 node1/crm: relocate service 'fa:109' to node 'node3'
|
||||||
|
info 360 node1/crm: service 'fa:109': state changed from 'started' to 'relocate' (node = node2, target = node3)
|
||||||
|
info 360 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
|
||||||
|
info 360 node1/crm: migrate service 'vm:103' to node 'node3' (running)
|
||||||
|
info 360 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3)
|
||||||
|
info 361 node1/lrm: service vm:103 - start migrate to node 'node3'
|
||||||
|
info 361 node1/lrm: service vm:103 - end migrate to node 'node3'
|
||||||
|
err 363 node2/lrm: service fa:109 not moved (migration error)
|
||||||
info 364 node3/crm: status change wait_for_quorum => slave
|
info 364 node3/crm: status change wait_for_quorum => slave
|
||||||
|
err 380 node1/crm: service 'fa:109' - migration failed (exit code 1)
|
||||||
|
info 380 node1/crm: service 'fa:109': state changed from 'relocate' to 'started' (node = node2)
|
||||||
|
info 380 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
|
||||||
|
info 385 node3/lrm: got lock 'ha_agent_node3_lock'
|
||||||
|
info 385 node3/lrm: status change wait_for_agent_lock => active
|
||||||
|
info 385 node3/lrm: starting service vm:103
|
||||||
|
info 385 node3/lrm: service status vm:103 started
|
||||||
info 720 hardware: exit simulation - done
|
info 720 hardware: exit simulation - done
|
||||||
|
@ -55,5 +55,25 @@ info 165 reboot: execute power node3 on
|
|||||||
info 165 node3/crm: status change startup => wait_for_quorum
|
info 165 node3/crm: status change startup => wait_for_quorum
|
||||||
info 160 node3/lrm: status change startup => wait_for_agent_lock
|
info 160 node3/lrm: status change startup => wait_for_agent_lock
|
||||||
info 180 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
|
info 180 node1/crm: node 'node3': state changed from 'maintenance' => 'online'
|
||||||
|
info 180 node1/crm: moving service 'ct:102' back to 'node3', node came back from maintenance.
|
||||||
|
info 180 node1/crm: relocate service 'ct:102' to node 'node3'
|
||||||
|
info 180 node1/crm: service 'ct:102': state changed from 'started' to 'relocate' (node = node1, target = node3)
|
||||||
|
info 180 node1/crm: moving service 'vm:103' back to 'node3', node came back from maintenance.
|
||||||
|
info 180 node1/crm: migrate service 'vm:103' to node 'node3' (running)
|
||||||
|
info 180 node1/crm: service 'vm:103': state changed from 'started' to 'migrate' (node = node1, target = node3)
|
||||||
|
info 181 node1/lrm: service ct:102 - start relocate to node 'node3'
|
||||||
|
info 181 node1/lrm: stopping service ct:102 (relocate)
|
||||||
|
info 181 node1/lrm: service status ct:102 stopped
|
||||||
|
info 181 node1/lrm: service ct:102 - end relocate to node 'node3'
|
||||||
|
info 181 node1/lrm: service vm:103 - start migrate to node 'node3'
|
||||||
|
info 181 node1/lrm: service vm:103 - end migrate to node 'node3'
|
||||||
info 184 node3/crm: status change wait_for_quorum => slave
|
info 184 node3/crm: status change wait_for_quorum => slave
|
||||||
|
info 200 node1/crm: service 'ct:102': state changed from 'relocate' to 'started' (node = node3)
|
||||||
|
info 200 node1/crm: service 'vm:103': state changed from 'migrate' to 'started' (node = node3)
|
||||||
|
info 205 node3/lrm: got lock 'ha_agent_node3_lock'
|
||||||
|
info 205 node3/lrm: status change wait_for_agent_lock => active
|
||||||
|
info 205 node3/lrm: starting service ct:102
|
||||||
|
info 205 node3/lrm: service status ct:102 started
|
||||||
|
info 205 node3/lrm: starting service vm:103
|
||||||
|
info 205 node3/lrm: service status vm:103 started
|
||||||
info 720 hardware: exit simulation - done
|
info 720 hardware: exit simulation - done
|
||||||
|
Loading…
x
Reference in New Issue
Block a user