1
0
mirror of https://github.com/OpenNebula/one.git synced 2025-03-29 18:50:08 +03:00

improve rescue of VM when node fail is detected

* Introduce new LCM states PROLOG_MIGRATE_UNKNOWN and PROLOG_MIGRATE_UNKNOWN_FAILURE

* Change VM migrate logic for when state is ACTIVE and lcm_state is UNKNOWN to
  call TM's PROLOG_MIGR action before VMM's BOOT

All core TM drivers that are not empty are skipping disks so there is no impact on
the default behaviour

The datastore addon drivers that implement access to raw block devices should check
if the LCM_STATE == 60 (PROLOG_MIGRATE_UNKNOWN) and to remove block device access
from the failed node and provide access to the current node.
There is a simple script function added to get LCM_STATE that can be used as follow

```bash
LCM_STATE=$(lcm_state)
if [ "$LCM_STATE" = "60" ]
fi
```

(cherry picked from commit 676f36e0aa4f6ca705f60b826fd52d69888d7bb9)
This commit is contained in:
Anton Todorov 2015-08-31 15:10:53 +03:00 committed by Ruben S. Montero
parent d2b66f0ec5
commit f9375eccab
16 changed files with 91 additions and 24 deletions

View File

@ -166,7 +166,9 @@ public:
DISK_SNAPSHOT_DELETE_SUSPENDED = 56,
DISK_SNAPSHOT = 57,
DISK_SNAPSHOT_REVERT = 58,
DISK_SNAPSHOT_DELETE = 59
DISK_SNAPSHOT_DELETE = 59,
PROLOG_MIGRATE_UNKNOWN = 60,
PROLOG_MIGRATE_UNKNOWN_FAILURE = 61
};
static int lcm_state_from_str(string& st, LcmState& state)
@ -231,6 +233,8 @@ public:
else if ( st == "DISK_SNAPSHOT") { state = DISK_SNAPSHOT; }
else if ( st == "DISK_SNAPSHOT_REVERT") { state = DISK_SNAPSHOT_REVERT; }
else if ( st == "DISK_SNAPSHOT_DELETE") { state = DISK_SNAPSHOT_DELETE; }
else if ( st == "PROLOG_MIGRATE_UNKNOWN") { state = PROLOG_MIGRATE_UNKNOWN; }
else if ( st == "PROLOG_MIGRATE_UNKNOWN_FAILURE") { state = PROLOG_MIGRATE_UNKNOWN_FAILURE; }
else {return -1;}
return 0;
@ -298,6 +302,8 @@ public:
case DISK_SNAPSHOT: st = "DISK_SNAPSHOT"; break;
case DISK_SNAPSHOT_REVERT: st = "DISK_SNAPSHOT_REVERT"; break;
case DISK_SNAPSHOT_DELETE: st = "DISK_SNAPSHOT_DELETE"; break;
case PROLOG_MIGRATE_UNKNOWN: st = "PROLOG_MIGRATE_UNKNOWN"; break;
case PROLOG_MIGRATE_UNKNOWN_FAILURE: st = "PROLOG_MIGRATE_UNKNOWN_FAILURE"; break;
}
return st;

View File

@ -71,6 +71,8 @@ digraph OpenNebula {
prolog_migrate_suspend;
prolog_migrate_suspend_failure;
prolog_undeploy;
prolog_migrate_unknown;
prolog_migrate_unknown_failure;
color="white"
}
subgraph {
@ -154,7 +156,8 @@ digraph OpenNebula {
prolog_migrate -> boot_migrate [style="dashed", color="blue"];
boot_migrate -> running [style="dashed", color="blue"];
unknown -> boot [label="migrate"];
unknown -> prolog_migrate_unknown [label="migrate"];
prolog_migrate_unknown -> boot [style="dashed", color="blue"];
poweroff -> prolog_migrate_poweroff [label="migrate"];
prolog_migrate_poweroff -> poweroff [style="dashed", color="blue"];
@ -304,6 +307,9 @@ digraph OpenNebula {
prolog_resume -> stopped [style="dotted", color="red"];
prolog_undeploy -> undeployed [style="dotted", color="red"];
prolog_migrate_unknown -> prolog_migrate_unknown_failure [label=" ", style="dotted", color="red"];
prolog_migrate_unknown_failure -> prolog_migrate_unknown [label="migrate"];
boot -> boot_failure [label=" ", style="dotted", color="red"];
boot_migrate -> boot_migrate_failure [label=" ", style="dotted", color="red"];
boot_poweroff -> poweroff [style="dotted", color="red"];

View File

@ -174,7 +174,9 @@
DISK_SNAPSHOT_DELETE_SUSPENDED = 56,
DISK_SNAPSHOT = 57,
DISK_SNAPSHOT_REVERT = 58,
DISK_SNAPSHOT_DELETE = 59
DISK_SNAPSHOT_DELETE = 59,
PROLOG_MIGRATE_UNKNOWN = 60,
PROLOG_MIGRATE_UNKNOWN_FAILURE = 61
-->
<xs:element name="LCM_STATE" type="xs:integer"/>
<xs:element name="PREV_STATE" type="xs:integer"/>

View File

@ -105,7 +105,9 @@
DISK_SNAPSHOT_DELETE_SUSPENDED = 56,
DISK_SNAPSHOT = 57,
DISK_SNAPSHOT_REVERT = 58,
DISK_SNAPSHOT_DELETE = 59
DISK_SNAPSHOT_DELETE = 59,
PROLOG_MIGRATE_UNKNOWN = 60,
PROLOG_MIGRATE_UNKNOWN_FAILURE = 61
-->
<xs:element name="LCM_STATE" type="xs:integer"/>
<xs:element name="PREV_STATE" type="xs:integer"/>

View File

@ -336,6 +336,7 @@ class OneVMHelper < OpenNebulaHelper::OneHelper
:PROLOG_MIGRATE_FAILURE => :migrate,
:PROLOG_MIGRATE_POWEROFF_FAILURE => :migrate,
:PROLOG_MIGRATE_SUSPEND_FAILURE => :migrate,
:PROLOG_MIGRATE_UNKNOWN_FAILURE => :migrate,
:PROLOG_FAILURE => :prolog,
:PROLOG_RESUME_FAILURE => :resume,
:PROLOG_UNDEPLOY_FAILURE => :resume,

View File

@ -484,6 +484,7 @@ module OpenNebula
lcm_state_str == 'EPILOG_UNDEPLOY_FAILURE' ||
lcm_state_str == 'PROLOG_MIGRATE_POWEROFF_FAILURE' ||
lcm_state_str == 'PROLOG_MIGRATE_SUSPEND_FAILURE' ||
lcm_state_str == 'PROLOG_MIGRATE_UNKNOWN_FAILURE' ||
lcm_state_str == 'BOOT_UNDEPLOY_FAILURE' ||
lcm_state_str == 'BOOT_STOPPED_FAILURE' ||
lcm_state_str == 'PROLOG_RESUME_FAILURE' ||

View File

@ -296,12 +296,12 @@ void LifeCycleManager::migrate_action(int vid)
vm->get_lcm_state() == VirtualMachine::UNKNOWN)
{
//----------------------------------------------------
// Bypass SAVE_MIGRATE & PROLOG_MIGRATE goto BOOT
// Bypass SAVE_MIGRATE goto PROLOG_MIGRATE_UNKNOWN
//----------------------------------------------------
vm->set_resched(false);
vm->set_state(VirtualMachine::BOOT);
vm->set_state(VirtualMachine::PROLOG_MIGRATE_UNKNOWN);
vm->delete_snapshots();
@ -311,17 +311,7 @@ void LifeCycleManager::migrate_action(int vid)
vm->set_stime(the_time);
vm->set_previous_action(History::MIGRATE_ACTION);
vm->set_previous_etime(the_time);
vm->set_previous_vm_info();
vm->set_previous_running_etime(the_time);
vm->set_previous_reason(History::USER);
vmpool->update_previous_history(vm);
vm->set_prolog_stime(the_time);
vmpool->update_history(vm);
@ -334,7 +324,7 @@ void LifeCycleManager::migrate_action(int vid)
//----------------------------------------------------
vmm->trigger(VirtualMachineManager::DEPLOY, vid);
tm->trigger(TransferManager::PROLOG_MIGR,vid);
}
else
{
@ -1072,6 +1062,8 @@ void LifeCycleManager::clean_up_vm(VirtualMachine * vm, bool dispose, int& imag
case VirtualMachine::PROLOG_MIGRATE_POWEROFF_FAILURE:
case VirtualMachine::PROLOG_MIGRATE_SUSPEND:
case VirtualMachine::PROLOG_MIGRATE_SUSPEND_FAILURE:
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN:
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN_FAILURE:
vm->set_prolog_etime(the_time);
vmpool->update_history(vm);
@ -1388,6 +1380,14 @@ void LifeCycleManager::retry(VirtualMachine * vm)
tm->trigger(TransferManager::PROLOG_MIGR, vid);
break;
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN_FAILURE:
vm->set_state(VirtualMachine::PROLOG_MIGRATE_UNKNOWN);
vmpool->update(vm);
tm->trigger(TransferManager::PROLOG_MIGR, vid);
break;
case VirtualMachine::PROLOG_RESUME_FAILURE:
vm->set_state(VirtualMachine::PROLOG_RESUME);
@ -1479,6 +1479,7 @@ void LifeCycleManager::retry(VirtualMachine * vm)
case VirtualMachine::PROLOG_MIGRATE:
case VirtualMachine::PROLOG_MIGRATE_POWEROFF:
case VirtualMachine::PROLOG_MIGRATE_SUSPEND:
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN:
tm->trigger(TransferManager::PROLOG_MIGR,vid);
break;

View File

@ -639,6 +639,8 @@ void LifeCycleManager::prolog_success_action(int vid)
case VirtualMachine::PROLOG_MIGRATE_FAILURE: //recover success
case VirtualMachine::PROLOG:
case VirtualMachine::PROLOG_FAILURE: //recover success
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN:
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN_FAILURE: //recover success
switch (lcm_state)
{
case VirtualMachine::PROLOG_RESUME:
@ -665,6 +667,12 @@ void LifeCycleManager::prolog_success_action(int vid)
vm->set_state(VirtualMachine::BOOT);
break;
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN:
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN_FAILURE: //recover success
action = VirtualMachineManager::DEPLOY;
vm->set_state(VirtualMachine::BOOT);
break;
default:
return;
}

View File

@ -139,7 +139,9 @@ public class VirtualMachine extends PoolElement{
"DISK_SNAPSHOT_DELETE_SUSPENDED",
"DISK_SNAPSHOT",
"DISK_SNAPSHOT_REVERT",
"DISK_SNAPSHOT_DELETE"
"DISK_SNAPSHOT_DELETE",
"PROLOG_MIGRATE_UNKNOWN",
"PROLOG_MIGRATE_UNKNOWN_FAILURE"
};
private static final String[] SHORT_LCM_STATES =
@ -203,7 +205,9 @@ public class VirtualMachine extends PoolElement{
"snap", // DISK_SNAPSHOT_DELETE_SUSPENDED
"snap", // DISK_SNAPSHOT
"snap", // DISK_SNAPSHOT_REVERT
"snap" // DISK_SNAPSHOT_DELETE
"snap", // DISK_SNAPSHOT_DELETE
"migr", // PROLOG_MIGRATE_UNKNOWN
"fail" // PROLOG_MIGRATE_UNKNOWN_FAILURE
};
/**

View File

@ -113,6 +113,8 @@ module OpenNebula
DISK_SNAPSHOT
DISK_SNAPSHOT_REVERT
DISK_SNAPSHOT_DELETE
PROLOG_MIGRATE_UNKNOWN
PROLOG_MIGRATE_UNKNOWN_FAILURE
}
SHORT_VM_STATES={
@ -187,7 +189,9 @@ module OpenNebula
"DISK_SNAPSHOT_DELETE_SUSPENDED"=> "snap",
"DISK_SNAPSHOT" => "snap",
"DISK_SNAPSHOT_REVERT" => "snap",
"DISK_SNAPSHOT_DELETE" => "snap"
"DISK_SNAPSHOT_DELETE" => "snap",
"PROLOG_MIGRATE_UNKNOWN" => "migr",
"PROLOG_MIGRATE_UNKNOWN_FAILURE" => "fail"
}
MIGRATE_REASON=%w{NONE ERROR USER}

View File

@ -91,7 +91,9 @@ VNC_STATES = [
#56, #DISK_SNAPSHOT_DELETE_SUSPENDED
#57, #DISK_SNAPSHOT
#58, #DISK_SNAPSHOT_REVERT
#59 #DISK_SNAPSHOT_DELETE
#59, #DISK_SNAPSHOT_DELETE
#60, #PROLOG_MIGRATE_UNKNOWN
#61 #PROLOG_MIGRATE_UNKNOWN_FAILURE
]
class OpenNebulaVNC

View File

@ -109,6 +109,8 @@ define(function(require) {
"DISK_SNAPSHOT",
"DISK_SNAPSHOT_REVERT",
"DISK_SNAPSHOT_DELETE",
"PROLOG_MIGRATE_UNKNOWN",
"PROLOG_MIGRATE_UNKNOWN_FAILURE",
];
var LCM_STATES = {
@ -171,7 +173,9 @@ define(function(require) {
DISK_SNAPSHOT_DELETE_SUSPENDED : 56,
DISK_SNAPSHOT : 57,
DISK_SNAPSHOT_REVERT : 58,
DISK_SNAPSHOT_DELETE : 59
DISK_SNAPSHOT_DELETE : 59,
PROLOG_MIGRATE_UNKNOWN : 60,
PROLOG_MIGRATE_UNKNOWN_FAILURE : 61
};
var SHORT_LCM_STATES_STR = [
@ -235,6 +239,8 @@ define(function(require) {
Locale.tr("SNAPSHOT"), // DISK_SNAPSHOT
Locale.tr("SNAPSHOT"), // DISK_SNAPSHOT_REVERT
Locale.tr("SNAPSHOT"), // DISK_SNAPSHOT_DELETE
Locale.tr("MIGRATE"), // PROLOG_MIGRATE_UNKNOWN
Locale.tr("FAILURE"), // PROLOG_MIGRATE_UNKNOWN_FAILURE
];
var VNC_STATES = [
@ -543,6 +549,7 @@ define(function(require) {
case LCM_STATES.BOOT_STOPPED_FAILURE:
case LCM_STATES.PROLOG_RESUME_FAILURE:
case LCM_STATES.PROLOG_UNDEPLOY_FAILURE:
case LCM_STATES.PROLOG_MIGRATE_UNKNOWN_FAILURE:
return true;
default:

View File

@ -1003,6 +1003,7 @@ define(function(require) {
case OpenNebulaVM.LCM_STATES.PROLOG_MIGRATE:
case OpenNebulaVM.LCM_STATES.PROLOG_MIGRATE_POWEROFF:
case OpenNebulaVM.LCM_STATES.PROLOG_MIGRATE_SUSPEND:
case OpenNebulaVM.LCM_STATES.PROLOG_MIGRATE_UNKNOWN:
state_color = 'running';
state_str = Locale.tr("RUNNING");
break;
@ -1030,6 +1031,7 @@ define(function(require) {
case OpenNebulaVM.LCM_STATES.BOOT_STOPPED_FAILURE:
case OpenNebulaVM.LCM_STATES.PROLOG_RESUME_FAILURE:
case OpenNebulaVM.LCM_STATES.PROLOG_UNDEPLOY_FAILURE:
case OpenNebulaVM.LCM_STATES.PROLOG_MIGRATE_UNKNOWN_FAILURE:
state_color = 'error';
state_str = Locale.tr("ERROR");
break;

View File

@ -113,6 +113,8 @@ define(function(require) {
LCM_STATE_ACTIONS[ OpenNebulaVM.LCM_STATES.DISK_SNAPSHOT ] = [];
LCM_STATE_ACTIONS[ OpenNebulaVM.LCM_STATES.DISK_SNAPSHOT_REVERT ] = [];
LCM_STATE_ACTIONS[ OpenNebulaVM.LCM_STATES.DISK_SNAPSHOT_DELETE ] = [];
LCM_STATE_ACTIONS[ OpenNebulaVM.LCM_STATES.PROLOG_MIGRATE_UNKNOWN ] = [];
LCM_STATE_ACTIONS[ OpenNebulaVM.LCM_STATES.PROLOG_MIGRATE_UNKNOWN_FAILURE ] = [];
return {
'disableAllStateActions': disableAllStateActions,

View File

@ -123,6 +123,7 @@ void TransferManagerDriver::protocol(const string& message) const
case VirtualMachine::PROLOG_UNDEPLOY:
case VirtualMachine::PROLOG_MIGRATE_POWEROFF:
case VirtualMachine::PROLOG_MIGRATE_SUSPEND:
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN:
lcm_action = LifeCycleManager::PROLOG_SUCCESS;
break;
@ -188,6 +189,7 @@ void TransferManagerDriver::protocol(const string& message) const
case VirtualMachine::PROLOG_UNDEPLOY:
case VirtualMachine::PROLOG_MIGRATE_POWEROFF:
case VirtualMachine::PROLOG_MIGRATE_SUSPEND:
case VirtualMachine::PROLOG_MIGRATE_UNKNOWN:
lcm_action = LifeCycleManager::PROLOG_FAILURE;
break;

View File

@ -94,4 +94,21 @@ function disk_type
DISK_TYPE="${XPATH_ELEMENTS[0]}"
echo $DISK_TYPE
}
}
#Return LCM_STATE
function lcm_state
{
XPATH="${ONE_LOCAL_VAR}/remotes/datastore/xpath.rb --stdin"
unset i XPATH_ELEMENTS
while IFS= read -r -d '' element; do
XPATH_ELEMENTS[i++]="$element"
done < <(onevm show -x $VMID| $XPATH \
/VM/LCM_STATE )
LCM_STATE="${XPATH_ELEMENTS[0]}"
echo $LCM_STATE
}