2011-07-11 20:42:22 +04:00
#!/usr/bin/env ruby
2011-01-17 17:27:10 +03:00
# -------------------------------------------------------------------------- #
2017-05-25 17:07:35 +03:00
# Copyright 2002-2017, OpenNebula Project, OpenNebula Systems #
2011-01-17 17:27:10 +03:00
# #
# Licensed under the Apache License, Version 2.0 (the "License"); you may #
# not use this file except in compliance with the License. You may obtain #
# a copy of the License at #
# #
# http://www.apache.org/licenses/LICENSE-2.0 #
# #
# Unless required by applicable law or agreed to in writing, software #
# distributed under the License is distributed on an "AS IS" BASIS, #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and #
# limitations under the License. #
#--------------------------------------------------------------------------- #
2014-08-24 14:05:01 +04:00
##############################################################################
2011-01-17 17:27:10 +03:00
# Script to implement host failure tolerance
2016-09-07 13:14:41 +03:00
# One of the following modes must be chosen
# -m resched VMs to another host. (Only for images in shared storage!)
2014-08-24 14:05:01 +04:00
# -r recreate VMs running in the host. State will be lost.
2011-07-11 20:42:22 +04:00
# -d delete VMs running in the host
2016-09-07 13:14:41 +03:00
#
2012-10-18 21:18:43 +04:00
# Additional flags
2016-09-07 13:14:41 +03:00
# -f resubmit suspended and powered off VMs (only for recreate)
# -p <n> avoid resubmission if host comes back after n monitoring
# cycles. 0 to disable it. Default is 2.
# -u disables fencing. Fencing is enabled by default. Don't disable it
# unless you are very sure about what you're doing
2014-08-24 14:05:01 +04:00
##############################################################################
2011-01-17 17:27:10 +03:00
ONE_LOCATION = ENV [ " ONE_LOCATION " ]
if ! ONE_LOCATION
RUBY_LIB_LOCATION = " /usr/lib/one/ruby "
VMDIR = " /var/lib/one "
2012-10-18 21:18:43 +04:00
CONFIG_FILE = " /var/lib/one/config "
2016-09-07 13:14:41 +03:00
LOG_FILE = " /var/log/one/host_error.log "
2011-01-17 17:27:10 +03:00
else
RUBY_LIB_LOCATION = ONE_LOCATION + " /lib/ruby "
VMDIR = ONE_LOCATION + " /var "
2012-10-18 21:18:43 +04:00
CONFIG_FILE = ONE_LOCATION + " /var/config "
2016-09-07 13:14:41 +03:00
LOG_FILE = ONE_LOCATION + " /var/host_error.log "
2011-01-17 17:27:10 +03:00
end
2016-09-07 13:14:41 +03:00
FENCE_HOST = File . dirname ( __FILE__ ) + '/fence_host.sh'
2011-01-17 17:27:10 +03:00
$: << RUBY_LIB_LOCATION
2013-05-29 12:42:15 +04:00
require 'opennebula'
2012-12-07 15:12:23 +04:00
include OpenNebula
2012-10-19 15:17:27 +04:00
require 'getoptlong'
2016-09-07 13:14:41 +03:00
require 'base64'
require 'open3'
################################################################################
# Arguments
################################################################################
2011-01-17 17:27:10 +03:00
2016-09-07 13:14:41 +03:00
HOST_ID = ARGV [ 0 ]
if HOST_ID . nil?
2011-01-17 17:27:10 +03:00
exit - 1
end
2016-09-07 13:14:41 +03:00
################################################################################
# Methods
################################################################################
def log ( msg , level = " I " )
File . open ( LOG_FILE , 'a' ) do | f |
msg . lines do | l |
f . puts " [ #{ Time . now } ][HOST #{ HOST_ID } ][ #{ level } ] #{ l } "
end
end
end
def log_error ( msg )
log ( msg , " E " )
end
def exit_error
log_error ( " Exiting due to previous error. " )
exit ( - 1 )
end
def states_xpath ( * arr )
arr . map { | e | " STATE= #{ e } " } . join ( " or " )
end
################################################################################
# Options
################################################################################
mode = nil # **must** be set to something other than nil using the options
force = false # By default, don't recreate/delete suspended and poweroff VMs
repeat = 2 # By default, wait for 2 monitorization cycles
fencing = true
2011-01-17 17:27:10 +03:00
2012-10-19 15:17:27 +04:00
opts = GetoptLong . new (
2016-09-07 13:14:41 +03:00
[ '--migrate' , '-m' , GetoptLong :: NO_ARGUMENT ] ,
[ '--delete' , '-d' , GetoptLong :: NO_ARGUMENT ] ,
[ '--recreate' , '-r' , GetoptLong :: NO_ARGUMENT ] ,
[ '--force' , '-f' , GetoptLong :: NO_ARGUMENT ] ,
[ '--pause' , '-p' , GetoptLong :: REQUIRED_ARGUMENT ] ,
[ '--no-fencing' , '-u' , GetoptLong :: NO_ARGUMENT ]
2012-10-19 15:17:27 +04:00
)
begin
opts . each do | opt , arg |
case opt
2014-08-12 17:56:14 +04:00
when '--migrate'
2016-09-07 13:14:41 +03:00
mode = :migrate
2012-10-19 15:17:27 +04:00
when '--delete'
2016-09-07 13:14:41 +03:00
mode = :delete
2013-03-20 17:46:07 +04:00
when '--recreate'
2016-09-07 13:14:41 +03:00
mode = :recreate
2012-10-19 15:17:27 +04:00
when '--force'
2016-09-07 13:14:41 +03:00
force = true
2012-10-19 15:17:27 +04:00
when '--pause'
repeat = arg . to_i
2016-09-07 13:14:41 +03:00
when '--no-fencing'
fencing = false
2012-10-19 15:17:27 +04:00
end
end
rescue Exception = > e
2016-09-07 13:14:41 +03:00
log_error e . to_s
exit_error
end
if mode . nil?
log_error " Exiting. A mode must be supplied. "
exit_error
2012-10-19 15:17:27 +04:00
end
2011-01-17 20:26:36 +03:00
2016-09-07 13:14:41 +03:00
################################################################################
# Main
################################################################################
log " Hook launched "
2011-01-17 17:27:10 +03:00
begin
client = Client . new ( )
rescue Exception = > e
2016-09-07 13:14:41 +03:00
log_error e . to_s
exit_error
end
sys = OpenNebula :: System . new ( client )
conf = sys . get_configuration
begin
MONITORING_INTERVAL = conf [ 'MONITORING_INTERVAL' ] || 60
rescue Exception = > e
log_error " Could not get MONITORING_INTERVAL "
log_error e . to_s
exit_error
2011-01-17 17:27:10 +03:00
end
# Retrieve hostname
2016-09-07 13:14:41 +03:00
host = OpenNebula :: Host . new_with_id ( HOST_ID , client )
rc = host . info
if OpenNebula . is_error? ( rc )
log_error " Could not get host info "
exit_error
end
log " hostname: #{ host . name } "
if repeat > 0
log " Wait #{ repeat } cycles. "
2012-10-18 21:18:43 +04:00
# Sleep through the desired number of monitor interval
2016-09-07 13:14:41 +03:00
period = repeat * MONITORING_INTERVAL . to_i
log " Sleeping #{ period } seconds. "
sleep ( period )
rc = host . info
if OpenNebula . is_error? ( rc )
log_error " Could not get host info "
exit_error
end
2012-10-18 21:18:43 +04:00
# If the host came back, exit! avoid duplicated VMs
2016-09-07 13:18:31 +03:00
if host . state != 3 && host . state != 5
2016-09-07 13:14:41 +03:00
log " Exiting. Host came back after waiting. "
exit 0
end
end
# Do fencing
if fencing
host64 = Base64 :: strict_encode64 ( host . to_xml )
log " Fencing enabled "
begin
i , oe , w = Open3 . popen2e ( FENCE_HOST , host64 )
if w . value . success?
log oe . read
log " Fencing success "
else
raise oe . read << " \n " << " Fencing error "
end
rescue Exception = > e
log_error e . to_s
exit_error
end
else
log " WARNING: Fencing disabled "
2012-10-18 21:18:43 +04:00
end
2011-01-17 17:27:10 +03:00
# Loop through all vms
2011-07-11 20:42:22 +04:00
vms = VirtualMachinePool . new ( client )
2016-09-07 13:14:41 +03:00
rc = vms . info_all
if OpenNebula . is_error? ( rc )
exit_error " Could not get vm pool "
end
2011-01-17 17:27:10 +03:00
2016-09-07 13:14:41 +03:00
# STATE=3: ACTIVE (LCM unknown)
# STATE=5: SUSPENDED
# STATE=8: POWEROFF
2011-01-17 17:27:10 +03:00
2016-09-07 13:14:41 +03:00
if mode == :recreate && ! force
log " states: 3 "
state = states_xpath ( 3 )
else
log " states: 3, 5, 8 "
state = states_xpath ( 3 , 5 , 8 )
end
2011-01-17 17:27:10 +03:00
2016-09-07 13:14:41 +03:00
xpath = " /VM_POOL/VM[ #{ state } ]/HISTORY_RECORDS/HISTORY[HOSTNAME= \" #{ host . name } \" and last()] "
vm_ids_array = vms . retrieve_elements ( " #{ xpath } /../../ID " )
2011-01-17 20:26:36 +03:00
2011-01-18 20:37:02 +03:00
if vm_ids_array
2016-09-07 13:14:41 +03:00
log " vms: #{ vm_ids_array } "
2011-07-11 20:42:22 +04:00
vm_ids_array . each do | vm_id |
2016-09-07 13:14:41 +03:00
vm = OpenNebula :: VirtualMachine . new_with_id ( vm_id , client )
rc = vm . info
2011-01-17 20:26:36 +03:00
2016-09-07 13:14:41 +03:00
if OpenNebula . is_error? ( rc )
log_error " Could not get info of VM #{ vm_id } "
next
end
case mode
when :recreate
log " recreate #{ vm_id } "
2013-04-11 18:39:55 +04:00
vm . delete ( true )
2016-09-07 13:14:41 +03:00
when :delete
log " delete #{ vm_id } "
2013-04-11 18:39:55 +04:00
vm . delete
2016-09-07 13:14:41 +03:00
when :migrate
log " resched #{ vm_id } "
2014-08-12 17:56:14 +04:00
vm . resched
2016-09-07 13:14:41 +03:00
else
2016-09-29 12:06:28 +03:00
log_error " unknown mode ' #{ mode } ' "
2016-09-07 13:14:41 +03:00
exit_error
2011-01-17 20:26:36 +03:00
end
end
2016-09-07 13:14:41 +03:00
else
log " No VMs found. "
2011-01-17 20:26:36 +03:00
end
2016-09-07 13:14:41 +03:00
log " Hook finished "
exit 0