2011-07-11 18:42:22 +02:00
#!/usr/bin/env ruby
2011-01-17 15:27:10 +01:00
# -------------------------------------------------------------------------- #
2020-04-30 15:00:02 +02:00
# Copyright 2002-2020, OpenNebula Project, OpenNebula Systems #
2011-01-17 15:27:10 +01:00
# #
# Licensed under the Apache License, Version 2.0 (the "License"); you may #
# not use this file except in compliance with the License. You may obtain #
# a copy of the License at #
# #
# http://www.apache.org/licenses/LICENSE-2.0 #
# #
# Unless required by applicable law or agreed to in writing, software #
# distributed under the License is distributed on an "AS IS" BASIS, #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and #
# limitations under the License. #
#--------------------------------------------------------------------------- #
2014-08-24 12:05:01 +02:00
##############################################################################
2011-01-17 15:27:10 +01:00
# Script to implement host failure tolerance
2016-09-07 12:14:41 +02:00
# One of the following modes must be chosen
# -m resched VMs to another host. (Only for images in shared storage!)
2014-08-24 12:05:01 +02:00
# -r recreate VMs running in the host. State will be lost.
2011-07-11 18:42:22 +02:00
# -d delete VMs running in the host
2016-09-07 12:14:41 +02:00
#
2012-10-18 19:18:43 +02:00
# Additional flags
2016-09-07 12:14:41 +02:00
# -f resubmit suspended and powered off VMs (only for recreate)
# -p <n> avoid resubmission if host comes back after n monitoring
# cycles. 0 to disable it. Default is 2.
# -u disables fencing. Fencing is enabled by default. Don't disable it
# unless you are very sure about what you're doing
2014-08-24 12:05:01 +02:00
##############################################################################
2019-08-18 20:33:09 +02:00
ONE_LOCATION = ENV [ 'ONE_LOCATION' ]
2011-01-17 15:27:10 +01:00
if ! ONE_LOCATION
2019-08-18 20:33:09 +02:00
RUBY_LIB_LOCATION = '/usr/lib/one/ruby'
GEMS_LOCATION = '/usr/share/one/gems'
VMDIR = '/var/lib/one'
CONFIG_FILE = '/var/lib/one/config'
LOG_FILE = '/var/log/one/host_error.log'
2011-01-17 15:27:10 +01:00
else
2019-08-18 20:33:09 +02:00
RUBY_LIB_LOCATION = ONE_LOCATION + '/lib/ruby'
GEMS_LOCATION = ONE_LOCATION + '/share/gems'
VMDIR = ONE_LOCATION + '/var'
CONFIG_FILE = ONE_LOCATION + '/var/config'
LOG_FILE = ONE_LOCATION + '/var/host_error.log'
2011-01-17 15:27:10 +01:00
end
2016-09-07 12:14:41 +02:00
FENCE_HOST = File . dirname ( __FILE__ ) + '/fence_host.sh'
2019-08-18 20:33:09 +02:00
if File . directory? ( GEMS_LOCATION )
2020-06-22 15:18:57 +02:00
$LOAD_PATH . reject! { | l | l =~ / vendor_ruby / }
require 'rubygems'
Gem . use_paths ( File . realpath ( GEMS_LOCATION ) )
2019-08-18 20:33:09 +02:00
end
$LOAD_PATH << RUBY_LIB_LOCATION
2011-01-17 15:27:10 +01:00
2013-05-29 10:42:15 +02:00
require 'opennebula'
2012-12-07 12:12:23 +01:00
include OpenNebula
2012-10-19 13:17:27 +02:00
require 'getoptlong'
2016-09-07 12:14:41 +02:00
require 'base64'
require 'open3'
################################################################################
# Arguments
################################################################################
2011-01-17 15:27:10 +01:00
2020-09-23 11:57:25 +02:00
# Get arguments from standard input
standard_input = STDIN . read
ARGV . replace ( standard_input . split ( ' ' ) )
2019-09-24 09:44:06 +02:00
raw_host_template = Base64 . decode64 ( ARGV [ 0 ] )
xml_host_template = Nokogiri :: XML ( raw_host_template )
2016-09-07 12:14:41 +02:00
2019-09-24 09:44:06 +02:00
HOST_ID = xml_host_template . xpath ( 'HOST/ID' ) . text
2011-01-17 15:27:10 +01:00
2016-09-07 12:14:41 +02:00
################################################################################
# Methods
################################################################################
def log ( msg , level = " I " )
File . open ( LOG_FILE , 'a' ) do | f |
msg . lines do | l |
f . puts " [ #{ Time . now } ][HOST #{ HOST_ID } ][ #{ level } ] #{ l } "
end
end
end
def log_error ( msg )
log ( msg , " E " )
end
def exit_error
log_error ( " Exiting due to previous error. " )
exit ( - 1 )
end
def states_xpath ( * arr )
arr . map { | e | " STATE= #{ e } " } . join ( " or " )
end
################################################################################
# Options
################################################################################
mode = nil # **must** be set to something other than nil using the options
force = false # By default, don't recreate/delete suspended and poweroff VMs
repeat = 2 # By default, wait for 2 monitorization cycles
fencing = true
2011-01-17 15:27:10 +01:00
2012-10-19 13:17:27 +02:00
opts = GetoptLong . new (
2016-09-07 12:14:41 +02:00
[ '--migrate' , '-m' , GetoptLong :: NO_ARGUMENT ] ,
[ '--delete' , '-d' , GetoptLong :: NO_ARGUMENT ] ,
[ '--recreate' , '-r' , GetoptLong :: NO_ARGUMENT ] ,
[ '--force' , '-f' , GetoptLong :: NO_ARGUMENT ] ,
[ '--pause' , '-p' , GetoptLong :: REQUIRED_ARGUMENT ] ,
[ '--no-fencing' , '-u' , GetoptLong :: NO_ARGUMENT ]
2012-10-19 13:17:27 +02:00
)
begin
opts . each do | opt , arg |
case opt
2014-08-12 15:56:14 +02:00
when '--migrate'
2016-09-07 12:14:41 +02:00
mode = :migrate
2012-10-19 13:17:27 +02:00
when '--delete'
2016-09-07 12:14:41 +02:00
mode = :delete
2013-03-20 14:46:07 +01:00
when '--recreate'
2016-09-07 12:14:41 +02:00
mode = :recreate
2012-10-19 13:17:27 +02:00
when '--force'
2016-09-07 12:14:41 +02:00
force = true
2012-10-19 13:17:27 +02:00
when '--pause'
repeat = arg . to_i
2016-09-07 12:14:41 +02:00
when '--no-fencing'
fencing = false
2012-10-19 13:17:27 +02:00
end
end
rescue Exception = > e
2016-09-07 12:14:41 +02:00
log_error e . to_s
exit_error
end
if mode . nil?
log_error " Exiting. A mode must be supplied. "
exit_error
2012-10-19 13:17:27 +02:00
end
2011-01-17 18:26:36 +01:00
2016-09-07 12:14:41 +02:00
################################################################################
# Main
################################################################################
log " Hook launched "
2011-01-17 15:27:10 +01:00
begin
client = Client . new ( )
rescue Exception = > e
2016-09-07 12:14:41 +02:00
log_error e . to_s
exit_error
end
sys = OpenNebula :: System . new ( client )
conf = sys . get_configuration
begin
2018-06-19 16:15:01 +02:00
MONITORING_INTERVAL = conf [ 'MONITORING_INTERVAL_HOST' ] || 60
2016-09-07 12:14:41 +02:00
rescue Exception = > e
2018-06-19 16:15:01 +02:00
log_error " Could not get MONITORING_INTERVAL_HOST "
2016-09-07 12:14:41 +02:00
log_error e . to_s
exit_error
2011-01-17 15:27:10 +01:00
end
# Retrieve hostname
2016-09-07 12:14:41 +02:00
host = OpenNebula :: Host . new_with_id ( HOST_ID , client )
rc = host . info
if OpenNebula . is_error? ( rc )
log_error " Could not get host info "
exit_error
end
log " hostname: #{ host . name } "
if repeat > 0
log " Wait #{ repeat } cycles. "
2012-10-18 19:18:43 +02:00
# Sleep through the desired number of monitor interval
2016-09-07 12:14:41 +02:00
period = repeat * MONITORING_INTERVAL . to_i
log " Sleeping #{ period } seconds. "
sleep ( period )
rc = host . info
if OpenNebula . is_error? ( rc )
log_error " Could not get host info "
exit_error
end
2012-10-18 19:18:43 +02:00
# If the host came back, exit! avoid duplicated VMs
2016-09-07 12:18:31 +02:00
if host . state != 3 && host . state != 5
2016-09-07 12:14:41 +02:00
log " Exiting. Host came back after waiting. "
exit 0
end
end
# Do fencing
if fencing
host64 = Base64 :: strict_encode64 ( host . to_xml )
log " Fencing enabled "
begin
2020-09-23 11:57:25 +02:00
oe , w = Open3 . capture2e ( FENCE_HOST , :stdin_data = > host64 )
if w . success?
log oe
2016-09-07 12:14:41 +02:00
log " Fencing success "
else
2020-09-23 11:57:25 +02:00
raise oe << " \n " << " Fencing error "
2016-09-07 12:14:41 +02:00
end
rescue Exception = > e
log_error e . to_s
exit_error
end
else
log " WARNING: Fencing disabled "
2012-10-18 19:18:43 +02:00
end
2011-01-17 15:27:10 +01:00
# Loop through all vms
2011-07-11 18:42:22 +02:00
vms = VirtualMachinePool . new ( client )
2016-09-07 12:14:41 +02:00
rc = vms . info_all
if OpenNebula . is_error? ( rc )
exit_error " Could not get vm pool "
end
2011-01-17 15:27:10 +01:00
2016-09-07 12:14:41 +02:00
# STATE=3: ACTIVE (LCM unknown)
# STATE=5: SUSPENDED
# STATE=8: POWEROFF
2011-01-17 15:27:10 +01:00
2016-09-07 12:14:41 +02:00
if mode == :recreate && ! force
log " states: 3 "
state = states_xpath ( 3 )
else
log " states: 3, 5, 8 "
state = states_xpath ( 3 , 5 , 8 )
end
2011-01-17 15:27:10 +01:00
2016-09-07 12:14:41 +02:00
xpath = " /VM_POOL/VM[ #{ state } ]/HISTORY_RECORDS/HISTORY[HOSTNAME= \" #{ host . name } \" and last()] "
vm_ids_array = vms . retrieve_elements ( " #{ xpath } /../../ID " )
2011-01-17 18:26:36 +01:00
2011-01-18 18:37:02 +01:00
if vm_ids_array
2016-09-07 12:14:41 +02:00
log " vms: #{ vm_ids_array } "
2011-07-11 18:42:22 +02:00
vm_ids_array . each do | vm_id |
2016-09-07 12:14:41 +02:00
vm = OpenNebula :: VirtualMachine . new_with_id ( vm_id , client )
rc = vm . info
2011-01-17 18:26:36 +01:00
2016-09-07 12:14:41 +02:00
if OpenNebula . is_error? ( rc )
log_error " Could not get info of VM #{ vm_id } "
next
end
case mode
when :recreate
log " recreate #{ vm_id } "
2013-04-11 16:39:55 +02:00
vm . delete ( true )
2016-09-07 12:14:41 +02:00
when :delete
log " delete #{ vm_id } "
2013-04-11 16:39:55 +02:00
vm . delete
2016-09-07 12:14:41 +02:00
when :migrate
log " resched #{ vm_id } "
2014-08-12 15:56:14 +02:00
vm . resched
2016-09-07 12:14:41 +02:00
else
2016-09-29 11:06:28 +02:00
log_error " unknown mode ' #{ mode } ' "
2016-09-07 12:14:41 +02:00
exit_error
2011-01-17 18:26:36 +01:00
end
end
2016-09-07 12:14:41 +02:00
else
log " No VMs found. "
2011-01-17 18:26:36 +01:00
end
2016-09-07 12:14:41 +02:00
log " Hook finished "
exit 0