mirror of
https://github.com/OpenNebula/one.git
synced 2024-12-23 17:33:56 +03:00
Feature #4659: Improve fault-tolerance hook
This commit is contained in:
parent
3d3d2c416c
commit
c2159551a4
@ -1224,7 +1224,8 @@ WEBSOCKIFY_SHARE_FILES="share/websockify/websocketproxy.py \
|
||||
# HOOK scripts, to be installed under $VAR_LOCATION/remotes/hooks
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
HOOK_FT_FILES="share/hooks/host_error.rb"
|
||||
HOOK_FT_FILES="share/hooks/host_error.rb \
|
||||
share/hooks/fence_host.sh"
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Installation scripts, to be installed under $SHARE_LOCATION
|
||||
|
@ -696,14 +696,17 @@ HM_MAD = [
|
||||
#*******************************************************************************
|
||||
# This hook is used to perform recovery actions when a host fails.
|
||||
# Script to implement host failure tolerance
|
||||
# It can be set to
|
||||
# -m migrate VMs to another host. Only for images in shared storage
|
||||
# One of the following modes must be chosen
|
||||
# -m resched VMs to another host. (Only for images in shared storage!)
|
||||
# -r recreate VMs running in the host. State will be lost.
|
||||
# -d delete VMs running in the host
|
||||
#
|
||||
# Additional flags
|
||||
# -f force resubmission of suspended VMs
|
||||
# -p <n> avoid resubmission if host comes
|
||||
# back after n monitoring cycles
|
||||
# -f resubmit suspended and powered off VMs (only for recreate)
|
||||
# -p <n> avoid resubmission if host comes back after n monitoring
|
||||
# cycles. 0 to disable it. Default is 2.
|
||||
# -u disables fencing. Fencing is enabled by default. Don't disable it
|
||||
# unless you are very sure about what you're doing
|
||||
#*******************************************************************************
|
||||
#
|
||||
#HOST_HOOK = [
|
||||
|
89
share/hooks/fence_host.sh
Executable file
89
share/hooks/fence_host.sh
Executable file
@ -0,0 +1,89 @@
|
||||
#!/bin/bash
|
||||
|
||||
# -------------------------------------------------------------------------- #
|
||||
# Copyright 2002-2016, OpenNebula Project, OpenNebula Systems #
|
||||
# #
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may #
|
||||
# not use this file except in compliance with the License. You may obtain #
|
||||
# a copy of the License at #
|
||||
# #
|
||||
# http://www.apache.org/licenses/LICENSE-2.0 #
|
||||
# #
|
||||
# Unless required by applicable law or agreed to in writing, software #
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, #
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
|
||||
# See the License for the specific language governing permissions and #
|
||||
# limitations under the License. #
|
||||
#--------------------------------------------------------------------------- #
|
||||
|
||||
##############################################################################
|
||||
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
|
||||
#
|
||||
# This script needs to be modified to enable fencing of the host. By default it
|
||||
# will fail, as the first line is 'exit 1'. You will need to remove it.
|
||||
#
|
||||
# In order to perform the fencing, you will probably need to install a fencing
|
||||
# utility. They are typically found in: fence-agents-all (CentOS) and fence-
|
||||
# agents (Ubuntu). They come with many utilities: fence_ilo, fence_ipmilan,
|
||||
# fence_apc, etc...
|
||||
#
|
||||
# To call the fencing utility, you will need to pass some parameters, which are
|
||||
# typically the iLO IP of the host, etc. We recommend you enter this information
|
||||
# in the host's template, and pick it up using the xpath example below. AS AN
|
||||
# EXAMPLE (only an example) the script below expects that you have defined a
|
||||
# parameter called FENCE_IP in the Host's template, and it will rely on that to
|
||||
# call the fencing mechanism. You should customize this to your needs. It is
|
||||
# perfectly OK to discard the code below and use a different mechanism, like
|
||||
# storing the information required to perform the fencing in a separate CMDB,
|
||||
# etc. However, you will probably need to get the host's NAME, which should be
|
||||
# done as shown below.
|
||||
#
|
||||
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
|
||||
#############################################################################
|
||||
|
||||
# @param $1 the host information in base64
|
||||
# @return 0 on success. Make sure this script does not return 0 if it fails.
|
||||
|
||||
# To enable remove this line
|
||||
exit 1
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Get host parameters with XPATH
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
if [ -z "$ONE_LOCATION" ]; then
|
||||
XPATH=/var/lib/one/remotes/datastore/xpath.rb
|
||||
else
|
||||
XPATH=$ONE_LOCATION/var/remotes/datastore/xpath.rb
|
||||
fi
|
||||
|
||||
if [ ! -x "$XPATH" ]; then
|
||||
echo "XPATH not found: $XPATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
XPATH="${XPATH} -b $1"
|
||||
|
||||
unset i j XPATH_ELEMENTS
|
||||
|
||||
while IFS= read -r -d '' element; do
|
||||
XPATH_ELEMENTS[i++]="$element"
|
||||
done < <($XPATH /HOST/ID \
|
||||
/HOST/NAME \
|
||||
/HOST/TEMPLATE/FENCE_IP )
|
||||
|
||||
HOST_ID="${XPATH_ELEMENTS[j++]}"
|
||||
NAME="${XPATH_ELEMENTS[j++]}"
|
||||
FENCE_IP="${XPATH_ELEMENTS[j++]}"
|
||||
|
||||
if [ -z "$FENCE_IP" ]; then
|
||||
echo "Fence ip not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# Fence
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
# Example:
|
||||
# fence_ilo -a $FENCE_IP -l <username> -p <password>
|
@ -18,128 +18,256 @@
|
||||
|
||||
##############################################################################
|
||||
# Script to implement host failure tolerance
|
||||
# It can be set to
|
||||
# -m migrate VMs to another host. Only for images in shared storage
|
||||
# One of the following modes must be chosen
|
||||
# -m resched VMs to another host. (Only for images in shared storage!)
|
||||
# -r recreate VMs running in the host. State will be lost.
|
||||
# -d delete VMs running in the host
|
||||
#
|
||||
# Additional flags
|
||||
# -f force resubmission of suspended VMs
|
||||
# -p <n> avoid resubmission if host comes
|
||||
# back after n monitoring cycles
|
||||
# -f resubmit suspended and powered off VMs (only for recreate)
|
||||
# -p <n> avoid resubmission if host comes back after n monitoring
|
||||
# cycles. 0 to disable it. Default is 2.
|
||||
# -u disables fencing. Fencing is enabled by default. Don't disable it
|
||||
# unless you are very sure about what you're doing
|
||||
##############################################################################
|
||||
|
||||
##############################################################################
|
||||
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
|
||||
#
|
||||
# This script needs to fence the error host to prevent split brain VMs. You
|
||||
# may use any fence mechanism and invoke it around L105, using host_name
|
||||
#
|
||||
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
|
||||
#############################################################################
|
||||
|
||||
ONE_LOCATION=ENV["ONE_LOCATION"]
|
||||
|
||||
if !ONE_LOCATION
|
||||
RUBY_LIB_LOCATION="/usr/lib/one/ruby"
|
||||
VMDIR="/var/lib/one"
|
||||
CONFIG_FILE="/var/lib/one/config"
|
||||
LOG_FILE="/var/log/one/host_error.log"
|
||||
else
|
||||
RUBY_LIB_LOCATION=ONE_LOCATION+"/lib/ruby"
|
||||
VMDIR=ONE_LOCATION+"/var"
|
||||
CONFIG_FILE=ONE_LOCATION+"/var/config"
|
||||
LOG_FILE=ONE_LOCATION+"/var/host_error.log"
|
||||
end
|
||||
|
||||
FENCE_HOST = File.dirname(__FILE__) + '/fence_host.sh'
|
||||
|
||||
$: << RUBY_LIB_LOCATION
|
||||
|
||||
require 'opennebula'
|
||||
include OpenNebula
|
||||
|
||||
require 'getoptlong'
|
||||
require 'base64'
|
||||
require 'open3'
|
||||
|
||||
if !(host_id=ARGV[0])
|
||||
################################################################################
|
||||
# Arguments
|
||||
################################################################################
|
||||
|
||||
HOST_ID = ARGV[0]
|
||||
|
||||
if HOST_ID.nil?
|
||||
exit -1
|
||||
end
|
||||
|
||||
mode = "-r" # By default, recreate VMs
|
||||
force = "n" # By default, don't recreate/delete suspended VMs
|
||||
repeat = nil # By default, don't wait for monitorization cycles"
|
||||
################################################################################
|
||||
# Methods
|
||||
################################################################################
|
||||
|
||||
def log(msg, level="I")
|
||||
File.open(LOG_FILE, 'a') do |f|
|
||||
msg.lines do |l|
|
||||
f.puts "[#{Time.now}][HOST #{HOST_ID}][#{level}] #{l}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def log_error(msg)
|
||||
log(msg, "E")
|
||||
end
|
||||
|
||||
def exit_error
|
||||
log_error("Exiting due to previous error.")
|
||||
exit(-1)
|
||||
end
|
||||
|
||||
def states_xpath(*arr)
|
||||
arr.map{|e| "STATE=#{e}"}.join(" or ")
|
||||
end
|
||||
|
||||
################################################################################
|
||||
# Options
|
||||
################################################################################
|
||||
|
||||
mode = nil # **must** be set to something other than nil using the options
|
||||
force = false # By default, don't recreate/delete suspended and poweroff VMs
|
||||
repeat = 2 # By default, wait for 2 monitorization cycles
|
||||
fencing = true
|
||||
|
||||
opts = GetoptLong.new(
|
||||
['--migrate', '-m',GetoptLong::NO_ARGUMENT],
|
||||
['--delete', '-d',GetoptLong::NO_ARGUMENT],
|
||||
['--recreate', '-r',GetoptLong::NO_ARGUMENT],
|
||||
['--force', '-f',GetoptLong::NO_ARGUMENT],
|
||||
['--pause', '-p',GetoptLong::REQUIRED_ARGUMENT]
|
||||
['--migrate', '-m', GetoptLong::NO_ARGUMENT],
|
||||
['--delete', '-d', GetoptLong::NO_ARGUMENT],
|
||||
['--recreate', '-r', GetoptLong::NO_ARGUMENT],
|
||||
['--force', '-f', GetoptLong::NO_ARGUMENT],
|
||||
['--pause', '-p', GetoptLong::REQUIRED_ARGUMENT],
|
||||
['--no-fencing', '-u', GetoptLong::NO_ARGUMENT]
|
||||
)
|
||||
|
||||
begin
|
||||
opts.each do |opt, arg|
|
||||
case opt
|
||||
when '--migrate'
|
||||
mode="-m"
|
||||
mode = :migrate
|
||||
when '--delete'
|
||||
mode="-d"
|
||||
mode = :delete
|
||||
when '--recreate'
|
||||
mode="-r"
|
||||
mode = :recreate
|
||||
when '--force'
|
||||
force = "y"
|
||||
force = true
|
||||
when '--pause'
|
||||
repeat = arg.to_i
|
||||
when '--no-fencing'
|
||||
fencing = false
|
||||
end
|
||||
end
|
||||
rescue Exception => e
|
||||
exit(-1)
|
||||
log_error e.to_s
|
||||
exit_error
|
||||
end
|
||||
|
||||
if mode.nil?
|
||||
log_error "Exiting. A mode must be supplied."
|
||||
exit_error
|
||||
end
|
||||
|
||||
################################################################################
|
||||
# Main
|
||||
################################################################################
|
||||
|
||||
log "Hook launched"
|
||||
|
||||
begin
|
||||
client = Client.new()
|
||||
rescue Exception => e
|
||||
puts "Error: #{e}"
|
||||
exit -1
|
||||
log_error e.to_s
|
||||
exit_error
|
||||
end
|
||||
|
||||
sys = OpenNebula::System.new(client)
|
||||
conf = sys.get_configuration
|
||||
|
||||
begin
|
||||
MONITORING_INTERVAL = conf['MONITORING_INTERVAL'] || 60
|
||||
rescue Exception => e
|
||||
log_error "Could not get MONITORING_INTERVAL"
|
||||
log_error e.to_s
|
||||
exit_error
|
||||
end
|
||||
|
||||
# Retrieve hostname
|
||||
host = OpenNebula::Host.new_with_id(host_id, client)
|
||||
rc = host.info
|
||||
exit -1 if OpenNebula.is_error?(rc)
|
||||
host_name = host.name
|
||||
host = OpenNebula::Host.new_with_id(HOST_ID, client)
|
||||
rc = host.info
|
||||
|
||||
if OpenNebula.is_error?(rc)
|
||||
log_error "Could not get host info"
|
||||
exit_error
|
||||
end
|
||||
|
||||
log "hostname: #{host.name}"
|
||||
|
||||
if repeat > 0
|
||||
log "Wait #{repeat} cycles."
|
||||
|
||||
if repeat
|
||||
# Retrieve host monitor interval
|
||||
monitor_interval = nil
|
||||
File.readlines(CONFIG_FILE).each{|line|
|
||||
monitor_interval = line.split("=").last.to_i if /MONITORING_INTERVAL/=~line
|
||||
}
|
||||
# Sleep through the desired number of monitor interval
|
||||
sleep (repeat * monitor_interval)
|
||||
period = repeat * MONITORING_INTERVAL.to_i
|
||||
|
||||
log "Sleeping #{period} seconds."
|
||||
sleep(period)
|
||||
|
||||
rc = host.info
|
||||
if OpenNebula.is_error?(rc)
|
||||
log_error "Could not get host info"
|
||||
exit_error
|
||||
end
|
||||
|
||||
# If the host came back, exit! avoid duplicated VMs
|
||||
exit 0 if host.state != 3
|
||||
if host.state != 3
|
||||
log "Exiting. Host came back after waiting."
|
||||
exit 0
|
||||
end
|
||||
end
|
||||
|
||||
# Do fencing
|
||||
if fencing
|
||||
host64 = Base64::strict_encode64(host.to_xml)
|
||||
|
||||
log "Fencing enabled"
|
||||
|
||||
begin
|
||||
i, oe, w = Open3.popen2e(FENCE_HOST, host64)
|
||||
if w.value.success?
|
||||
log oe.read
|
||||
log "Fencing success"
|
||||
else
|
||||
raise oe.read << "\n" << "Fencing error"
|
||||
end
|
||||
rescue Exception => e
|
||||
log_error e.to_s
|
||||
exit_error
|
||||
end
|
||||
else
|
||||
log "WARNING: Fencing disabled"
|
||||
end
|
||||
|
||||
# Loop through all vms
|
||||
vms = VirtualMachinePool.new(client)
|
||||
rc = vms.info_all
|
||||
exit -1 if OpenNebula.is_error?(rc)
|
||||
rc = vms.info_all
|
||||
|
||||
|
||||
state = "STATE=3"
|
||||
state += " or STATE=5 or STATE=8" if force == "y"
|
||||
|
||||
vm_ids_array = vms.retrieve_elements("/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host_name}\" and last()]/../../ID")
|
||||
|
||||
if vm_ids_array
|
||||
vm_ids_array.each do |vm_id|
|
||||
vm=OpenNebula::VirtualMachine.new_with_id(vm_id, client)
|
||||
vm.info
|
||||
|
||||
if mode == "-r"
|
||||
vm.delete(true)
|
||||
elsif mode == "-d"
|
||||
vm.delete
|
||||
elsif mode == "-m"
|
||||
vm.resched
|
||||
end
|
||||
end
|
||||
if OpenNebula.is_error?(rc)
|
||||
exit_error "Could not get vm pool"
|
||||
end
|
||||
|
||||
# STATE=3: ACTIVE (LCM unknown)
|
||||
# STATE=5: SUSPENDED
|
||||
# STATE=8: POWEROFF
|
||||
|
||||
if mode == :recreate && !force
|
||||
log "states: 3"
|
||||
state = states_xpath(3)
|
||||
else
|
||||
log "states: 3, 5, 8"
|
||||
state = states_xpath(3, 5, 8)
|
||||
end
|
||||
|
||||
xpath = "/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host.name}\" and last()]"
|
||||
vm_ids_array = vms.retrieve_elements("#{xpath}/../../ID")
|
||||
|
||||
if vm_ids_array
|
||||
log "vms: #{vm_ids_array}"
|
||||
|
||||
vm_ids_array.each do |vm_id|
|
||||
vm = OpenNebula::VirtualMachine.new_with_id(vm_id, client)
|
||||
rc = vm.info
|
||||
|
||||
if OpenNebula.is_error?(rc)
|
||||
log_error "Could not get info of VM #{vm_id}"
|
||||
next
|
||||
end
|
||||
|
||||
case mode
|
||||
when :recreate
|
||||
log "recreate #{vm_id}"
|
||||
vm.delete(true)
|
||||
when :delete
|
||||
log "delete #{vm_id}"
|
||||
vm.delete
|
||||
when :migrate
|
||||
log "resched #{vm_id}"
|
||||
vm.resched
|
||||
else
|
||||
log_error "unkown mode '#{mode}'"
|
||||
exit_error
|
||||
end
|
||||
end
|
||||
else
|
||||
log "No VMs found."
|
||||
end
|
||||
|
||||
log "Hook finished"
|
||||
exit 0
|
||||
|
Loading…
Reference in New Issue
Block a user