1
0
mirror of https://github.com/OpenNebula/one.git synced 2024-12-23 17:33:56 +03:00

Feature #4659: Improve fault-tolerance hook

This commit is contained in:
Jaime Melis 2016-09-07 12:14:41 +02:00
parent 3d3d2c416c
commit c2159551a4
4 changed files with 290 additions and 69 deletions

View File

@ -1224,7 +1224,8 @@ WEBSOCKIFY_SHARE_FILES="share/websockify/websocketproxy.py \
# HOOK scripts, to be installed under $VAR_LOCATION/remotes/hooks
#-------------------------------------------------------------------------------
HOOK_FT_FILES="share/hooks/host_error.rb"
HOOK_FT_FILES="share/hooks/host_error.rb \
share/hooks/fence_host.sh"
#-------------------------------------------------------------------------------
# Installation scripts, to be installed under $SHARE_LOCATION

View File

@ -696,14 +696,17 @@ HM_MAD = [
#*******************************************************************************
# This hook is used to perform recovery actions when a host fails.
# Script to implement host failure tolerance
# It can be set to
# -m migrate VMs to another host. Only for images in shared storage
# One of the following modes must be chosen
# -m resched VMs to another host. (Only for images in shared storage!)
# -r recreate VMs running in the host. State will be lost.
# -d delete VMs running in the host
#
# Additional flags
# -f force resubmission of suspended VMs
# -p <n> avoid resubmission if host comes
# back after n monitoring cycles
# -f resubmit suspended and powered off VMs (only for recreate)
# -p <n> avoid resubmission if host comes back after n monitoring
# cycles. 0 to disable it. Default is 2.
# -u disables fencing. Fencing is enabled by default. Don't disable it
# unless you are very sure about what you're doing
#*******************************************************************************
#
#HOST_HOOK = [

89
share/hooks/fence_host.sh Executable file
View File

@ -0,0 +1,89 @@
#!/bin/bash
# -------------------------------------------------------------------------- #
# Copyright 2002-2016, OpenNebula Project, OpenNebula Systems #
# #
# Licensed under the Apache License, Version 2.0 (the "License"); you may #
# not use this file except in compliance with the License. You may obtain #
# a copy of the License at #
# #
# http://www.apache.org/licenses/LICENSE-2.0 #
# #
# Unless required by applicable law or agreed to in writing, software #
# distributed under the License is distributed on an "AS IS" BASIS, #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and #
# limitations under the License. #
#--------------------------------------------------------------------------- #
##############################################################################
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
#
# This script needs to be modified to enable fencing of the host. By default it
# will fail, as the first line is 'exit 1'. You will need to remove it.
#
# In order to perform the fencing, you will probably need to install a fencing
# utility. They are typically found in: fence-agents-all (CentOS) and fence-
# agents (Ubuntu). They come with many utilities: fence_ilo, fence_ipmilan,
# fence_apc, etc...
#
# To call the fencing utility, you will need to pass some parameters, which are
# typically the iLO IP of the host, etc. We recommend you enter this information
# in the host's template, and pick it up using the xpath example below. AS AN
# EXAMPLE (only an example) the script below expects that you have defined a
# parameter called FENCE_IP in the Host's template, and it will rely on that to
# call the fencing mechanism. You should customize this to your needs. It is
# perfectly OK to discard the code below and use a different mechanism, like
# storing the information required to perform the fencing in a separate CMDB,
# etc. However, you will probably need to get the host's NAME, which should be
# done as shown below.
#
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
#############################################################################
# @param $1 the host information in base64
# @return 0 on success. Make sure this script does not return 0 if it fails.
# To enable remove this line
exit 1
#-------------------------------------------------------------------------------
# Get host parameters with XPATH
#-------------------------------------------------------------------------------
if [ -z "$ONE_LOCATION" ]; then
XPATH=/var/lib/one/remotes/datastore/xpath.rb
else
XPATH=$ONE_LOCATION/var/remotes/datastore/xpath.rb
fi
if [ ! -x "$XPATH" ]; then
echo "XPATH not found: $XPATH"
exit 1
fi
XPATH="${XPATH} -b $1"
unset i j XPATH_ELEMENTS
while IFS= read -r -d '' element; do
XPATH_ELEMENTS[i++]="$element"
done < <($XPATH /HOST/ID \
/HOST/NAME \
/HOST/TEMPLATE/FENCE_IP )
HOST_ID="${XPATH_ELEMENTS[j++]}"
NAME="${XPATH_ELEMENTS[j++]}"
FENCE_IP="${XPATH_ELEMENTS[j++]}"
if [ -z "$FENCE_IP" ]; then
echo "Fence ip not found"
exit 1
fi
#-------------------------------------------------------------------------------
# Fence
#-------------------------------------------------------------------------------
# Example:
# fence_ilo -a $FENCE_IP -l <username> -p <password>

View File

@ -18,128 +18,256 @@
##############################################################################
# Script to implement host failure tolerance
# It can be set to
# -m migrate VMs to another host. Only for images in shared storage
# One of the following modes must be chosen
# -m resched VMs to another host. (Only for images in shared storage!)
# -r recreate VMs running in the host. State will be lost.
# -d delete VMs running in the host
#
# Additional flags
# -f force resubmission of suspended VMs
# -p <n> avoid resubmission if host comes
# back after n monitoring cycles
# -f resubmit suspended and powered off VMs (only for recreate)
# -p <n> avoid resubmission if host comes back after n monitoring
# cycles. 0 to disable it. Default is 2.
# -u disables fencing. Fencing is enabled by default. Don't disable it
# unless you are very sure about what you're doing
##############################################################################
##############################################################################
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
#
# This script needs to fence the error host to prevent split brain VMs. You
# may use any fence mechanism and invoke it around L105, using host_name
#
# WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
#############################################################################
ONE_LOCATION=ENV["ONE_LOCATION"]
if !ONE_LOCATION
RUBY_LIB_LOCATION="/usr/lib/one/ruby"
VMDIR="/var/lib/one"
CONFIG_FILE="/var/lib/one/config"
LOG_FILE="/var/log/one/host_error.log"
else
RUBY_LIB_LOCATION=ONE_LOCATION+"/lib/ruby"
VMDIR=ONE_LOCATION+"/var"
CONFIG_FILE=ONE_LOCATION+"/var/config"
LOG_FILE=ONE_LOCATION+"/var/host_error.log"
end
FENCE_HOST = File.dirname(__FILE__) + '/fence_host.sh'
$: << RUBY_LIB_LOCATION
require 'opennebula'
include OpenNebula
require 'getoptlong'
require 'base64'
require 'open3'
if !(host_id=ARGV[0])
################################################################################
# Arguments
################################################################################
HOST_ID = ARGV[0]
if HOST_ID.nil?
exit -1
end
mode = "-r" # By default, recreate VMs
force = "n" # By default, don't recreate/delete suspended VMs
repeat = nil # By default, don't wait for monitorization cycles"
################################################################################
# Methods
################################################################################
def log(msg, level="I")
File.open(LOG_FILE, 'a') do |f|
msg.lines do |l|
f.puts "[#{Time.now}][HOST #{HOST_ID}][#{level}] #{l}"
end
end
end
def log_error(msg)
log(msg, "E")
end
def exit_error
log_error("Exiting due to previous error.")
exit(-1)
end
def states_xpath(*arr)
arr.map{|e| "STATE=#{e}"}.join(" or ")
end
################################################################################
# Options
################################################################################
mode = nil # **must** be set to something other than nil using the options
force = false # By default, don't recreate/delete suspended and poweroff VMs
repeat = 2 # By default, wait for 2 monitorization cycles
fencing = true
opts = GetoptLong.new(
['--migrate', '-m',GetoptLong::NO_ARGUMENT],
['--delete', '-d',GetoptLong::NO_ARGUMENT],
['--recreate', '-r',GetoptLong::NO_ARGUMENT],
['--force', '-f',GetoptLong::NO_ARGUMENT],
['--pause', '-p',GetoptLong::REQUIRED_ARGUMENT]
['--migrate', '-m', GetoptLong::NO_ARGUMENT],
['--delete', '-d', GetoptLong::NO_ARGUMENT],
['--recreate', '-r', GetoptLong::NO_ARGUMENT],
['--force', '-f', GetoptLong::NO_ARGUMENT],
['--pause', '-p', GetoptLong::REQUIRED_ARGUMENT],
['--no-fencing', '-u', GetoptLong::NO_ARGUMENT]
)
begin
opts.each do |opt, arg|
case opt
when '--migrate'
mode="-m"
mode = :migrate
when '--delete'
mode="-d"
mode = :delete
when '--recreate'
mode="-r"
mode = :recreate
when '--force'
force = "y"
force = true
when '--pause'
repeat = arg.to_i
when '--no-fencing'
fencing = false
end
end
rescue Exception => e
exit(-1)
log_error e.to_s
exit_error
end
if mode.nil?
log_error "Exiting. A mode must be supplied."
exit_error
end
################################################################################
# Main
################################################################################
log "Hook launched"
begin
client = Client.new()
rescue Exception => e
puts "Error: #{e}"
exit -1
log_error e.to_s
exit_error
end
sys = OpenNebula::System.new(client)
conf = sys.get_configuration
begin
MONITORING_INTERVAL = conf['MONITORING_INTERVAL'] || 60
rescue Exception => e
log_error "Could not get MONITORING_INTERVAL"
log_error e.to_s
exit_error
end
# Retrieve hostname
host = OpenNebula::Host.new_with_id(host_id, client)
rc = host.info
exit -1 if OpenNebula.is_error?(rc)
host_name = host.name
host = OpenNebula::Host.new_with_id(HOST_ID, client)
rc = host.info
if OpenNebula.is_error?(rc)
log_error "Could not get host info"
exit_error
end
log "hostname: #{host.name}"
if repeat > 0
log "Wait #{repeat} cycles."
if repeat
# Retrieve host monitor interval
monitor_interval = nil
File.readlines(CONFIG_FILE).each{|line|
monitor_interval = line.split("=").last.to_i if /MONITORING_INTERVAL/=~line
}
# Sleep through the desired number of monitor interval
sleep (repeat * monitor_interval)
period = repeat * MONITORING_INTERVAL.to_i
log "Sleeping #{period} seconds."
sleep(period)
rc = host.info
if OpenNebula.is_error?(rc)
log_error "Could not get host info"
exit_error
end
# If the host came back, exit! avoid duplicated VMs
exit 0 if host.state != 3
if host.state != 3
log "Exiting. Host came back after waiting."
exit 0
end
end
# Do fencing
if fencing
host64 = Base64::strict_encode64(host.to_xml)
log "Fencing enabled"
begin
i, oe, w = Open3.popen2e(FENCE_HOST, host64)
if w.value.success?
log oe.read
log "Fencing success"
else
raise oe.read << "\n" << "Fencing error"
end
rescue Exception => e
log_error e.to_s
exit_error
end
else
log "WARNING: Fencing disabled"
end
# Loop through all vms
vms = VirtualMachinePool.new(client)
rc = vms.info_all
exit -1 if OpenNebula.is_error?(rc)
rc = vms.info_all
state = "STATE=3"
state += " or STATE=5 or STATE=8" if force == "y"
vm_ids_array = vms.retrieve_elements("/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host_name}\" and last()]/../../ID")
if vm_ids_array
vm_ids_array.each do |vm_id|
vm=OpenNebula::VirtualMachine.new_with_id(vm_id, client)
vm.info
if mode == "-r"
vm.delete(true)
elsif mode == "-d"
vm.delete
elsif mode == "-m"
vm.resched
end
end
if OpenNebula.is_error?(rc)
exit_error "Could not get vm pool"
end
# STATE=3: ACTIVE (LCM unknown)
# STATE=5: SUSPENDED
# STATE=8: POWEROFF
if mode == :recreate && !force
log "states: 3"
state = states_xpath(3)
else
log "states: 3, 5, 8"
state = states_xpath(3, 5, 8)
end
xpath = "/VM_POOL/VM[#{state}]/HISTORY_RECORDS/HISTORY[HOSTNAME=\"#{host.name}\" and last()]"
vm_ids_array = vms.retrieve_elements("#{xpath}/../../ID")
if vm_ids_array
log "vms: #{vm_ids_array}"
vm_ids_array.each do |vm_id|
vm = OpenNebula::VirtualMachine.new_with_id(vm_id, client)
rc = vm.info
if OpenNebula.is_error?(rc)
log_error "Could not get info of VM #{vm_id}"
next
end
case mode
when :recreate
log "recreate #{vm_id}"
vm.delete(true)
when :delete
log "delete #{vm_id}"
vm.delete
when :migrate
log "resched #{vm_id}"
vm.resched
else
log_error "unkown mode '#{mode}'"
exit_error
end
end
else
log "No VMs found."
end
log "Hook finished"
exit 0